diff options
author | Erlang/OTP <[email protected]> | 2009-11-20 14:54:40 +0000 |
---|---|---|
committer | Erlang/OTP <[email protected]> | 2009-11-20 14:54:40 +0000 |
commit | 84adefa331c4159d432d22840663c38f155cd4c1 (patch) | |
tree | bff9a9c66adda4df2106dfd0e5c053ab182a12bd /lib/mnesia/src | |
download | otp-84adefa331c4159d432d22840663c38f155cd4c1.tar.gz otp-84adefa331c4159d432d22840663c38f155cd4c1.tar.bz2 otp-84adefa331c4159d432d22840663c38f155cd4c1.zip |
The R13B03 release.OTP_R13B03
Diffstat (limited to 'lib/mnesia/src')
33 files changed, 24961 insertions, 0 deletions
diff --git a/lib/mnesia/src/Makefile b/lib/mnesia/src/Makefile new file mode 100644 index 0000000000..e032f563fa --- /dev/null +++ b/lib/mnesia/src/Makefile @@ -0,0 +1,139 @@ +# +# %CopyrightBegin% +# +# Copyright Ericsson AB 1996-2009. All Rights Reserved. +# +# The contents of this file are subject to the Erlang Public License, +# Version 1.1, (the "License"); you may not use this file except in +# compliance with the License. You should have received a copy of the +# Erlang Public License along with this software. If not, it can be +# retrieved online at http://www.erlang.org/. +# +# Software distributed under the License is distributed on an "AS IS" +# basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See +# the License for the specific language governing rights and limitations +# under the License. +# +# %CopyrightEnd% +# + +# +include $(ERL_TOP)/make/target.mk + +ifeq ($(TYPE),debug) +ERL_COMPILE_FLAGS += -Ddebug -W +endif + +include $(ERL_TOP)/make/$(TARGET)/otp.mk + +# ---------------------------------------------------- +# Application version +# ---------------------------------------------------- +include ../vsn.mk +VSN=$(MNESIA_VSN) + +# ---------------------------------------------------- +# Release directory specification +# ---------------------------------------------------- +RELSYSDIR = $(RELEASE_PATH)/lib/mnesia-$(VSN) + +# ---------------------------------------------------- +# Target Specs +# ---------------------------------------------------- +MODULES= \ + mnesia \ + mnesia_backup \ + mnesia_bup \ + mnesia_checkpoint \ + mnesia_checkpoint_sup \ + mnesia_controller \ + mnesia_dumper\ + mnesia_event \ + mnesia_frag \ + mnesia_frag_hash \ + mnesia_frag_old_hash \ + mnesia_index \ + mnesia_kernel_sup \ + mnesia_late_loader \ + mnesia_lib\ + mnesia_loader \ + mnesia_locker \ + mnesia_log \ + mnesia_monitor \ + mnesia_recover \ + mnesia_registry \ + mnesia_schema\ + mnesia_snmp_hook \ + mnesia_snmp_sup \ + mnesia_subscr \ + mnesia_sup \ + mnesia_sp \ + mnesia_text \ + mnesia_tm + +HRL_FILES= mnesia.hrl + +ERL_FILES= $(MODULES:%=%.erl) + +TARGET_FILES= $(MODULES:%=$(EBIN)/%.$(EMULATOR)) $(APP_TARGET) $(APPUP_TARGET) + +APP_FILE= mnesia.app + +APP_SRC= $(APP_FILE).src +APP_TARGET= $(EBIN)/$(APP_FILE) + +APPUP_FILE= mnesia.appup + +APPUP_SRC= $(APPUP_FILE).src +APPUP_TARGET= $(EBIN)/$(APPUP_FILE) + + + +# ---------------------------------------------------- +# FLAGS +# ---------------------------------------------------- +ERL_COMPILE_FLAGS += \ + +warn_unused_vars \ + +'{parse_transform,sys_pre_attributes}' \ + +'{attribute,insert,vsn,"mnesia_$(MNESIA_VSN)"}' \ + -W + +# ---------------------------------------------------- +# Targets +# ---------------------------------------------------- + +opt: $(TARGET_FILES) + +debug: + @${MAKE} TYPE=debug + +clean: + rm -f $(TARGET_FILES) + rm -f core + +docs: + +# ---------------------------------------------------- +# Special Build Targets +# ---------------------------------------------------- + +$(APP_TARGET): $(APP_SRC) ../vsn.mk + sed -e 's;%VSN%;$(VSN);' $< > $@ + +$(APPUP_TARGET): $(APPUP_SRC) ../vsn.mk + sed -e 's;%VSN%;$(VSN);' $< > $@ + + +# ---------------------------------------------------- +# Release Target +# ---------------------------------------------------- +include $(ERL_TOP)/make/otp_release_targets.mk + +release_spec: opt + $(INSTALL_DIR) $(RELSYSDIR)/src + $(INSTALL_DATA) $(HRL_FILES) $(ERL_FILES) $(RELSYSDIR)/src + $(INSTALL_DIR) $(RELSYSDIR)/ebin + $(INSTALL_DATA) $(TARGET_FILES) $(RELSYSDIR)/ebin + +release_docs_spec: + diff --git a/lib/mnesia/src/mnesia.app.src b/lib/mnesia/src/mnesia.app.src new file mode 100644 index 0000000000..3715488ec2 --- /dev/null +++ b/lib/mnesia/src/mnesia.app.src @@ -0,0 +1,52 @@ +{application, mnesia, + [{description, "MNESIA CXC 138 12"}, + {vsn, "%VSN%"}, + {modules, [ + mnesia, + mnesia_backup, + mnesia_bup, + mnesia_checkpoint, + mnesia_checkpoint_sup, + mnesia_controller, + mnesia_dumper, + mnesia_event, + mnesia_frag, + mnesia_frag_hash, + mnesia_frag_old_hash, + mnesia_index, + mnesia_kernel_sup, + mnesia_late_loader, + mnesia_lib, + mnesia_loader, + mnesia_locker, + mnesia_log, + mnesia_monitor, + mnesia_recover, + mnesia_registry, + mnesia_schema, + mnesia_snmp_hook, + mnesia_snmp_sup, + mnesia_subscr, + mnesia_sup, + mnesia_sp, + mnesia_text, + mnesia_tm + ]}, + {registered, [ + mnesia_dumper_load_regulator, + mnesia_event, + mnesia_fallback, + mnesia_controller, + mnesia_kernel_sup, + mnesia_late_loader, + mnesia_locker, + mnesia_monitor, + mnesia_recover, + mnesia_substr, + mnesia_sup, + mnesia_tm + ]}, + {applications, [kernel, stdlib]}, + {mod, {mnesia_sup, []}}]}. + + diff --git a/lib/mnesia/src/mnesia.appup.src b/lib/mnesia/src/mnesia.appup.src new file mode 100644 index 0000000000..cad63bf8df --- /dev/null +++ b/lib/mnesia/src/mnesia.appup.src @@ -0,0 +1,37 @@ +%% -*- erlang -*- +{"%VSN%", + [ + {"4.4.11", + [ + {update, mnesia_locker, soft, soft_purge, soft_purge, []}, + {update, mnesia_controller, soft, soft_purge, soft_purge, []} + ] + }, + {"4.4.10", + [ + {update, mnesia_locker, soft, soft_purge, soft_purge, []}, + {update, mnesia_controller, soft, soft_purge, soft_purge, []} + ] + }, + {"4.4.9", [{restart_application, mnesia}]}, + {"4.4.8", [{restart_application, mnesia}]}, + {"4.4.7", [{restart_application, mnesia}]} + ], + [ + {"4.4.11", + [ + {update, mnesia_locker, soft, soft_purge, soft_purge, []}, + {update, mnesia_controller, soft, soft_purge, soft_purge, []} + ] + }, + {"4.4.10", + [ + {update, mnesia_locker, soft, soft_purge, soft_purge, []}, + {update, mnesia_controller, soft, soft_purge, soft_purge, []} + ] + }, + {"4.4.9", [{restart_application, mnesia}]}, + {"4.4.8", [{restart_application, mnesia}]}, + {"4.4.7", [{restart_application, mnesia}]} + ] +}. diff --git a/lib/mnesia/src/mnesia.erl b/lib/mnesia/src/mnesia.erl new file mode 100644 index 0000000000..9a630f18eb --- /dev/null +++ b/lib/mnesia/src/mnesia.erl @@ -0,0 +1,2883 @@ +%% +%% %CopyrightBegin% +%% +%% Copyright Ericsson AB 1996-2009. All Rights Reserved. +%% +%% The contents of this file are subject to the Erlang Public License, +%% Version 1.1, (the "License"); you may not use this file except in +%% compliance with the License. You should have received a copy of the +%% Erlang Public License along with this software. If not, it can be +%% retrieved online at http://www.erlang.org/. +%% +%% Software distributed under the License is distributed on an "AS IS" +%% basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See +%% the License for the specific language governing rights and limitations +%% under the License. +%% +%% %CopyrightEnd% +%% + +%% +%% This module exports the public interface of the Mnesia DBMS engine + +-module(mnesia). +%-behaviour(mnesia_access). + +-export([ + %% Start, stop and debugging + start/0, start/1, stop/0, % Not for public use + set_debug_level/1, lkill/0, kill/0, % Not for public use + ms/0, + change_config/2, + + %% Activity mgt + abort/1, transaction/1, transaction/2, transaction/3, + sync_transaction/1, sync_transaction/2, sync_transaction/3, + async_dirty/1, async_dirty/2, sync_dirty/1, sync_dirty/2, ets/1, ets/2, + activity/2, activity/3, activity/4, % Not for public use + is_transaction/0, + + %% Access within an activity - Lock acquisition + lock/2, lock/4, + read_lock_table/1, + write_lock_table/1, + + %% Access within an activity - Updates + write/1, s_write/1, write/3, write/5, + delete/1, s_delete/1, delete/3, delete/5, + delete_object/1, s_delete_object/1, delete_object/3, delete_object/5, + + %% Access within an activity - Reads + read/1, read/2, wread/1, read/3, read/5, + match_object/1, match_object/3, match_object/5, + select/1,select/2,select/3,select/4,select/5,select/6, + all_keys/1, all_keys/4, + index_match_object/2, index_match_object/4, index_match_object/6, + index_read/3, index_read/6, + first/1, next/2, last/1, prev/2, + first/3, next/4, last/3, prev/4, + + %% Iterators within an activity + foldl/3, foldl/4, foldr/3, foldr/4, + + %% Dirty access regardless of activities - Updates + dirty_write/1, dirty_write/2, + dirty_delete/1, dirty_delete/2, + dirty_delete_object/1, dirty_delete_object/2, + dirty_update_counter/2, dirty_update_counter/3, + + %% Dirty access regardless of activities - Read + dirty_read/1, dirty_read/2, + dirty_select/2, + dirty_match_object/1, dirty_match_object/2, dirty_all_keys/1, + dirty_index_match_object/2, dirty_index_match_object/3, + dirty_index_read/3, dirty_slot/2, + dirty_first/1, dirty_next/2, dirty_last/1, dirty_prev/2, + + %% Info + table_info/2, table_info/4, schema/0, schema/1, + error_description/1, info/0, system_info/1, + system_info/0, % Not for public use + + %% Database mgt + create_schema/1, delete_schema/1, + backup/1, backup/2, traverse_backup/4, traverse_backup/6, + install_fallback/1, install_fallback/2, + uninstall_fallback/0, uninstall_fallback/1, + activate_checkpoint/1, deactivate_checkpoint/1, + backup_checkpoint/2, backup_checkpoint/3, restore/2, + + %% Table mgt + create_table/1, create_table/2, delete_table/1, + add_table_copy/3, del_table_copy/2, move_table_copy/3, + add_table_index/2, del_table_index/2, + transform_table/3, transform_table/4, + change_table_copy_type/3, + read_table_property/2, write_table_property/2, delete_table_property/2, + change_table_frag/2, + clear_table/1, clear_table/4, + + %% Table load + dump_tables/1, wait_for_tables/2, force_load_table/1, + change_table_access_mode/2, change_table_load_order/2, + set_master_nodes/1, set_master_nodes/2, + + %% Misc admin + dump_log/0, subscribe/1, unsubscribe/1, report_event/1, + + %% Snmp + snmp_open_table/2, snmp_close_table/1, + snmp_get_row/2, snmp_get_next_index/2, snmp_get_mnesia_key/2, + + %% Textfile access + load_textfile/1, dump_to_textfile/1, + + %% QLC functions + table/1, table/2, + + %% Mnemosyne exclusive + get_activity_id/0, put_activity_id/1, % Not for public use + + %% Mnesia internal functions + dirty_rpc/4, % Not for public use + has_var/1, fun_select/7, fun_select/10, select_cont/3, dirty_sel_init/5, + foldl/6, foldr/6, + + %% Module internal callback functions + raw_table_info/2, % Not for public use + remote_dirty_match_object/2, % Not for public use + remote_dirty_select/2 % Not for public use + ]). + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% + +-include("mnesia.hrl"). +-import(mnesia_lib, [verbose/2]). + +-define(DEFAULT_ACCESS, ?MODULE). + +%% Select +-define(PATTERN_TO_OBJECT_MATCH_SPEC(Pat), [{Pat,[],['$_']}]). +-define(PATTERN_TO_BINDINGS_MATCH_SPEC(Pat), [{Pat,[],['$$']}]). + +%% Local function in order to avoid external function call +val(Var) -> + case ?catch_val(Var) of + {'EXIT', Reason} -> mnesia_lib:other_val(Var, Reason); + Value -> Value + end. + +is_dollar_digits(Var) -> + case atom_to_list(Var) of + [$$ | Digs] -> + is_digits(Digs); + _ -> + false + end. + +is_digits([Dig | Tail]) -> + if + $0 =< Dig, Dig =< $9 -> + is_digits(Tail); + true -> + false + end; +is_digits([]) -> + true. + +has_var(X) when is_atom(X) -> + if + X == '_' -> + true; + is_atom(X) -> + is_dollar_digits(X); + true -> + false + end; +has_var(X) when is_tuple(X) -> + e_has_var(X, tuple_size(X)); +has_var([H|T]) -> + case has_var(H) of + false -> has_var(T); + Other -> Other + end; +has_var(_) -> false. + +e_has_var(_, 0) -> false; +e_has_var(X, Pos) -> + case has_var(element(Pos, X))of + false -> e_has_var(X, Pos-1); + Other -> Other + end. + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +%% Start and stop + +start() -> + {Time , Res} = timer:tc(application, start, [?APPLICATION, temporary]), + + Secs = Time div 1000000, + case Res of + ok -> + verbose("Mnesia started, ~p seconds~n",[ Secs]), + ok; + {error, {already_started, mnesia}} -> + verbose("Mnesia already started, ~p seconds~n",[ Secs]), + ok; + {error, R} -> + verbose("Mnesia failed to start, ~p seconds: ~p~n",[ Secs, R]), + {error, R} + end. + +start(ExtraEnv) when is_list(ExtraEnv) -> + case mnesia_lib:ensure_loaded(?APPLICATION) of + ok -> + patched_start(ExtraEnv); + Error -> + Error + end; +start(ExtraEnv) -> + {error, {badarg, ExtraEnv}}. + +patched_start([{Env, Val} | Tail]) when is_atom(Env) -> + case mnesia_monitor:patch_env(Env, Val) of + {error, Reason} -> + {error, Reason}; + _NewVal -> + patched_start(Tail) + end; +patched_start([Head | _]) -> + {error, {bad_type, Head}}; +patched_start([]) -> + start(). + +stop() -> + case application:stop(?APPLICATION) of + ok -> stopped; + {error, {not_started, ?APPLICATION}} -> stopped; + Other -> Other + end. + +change_config(extra_db_nodes, Ns) when is_list(Ns) -> + mnesia_controller:connect_nodes(Ns); +change_config(dc_dump_limit, N) when is_number(N), N > 0 -> + case mnesia_lib:is_running() of + yes -> + mnesia_lib:set(dc_dump_limit, N), + {ok, N}; + _ -> + {error, {not_started, ?APPLICATION}} + end; +change_config(BadKey, _BadVal) -> + {error, {badarg, BadKey}}. + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +%% Debugging + +set_debug_level(Level) -> + mnesia_subscr:set_debug_level(Level). + +lkill() -> + mnesia_sup:kill(). + +kill() -> + rpc:multicall(mnesia_sup, kill, []). + +ms() -> + [ + mnesia, + mnesia_backup, + mnesia_bup, + mnesia_checkpoint, + mnesia_checkpoint_sup, + mnesia_controller, + mnesia_dumper, + mnesia_loader, + mnesia_frag, + mnesia_frag_hash, + mnesia_frag_old_hash, + mnesia_index, + mnesia_kernel_sup, + mnesia_late_loader, + mnesia_lib, + mnesia_log, + mnesia_registry, + mnesia_schema, + mnesia_snmp_hook, + mnesia_snmp_sup, + mnesia_subscr, + mnesia_sup, + mnesia_text, + mnesia_tm, + mnesia_recover, + mnesia_locker, + + %% Keep these last in the list, so + %% mnesia_sup kills these last + mnesia_monitor, + mnesia_event + ]. + + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +%% Activity mgt + +-spec(abort/1 :: (_) -> no_return()). + +abort(Reason) -> + exit({aborted, Reason}). + +is_transaction() -> + case get(mnesia_activity_state) of + {_, Tid, _Ts} when element(1,Tid) == tid -> + true; + _ -> + false + end. + +transaction(Fun) -> + transaction(get(mnesia_activity_state), Fun, [], infinity, ?DEFAULT_ACCESS, async). +transaction(Fun, Retries) when is_integer(Retries), Retries >= 0 -> + transaction(get(mnesia_activity_state), Fun, [], Retries, ?DEFAULT_ACCESS, async); +transaction(Fun, Retries) when Retries == infinity -> + transaction(get(mnesia_activity_state), Fun, [], Retries, ?DEFAULT_ACCESS, async); +transaction(Fun, Args) -> + transaction(get(mnesia_activity_state), Fun, Args, infinity, ?DEFAULT_ACCESS, async). +transaction(Fun, Args, Retries) -> + transaction(get(mnesia_activity_state), Fun, Args, Retries, ?DEFAULT_ACCESS, async). + +sync_transaction(Fun) -> + transaction(get(mnesia_activity_state), Fun, [], infinity, ?DEFAULT_ACCESS, sync). +sync_transaction(Fun, Retries) when is_integer(Retries), Retries >= 0 -> + transaction(get(mnesia_activity_state), Fun, [], Retries, ?DEFAULT_ACCESS, sync); +sync_transaction(Fun, Retries) when Retries == infinity -> + transaction(get(mnesia_activity_state), Fun, [], Retries, ?DEFAULT_ACCESS, sync); +sync_transaction(Fun, Args) -> + transaction(get(mnesia_activity_state), Fun, Args, infinity, ?DEFAULT_ACCESS, sync). +sync_transaction(Fun, Args, Retries) -> + transaction(get(mnesia_activity_state), Fun, Args, Retries, ?DEFAULT_ACCESS, sync). + + +transaction(State, Fun, Args, Retries, Mod, Kind) + when is_function(Fun), is_list(Args), Retries == infinity, is_atom(Mod) -> + mnesia_tm:transaction(State, Fun, Args, Retries, Mod, Kind); +transaction(State, Fun, Args, Retries, Mod, Kind) + when is_function(Fun), is_list(Args), is_integer(Retries), Retries >= 0, is_atom(Mod) -> + mnesia_tm:transaction(State, Fun, Args, Retries, Mod, Kind); +transaction(_State, Fun, Args, Retries, Mod, _Kind) -> + {aborted, {badarg, Fun, Args, Retries, Mod}}. + +non_transaction(State, Fun, Args, ActivityKind, Mod) + when is_function(Fun), is_list(Args), is_atom(Mod) -> + mnesia_tm:non_transaction(State, Fun, Args, ActivityKind, Mod); +non_transaction(_State, Fun, Args, _ActivityKind, _Mod) -> + {aborted, {badarg, Fun, Args}}. + +async_dirty(Fun) -> + async_dirty(Fun, []). +async_dirty(Fun, Args) -> + non_transaction(get(mnesia_activity_state), Fun, Args, async_dirty, ?DEFAULT_ACCESS). + +sync_dirty(Fun) -> + sync_dirty(Fun, []). +sync_dirty(Fun, Args) -> + non_transaction(get(mnesia_activity_state), Fun, Args, sync_dirty, ?DEFAULT_ACCESS). + +ets(Fun) -> + ets(Fun, []). +ets(Fun, Args) -> + non_transaction(get(mnesia_activity_state), Fun, Args, ets, ?DEFAULT_ACCESS). + +activity(Kind, Fun) -> + activity(Kind, Fun, []). +activity(Kind, Fun, Args) when is_list(Args) -> + activity(Kind, Fun, Args, mnesia_monitor:get_env(access_module)); +activity(Kind, Fun, Mod) -> + activity(Kind, Fun, [], Mod). + +activity(Kind, Fun, Args, Mod) -> + State = get(mnesia_activity_state), + case Kind of + ets -> non_transaction(State, Fun, Args, Kind, Mod); + async_dirty -> non_transaction(State, Fun, Args, Kind, Mod); + sync_dirty -> non_transaction(State, Fun, Args, Kind, Mod); + transaction -> wrap_trans(State, Fun, Args, infinity, Mod, async); + {transaction, Retries} -> wrap_trans(State, Fun, Args, Retries, Mod, async); + sync_transaction -> wrap_trans(State, Fun, Args, infinity, Mod, sync); + {sync_transaction, Retries} -> wrap_trans(State, Fun, Args, Retries, Mod, sync); + _ -> {aborted, {bad_type, Kind}} + end. + +wrap_trans(State, Fun, Args, Retries, Mod, Kind) -> + case transaction(State, Fun, Args, Retries, Mod, Kind) of + {atomic, GoodRes} -> GoodRes; + BadRes -> exit(BadRes) + end. + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +%% Access within an activity - lock acquisition + +%% Grab a lock on an item in the global lock table +%% Item may be any term. Lock may be write or read. +%% write lock is set on all the given nodes +%% read lock is only set on the first node +%% Nodes may either be a list of nodes or one node as an atom +%% Mnesia on all Nodes must be connected to each other, but +%% it is not neccessary that they are up and running. + +lock(LockItem, LockKind) -> + case get(mnesia_activity_state) of + {?DEFAULT_ACCESS, Tid, Ts} -> + lock(Tid, Ts, LockItem, LockKind); + {Mod, Tid, Ts} -> + Mod:lock(Tid, Ts, LockItem, LockKind); + _ -> + abort(no_transaction) + end. + +lock(Tid, Ts, LockItem, LockKind) -> + case element(1, Tid) of + tid -> + case LockItem of + {record, Tab, Key} -> + lock_record(Tid, Ts, Tab, Key, LockKind); + {table, Tab} -> + lock_table(Tid, Ts, Tab, LockKind); + {global, GlobalKey, Nodes} -> + global_lock(Tid, Ts, GlobalKey, LockKind, Nodes); + _ -> + abort({bad_type, LockItem}) + end; + _Protocol -> + [] + end. + +%% Grab a read lock on a whole table +read_lock_table(Tab) -> + lock({table, Tab}, read), + ok. + +%% Grab a write lock on a whole table +write_lock_table(Tab) -> + lock({table, Tab}, write), + ok. + +lock_record(Tid, Ts, Tab, Key, LockKind) when is_atom(Tab) -> + Store = Ts#tidstore.store, + Oid = {Tab, Key}, + case LockKind of + read -> + mnesia_locker:rlock(Tid, Store, Oid); + write -> + mnesia_locker:wlock(Tid, Store, Oid); + sticky_write -> + mnesia_locker:sticky_wlock(Tid, Store, Oid); + none -> + []; + _ -> + abort({bad_type, Tab, LockKind}) + end; +lock_record(_Tid, _Ts, Tab, _Key, _LockKind) -> + abort({bad_type, Tab}). + +lock_table(Tid, Ts, Tab, LockKind) when is_atom(Tab) -> + Store = Ts#tidstore.store, + case LockKind of + read -> + mnesia_locker:rlock_table(Tid, Store, Tab); + write -> + mnesia_locker:wlock_table(Tid, Store, Tab); + sticky_write -> + mnesia_locker:sticky_wlock_table(Tid, Store, Tab); + none -> + []; + _ -> + abort({bad_type, Tab, LockKind}) + end; +lock_table(_Tid, _Ts, Tab, _LockKind) -> + abort({bad_type, Tab}). + +global_lock(Tid, Ts, Item, Kind, Nodes) when is_list(Nodes) -> + case element(1, Tid) of + tid -> + Store = Ts#tidstore.store, + GoodNs = good_global_nodes(Nodes), + if + Kind /= read, Kind /= write -> + abort({bad_type, Kind}); + true -> + mnesia_locker:global_lock(Tid, Store, Item, Kind, GoodNs) + end; + _Protocol -> + [] + end; +global_lock(_Tid, _Ts, _Item, _Kind, Nodes) -> + abort({bad_type, Nodes}). + +good_global_nodes(Nodes) -> + Recover = [node() | val(recover_nodes)], + mnesia_lib:intersect(Nodes, Recover). + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +%% Access within an activity - updates + +write(Val) when is_tuple(Val), tuple_size(Val) > 2 -> + Tab = element(1, Val), + write(Tab, Val, write); +write(Val) -> + abort({bad_type, Val}). + +s_write(Val) when is_tuple(Val), tuple_size(Val) > 2 -> + Tab = element(1, Val), + write(Tab, Val, sticky_write). + +write(Tab, Val, LockKind) -> + case get(mnesia_activity_state) of + {?DEFAULT_ACCESS, Tid, Ts} -> + write(Tid, Ts, Tab, Val, LockKind); + {Mod, Tid, Ts} -> + Mod:write(Tid, Ts, Tab, Val, LockKind); + _ -> + abort(no_transaction) + end. + +write(Tid, Ts, Tab, Val, LockKind) + when is_atom(Tab), Tab /= schema, is_tuple(Val), tuple_size(Val) > 2 -> + case element(1, Tid) of + ets -> + ?ets_insert(Tab, Val), + ok; + tid -> + Store = Ts#tidstore.store, + Oid = {Tab, element(2, Val)}, + case LockKind of + write -> + mnesia_locker:wlock(Tid, Store, Oid); + sticky_write -> + mnesia_locker:sticky_wlock(Tid, Store, Oid); + _ -> + abort({bad_type, Tab, LockKind}) + end, + write_to_store(Tab, Store, Oid, Val); + Protocol -> + do_dirty_write(Protocol, Tab, Val) + end; +write(_Tid, _Ts, Tab, Val, LockKind) -> + abort({bad_type, Tab, Val, LockKind}). + +write_to_store(Tab, Store, Oid, Val) -> + case ?catch_val({Tab, record_validation}) of + {RecName, Arity, Type} + when tuple_size(Val) == Arity, RecName == element(1, Val) -> + case Type of + bag -> + ?ets_insert(Store, {Oid, Val, write}); + _ -> + ?ets_delete(Store, Oid), + ?ets_insert(Store, {Oid, Val, write}) + end, + ok; + {'EXIT', _} -> + abort({no_exists, Tab}); + _ -> + abort({bad_type, Val}) + end. + +delete({Tab, Key}) -> + delete(Tab, Key, write); +delete(Oid) -> + abort({bad_type, Oid}). + +s_delete({Tab, Key}) -> + delete(Tab, Key, sticky_write); +s_delete(Oid) -> + abort({bad_type, Oid}). + +delete(Tab, Key, LockKind) -> + case get(mnesia_activity_state) of + {?DEFAULT_ACCESS, Tid, Ts} -> + delete(Tid, Ts, Tab, Key, LockKind); + {Mod, Tid, Ts} -> + Mod:delete(Tid, Ts, Tab, Key, LockKind); + _ -> + abort(no_transaction) + end. + +delete(Tid, Ts, Tab, Key, LockKind) + when is_atom(Tab), Tab /= schema -> + case element(1, Tid) of + ets -> + ?ets_delete(Tab, Key), + ok; + tid -> + Store = Ts#tidstore.store, + Oid = {Tab, Key}, + case LockKind of + write -> + mnesia_locker:wlock(Tid, Store, Oid); + sticky_write -> + mnesia_locker:sticky_wlock(Tid, Store, Oid); + _ -> + abort({bad_type, Tab, LockKind}) + end, + ?ets_delete(Store, Oid), + ?ets_insert(Store, {Oid, Oid, delete}), + ok; + Protocol -> + do_dirty_delete(Protocol, Tab, Key) + end; +delete(_Tid, _Ts, Tab, _Key, _LockKind) -> + abort({bad_type, Tab}). + +delete_object(Val) when is_tuple(Val), tuple_size(Val) > 2 -> + Tab = element(1, Val), + delete_object(Tab, Val, write); +delete_object(Val) -> + abort({bad_type, Val}). + +s_delete_object(Val) when is_tuple(Val), tuple_size(Val) > 2 -> + Tab = element(1, Val), + delete_object(Tab, Val, sticky_write); +s_delete_object(Val) -> + abort({bad_type, Val}). + +delete_object(Tab, Val, LockKind) -> + case get(mnesia_activity_state) of + {?DEFAULT_ACCESS, Tid, Ts} -> + delete_object(Tid, Ts, Tab, Val, LockKind); + {Mod, Tid, Ts} -> + Mod:delete_object(Tid, Ts, Tab, Val, LockKind); + _ -> + abort(no_transaction) + end. + +delete_object(Tid, Ts, Tab, Val, LockKind) + when is_atom(Tab), Tab /= schema, is_tuple(Val), tuple_size(Val) > 2 -> + case has_var(Val) of + false -> + do_delete_object(Tid, Ts, Tab, Val, LockKind); + true -> + abort({bad_type, Tab, Val}) + end; +delete_object(_Tid, _Ts, Tab, _Key, _LockKind) -> + abort({bad_type, Tab}). + +do_delete_object(Tid, Ts, Tab, Val, LockKind) -> + case element(1, Tid) of + ets -> + ?ets_match_delete(Tab, Val), + ok; + tid -> + Store = Ts#tidstore.store, + Oid = {Tab, element(2, Val)}, + case LockKind of + write -> + mnesia_locker:wlock(Tid, Store, Oid); + sticky_write -> + mnesia_locker:sticky_wlock(Tid, Store, Oid); + _ -> + abort({bad_type, Tab, LockKind}) + end, + case val({Tab, setorbag}) of + bag -> + ?ets_match_delete(Store, {Oid, Val, '_'}), + ?ets_insert(Store, {Oid, Val, delete_object}); + _ -> + case ?ets_match_object(Store, {Oid, '_', write}) of + [] -> + ?ets_match_delete(Store, {Oid, Val, '_'}), + ?ets_insert(Store, {Oid, Val, delete_object}); + _ -> + ?ets_delete(Store, Oid), + ?ets_insert(Store, {Oid, Oid, delete}) + end + end, + ok; + Protocol -> + do_dirty_delete_object(Protocol, Tab, Val) + end. + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +%% Access within an activity - read + +read(Tab, Key) -> + read(Tab, Key, read). + +read({Tab, Key}) -> + read(Tab, Key, read); +read(Oid) -> + abort({bad_type, Oid}). + +wread({Tab, Key}) -> + read(Tab, Key, write); +wread(Oid) -> + abort({bad_type, Oid}). + +read(Tab, Key, LockKind) -> + case get(mnesia_activity_state) of + {?DEFAULT_ACCESS, Tid, Ts} -> + read(Tid, Ts, Tab, Key, LockKind); + {Mod, Tid, Ts} -> + Mod:read(Tid, Ts, Tab, Key, LockKind); + _ -> + abort(no_transaction) + end. + +read(Tid, Ts, Tab, Key, LockKind) + when is_atom(Tab), Tab /= schema -> + case element(1, Tid) of + ets -> + ?ets_lookup(Tab, Key); + tid -> + Store = Ts#tidstore.store, + Oid = {Tab, Key}, + Objs = + case LockKind of + read -> + mnesia_locker:rlock(Tid, Store, Oid); + write -> + mnesia_locker:rwlock(Tid, Store, Oid); + sticky_write -> + mnesia_locker:sticky_rwlock(Tid, Store, Oid); + _ -> + abort({bad_type, Tab, LockKind}) + end, + add_written(?ets_lookup(Store, Oid), Tab, Objs); + _Protocol -> + dirty_read(Tab, Key) + end; +read(_Tid, _Ts, Tab, _Key, _LockKind) -> + abort({bad_type, Tab}). + +first(Tab) -> + case get(mnesia_activity_state) of + {?DEFAULT_ACCESS, Tid, Ts} -> + first(Tid, Ts, Tab); + {Mod, Tid, Ts} -> + Mod:first(Tid, Ts, Tab); + _ -> + abort(no_transaction) + end. + +first(Tid, Ts, Tab) + when is_atom(Tab), Tab /= schema -> + case element(1, Tid) of + ets -> + ?ets_first(Tab); + tid -> + lock_table(Tid, Ts, Tab, read), + do_fixtable(Tab,Ts), + Key = dirty_first(Tab), + stored_keys(Tab,Key,'$end_of_table',Ts,next, + val({Tab, setorbag})); + _Protocol -> + dirty_first(Tab) + end; +first(_Tid, _Ts,Tab) -> + abort({bad_type, Tab}). + +last(Tab) -> + case get(mnesia_activity_state) of + {?DEFAULT_ACCESS, Tid, Ts} -> + last(Tid, Ts, Tab); + {Mod, Tid, Ts} -> + Mod:last(Tid, Ts, Tab); + _ -> + abort(no_transaction) + end. + +last(Tid, Ts, Tab) + when is_atom(Tab), Tab /= schema -> + case element(1, Tid) of + ets -> + ?ets_last(Tab); + tid -> + lock_table(Tid, Ts, Tab, read), + do_fixtable(Tab,Ts), + Key = dirty_last(Tab), + stored_keys(Tab,Key,'$end_of_table',Ts,prev, + val({Tab, setorbag})); + _Protocol -> + dirty_last(Tab) + end; +last(_Tid, _Ts,Tab) -> + abort({bad_type, Tab}). + +next(Tab,Key) -> + case get(mnesia_activity_state) of + {?DEFAULT_ACCESS,Tid,Ts} -> + next(Tid,Ts,Tab,Key); + {Mod,Tid,Ts} -> + Mod:next(Tid,Ts,Tab,Key); + _ -> + abort(no_transaction) + end. +next(Tid,Ts,Tab,Key) + when is_atom(Tab), Tab /= schema -> + case element(1, Tid) of + ets -> + ?ets_next(Tab,Key); + tid -> + lock_table(Tid, Ts, Tab, read), + do_fixtable(Tab,Ts), + New = (catch dirty_next(Tab,Key)), + stored_keys(Tab,New,Key,Ts,next, + val({Tab, setorbag})); + _Protocol -> + dirty_next(Tab,Key) + end; +next(_Tid, _Ts,Tab,_) -> + abort({bad_type, Tab}). + +prev(Tab,Key) -> + case get(mnesia_activity_state) of + {?DEFAULT_ACCESS,Tid,Ts} -> + prev(Tid,Ts,Tab,Key); + {Mod,Tid,Ts} -> + Mod:prev(Tid,Ts,Tab,Key); + _ -> + abort(no_transaction) + end. +prev(Tid,Ts,Tab,Key) + when is_atom(Tab), Tab /= schema -> + case element(1, Tid) of + ets -> + ?ets_prev(Tab,Key); + tid -> + lock_table(Tid, Ts, Tab, read), + do_fixtable(Tab,Ts), + New = (catch dirty_prev(Tab,Key)), + stored_keys(Tab,New,Key,Ts,prev, + val({Tab, setorbag})); + _Protocol -> + dirty_prev(Tab,Key) + end; +prev(_Tid, _Ts,Tab,_) -> + abort({bad_type, Tab}). + +%% Compensate for transaction written and/or deleted records +stored_keys(Tab,'$end_of_table',Prev,Ts,Op,Type) -> + case ts_keys(Ts#tidstore.store,Tab,Op,Type,[]) of + [] -> '$end_of_table'; + Keys when Type == ordered_set-> + get_ordered_tskey(Prev,Keys,Op); + Keys -> + get_next_tskey(Prev,Keys,Tab) + end; +stored_keys(Tab,{'EXIT',{aborted,R={badarg,[Tab,Key]}}}, + Key,#tidstore{store=Store},Op,Type) -> + %% Had to match on error, ouch.. + case ?ets_match(Store, {{Tab, Key}, '_', '$1'}) of + [] -> abort(R); + Ops -> + case lists:last(Ops) of + [delete] -> abort(R); + _ -> + case ts_keys(Store,Tab,Op,Type,[]) of + [] -> '$end_of_table'; + Keys -> get_next_tskey(Key,Keys,Tab) + end + end + end; +stored_keys(_,{'EXIT',{aborted,R}},_,_,_,_) -> + abort(R); +stored_keys(Tab,Key,Prev,#tidstore{store=Store},Op,ordered_set) -> + case ?ets_match(Store, {{Tab, Key}, '_', '$1'}) of + [] -> + Keys = ts_keys(Store,Tab,Op,ordered_set,[Key]), + get_ordered_tskey(Prev,Keys,Op); + Ops -> + case lists:last(Ops) of + [delete] -> + mnesia:Op(Tab,Key); + _ -> + Keys = ts_keys(Store,Tab,Op,ordered_set,[Key]), + get_ordered_tskey(Prev,Keys,Op) + end + end; +stored_keys(Tab,Key,_,#tidstore{store=Store},Op,_) -> + case ?ets_match(Store, {{Tab, Key}, '_', '$1'}) of + [] -> Key; + Ops -> + case lists:last(Ops) of + [delete] -> mnesia:Op(Tab,Key); + _ -> Key + end + end. + +get_ordered_tskey('$end_of_table', [First|_],_) -> First; +get_ordered_tskey(Prev, [First|_], next) when Prev < First -> First; +get_ordered_tskey(Prev, [First|_], prev) when Prev > First -> First; +get_ordered_tskey(Prev, [_|R],Op) -> get_ordered_tskey(Prev,R,Op); +get_ordered_tskey(_, [],_) -> '$end_of_table'. + +get_next_tskey(Key,Keys,Tab) -> + Next = + if Key == '$end_of_table' -> hd(Keys); + true -> + case lists:dropwhile(fun(A) -> A /= Key end, Keys) of + [] -> hd(Keys); %% First stored key + [Key] -> '$end_of_table'; + [Key,Next2|_] -> Next2 + end + end, + case Next of + '$end_of_table' -> '$end_of_table'; + _ -> %% Really slow anybody got another solution?? + case dirty_read(Tab, Next) of + [] -> Next; + _ -> + %% Updated value we already returned this key + get_next_tskey(Next,Keys,Tab) + end + end. + +ts_keys(Store, Tab, Op, Type, Def) -> + All = ?ets_match(Store, {{Tab,'$1'},'_','$2'}), + Keys = ts_keys_1(All, Def), + if + Type == ordered_set, Op == prev -> + lists:reverse(lists:sort(Keys)); + Type == ordered_set -> + lists:sort(Keys); + Op == next -> + lists:reverse(Keys); + true -> + Keys + end. + +ts_keys_1([[Key, write]|R], []) -> + ts_keys_1(R, [Key]); +ts_keys_1([[Key, write]|R], Acc=[Key|_]) -> + ts_keys_1(R, Acc); +ts_keys_1([[Key, write]|R], Acc) -> + ts_keys_1(R, [Key|Acc]); +ts_keys_1([[Key, delete]|R], [Key|Acc]) -> + ts_keys_1(R, Acc); +ts_keys_1([_|R], Acc) -> + ts_keys_1(R, Acc); +ts_keys_1([], Acc) -> + Acc. + + +%%%%%%%%%%%%%%%%%%%%% +%% Iterators + +foldl(Fun, Acc, Tab) -> + foldl(Fun, Acc, Tab, read). + +foldl(Fun, Acc, Tab, LockKind) when is_function(Fun) -> + case get(mnesia_activity_state) of + {?DEFAULT_ACCESS, Tid, Ts} -> + foldl(Tid, Ts, Fun, Acc, Tab, LockKind); + {Mod, Tid, Ts} -> + Mod:foldl(Tid, Ts, Fun, Acc, Tab, LockKind); + _ -> + abort(no_transaction) + end. + +foldl(ActivityId, Opaque, Fun, Acc, Tab, LockKind) -> + {Type, Prev} = init_iteration(ActivityId, Opaque, Tab, LockKind), + Res = (catch do_foldl(ActivityId, Opaque, Tab, dirty_first(Tab), Fun, Acc, Type, Prev)), + close_iteration(Res, Tab). + +do_foldl(A, O, Tab, '$end_of_table', Fun, RAcc, _Type, Stored) -> + lists:foldl(fun(Key, Acc) -> + lists:foldl(Fun, Acc, read(A, O, Tab, Key, read)) + end, RAcc, Stored); +do_foldl(A, O, Tab, Key, Fun, Acc, ordered_set, [H | Stored]) when H == Key -> + NewAcc = lists:foldl(Fun, Acc, read(A, O, Tab, Key, read)), + {_, Tid, Ts} = get(mnesia_activity_state), + do_foldl(Tid, Ts, Tab, dirty_next(Tab, Key), Fun, NewAcc, ordered_set, Stored); +do_foldl(A, O, Tab, Key, Fun, Acc, ordered_set, [H | Stored]) when H < Key -> + NewAcc = lists:foldl(Fun, Acc, read(A, O, Tab, H, read)), + {_, Tid, Ts} = get(mnesia_activity_state), + do_foldl(Tid, Ts, Tab, Key, Fun, NewAcc, ordered_set, Stored); +do_foldl(A, O, Tab, Key, Fun, Acc, ordered_set, [H | Stored]) when H > Key -> + NewAcc = lists:foldl(Fun, Acc, read(A, O, Tab, Key, read)), + {_, Tid, Ts} = get(mnesia_activity_state), + do_foldl(Tid, Ts, Tab, dirty_next(Tab, Key), Fun, NewAcc, ordered_set, [H |Stored]); +do_foldl(A, O, Tab, Key, Fun, Acc, Type, Stored) -> %% Type is set or bag + NewAcc = lists:foldl(Fun, Acc, read(A, O, Tab, Key, read)), + NewStored = ordsets:del_element(Key, Stored), + {_, Tid, Ts} = get(mnesia_activity_state), + do_foldl(Tid, Ts, Tab, dirty_next(Tab, Key), Fun, NewAcc, Type, NewStored). + +foldr(Fun, Acc, Tab) -> + foldr(Fun, Acc, Tab, read). +foldr(Fun, Acc, Tab, LockKind) when is_function(Fun) -> + case get(mnesia_activity_state) of + {?DEFAULT_ACCESS, Tid, Ts} -> + foldr(Tid, Ts, Fun, Acc, Tab, LockKind); + {Mod, Tid, Ts} -> + Mod:foldr(Tid, Ts, Fun, Acc, Tab, LockKind); + _ -> + abort(no_transaction) + end. + +foldr(ActivityId, Opaque, Fun, Acc, Tab, LockKind) -> + {Type, TempPrev} = init_iteration(ActivityId, Opaque, Tab, LockKind), + Prev = + if + Type == ordered_set -> + lists:reverse(TempPrev); + true -> %% Order doesn't matter for set and bag + TempPrev %% Keep the order so we can use ordsets:del_element + end, + Res = (catch do_foldr(ActivityId, Opaque, Tab, dirty_last(Tab), Fun, Acc, Type, Prev)), + close_iteration(Res, Tab). + +do_foldr(A, O, Tab, '$end_of_table', Fun, RAcc, _Type, Stored) -> + lists:foldl(fun(Key, Acc) -> + lists:foldl(Fun, Acc, read(A, O, Tab, Key, read)) + end, RAcc, Stored); +do_foldr(A, O, Tab, Key, Fun, Acc, ordered_set, [H | Stored]) when H == Key -> + NewAcc = lists:foldl(Fun, Acc, read(A, O, Tab, Key, read)), + {_, Tid, Ts} = get(mnesia_activity_state), + do_foldr(Tid, Ts, Tab, dirty_prev(Tab, Key), Fun, NewAcc, ordered_set, Stored); +do_foldr(A, O, Tab, Key, Fun, Acc, ordered_set, [H | Stored]) when H > Key -> + NewAcc = lists:foldl(Fun, Acc, read(A, O, Tab, H, read)), + {_, Tid, Ts} = get(mnesia_activity_state), + do_foldr(Tid, Ts, Tab, Key, Fun, NewAcc, ordered_set, Stored); +do_foldr(A, O, Tab, Key, Fun, Acc, ordered_set, [H | Stored]) when H < Key -> + NewAcc = lists:foldl(Fun, Acc, read(A, O, Tab, Key, read)), + {_, Tid, Ts} = get(mnesia_activity_state), + do_foldr(Tid, Ts, Tab, dirty_prev(Tab, Key), Fun, NewAcc, ordered_set, [H |Stored]); +do_foldr(A, O, Tab, Key, Fun, Acc, Type, Stored) -> %% Type is set or bag + NewAcc = lists:foldl(Fun, Acc, read(A, O, Tab, Key, read)), + NewStored = ordsets:del_element(Key, Stored), + {_, Tid, Ts} = get(mnesia_activity_state), + do_foldr(Tid, Ts, Tab, dirty_prev(Tab, Key), Fun, NewAcc, Type, NewStored). + +init_iteration(ActivityId, Opaque, Tab, LockKind) -> + lock(ActivityId, Opaque, {table, Tab}, LockKind), + Type = val({Tab, setorbag}), + Previous = add_previous(ActivityId, Opaque, Type, Tab), + St = val({Tab, storage_type}), + if + St == unknown -> + ignore; + true -> + mnesia_lib:db_fixtable(St, Tab, true) + end, + {Type, Previous}. + +close_iteration(Res, Tab) -> + case val({Tab, storage_type}) of + unknown -> + ignore; + St -> + mnesia_lib:db_fixtable(St, Tab, false) + end, + case Res of + {'EXIT', {aborted, What}} -> + abort(What); + {'EXIT', What} -> + abort(What); + _ -> + Res + end. + +add_previous(_ActivityId, non_transaction, _Type, _Tab) -> + []; +add_previous(_Tid, Ts, _Type, Tab) -> + Previous = ?ets_match(Ts#tidstore.store, {{Tab, '$1'}, '_', write}), + lists:sort(lists:concat(Previous)). + +%% This routine fixes up the return value from read/1 so that +%% it is correct with respect to what this particular transaction +%% has already written, deleted .... etc + +add_written([], _Tab, Objs) -> + Objs; % standard normal fast case +add_written(Written, Tab, Objs) -> + case val({Tab, setorbag}) of + bag -> + add_written_to_bag(Written, Objs, []); + _ -> + add_written_to_set(Written) + end. + +add_written_to_set(Ws) -> + case lists:last(Ws) of + {_, _, delete} -> []; + {_, Val, write} -> [Val]; + {_, _, delete_object} -> [] + end. + +add_written_to_bag([{_, Val, write} | Tail], Objs, Ack) -> + add_written_to_bag(Tail, lists:delete(Val, Objs), [Val | Ack]); +add_written_to_bag([], Objs, Ack) -> + Objs ++ lists:reverse(Ack); %% Oldest write first as in ets +add_written_to_bag([{_, _ , delete} | Tail], _Objs, _Ack) -> + %% This transaction just deleted all objects + %% with this key + add_written_to_bag(Tail, [], []); +add_written_to_bag([{_, Val, delete_object} | Tail], Objs, Ack) -> + add_written_to_bag(Tail, lists:delete(Val, Objs), lists:delete(Val, Ack)). + +match_object(Pat) when is_tuple(Pat), tuple_size(Pat) > 2 -> + Tab = element(1, Pat), + match_object(Tab, Pat, read); +match_object(Pat) -> + abort({bad_type, Pat}). + +match_object(Tab, Pat, LockKind) -> + case get(mnesia_activity_state) of + {?DEFAULT_ACCESS, Tid, Ts} -> + match_object(Tid, Ts, Tab, Pat, LockKind); + {Mod, Tid, Ts} -> + Mod:match_object(Tid, Ts, Tab, Pat, LockKind); + _ -> + abort(no_transaction) + end. + +match_object(Tid, Ts, Tab, Pat, LockKind) + when is_atom(Tab), Tab /= schema, is_tuple(Pat), tuple_size(Pat) > 2 -> + case element(1, Tid) of + ets -> + mnesia_lib:db_match_object(ram_copies, Tab, Pat); + tid -> + Key = element(2, Pat), + case has_var(Key) of + false -> lock_record(Tid, Ts, Tab, Key, LockKind); + true -> lock_table(Tid, Ts, Tab, LockKind) + end, + Objs = dirty_match_object(Tab, Pat), + add_written_match(Ts#tidstore.store, Pat, Tab, Objs); + _Protocol -> + dirty_match_object(Tab, Pat) + end; +match_object(_Tid, _Ts, Tab, Pat, _LockKind) -> + abort({bad_type, Tab, Pat}). + +add_written_match(S, Pat, Tab, Objs) -> + Ops = find_ops(S, Tab, Pat), + add_match(Ops, Objs, val({Tab, setorbag})). + +find_ops(S, Tab, Pat) -> + GetWritten = [{{{Tab, '_'}, Pat, write}, [], ['$_']}, + {{{Tab, '_'}, '_', delete}, [], ['$_']}, + {{{Tab, '_'}, Pat, delete_object}, [], ['$_']}], + ets:select(S, GetWritten). + +add_match([], Objs, _Type) -> + Objs; +add_match(Written, Objs, ordered_set) -> + %% Must use keysort which is stable + add_ordered_match(lists:keysort(1,Written), Objs, []); +add_match([{Oid, _, delete}|R], Objs, Type) -> + add_match(R, deloid(Oid, Objs), Type); +add_match([{_Oid, Val, delete_object}|R], Objs, Type) -> + add_match(R, lists:delete(Val, Objs), Type); +add_match([{_Oid, Val, write}|R], Objs, bag) -> + add_match(R, [Val | lists:delete(Val, Objs)], bag); +add_match([{Oid, Val, write}|R], Objs, set) -> + add_match(R, [Val | deloid(Oid,Objs)],set). + +%% For ordered_set only !! +add_ordered_match(Written = [{{_, Key}, _, _}|_], [Obj|Objs], Acc) + when Key > element(2, Obj) -> + add_ordered_match(Written, Objs, [Obj|Acc]); +add_ordered_match([{{_, Key}, Val, write}|Rest], Objs =[Obj|_], Acc) + when Key < element(2, Obj) -> + add_ordered_match(Rest, [Val|Objs],Acc); +add_ordered_match([{{_, Key}, _, _DelOP}|Rest], Objs =[Obj|_], Acc) + when Key < element(2, Obj) -> + add_ordered_match(Rest,Objs,Acc); +%% Greater than last object +add_ordered_match([{_, Val, write}|Rest], [], Acc) -> + add_ordered_match(Rest, [Val], Acc); +add_ordered_match([_|Rest], [], Acc) -> + add_ordered_match(Rest, [], Acc); +%% Keys are equal from here +add_ordered_match([{_, Val, write}|Rest], [_Obj|Objs], Acc) -> + add_ordered_match(Rest, [Val|Objs], Acc); +add_ordered_match([{_, _Val, delete}|Rest], [_Obj|Objs], Acc) -> + add_ordered_match(Rest, Objs, Acc); +add_ordered_match([{_, Val, delete_object}|Rest], [Val|Objs], Acc) -> + add_ordered_match(Rest, Objs, Acc); +add_ordered_match([{_, _, delete_object}|Rest], Objs, Acc) -> + add_ordered_match(Rest, Objs, Acc); +add_ordered_match([], Objs, Acc) -> + lists:reverse(Acc, Objs). + +%% For select chunk +add_sel_match(Sorted, Objs, ordered_set) -> + add_sel_ordered_match(Sorted, Objs, []); +add_sel_match(Written, Objs, Type) -> + add_sel_match(Written, Objs, Type, []). + +add_sel_match([], Objs, _Type, Acc) -> + {Objs,lists:reverse(Acc)}; +add_sel_match([Op={Oid, _, delete}|R], Objs, Type, Acc) -> + case deloid(Oid, Objs) of + Objs -> + add_sel_match(R, Objs, Type, [Op|Acc]); + NewObjs when Type == set -> + add_sel_match(R, NewObjs, Type, Acc); + NewObjs -> %% If bag we may get more in next chunk + add_sel_match(R, NewObjs, Type, [Op|Acc]) + end; +add_sel_match([Op = {_Oid, Val, delete_object}|R], Objs, Type, Acc) -> + case lists:delete(Val, Objs) of + Objs -> + add_sel_match(R, Objs, Type, [Op|Acc]); + NewObjs when Type == set -> + add_sel_match(R, NewObjs, Type, Acc); + NewObjs -> + add_sel_match(R, NewObjs, Type, [Op|Acc]) + end; +add_sel_match([Op={Oid={_,Key}, Val, write}|R], Objs, bag, Acc) -> + case lists:keymember(Key, 2, Objs) of + true -> + add_sel_match(R,[Val|lists:delete(Val,Objs)],bag, + [{Oid,Val,delete_object}|Acc]); + false -> + add_sel_match(R,Objs,bag,[Op|Acc]) + end; +add_sel_match([Op={Oid, Val, write}|R], Objs, set, Acc) -> + case deloid(Oid,Objs) of + Objs -> + add_sel_match(R, Objs,set, [Op|Acc]); + NewObjs -> + add_sel_match(R, [Val | NewObjs],set, Acc) + end. + +%% For ordered_set only !! +add_sel_ordered_match(Written = [{{_, Key}, _, _}|_], [Obj|Objs],Acc) + when Key > element(2, Obj) -> + add_sel_ordered_match(Written, Objs, [Obj|Acc]); +add_sel_ordered_match([{{_, Key}, Val, write}|Rest], Objs =[Obj|_],Acc) + when Key < element(2, Obj) -> + add_sel_ordered_match(Rest,[Val|Objs],Acc); +add_sel_ordered_match([{{_, Key}, _, _DelOP}|Rest], Objs =[Obj|_], Acc) + when Key < element(2, Obj) -> + add_sel_ordered_match(Rest,Objs,Acc); +%% Greater than last object +add_sel_ordered_match(Ops1, [], Acc) -> + {lists:reverse(Acc), Ops1}; +%% Keys are equal from here +add_sel_ordered_match([{_, Val, write}|Rest], [_Obj|Objs], Acc) -> + add_sel_ordered_match(Rest, [Val|Objs], Acc); +add_sel_ordered_match([{_, _Val, delete}|Rest], [_Obj|Objs], Acc) -> + add_sel_ordered_match(Rest, Objs, Acc); +add_sel_ordered_match([{_, Val, delete_object}|Rest], [Val|Objs], Acc) -> + add_sel_ordered_match(Rest, Objs, Acc); +add_sel_ordered_match([{_, _, delete_object}|Rest], Objs, Acc) -> + add_sel_ordered_match(Rest, Objs, Acc); +add_sel_ordered_match([], Objs, Acc) -> + {lists:reverse(Acc, Objs),[]}. + + +deloid(_Oid, []) -> + []; +deloid({Tab, Key}, [H | T]) when element(2, H) == Key -> + deloid({Tab, Key}, T); +deloid(Oid, [H | T]) -> + [H | deloid(Oid, T)]. + +%%%%%%%%%%%%%%%%%% +% select + +select(Tab, Pat) -> + select(Tab, Pat, read). +select(Tab, Pat, LockKind) + when is_atom(Tab), Tab /= schema, is_list(Pat) -> + case get(mnesia_activity_state) of + {?DEFAULT_ACCESS, Tid, Ts} -> + select(Tid, Ts, Tab, Pat, LockKind); + {Mod, Tid, Ts} -> + Mod:select(Tid, Ts, Tab, Pat, LockKind); + _ -> + abort(no_transaction) + end; +select(Tab, Pat, _Lock) -> + abort({badarg, Tab, Pat}). + +select(Tid, Ts, Tab, Spec, LockKind) -> + SelectFun = fun(FixedSpec) -> dirty_select(Tab, FixedSpec) end, + fun_select(Tid, Ts, Tab, Spec, LockKind, Tab, SelectFun). + +fun_select(Tid, Ts, Tab, Spec, LockKind, TabPat, SelectFun) -> + case element(1, Tid) of + ets -> + mnesia_lib:db_select(ram_copies, Tab, Spec); + tid -> + select_lock(Tid,Ts,LockKind,Spec,Tab), + Store = Ts#tidstore.store, + Written = ?ets_match_object(Store, {{TabPat, '_'}, '_', '_'}), + case Written of + [] -> + %% Nothing changed in the table during this transaction, + %% Simple case get results from [d]ets + SelectFun(Spec); + _ -> + %% Hard (slow case) records added or deleted earlier + %% in the transaction, have to cope with that. + Type = val({Tab, setorbag}), + FixedSpec = get_record_pattern(Spec), + TabRecs = SelectFun(FixedSpec), + FixedRes = add_match(Written, TabRecs, Type), + CMS = ets:match_spec_compile(Spec), + ets:match_spec_run(FixedRes, CMS) + end; + _Protocol -> + SelectFun(Spec) + end. + +select_lock(Tid,Ts,LockKind,Spec,Tab) -> + %% Avoid table lock if possible + case Spec of + [{HeadPat,_, _}] when is_tuple(HeadPat), tuple_size(HeadPat) > 2 -> + Key = element(2, HeadPat), + case has_var(Key) of + false -> lock_record(Tid, Ts, Tab, Key, LockKind); + true -> lock_table(Tid, Ts, Tab, LockKind) + end; + _ -> + lock_table(Tid, Ts, Tab, LockKind) + end. + +%% Breakable Select +select(Tab, Pat, NObjects, LockKind) + when is_atom(Tab), Tab /= schema, is_list(Pat), is_integer(NObjects) -> + case get(mnesia_activity_state) of + {?DEFAULT_ACCESS, Tid, Ts} -> + select(Tid, Ts, Tab, Pat, NObjects, LockKind); + {Mod, Tid, Ts} -> + Mod:select(Tid, Ts, Tab, Pat, NObjects, LockKind); + _ -> + abort(no_transaction) + end; +select(Tab, Pat, NObjects, _Lock) -> + abort({badarg, Tab, Pat, NObjects}). + +select(Tid, Ts, Tab, Spec, NObjects, LockKind) -> + Where = val({Tab,where_to_read}), + Type = mnesia_lib:storage_type_at_node(Where,Tab), + InitFun = fun(FixedSpec) -> dirty_sel_init(Where,Tab,FixedSpec,NObjects,Type) end, + fun_select(Tid,Ts,Tab,Spec,LockKind,Tab,InitFun,NObjects,Where,Type). + +-record(mnesia_select, {tab,tid,node,storage,cont,written=[],spec,type,orig}). + +fun_select(Tid, Ts, Tab, Spec, LockKind, TabPat, Init, NObjects, Node, Storage) -> + Def = #mnesia_select{tid=Tid,node=Node,storage=Storage,tab=Tab,orig=Spec}, + case element(1, Tid) of + ets -> + select_state(mnesia_lib:db_select_init(ram_copies,Tab,Spec,NObjects),Def); + tid -> + select_lock(Tid,Ts,LockKind,Spec,Tab), + Store = Ts#tidstore.store, + do_fixtable(Tab, Store), + + Written0 = ?ets_match_object(Store, {{TabPat, '_'}, '_', '_'}), + case Written0 of + [] -> + %% Nothing changed in the table during this transaction, + %% Simple case get results from [d]ets + select_state(Init(Spec),Def); + _ -> + %% Hard (slow case) records added or deleted earlier + %% in the transaction, have to cope with that. + Type = val({Tab, setorbag}), + Written = + if Type == ordered_set -> %% Sort stable + lists:keysort(1,Written0); + true -> + Written0 + end, + FixedSpec = get_record_pattern(Spec), + CMS = ets:match_spec_compile(Spec), + trans_select(Init(FixedSpec), + Def#mnesia_select{written=Written,spec=CMS,type=Type, orig=FixedSpec}) + end; + _Protocol -> + select_state(Init(Spec),Def) + end. + +select(Cont) -> + case get(mnesia_activity_state) of + {?DEFAULT_ACCESS, Tid, Ts} -> + select_cont(Tid,Ts,Cont); + {Mod, Tid, Ts} -> + Mod:select_cont(Tid,Ts,Cont); + _ -> + abort(no_transaction) + end. + +select_cont(_Tid,_Ts,'$end_of_table') -> + '$end_of_table'; +select_cont(Tid,_Ts,State=#mnesia_select{tid=Tid,cont=Cont, orig=Ms}) + when element(1,Tid) == ets -> + case Cont of + '$end_of_table' -> '$end_of_table'; + _ -> select_state(mnesia_lib:db_select_cont(ram_copies,Cont,Ms),State) + end; +select_cont(Tid,_,State=#mnesia_select{tid=Tid,written=[]}) -> + select_state(dirty_sel_cont(State),State); +select_cont(Tid,_Ts,State=#mnesia_select{tid=Tid}) -> + trans_select(dirty_sel_cont(State), State); +select_cont(_Tid2,_,#mnesia_select{tid=_Tid1}) -> % Missmatching tids + abort(wrong_transaction); +select_cont(_,_,Cont) -> + abort({badarg, Cont}). + +trans_select('$end_of_table', #mnesia_select{written=Written0,spec=CMS,type=Type}) -> + Written = add_match(Written0, [], Type), + {ets:match_spec_run(Written, CMS), '$end_of_table'}; +trans_select({TabRecs,Cont}, State = #mnesia_select{written=Written0,spec=CMS,type=Type}) -> + {FixedRes,Written} = add_sel_match(Written0, TabRecs, Type), + select_state({ets:match_spec_run(FixedRes, CMS),Cont}, + State#mnesia_select{written=Written}). + +select_state({Matches, Cont}, MS) -> + {Matches, MS#mnesia_select{cont=Cont}}; +select_state('$end_of_table',_) -> '$end_of_table'. + +get_record_pattern([]) -> []; +get_record_pattern([{M,C,_B}|R]) -> + [{M,C,['$_']} | get_record_pattern(R)]. + +all_keys(Tab) -> + case get(mnesia_activity_state) of + {?DEFAULT_ACCESS, Tid, Ts} -> + all_keys(Tid, Ts, Tab, read); + {Mod, Tid, Ts} -> + Mod:all_keys(Tid, Ts, Tab, read); + _ -> + abort(no_transaction) + end. + +all_keys(Tid, Ts, Tab, LockKind) + when is_atom(Tab), Tab /= schema -> + Pat0 = val({Tab, wild_pattern}), + Pat = setelement(2, Pat0, '$1'), + Keys = select(Tid, Ts, Tab, [{Pat, [], ['$1']}], LockKind), + case val({Tab, setorbag}) of + bag -> + mnesia_lib:uniq(Keys); + _ -> + Keys + end; +all_keys(_Tid, _Ts, Tab, _LockKind) -> + abort({bad_type, Tab}). + +index_match_object(Pat, Attr) when is_tuple(Pat), tuple_size(Pat) > 2 -> + Tab = element(1, Pat), + index_match_object(Tab, Pat, Attr, read); +index_match_object(Pat, _Attr) -> + abort({bad_type, Pat}). + +index_match_object(Tab, Pat, Attr, LockKind) -> + case get(mnesia_activity_state) of + {?DEFAULT_ACCESS, Tid, Ts} -> + index_match_object(Tid, Ts, Tab, Pat, Attr, LockKind); + {Mod, Tid, Ts} -> + Mod:index_match_object(Tid, Ts, Tab, Pat, Attr, LockKind); + _ -> + abort(no_transaction) + end. + +index_match_object(Tid, Ts, Tab, Pat, Attr, LockKind) + when is_atom(Tab), Tab /= schema, is_tuple(Pat), tuple_size(Pat) > 2 -> + case element(1, Tid) of + ets -> + dirty_index_match_object(Tab, Pat, Attr); % Should be optimized? + tid -> + case mnesia_schema:attr_tab_to_pos(Tab, Attr) of + Pos when Pos =< tuple_size(Pat) -> + case LockKind of + read -> + Store = Ts#tidstore.store, + mnesia_locker:rlock_table(Tid, Store, Tab), + Objs = dirty_index_match_object(Tab, Pat, Attr), + add_written_match(Store, Pat, Tab, Objs); + _ -> + abort({bad_type, Tab, LockKind}) + end; + BadPos -> + abort({bad_type, Tab, BadPos}) + end; + _Protocol -> + dirty_index_match_object(Tab, Pat, Attr) + end; +index_match_object(_Tid, _Ts, Tab, Pat, _Attr, _LockKind) -> + abort({bad_type, Tab, Pat}). + +index_read(Tab, Key, Attr) -> + case get(mnesia_activity_state) of + {?DEFAULT_ACCESS, Tid, Ts} -> + index_read(Tid, Ts, Tab, Key, Attr, read); + {Mod, Tid, Ts} -> + Mod:index_read(Tid, Ts, Tab, Key, Attr, read); + _ -> + abort(no_transaction) + end. + +index_read(Tid, Ts, Tab, Key, Attr, LockKind) + when is_atom(Tab), Tab /= schema -> + case element(1, Tid) of + ets -> + dirty_index_read(Tab, Key, Attr); % Should be optimized? + tid -> + Pos = mnesia_schema:attr_tab_to_pos(Tab, Attr), + case LockKind of + read -> + case has_var(Key) of + false -> + Store = Ts#tidstore.store, + Objs = mnesia_index:read(Tid, Store, Tab, Key, Pos), + Pat = setelement(Pos, val({Tab, wild_pattern}), Key), + add_written_match(Store, Pat, Tab, Objs); + true -> + abort({bad_type, Tab, Attr, Key}) + end; + _ -> + abort({bad_type, Tab, LockKind}) + end; + _Protocol -> + dirty_index_read(Tab, Key, Attr) + end; +index_read(_Tid, _Ts, Tab, _Key, _Attr, _LockKind) -> + abort({bad_type, Tab}). + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +%% Dirty access regardless of activities - updates + +dirty_write(Val) when is_tuple(Val), tuple_size(Val) > 2 -> + Tab = element(1, Val), + dirty_write(Tab, Val); +dirty_write(Val) -> + abort({bad_type, Val}). + +dirty_write(Tab, Val) -> + do_dirty_write(async_dirty, Tab, Val). + +do_dirty_write(SyncMode, Tab, Val) + when is_atom(Tab), Tab /= schema, is_tuple(Val), tuple_size(Val) > 2 -> + case ?catch_val({Tab, record_validation}) of + {RecName, Arity, _Type} + when tuple_size(Val) == Arity, RecName == element(1, Val) -> + Oid = {Tab, element(2, Val)}, + mnesia_tm:dirty(SyncMode, {Oid, Val, write}); + {'EXIT', _} -> + abort({no_exists, Tab}); + _ -> + abort({bad_type, Val}) + end; +do_dirty_write(_SyncMode, Tab, Val) -> + abort({bad_type, Tab, Val}). + +dirty_delete({Tab, Key}) -> + dirty_delete(Tab, Key); +dirty_delete(Oid) -> + abort({bad_type, Oid}). + +dirty_delete(Tab, Key) -> + do_dirty_delete(async_dirty, Tab, Key). + +do_dirty_delete(SyncMode, Tab, Key) when is_atom(Tab), Tab /= schema -> + Oid = {Tab, Key}, + mnesia_tm:dirty(SyncMode, {Oid, Oid, delete}); +do_dirty_delete(_SyncMode, Tab, _Key) -> + abort({bad_type, Tab}). + +dirty_delete_object(Val) when is_tuple(Val), tuple_size(Val) > 2 -> + Tab = element(1, Val), + dirty_delete_object(Tab, Val); +dirty_delete_object(Val) -> + abort({bad_type, Val}). + +dirty_delete_object(Tab, Val) -> + do_dirty_delete_object(async_dirty, Tab, Val). + +do_dirty_delete_object(SyncMode, Tab, Val) + when is_atom(Tab), Tab /= schema, is_tuple(Val), tuple_size(Val) > 2 -> + Oid = {Tab, element(2, Val)}, + case has_var(Val) of + false -> + mnesia_tm:dirty(SyncMode, {Oid, Val, delete_object}); + true -> + abort({bad_type, Tab, Val}) + end; + +do_dirty_delete_object(_SyncMode, Tab, Val) -> + abort({bad_type, Tab, Val}). + +%% A Counter is an Oid being {CounterTab, CounterName} + +dirty_update_counter({Tab, Key}, Incr) -> + dirty_update_counter(Tab, Key, Incr); +dirty_update_counter(Counter, _Incr) -> + abort({bad_type, Counter}). + +dirty_update_counter(Tab, Key, Incr) -> + do_dirty_update_counter(async_dirty, Tab, Key, Incr). + +do_dirty_update_counter(SyncMode, Tab, Key, Incr) + when is_atom(Tab), Tab /= schema, is_integer(Incr) -> + case ?catch_val({Tab, record_validation}) of + {RecName, 3, set} -> + Oid = {Tab, Key}, + mnesia_tm:dirty(SyncMode, {Oid, {RecName, Incr}, update_counter}); + _ -> + abort({combine_error, Tab, update_counter}) + end; +do_dirty_update_counter(_SyncMode, Tab, _Key, Incr) -> + abort({bad_type, Tab, Incr}). + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +%% Dirty access regardless of activities - read + +dirty_read({Tab, Key}) -> + dirty_read(Tab, Key); +dirty_read(Oid) -> + abort({bad_type, Oid}). + +dirty_read(Tab, Key) + when is_atom(Tab), Tab /= schema -> +%% case catch ?ets_lookup(Tab, Key) of +%% {'EXIT', _} -> + %% Bad luck, we have to perform a real lookup + dirty_rpc(Tab, mnesia_lib, db_get, [Tab, Key]); +%% Val -> +%% Val +%% end; +dirty_read(Tab, _Key) -> + abort({bad_type, Tab}). + +dirty_match_object(Pat) when is_tuple(Pat), tuple_size(Pat) > 2 -> + Tab = element(1, Pat), + dirty_match_object(Tab, Pat); +dirty_match_object(Pat) -> + abort({bad_type, Pat}). + +dirty_match_object(Tab, Pat) + when is_atom(Tab), Tab /= schema, is_tuple(Pat), tuple_size(Pat) > 2 -> + dirty_rpc(Tab, ?MODULE, remote_dirty_match_object, [Tab, Pat]); +dirty_match_object(Tab, Pat) -> + abort({bad_type, Tab, Pat}). + +remote_dirty_match_object(Tab, Pat) -> + Key = element(2, Pat), + case has_var(Key) of + false -> + mnesia_lib:db_match_object(Tab, Pat); + true -> + PosList = val({Tab, index}), + remote_dirty_match_object(Tab, Pat, PosList) + end. + +remote_dirty_match_object(Tab, Pat, [Pos | Tail]) when Pos =< tuple_size(Pat) -> + IxKey = element(Pos, Pat), + case has_var(IxKey) of + false -> + mnesia_index:dirty_match_object(Tab, Pat, Pos); + true -> + remote_dirty_match_object(Tab, Pat, Tail) + end; +remote_dirty_match_object(Tab, Pat, []) -> + mnesia_lib:db_match_object(Tab, Pat); +remote_dirty_match_object(Tab, Pat, _PosList) -> + abort({bad_type, Tab, Pat}). + +dirty_select(Tab, Spec) when is_atom(Tab), Tab /= schema, is_list(Spec) -> + dirty_rpc(Tab, ?MODULE, remote_dirty_select, [Tab, Spec]); +dirty_select(Tab, Spec) -> + abort({bad_type, Tab, Spec}). + +remote_dirty_select(Tab, Spec) -> + case Spec of + [{HeadPat, _, _}] when is_tuple(HeadPat), tuple_size(HeadPat) > 2 -> + Key = element(2, HeadPat), + case has_var(Key) of + false -> + mnesia_lib:db_select(Tab, Spec); + true -> + PosList = val({Tab, index}), + remote_dirty_select(Tab, Spec, PosList) + end; + _ -> + mnesia_lib:db_select(Tab, Spec) + end. + +remote_dirty_select(Tab, [{HeadPat,_, _}] = Spec, [Pos | Tail]) + when is_tuple(HeadPat), tuple_size(HeadPat) > 2, Pos =< tuple_size(HeadPat) -> + Key = element(Pos, HeadPat), + case has_var(Key) of + false -> + Recs = mnesia_index:dirty_select(Tab, HeadPat, Pos), + %% Returns the records without applying the match spec + %% The actual filtering is handled by the caller + CMS = ets:match_spec_compile(Spec), + case val({Tab, setorbag}) of + ordered_set -> + ets:match_spec_run(lists:sort(Recs), CMS); + _ -> + ets:match_spec_run(Recs, CMS) + end; + true -> + remote_dirty_select(Tab, Spec, Tail) + end; +remote_dirty_select(Tab, Spec, _) -> + mnesia_lib:db_select(Tab, Spec). + +dirty_sel_init(Node,Tab,Spec,NObjects,Type) -> + do_dirty_rpc(Tab,Node,mnesia_lib,db_select_init,[Type,Tab,Spec,NObjects]). + +dirty_sel_cont(#mnesia_select{cont='$end_of_table'}) -> '$end_of_table'; +dirty_sel_cont(#mnesia_select{node=Node,tab=Tab,storage=Type,cont=Cont,orig=Ms}) -> + do_dirty_rpc(Tab,Node,mnesia_lib,db_select_cont,[Type,Cont,Ms]). + +dirty_all_keys(Tab) when is_atom(Tab), Tab /= schema -> + case ?catch_val({Tab, wild_pattern}) of + {'EXIT', _} -> + abort({no_exists, Tab}); + Pat0 -> + Pat = setelement(2, Pat0, '$1'), + Keys = dirty_select(Tab, [{Pat, [], ['$1']}]), + case val({Tab, setorbag}) of + bag -> mnesia_lib:uniq(Keys); + _ -> Keys + end + end; +dirty_all_keys(Tab) -> + abort({bad_type, Tab}). + +dirty_index_match_object(Pat, Attr) when is_tuple(Pat), tuple_size(Pat) > 2 -> + Tab = element(1, Pat), + dirty_index_match_object(Tab, Pat, Attr); +dirty_index_match_object(Pat, _Attr) -> + abort({bad_type, Pat}). + +dirty_index_match_object(Tab, Pat, Attr) + when is_atom(Tab), Tab /= schema, is_tuple(Pat), tuple_size(Pat) > 2 -> + case mnesia_schema:attr_tab_to_pos(Tab, Attr) of + Pos when Pos =< tuple_size(Pat) -> + case has_var(element(2, Pat)) of + false -> + dirty_match_object(Tab, Pat); + true -> + Elem = element(Pos, Pat), + case has_var(Elem) of + false -> + dirty_rpc(Tab, mnesia_index, dirty_match_object, + [Tab, Pat, Pos]); + true -> + abort({bad_type, Tab, Attr, Elem}) + end + end; + BadPos -> + abort({bad_type, Tab, BadPos}) + end; +dirty_index_match_object(Tab, Pat, _Attr) -> + abort({bad_type, Tab, Pat}). + +dirty_index_read(Tab, Key, Attr) when is_atom(Tab), Tab /= schema -> + Pos = mnesia_schema:attr_tab_to_pos(Tab, Attr), + case has_var(Key) of + false -> + mnesia_index:dirty_read(Tab, Key, Pos); + true -> + abort({bad_type, Tab, Attr, Key}) + end; +dirty_index_read(Tab, _Key, _Attr) -> + abort({bad_type, Tab}). + +dirty_slot(Tab, Slot) when is_atom(Tab), Tab /= schema, is_integer(Slot) -> + dirty_rpc(Tab, mnesia_lib, db_slot, [Tab, Slot]); +dirty_slot(Tab, Slot) -> + abort({bad_type, Tab, Slot}). + +dirty_first(Tab) when is_atom(Tab), Tab /= schema -> + dirty_rpc(Tab, mnesia_lib, db_first, [Tab]); +dirty_first(Tab) -> + abort({bad_type, Tab}). + +dirty_last(Tab) when is_atom(Tab), Tab /= schema -> + dirty_rpc(Tab, mnesia_lib, db_last, [Tab]); +dirty_last(Tab) -> + abort({bad_type, Tab}). + +dirty_next(Tab, Key) when is_atom(Tab), Tab /= schema -> + dirty_rpc(Tab, mnesia_lib, db_next_key, [Tab, Key]); +dirty_next(Tab, _Key) -> + abort({bad_type, Tab}). + +dirty_prev(Tab, Key) when is_atom(Tab), Tab /= schema -> + dirty_rpc(Tab, mnesia_lib, db_prev_key, [Tab, Key]); +dirty_prev(Tab, _Key) -> + abort({bad_type, Tab}). + + +dirty_rpc(Tab, M, F, Args) -> + Node = val({Tab, where_to_read}), + do_dirty_rpc(Tab, Node, M, F, Args). + +do_dirty_rpc(_Tab, nowhere, _, _, Args) -> + mnesia:abort({no_exists, Args}); +do_dirty_rpc(Tab, Node, M, F, Args) -> + case rpc:call(Node, M, F, Args) of + {badrpc, Reason} -> + timer:sleep(20), %% Do not be too eager, and can't use yield on SMP + %% Sync with mnesia_monitor + try sys:get_status(mnesia_monitor) catch _:_ -> ok end, + case mnesia_controller:call({check_w2r, Node, Tab}) of % Sync + NewNode when NewNode =:= Node -> + ErrorTag = mnesia_lib:dirty_rpc_error_tag(Reason), + mnesia:abort({ErrorTag, Args}); + NewNode -> + case get(mnesia_activity_state) of + {_Mod, Tid, _Ts} when is_record(Tid, tid) -> + %% In order to perform a consistent + %% retry of a transaction we need + %% to acquire the lock on the NewNode. + %% In this context we do neither know + %% the kind or granularity of the lock. + %% --> Abort the transaction + mnesia:abort({node_not_running, Node}); + {error, {node_not_running, _}} -> + %% Mnesia is stopping + mnesia:abort({no_exists, Args}); + _ -> + %% Splendid! A dirty retry is safe + %% 'Node' probably went down now + %% Let mnesia_controller get broken link message first + do_dirty_rpc(Tab, NewNode, M, F, Args) + end + end; + Other -> + Other + end. + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +%% Info + +%% Info about one table +table_info(Tab, Item) -> + case get(mnesia_activity_state) of + undefined -> + any_table_info(Tab, Item); + {?DEFAULT_ACCESS, _Tid, _Ts} -> + any_table_info(Tab, Item); + {Mod, Tid, Ts} -> + Mod:table_info(Tid, Ts, Tab, Item); + _ -> + abort(no_transaction) + end. + +table_info(_Tid, _Ts, Tab, Item) -> + any_table_info(Tab, Item). + + +any_table_info(Tab, Item) when is_atom(Tab) -> + case Item of + master_nodes -> + mnesia_recover:get_master_nodes(Tab); +% checkpoints -> +% case ?catch_val({Tab, commit_work}) of +% [{checkpoints, List} | _] -> List; +% No_chk when is_list(No_chk) -> []; +% Else -> info_reply(Else, Tab, Item) +% end; + size -> + raw_table_info(Tab, Item); + memory -> + raw_table_info(Tab, Item); + type -> + case ?catch_val({Tab, setorbag}) of + {'EXIT', _} -> + bad_info_reply(Tab, Item); + Val -> + Val + end; + all -> + case mnesia_schema:get_table_properties(Tab) of + [] -> + abort({no_exists, Tab, Item}); + Props -> + lists:map(fun({setorbag, Type}) -> {type, Type}; + (Prop) -> Prop end, + Props) + end; + name -> + Tab; + _ -> + case ?catch_val({Tab, Item}) of + {'EXIT', _} -> + bad_info_reply(Tab, Item); + Val -> + Val + end + end; +any_table_info(Tab, _Item) -> + abort({bad_type, Tab}). + +raw_table_info(Tab, Item) -> + case ?catch_val({Tab, storage_type}) of + ram_copies -> + info_reply(catch ?ets_info(Tab, Item), Tab, Item); + disc_copies -> + info_reply(catch ?ets_info(Tab, Item), Tab, Item); + disc_only_copies -> + info_reply(catch dets:info(Tab, Item), Tab, Item); + unknown -> + bad_info_reply(Tab, Item); + {'EXIT', _} -> + bad_info_reply(Tab, Item) + end. + +info_reply({'EXIT', _Reason}, Tab, Item) -> + bad_info_reply(Tab, Item); +info_reply({error, _Reason}, Tab, Item) -> + bad_info_reply(Tab, Item); +info_reply(Val, _Tab, _Item) -> + Val. + +bad_info_reply(_Tab, size) -> 0; +bad_info_reply(_Tab, memory) -> 0; +bad_info_reply(Tab, Item) -> abort({no_exists, Tab, Item}). + +%% Raw info about all tables +schema() -> + mnesia_schema:info(). + +%% Raw info about one tables +schema(Tab) -> + mnesia_schema:info(Tab). + +error_description(Err) -> + mnesia_lib:error_desc(Err). + +info() -> + case mnesia_lib:is_running() of + yes -> + TmInfo = mnesia_tm:get_info(10000), + Held = system_info(held_locks), + Queued = system_info(lock_queue), + + io:format("---> Processes holding locks <--- ~n", []), + lists:foreach(fun(L) -> io:format("Lock: ~p~n", [L]) end, + Held), + + io:format( "---> Processes waiting for locks <--- ~n", []), + lists:foreach(fun({Oid, Op, _Pid, Tid, OwnerTid}) -> + io:format("Tid ~p waits for ~p lock " + "on oid ~p owned by ~p ~n", + [Tid, Op, Oid, OwnerTid]) + end, Queued), + mnesia_tm:display_info(group_leader(), TmInfo), + + Pat = {'_', unclear, '_'}, + Uncertain = ets:match_object(mnesia_decision, Pat), + + io:format( "---> Uncertain transactions <--- ~n", []), + lists:foreach(fun({Tid, _, Nodes}) -> + io:format("Tid ~w waits for decision " + "from ~w~n", + [Tid, Nodes]) + end, Uncertain), + + mnesia_controller:info(), + display_system_info(Held, Queued, TmInfo, Uncertain); + _ -> + mini_info() + end, + ok. + +mini_info() -> + io:format("===> System info in version ~p, debug level = ~p <===~n", + [system_info(version), system_info(debug)]), + Not = + case system_info(use_dir) of + true -> ""; + false -> "NOT " + end, + + io:format("~w. Directory ~p is ~sused.~n", + [system_info(schema_location), system_info(directory), Not]), + io:format("use fallback at restart = ~w~n", + [system_info(fallback_activated)]), + Running = system_info(running_db_nodes), + io:format("running db nodes = ~w~n", [Running]), + All = mnesia_lib:all_nodes(), + io:format("stopped db nodes = ~w ~n", [All -- Running]). + +display_system_info(Held, Queued, TmInfo, Uncertain) -> + mini_info(), + display_tab_info(), + S = fun(Items) -> [system_info(I) || I <- Items] end, + + io:format("~w transactions committed, ~w aborted, " + "~w restarted, ~w logged to disc~n", + S([transaction_commits, transaction_failures, + transaction_restarts, transaction_log_writes])), + + {Active, Pending} = + case TmInfo of + {timeout, _} -> {infinity, infinity}; + {info, P, A} -> {length(A), length(P)} + end, + io:format("~w held locks, ~w in queue; " + "~w local transactions, ~w remote~n", + [length(Held), length(Queued), Active, Pending]), + + Ufold = fun({_, _, Ns}, {C, Old}) -> + New = [N || N <- Ns, not lists:member(N, Old)], + {C + 1, New ++ Old} + end, + {Ucount, Unodes} = lists:foldl(Ufold, {0, []}, Uncertain), + io:format("~w transactions waits for other nodes: ~p~n", + [Ucount, Unodes]). + +display_tab_info() -> + MasterTabs = mnesia_recover:get_master_node_tables(), + io:format("master node tables = ~p~n", [lists:sort(MasterTabs)]), + + Tabs = system_info(tables), + + {Unknown, Ram, Disc, DiscOnly} = + lists:foldl(fun storage_count/2, {[], [], [], []}, Tabs), + + io:format("remote = ~p~n", [lists:sort(Unknown)]), + io:format("ram_copies = ~p~n", [lists:sort(Ram)]), + io:format("disc_copies = ~p~n", [lists:sort(Disc)]), + io:format("disc_only_copies = ~p~n", [lists:sort(DiscOnly)]), + + Rfoldl = fun(T, Acc) -> + Rpat = + case val({T, access_mode}) of + read_only -> + lists:sort([{A, read_only} || A <- val({T, active_replicas})]); + read_write -> + table_info(T, where_to_commit) + end, + case lists:keysearch(Rpat, 1, Acc) of + {value, {_Rpat, Rtabs}} -> + lists:keyreplace(Rpat, 1, Acc, {Rpat, [T | Rtabs]}); + false -> + [{Rpat, [T]} | Acc] + end + end, + Repl = lists:foldl(Rfoldl, [], Tabs), + Rdisp = fun({Rpat, Rtabs}) -> io:format("~p = ~p~n", [Rpat, Rtabs]) end, + lists:foreach(Rdisp, lists:sort(Repl)). + +storage_count(T, {U, R, D, DO}) -> + case table_info(T, storage_type) of + unknown -> {[T | U], R, D, DO}; + ram_copies -> {U, [T | R], D, DO}; + disc_copies -> {U, R, [T | D], DO}; + disc_only_copies -> {U, R, D, [T | DO]} + end. + +system_info(Item) -> + case catch system_info2(Item) of + {'EXIT',Error} -> abort(Error); + Other -> Other + end. + +system_info2(all) -> + Items = system_info_items(mnesia_lib:is_running()), + [{I, system_info(I)} || I <- Items]; + +system_info2(db_nodes) -> + DiscNs = ?catch_val({schema, disc_copies}), + RamNs = ?catch_val({schema, ram_copies}), + if + is_list(DiscNs), is_list(RamNs) -> + DiscNs ++ RamNs; + true -> + case mnesia_schema:read_nodes() of + {ok, Nodes} -> Nodes; + {error,Reason} -> exit(Reason) + end + end; +system_info2(running_db_nodes) -> + case ?catch_val({current, db_nodes}) of + {'EXIT',_} -> + %% Ensure that we access the intended Mnesia + %% directory. This function may not be called + %% during startup since it will cause the + %% application_controller to get into deadlock + load_mnesia_or_abort(), + mnesia_lib:running_nodes(); + Other -> + Other + end; + +system_info2(extra_db_nodes) -> + case ?catch_val(extra_db_nodes) of + {'EXIT',_} -> + %% Ensure that we access the intended Mnesia + %% directory. This function may not be called + %% during startup since it will cause the + %% application_controller to get into deadlock + load_mnesia_or_abort(), + mnesia_monitor:get_env(extra_db_nodes); + Other -> + Other + end; + +system_info2(directory) -> + case ?catch_val(directory) of + {'EXIT',_} -> + %% Ensure that we access the intended Mnesia + %% directory. This function may not be called + %% during startup since it will cause the + %% application_controller to get into deadlock + load_mnesia_or_abort(), + mnesia_monitor:get_env(dir); + Other -> + Other + end; + +system_info2(use_dir) -> + case ?catch_val(use_dir) of + {'EXIT',_} -> + %% Ensure that we access the intended Mnesia + %% directory. This function may not be called + %% during startup since it will cause the + %% application_controller to get into deadlock + load_mnesia_or_abort(), + mnesia_monitor:use_dir(); + Other -> + Other + end; + +system_info2(schema_location) -> + case ?catch_val(schema_location) of + {'EXIT',_} -> + %% Ensure that we access the intended Mnesia + %% directory. This function may not be called + %% during startup since it will cause the + %% application_controller to get into deadlock + load_mnesia_or_abort(), + mnesia_monitor:get_env(schema_location); + Other -> + Other + end; + +system_info2(fallback_activated) -> + case ?catch_val(fallback_activated) of + {'EXIT',_} -> + %% Ensure that we access the intended Mnesia + %% directory. This function may not be called + %% during startup since it will cause the + %% application_controller to get into deadlock + load_mnesia_or_abort(), + mnesia_bup:fallback_exists(); + Other -> + Other + end; + +system_info2(version) -> + case ?catch_val(version) of + {'EXIT', _} -> + Apps = application:loaded_applications(), + case lists:keysearch(?APPLICATION, 1, Apps) of + {value, {_Name, _Desc, Version}} -> + Version; + false -> + %% Ensure that it does not match + {mnesia_not_loaded, node(), now()} + end; + Version -> + Version + end; + +system_info2(access_module) -> mnesia_monitor:get_env(access_module); +system_info2(auto_repair) -> mnesia_monitor:get_env(auto_repair); +system_info2(is_running) -> mnesia_lib:is_running(); +system_info2(backup_module) -> mnesia_monitor:get_env(backup_module); +system_info2(event_module) -> mnesia_monitor:get_env(event_module); +system_info2(debug) -> mnesia_monitor:get_env(debug); +system_info2(dump_log_load_regulation) -> mnesia_monitor:get_env(dump_log_load_regulation); +system_info2(dump_log_write_threshold) -> mnesia_monitor:get_env(dump_log_write_threshold); +system_info2(dump_log_time_threshold) -> mnesia_monitor:get_env(dump_log_time_threshold); +system_info2(dump_log_update_in_place) -> + mnesia_monitor:get_env(dump_log_update_in_place); +system_info2(max_wait_for_decision) -> mnesia_monitor:get_env(max_wait_for_decision); +system_info2(embedded_mnemosyne) -> mnesia_monitor:get_env(embedded_mnemosyne); +system_info2(ignore_fallback_at_startup) -> mnesia_monitor:get_env(ignore_fallback_at_startup); +system_info2(fallback_error_function) -> mnesia_monitor:get_env(fallback_error_function); +system_info2(log_version) -> mnesia_log:version(); +system_info2(protocol_version) -> mnesia_monitor:protocol_version(); +system_info2(schema_version) -> mnesia_schema:version(); %backward compatibility +system_info2(tables) -> val({schema, tables}); +system_info2(local_tables) -> val({schema, local_tables}); +system_info2(master_node_tables) -> mnesia_recover:get_master_node_tables(); +system_info2(subscribers) -> mnesia_subscr:subscribers(); +system_info2(checkpoints) -> mnesia_checkpoint:checkpoints(); +system_info2(held_locks) -> mnesia_locker:get_held_locks(); +system_info2(lock_queue) -> mnesia_locker:get_lock_queue(); +system_info2(transactions) -> mnesia_tm:get_transactions(); +system_info2(transaction_failures) -> mnesia_lib:read_counter(trans_failures); +system_info2(transaction_commits) -> mnesia_lib:read_counter(trans_commits); +system_info2(transaction_restarts) -> mnesia_lib:read_counter(trans_restarts); +system_info2(transaction_log_writes) -> mnesia_dumper:get_log_writes(); +system_info2(core_dir) -> mnesia_monitor:get_env(core_dir); +system_info2(no_table_loaders) -> mnesia_monitor:get_env(no_table_loaders); +system_info2(dc_dump_limit) -> mnesia_monitor:get_env(dc_dump_limit); + +system_info2(Item) -> exit({badarg, Item}). + +system_info_items(yes) -> + [ + access_module, + auto_repair, + backup_module, + checkpoints, + db_nodes, + debug, + directory, + dump_log_load_regulation, + dump_log_time_threshold, + dump_log_update_in_place, + dump_log_write_threshold, + embedded_mnemosyne, + event_module, + extra_db_nodes, + fallback_activated, + held_locks, + ignore_fallback_at_startup, + fallback_error_function, + is_running, + local_tables, + lock_queue, + log_version, + master_node_tables, + max_wait_for_decision, + protocol_version, + running_db_nodes, + schema_location, + schema_version, + subscribers, + tables, + transaction_commits, + transaction_failures, + transaction_log_writes, + transaction_restarts, + transactions, + use_dir, + core_dir, + no_table_loaders, + dc_dump_limit, + version + ]; +system_info_items(no) -> + [ + auto_repair, + backup_module, + db_nodes, + debug, + directory, + dump_log_load_regulation, + dump_log_time_threshold, + dump_log_update_in_place, + dump_log_write_threshold, + event_module, + extra_db_nodes, + ignore_fallback_at_startup, + fallback_error_function, + is_running, + log_version, + max_wait_for_decision, + protocol_version, + running_db_nodes, + schema_location, + schema_version, + use_dir, + core_dir, + version + ]. + +system_info() -> + IsRunning = mnesia_lib:is_running(), + case IsRunning of + yes -> + TmInfo = mnesia_tm:get_info(10000), + Held = system_info(held_locks), + Queued = system_info(lock_queue), + Pat = {'_', unclear, '_'}, + Uncertain = ets:match_object(mnesia_decision, Pat), + display_system_info(Held, Queued, TmInfo, Uncertain); + _ -> + mini_info() + end, + IsRunning. + +load_mnesia_or_abort() -> + case mnesia_lib:ensure_loaded(?APPLICATION) of + ok -> + ok; + {error, Reason} -> + abort(Reason) + end. + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +%% Database mgt + +create_schema(Ns) -> + mnesia_bup:create_schema(Ns). + +delete_schema(Ns) -> + mnesia_schema:delete_schema(Ns). + +backup(Opaque) -> + mnesia_log:backup(Opaque). + +backup(Opaque, Mod) -> + mnesia_log:backup(Opaque, Mod). + +traverse_backup(S, T, Fun, Acc) -> + mnesia_bup:traverse_backup(S, T, Fun, Acc). + +traverse_backup(S, SM, T, TM, F, A) -> + mnesia_bup:traverse_backup(S, SM, T, TM, F, A). + +install_fallback(Opaque) -> + mnesia_bup:install_fallback(Opaque). + +install_fallback(Opaque, Mod) -> + mnesia_bup:install_fallback(Opaque, Mod). + +uninstall_fallback() -> + mnesia_bup:uninstall_fallback(). + +uninstall_fallback(Args) -> + mnesia_bup:uninstall_fallback(Args). + +activate_checkpoint(Args) -> + mnesia_checkpoint:activate(Args). + +deactivate_checkpoint(Name) -> + mnesia_checkpoint:deactivate(Name). + +backup_checkpoint(Name, Opaque) -> + mnesia_log:backup_checkpoint(Name, Opaque). + +backup_checkpoint(Name, Opaque, Mod) -> + mnesia_log:backup_checkpoint(Name, Opaque, Mod). + +restore(Opaque, Args) -> + mnesia_schema:restore(Opaque, Args). + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +%% Table mgt + +create_table(Arg) -> + mnesia_schema:create_table(Arg). +create_table(Name, Arg) when is_list(Arg) -> + mnesia_schema:create_table([{name, Name}| Arg]); +create_table(Name, Arg) -> + {aborted, badarg, Name, Arg}. + +delete_table(Tab) -> + mnesia_schema:delete_table(Tab). + +add_table_copy(Tab, N, S) -> + mnesia_schema:add_table_copy(Tab, N, S). +del_table_copy(Tab, N) -> + mnesia_schema:del_table_copy(Tab, N). + +move_table_copy(Tab, From, To) -> + mnesia_schema:move_table(Tab, From, To). + +add_table_index(Tab, Ix) -> + mnesia_schema:add_table_index(Tab, Ix). +del_table_index(Tab, Ix) -> + mnesia_schema:del_table_index(Tab, Ix). + +transform_table(Tab, Fun, NewA) -> + case catch val({Tab, record_name}) of + {'EXIT', Reason} -> + mnesia:abort(Reason); + OldRN -> + mnesia_schema:transform_table(Tab, Fun, NewA, OldRN) + end. + +transform_table(Tab, Fun, NewA, NewRN) -> + mnesia_schema:transform_table(Tab, Fun, NewA, NewRN). + +change_table_copy_type(T, N, S) -> + mnesia_schema:change_table_copy_type(T, N, S). + +clear_table(Tab) -> + case get(mnesia_activity_state) of + State = {Mod, Tid, _Ts} when element(1, Tid) =/= tid -> + transaction(State, fun() -> do_clear_table(Tab) end, [], infinity, Mod, sync); + undefined -> + transaction(undefined, fun() -> do_clear_table(Tab) end, [], infinity, ?DEFAULT_ACCESS, sync); + _ -> %% Not allowed for clear_table + mnesia:abort({aborted, nested_transaction}) + end. + +do_clear_table(Tab) -> + case get(mnesia_activity_state) of + {?DEFAULT_ACCESS, Tid, Ts} -> + clear_table(Tid, Ts, Tab, '_'); + {Mod, Tid, Ts} -> + Mod:clear_table(Tid, Ts, Tab, '_'); + _ -> + abort(no_transaction) + end. + +clear_table(Tid, Ts, Tab, Obj) when element(1, Tid) =:= tid -> + Store = Ts#tidstore.store, + mnesia_locker:wlock_table(Tid, Store, Tab), + Oid = {Tab, '_'}, + ?ets_insert(Store, {Oid, Obj, clear_table}), + ok. + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +%% Table mgt - user properties + +read_table_property(Tab, PropKey) -> + val({Tab, user_property, PropKey}). + +write_table_property(Tab, Prop) -> + mnesia_schema:write_table_property(Tab, Prop). + +delete_table_property(Tab, PropKey) -> + mnesia_schema:delete_table_property(Tab, PropKey). + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +%% Table mgt - user properties + +change_table_frag(Tab, FragProp) -> + mnesia_schema:change_table_frag(Tab, FragProp). + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +%% Table mgt - table load + +%% Dump a ram table to disc +dump_tables(Tabs) -> + mnesia_schema:dump_tables(Tabs). + +%% allow the user to wait for some tables to be loaded +wait_for_tables(Tabs, Timeout) -> + mnesia_controller:wait_for_tables(Tabs, Timeout). + +force_load_table(Tab) -> + case mnesia_controller:force_load_table(Tab) of + ok -> yes; % Backwards compatibility + Other -> Other + end. + +change_table_access_mode(T, Access) -> + mnesia_schema:change_table_access_mode(T, Access). + +change_table_load_order(T, O) -> + mnesia_schema:change_table_load_order(T, O). + +set_master_nodes(Nodes) when is_list(Nodes) -> + UseDir = system_info(use_dir), + IsRunning = system_info(is_running), + case IsRunning of + yes -> + CsPat = {{'_', cstruct}, '_'}, + Cstructs0 = ?ets_match_object(mnesia_gvar, CsPat), + Cstructs = [Cs || {_, Cs} <- Cstructs0], + log_valid_master_nodes(Cstructs, Nodes, UseDir, IsRunning); + _NotRunning -> + case UseDir of + true -> + mnesia_lib:lock_table(schema), + Res = + case mnesia_schema:read_cstructs_from_disc() of + {ok, Cstructs} -> + log_valid_master_nodes(Cstructs, Nodes, UseDir, IsRunning); + {error, Reason} -> + {error, Reason} + end, + mnesia_lib:unlock_table(schema), + Res; + false -> + ok + end + end; +set_master_nodes(Nodes) -> + {error, {bad_type, Nodes}}. + +log_valid_master_nodes(Cstructs, Nodes, UseDir, IsRunning) -> + Fun = fun(Cs) -> + Copies = mnesia_lib:copy_holders(Cs), + Valid = mnesia_lib:intersect(Nodes, Copies), + {Cs#cstruct.name, Valid} + end, + Args = lists:map(Fun, Cstructs), + mnesia_recover:log_master_nodes(Args, UseDir, IsRunning). + +set_master_nodes(Tab, Nodes) when is_list(Nodes) -> + UseDir = system_info(use_dir), + IsRunning = system_info(is_running), + case IsRunning of + yes -> + case ?catch_val({Tab, cstruct}) of + {'EXIT', _} -> + {error, {no_exists, Tab}}; + Cs -> + case Nodes -- mnesia_lib:copy_holders(Cs) of + [] -> + Args = [{Tab , Nodes}], + mnesia_recover:log_master_nodes(Args, UseDir, IsRunning); + BadNodes -> + {error, {no_exists, Tab, BadNodes}} + end + end; + _NotRunning -> + case UseDir of + true -> + mnesia_lib:lock_table(schema), + Res = + case mnesia_schema:read_cstructs_from_disc() of + {ok, Cstructs} -> + case lists:keysearch(Tab, 2, Cstructs) of + {value, Cs} -> + case Nodes -- mnesia_lib:copy_holders(Cs) of + [] -> + Args = [{Tab , Nodes}], + mnesia_recover:log_master_nodes(Args, UseDir, IsRunning); + BadNodes -> + {error, {no_exists, Tab, BadNodes}} + end; + false -> + {error, {no_exists, Tab}} + end; + {error, Reason} -> + {error, Reason} + end, + mnesia_lib:unlock_table(schema), + Res; + false -> + ok + end + end; +set_master_nodes(Tab, Nodes) -> + {error, {bad_type, Tab, Nodes}}. + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +%% Misc admin + +dump_log() -> + mnesia_controller:sync_dump_log(user). + +subscribe(What) -> + mnesia_subscr:subscribe(self(), What). + +unsubscribe(What) -> + mnesia_subscr:unsubscribe(self(), What). + +report_event(Event) -> + mnesia_lib:report_system_event({mnesia_user, Event}). + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +%% Snmp + +snmp_open_table(Tab, Us) -> + mnesia_schema:add_snmp(Tab, Us). + +snmp_close_table(Tab) -> + mnesia_schema:del_snmp(Tab). + +snmp_get_row(Tab, RowIndex) when is_atom(Tab), Tab /= schema, is_list(RowIndex) -> + case get(mnesia_activity_state) of + {Mod, Tid, Ts=#tidstore{store=Store}} when element(1, Tid) =:= tid -> + case snmp_oid_to_mnesia_key(RowIndex, Tab) of + unknown -> %% Arrg contains fix_string + Ops = find_ops(Store, Tab, val({Tab, wild_pattern})), + SnmpType = val({Tab,snmp}), + Fix = fun({{_,Key},Row,Op}, Res) -> + case mnesia_snmp_hook:key_to_oid(Tab,Key,SnmpType) of + RowIndex -> + case Op of + write -> {ok, Row}; + _ -> + undefined + end; + _ -> + Res + end + end, + lists:foldl(Fix, undefined, Ops); + Key -> + case Mod:read(Tid, Ts, Tab, Key, read) of + [Row] -> + {ok, Row}; + _ -> + undefined + end + end; + _ -> + dirty_rpc(Tab, mnesia_snmp_hook, get_row, [Tab, RowIndex]) + end; +snmp_get_row(Tab, _RowIndex) -> + abort({bad_type, Tab}). + +%%%%%%%%%%%%% + +snmp_get_next_index(Tab, RowIndex) when is_atom(Tab), Tab /= schema, is_list(RowIndex) -> + {Next,OrigKey} = dirty_rpc(Tab, mnesia_snmp_hook, get_next_index, [Tab, RowIndex]), + case get(mnesia_activity_state) of + {_Mod, Tid, #tidstore{store=Store}} when element(1, Tid) =:= tid -> + case OrigKey of + undefined -> + snmp_order_keys(Store, Tab, RowIndex, []); + _ -> + case ?ets_match(Store, {{Tab,OrigKey}, '_', '$1'}) of + [] -> snmp_order_keys(Store,Tab,RowIndex,[OrigKey]); + Ops -> + case lists:last(Ops) of + [delete] -> snmp_get_next_index(Tab, Next); + _ -> snmp_order_keys(Store,Tab,RowIndex,[OrigKey]) + end + end + end; + _ -> + case Next of + endOfTable -> endOfTable; + _ -> {ok, Next} + end + end; +snmp_get_next_index(Tab, _RowIndex) -> + abort({bad_type, Tab}). + +snmp_order_keys(Store,Tab,RowIndex,Def) -> + All = ?ets_match(Store, {{Tab,'$1'},'_','$2'}), + SnmpType = val({Tab,snmp}), + Keys0 = [mnesia_snmp_hook:key_to_oid(Tab,Key,SnmpType) || + Key <- ts_keys_1(All, Def)], + Keys = lists:sort(Keys0), + get_ordered_snmp_key(RowIndex,Keys). + +get_ordered_snmp_key(Prev, [First|_]) when Prev < First -> {ok, First}; +get_ordered_snmp_key(Prev, [_|R]) -> + get_ordered_snmp_key(Prev, R); +get_ordered_snmp_key(_, []) -> + endOfTable. + +%%%%%%%%%% + +snmp_get_mnesia_key(Tab, RowIndex) when is_atom(Tab), Tab /= schema, is_list(RowIndex) -> + case get(mnesia_activity_state) of + {_Mod, Tid, Ts} when element(1, Tid) =:= tid -> + Res = dirty_rpc(Tab,mnesia_snmp_hook,get_mnesia_key,[Tab,RowIndex]), + snmp_filter_key(Res, RowIndex, Tab, Ts#tidstore.store); + _ -> + dirty_rpc(Tab, mnesia_snmp_hook, get_mnesia_key, [Tab, RowIndex]) + end; +snmp_get_mnesia_key(Tab, _RowIndex) -> + abort({bad_type, Tab}). + +snmp_oid_to_mnesia_key(RowIndex, Tab) -> + case mnesia_snmp_hook:oid_to_key(RowIndex, Tab) of + unknown -> %% Contains fix_string needs lookup + case dirty_rpc(Tab,mnesia_snmp_hook,get_mnesia_key,[Tab,RowIndex]) of + {ok, MnesiaKey} -> MnesiaKey; + undefined -> unknown + end; + MnesiaKey -> + MnesiaKey + end. + +snmp_filter_key(Res = {ok,Key}, _RowIndex, Tab, Store) -> + case ?ets_lookup(Store, {Tab,Key}) of + [] -> Res; + Ops -> + case lists:last(Ops) of + {_, _, write} -> Res; + _ -> undefined + end + end; +snmp_filter_key(undefined, RowIndex, Tab, Store) -> + case mnesia_snmp_hook:oid_to_key(RowIndex, Tab) of + unknown -> %% Arrg contains fix_string + Ops = find_ops(Store, Tab, val({Tab, wild_pattern})), + SnmpType = val({Tab,snmp}), + Fix = fun({{_,Key},_,Op}, Res) -> + case mnesia_snmp_hook:key_to_oid(Tab,Key,SnmpType) of + RowIndex -> + case Op of + write -> {ok, Key}; + _ -> + undefined + end; + _ -> + Res + end + end, + lists:foldl(Fix, undefined, Ops); + Key -> + case ?ets_lookup(Store, {Tab,Key}) of + [] -> + undefined; + Ops -> + case lists:last(Ops) of + {_, _, write} -> {ok, Key}; + _ -> undefined + end + end + end. + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +%% Textfile access + +load_textfile(F) -> + mnesia_text:load_textfile(F). +dump_to_textfile(F) -> + mnesia_text:dump_to_textfile(F). + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +%% QLC Handles +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +table(Tab) -> + table(Tab, []). +table(Tab,Opts) -> + {[Trav,Lock,NObjects],QlcOptions0} = + qlc_opts(Opts,[{traverse,select},{lock,read},{n_objects,100}]), + TF = case Trav of + {select,Ms} -> + fun() -> qlc_select(select(Tab,Ms,NObjects,Lock)) end; + select -> + fun(Ms) -> qlc_select(select(Tab,Ms,NObjects,Lock)) end; + _ -> + erlang:error({badarg, {Trav,[Tab, Opts]}}) + end, + Pre = fun(Arg) -> pre_qlc(Arg, Tab) end, + Post = fun() -> post_qlc(Tab) end, + Info = fun(Tag) -> qlc_info(Tab, Tag) end, + ParentFun = fun() -> + {mnesia_activity, mnesia:get_activity_id()} + end, + Lookup = + case Trav of + {select, _} -> []; + _ -> + LFun = fun(2, Keys) -> + Read = fun(Key) -> read(Tab,Key,Lock) end, + lists:flatmap(Read, Keys); + (Index,Keys) -> + IdxRead = fun(Key) -> index_read(Tab,Key,Index) end, + lists:flatmap(IdxRead, Keys) + end, + [{lookup_fun, LFun}] + end, + MFA = fun(Type) -> qlc_format(Type, Tab, NObjects, Lock, Opts) end, + QlcOptions = [{pre_fun, Pre}, {post_fun, Post}, + {info_fun, Info}, {parent_fun, ParentFun}, + {format_fun, MFA}|Lookup] ++ QlcOptions0, + qlc:table(TF, QlcOptions). + +pre_qlc(Opts, Tab) -> + {_,Tid,_} = + case get(mnesia_activity_state) of + undefined -> + case lists:keysearch(parent_value, 1, Opts) of + {value, {parent_value,{mnesia_activity,undefined}}} -> + abort(no_transaction); + {value, {parent_value,{mnesia_activity,Aid}}} -> + {value,{stop_fun,Stop}} = + lists:keysearch(stop_fun,1,Opts), + put_activity_id(Aid,Stop), + Aid; + _ -> + abort(no_transaction) + end; + Else -> + Else + end, + case element(1,Tid) of + tid -> ok; + _ -> + case ?catch_val({Tab, setorbag}) of + ordered_set -> ok; + _ -> + dirty_rpc(Tab, mnesia_tm, fixtable, [Tab,true,self()]), + ok + end + end. + +post_qlc(Tab) -> + case catch get(mnesia_activity_state) of + {_,#tid{},_} -> ok; + _ -> + case ?catch_val({Tab, setorbag}) of + ordered_set -> + ok; + _ -> + dirty_rpc(Tab, mnesia_tm, fixtable, [Tab,false,self()]), + ok + end + end. + +qlc_select('$end_of_table') -> []; +qlc_select({[], Cont}) -> qlc_select(select(Cont)); +qlc_select({Objects, Cont}) -> + Objects ++ fun() -> qlc_select(select(Cont)) end. + +qlc_opts(Opts, Keys) when is_list(Opts) -> + qlc_opts(Opts, Keys, []); +qlc_opts(Option, Keys) -> + qlc_opts([Option], Keys, []). + +qlc_opts(Opts, [{Key,Def}|Keys], Acc) -> + Opt = case lists:keysearch(Key,1, Opts) of + {value, {Key,Value}} -> + Value; + false -> + Def + end, + qlc_opts(lists:keydelete(Key,1,Opts),Keys,[Opt|Acc]); +qlc_opts(Opts,[],Acc) -> {lists:reverse(Acc),Opts}. + +qlc_info(Tab, num_of_objects) -> + dirty_rpc(Tab, ?MODULE, raw_table_info, [Tab, size]); +qlc_info(_, keypos) -> 2; +qlc_info(_, is_unique_objects) -> true; +qlc_info(Tab, is_unique_keys) -> + case val({Tab, type}) of + set -> true; + ordered_set -> true; + _ -> false + end; +qlc_info(Tab, is_sorted_objects) -> + case val({Tab, type}) of + ordered_set -> + case ?catch_val({Tab, frag_hash}) of + {'EXIT', _} -> + ascending; + _ -> %% Fragmented tables are not ordered + no + end; + _ -> no + end; +qlc_info(Tab, indices) -> + val({Tab,index}); +qlc_info(_Tab, _) -> + undefined. + +qlc_format(all, Tab, NObjects, Lock, Opts) -> + {?MODULE, table, [Tab,[{n_objects, NObjects}, {lock,Lock}|Opts]]}; +qlc_format({match_spec, Ms}, Tab, NObjects, Lock, Opts) -> + {?MODULE, table, [Tab,[{traverse,{select,Ms}},{n_objects, NObjects}, {lock,Lock}|Opts]]}; +qlc_format({lookup, 2, Keys}, Tab, _, Lock, _) -> + io_lib:format("lists:flatmap(fun(V) -> " + "~w:read(~w, V, ~w) end, ~w)", + [?MODULE, Tab, Lock, Keys]); +qlc_format({lookup, Index,Keys}, Tab, _, _, _) -> + io_lib:format("lists:flatmap(fun(V) -> " + "~w:index_read(~w, V, ~w) end, ~w)", + [?MODULE, Tab, Index, Keys]). + + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% + +do_fixtable(Tab, #tidstore{store=Store}) -> + do_fixtable(Tab,Store); +do_fixtable(Tab, Store) -> + case ?catch_val({Tab, setorbag}) of + ordered_set -> + ok; + _ -> + case ?ets_match_object(Store, {fixtable, {Tab, '_'}}) of + [] -> + Node = dirty_rpc(Tab, mnesia_tm, fixtable, [Tab,true,self()]), + ?ets_insert(Store, {fixtable, {Tab, Node}}); + _ -> + ignore + end, + ok + end. + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +%% Mnemosyne exclusive + +get_activity_id() -> + get(mnesia_activity_state). + +put_activity_id(Activity) -> + mnesia_tm:put_activity_id(Activity). +put_activity_id(Activity,Fun) -> + mnesia_tm:put_activity_id(Activity,Fun). diff --git a/lib/mnesia/src/mnesia.hrl b/lib/mnesia/src/mnesia.hrl new file mode 100644 index 0000000000..d488d9364a --- /dev/null +++ b/lib/mnesia/src/mnesia.hrl @@ -0,0 +1,121 @@ +%% +%% %CopyrightBegin% +%% +%% Copyright Ericsson AB 1996-2009. All Rights Reserved. +%% +%% The contents of this file are subject to the Erlang Public License, +%% Version 1.1, (the "License"); you may not use this file except in +%% compliance with the License. You should have received a copy of the +%% Erlang Public License along with this software. If not, it can be +%% retrieved online at http://www.erlang.org/. +%% +%% Software distributed under the License is distributed on an "AS IS" +%% basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See +%% the License for the specific language governing rights and limitations +%% under the License. +%% +%% %CopyrightEnd% +%% + +%% + +-define(APPLICATION, mnesia). + +-define(ets_lookup(Tab, Key), ets:lookup(Tab, Key)). +-define(ets_lookup_element(Tab, Key, Pos), ets:lookup_element(Tab, Key, Pos)). +-define(ets_insert(Tab, Rec), ets:insert(Tab, Rec)). +-define(ets_delete(Tab, Key), ets:delete(Tab, Key)). +-define(ets_match_delete(Tab, Pat), ets:match_delete(Tab, Pat)). +-define(ets_match_object(Tab, Pat), ets:match_object(Tab, Pat)). +-define(ets_match(Tab, Pat), ets:match(Tab, Pat)). +-define(ets_info(Tab, Item), ets:info(Tab, Item)). +-define(ets_update_counter(Tab, Key, Incr), ets:update_counter(Tab, Key, Incr)). +-define(ets_first(Tab), ets:first(Tab)). +-define(ets_next(Tab, Key), ets:next(Tab, Key)). +-define(ets_last(Tab), ets:last(Tab)). +-define(ets_prev(Tab, Key), ets:prev(Tab, Key)). +-define(ets_slot(Tab, Pos), ets:slot(Tab, Pos)). +-define(ets_new_table(Tab, Props), ets:new(Tab, Props)). +-define(ets_delete_table(Tab), ets:delete(Tab)). +-define(ets_fixtable(Tab, Bool), ets:fixtable(Tab, Bool)). + +-define(catch_val(Var), (catch ?ets_lookup_element(mnesia_gvar, Var, 2))). + +%% It's important that counter is first, since we compare tid's + +-record(tid, + {counter, %% serial no for tid + pid}). %% owner of tid + + +-record(tidstore, + {store, %% current ets table for tid + up_stores = [], %% list of upper layer stores for nested trans + level = 1}). %% transaction level + +-define(unique_cookie, {erlang:now(), node()}). + +-record(cstruct, {name, % Atom + type = set, % set | bag + ram_copies = [], % [Node] + disc_copies = [], % [Node] + disc_only_copies = [], % [Node] + load_order = 0, % Integer + access_mode = read_write, % read_write | read_only + index = [], % [Integer] + snmp = [], % Snmp Ustruct + local_content = false, % true | false + record_name = {bad_record_name}, % Atom (Default = Name) + attributes = [key, val], % [Atom] + user_properties = [], % [Record] + frag_properties = [], % [{Key, Val] + cookie = ?unique_cookie, % Term + version = {{2, 0}, []}}). % {{Integer, Integer}, [Node]} + +%% Record for the head structure in Mnesia's log files +%% +%% The definition of this record may *NEVER* be changed +%% since it may be written to very old backup files. +%% By holding this record definition stable we can be +%% able to comprahend backups from timepoint 0. It also +%% allows us to use the backup format as an interchange +%% format between Mnesia releases. + +-record(log_header,{log_kind, + log_version, + mnesia_version, + node, + now}). + +%% Commit records stored in the transaction log +-record(commit, {node, + decision, % presume_commit | Decision + ram_copies = [], + disc_copies = [], + disc_only_copies = [], + snmp = [], + schema_ops = [] + }). + +-record(decision, {tid, + outcome, % presume_abort | committed + disc_nodes, + ram_nodes}). + +%% Maybe cyclic wait +-record(cyclic, {node = node(), + oid, % {Tab, Key} + op, % read | write + lock, % read | write + lucky + }). + +%% Managing conditional debug functions + +-ifdef(debug). + -define(eval_debug_fun(I, C), + mnesia_lib:eval_debug_fun(I, C, ?FILE, ?LINE)). +-else. + -define(eval_debug_fun(I, C), ok). +-endif. + diff --git a/lib/mnesia/src/mnesia_backup.erl b/lib/mnesia/src/mnesia_backup.erl new file mode 100644 index 0000000000..f372ca0be5 --- /dev/null +++ b/lib/mnesia/src/mnesia_backup.erl @@ -0,0 +1,201 @@ +%% +%% %CopyrightBegin% +%% +%% Copyright Ericsson AB 1996-2009. All Rights Reserved. +%% +%% The contents of this file are subject to the Erlang Public License, +%% Version 1.1, (the "License"); you may not use this file except in +%% compliance with the License. You should have received a copy of the +%% Erlang Public License along with this software. If not, it can be +%% retrieved online at http://www.erlang.org/. +%% +%% Software distributed under the License is distributed on an "AS IS" +%% basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See +%% the License for the specific language governing rights and limitations +%% under the License. +%% +%% %CopyrightEnd% +%% + +%% + +%%-behaviour(mnesia_backup). +%0 + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +%% +%% This module contains one implementation of callback functions +%% used by Mnesia at backup and restore. The user may however +%% write an own module the same interface as mnesia_backup and +%% configure Mnesia so the alternate module performs the actual +%% accesses to the backup media. This means that the user may put +%% the backup on medias that Mnesia does not know about, possibly +%% on hosts where Erlang is not running. +%% +%% The OpaqueData argument is never interpreted by other parts of +%% Mnesia. It is the property of this module. Alternate implementations +%% of this module may have different interpretations of OpaqueData. +%% The OpaqueData argument given to open_write/1 and open_read/1 +%% are forwarded directly from the user. +%% +%% All functions must return {ok, NewOpaqueData} or {error, Reason}. +%% +%% The NewOpaqueData arguments returned by backup callback functions will +%% be given as input when the next backup callback function is invoked. +%% If any return value does not match {ok, _} the backup will be aborted. +%% +%% The NewOpaqueData arguments returned by restore callback functions will +%% be given as input when the next restore callback function is invoked +%% If any return value does not match {ok, _} the restore will be aborted. +%% +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% + +-module(mnesia_backup). + +-include_lib("kernel/include/file.hrl"). + +-export([ + %% Write access + open_write/1, + write/2, + commit_write/1, + abort_write/1, + + %% Read access + open_read/1, + read/1, + close_read/1 + ]). + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +%% Backup callback interface +-record(backup, {tmp_file, file, file_desc}). + +%% Opens backup media for write +%% +%% Returns {ok, OpaqueData} or {error, Reason} +open_write(OpaqueData) -> + File = OpaqueData, + Tmp = lists:concat([File,".BUPTMP"]), + file:delete(Tmp), + file:delete(File), + case disk_log:open([{name, make_ref()}, + {file, Tmp}, + {repair, false}, + {linkto, self()}]) of + {ok, Fd} -> + {ok, #backup{tmp_file = Tmp, file = File, file_desc = Fd}}; + {error, Reason} -> + {error, Reason} + end. + +%% Writes BackupItems to the backup media +%% +%% Returns {ok, OpaqueData} or {error, Reason} +write(OpaqueData, BackupItems) -> + B = OpaqueData, + case disk_log:log_terms(B#backup.file_desc, BackupItems) of + ok -> + {ok, B}; + {error, Reason} -> + abort_write(B), + {error, Reason} + end. + +%% Closes the backup media after a successful backup +%% +%% Returns {ok, ReturnValueToUser} or {error, Reason} +commit_write(OpaqueData) -> + B = OpaqueData, + case disk_log:sync(B#backup.file_desc) of + ok -> + case disk_log:close(B#backup.file_desc) of + ok -> + case file:rename(B#backup.tmp_file, B#backup.file) of + ok -> + {ok, B#backup.file}; + {error, Reason} -> + {error, Reason} + end; + {error, Reason} -> + {error, Reason} + end; + {error, Reason} -> + {error, Reason} + end. + +%% Closes the backup media after an interrupted backup +%% +%% Returns {ok, ReturnValueToUser} or {error, Reason} +abort_write(BackupRef) -> + Res = disk_log:close(BackupRef#backup.file_desc), + file:delete(BackupRef#backup.tmp_file), + case Res of + ok -> + {ok, BackupRef#backup.file}; + {error, Reason} -> + {error, Reason} + end. + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +%% Restore callback interface + +-record(restore, {file, file_desc, cont}). + +%% Opens backup media for read +%% +%% Returns {ok, OpaqueData} or {error, Reason} +open_read(OpaqueData) -> + File = OpaqueData, + case file:read_file_info(File) of + {error, Reason} -> + {error, Reason}; + _FileInfo -> %% file exists + case disk_log:open([{file, File}, + {name, make_ref()}, + {repair, false}, + {mode, read_only}, + {linkto, self()}]) of + {ok, Fd} -> + {ok, #restore{file = File, file_desc = Fd, cont = start}}; + {repaired, Fd, _, {badbytes, 0}} -> + {ok, #restore{file = File, file_desc = Fd, cont = start}}; + {repaired, Fd, _, _} -> + {ok, #restore{file = File, file_desc = Fd, cont = start}}; + {error, Reason} -> + {error, Reason} + end + end. + +%% Reads BackupItems from the backup media +%% +%% Returns {ok, OpaqueData, BackupItems} or {error, Reason} +%% +%% BackupItems == [] is interpreted as eof +read(OpaqueData) -> + R = OpaqueData, + Fd = R#restore.file_desc, + case disk_log:chunk(Fd, R#restore.cont) of + {error, Reason} -> + {error, {"Possibly truncated", Reason}}; + eof -> + {ok, R, []}; + {Cont, []} -> + read(R#restore{cont = Cont}); + {Cont, BackupItems, _BadBytes} -> + {ok, R#restore{cont = Cont}, BackupItems}; + {Cont, BackupItems} -> + {ok, R#restore{cont = Cont}, BackupItems} + end. + +%% Closes the backup media after restore +%% +%% Returns {ok, ReturnValueToUser} or {error, Reason} +close_read(OpaqueData) -> + R = OpaqueData, + case disk_log:close(R#restore.file_desc) of + ok -> {ok, R#restore.file}; + {error, Reason} -> {error, Reason} + end. +%0 + diff --git a/lib/mnesia/src/mnesia_bup.erl b/lib/mnesia/src/mnesia_bup.erl new file mode 100644 index 0000000000..37a8258d74 --- /dev/null +++ b/lib/mnesia/src/mnesia_bup.erl @@ -0,0 +1,1186 @@ +%% +%% %CopyrightBegin% +%% +%% Copyright Ericsson AB 1996-2009. All Rights Reserved. +%% +%% The contents of this file are subject to the Erlang Public License, +%% Version 1.1, (the "License"); you may not use this file except in +%% compliance with the License. You should have received a copy of the +%% Erlang Public License along with this software. If not, it can be +%% retrieved online at http://www.erlang.org/. +%% +%% Software distributed under the License is distributed on an "AS IS" +%% basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See +%% the License for the specific language governing rights and limitations +%% under the License. +%% +%% %CopyrightEnd% +%% + +%% +-module(mnesia_bup). +-export([ + %% Public interface + iterate/4, + read_schema/2, + fallback_bup/0, + fallback_exists/0, + tm_fallback_start/1, + create_schema/1, + install_fallback/1, + install_fallback/2, + uninstall_fallback/0, + uninstall_fallback/1, + traverse_backup/4, + traverse_backup/6, + make_initial_backup/3, + fallback_to_schema/0, + lookup_schema/2, + schema2bup/1, + refresh_cookie/2, + + %% Internal + fallback_receiver/2, + install_fallback_master/2, + uninstall_fallback_master/2, + local_uninstall_fallback/2, + do_traverse_backup/7, + trav_apply/4 + ]). + +-include("mnesia.hrl"). +-import(mnesia_lib, [verbose/2, dbg_out/2]). + +-record(restore, {mode, bup_module, bup_data}). + +-record(fallback_args, {opaque, + scope = global, + module = mnesia_monitor:get_env(backup_module), + use_default_dir = true, + mnesia_dir, + fallback_bup, + fallback_tmp, + skip_tables = [], + keep_tables = [], + default_op = keep_tables + }). + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +%% Backup iterator + +%% Reads schema section and iterates over all records in a backup. +%% +%% Fun(BunchOfRecords, Header, Schema, Acc) is applied when a suitable amount +%% of records has been collected. +%% +%% BunchOfRecords will be [] when the iteration is done. +iterate(Mod, Fun, Opaque, Acc) -> + R = #restore{bup_module = Mod, bup_data = Opaque}, + case catch read_schema_section(R) of + {error, Reason} -> + {error, Reason}; + {R2, {Header, Schema, Rest}} -> + case catch iter(R2, Header, Schema, Fun, Acc, Rest) of + {ok, R3, Res} -> + catch safe_apply(R3, close_read, [R3#restore.bup_data]), + {ok, Res}; + {error, Reason} -> + catch safe_apply(R2, close_read, [R2#restore.bup_data]), + {error, Reason}; + {'EXIT', Pid, Reason} -> + catch safe_apply(R2, close_read, [R2#restore.bup_data]), + {error, {'EXIT', Pid, Reason}}; + {'EXIT', Reason} -> + catch safe_apply(R2, close_read, [R2#restore.bup_data]), + {error, {'EXIT', Reason}} + end + end. + +iter(R, Header, Schema, Fun, Acc, []) -> + case safe_apply(R, read, [R#restore.bup_data]) of + {R2, []} -> + Res = Fun([], Header, Schema, Acc), + {ok, R2, Res}; + {R2, BupItems} -> + iter(R2, Header, Schema, Fun, Acc, BupItems) + end; +iter(R, Header, Schema, Fun, Acc, BupItems) -> + Acc2 = Fun(BupItems, Header, Schema, Acc), + iter(R, Header, Schema, Fun, Acc2, []). + +safe_apply(R, write, [_, Items]) when Items =:= [] -> + R; +safe_apply(R, What, Args) -> + Abort = fun(Re) -> abort_restore(R, What, Args, Re) end, + Mod = R#restore.bup_module, + case catch apply(Mod, What, Args) of + {ok, Opaque, Items} when What =:= read -> + {R#restore{bup_data = Opaque}, Items}; + {ok, Opaque} when What =/= read-> + R#restore{bup_data = Opaque}; + {error, Re} -> + Abort(Re); + Re -> + Abort(Re) + end. + +abort_restore(R, What, Args, Reason) -> + Mod = R#restore.bup_module, + Opaque = R#restore.bup_data, + dbg_out("Restore aborted. ~p:~p~p -> ~p~n", + [Mod, What, Args, Reason]), + catch apply(Mod, close_read, [Opaque]), + throw({error, Reason}). + +fallback_to_schema() -> + Fname = fallback_bup(), + fallback_to_schema(Fname). + +fallback_to_schema(Fname) -> + Mod = mnesia_backup, + case read_schema(Mod, Fname) of + {error, Reason} -> + {error, Reason}; + Schema -> + case catch lookup_schema(schema, Schema) of + {error, _} -> + {error, "No schema in fallback"}; + List -> + {ok, fallback, List} + end + end. + +%% Opens Opaque reads schema and then close +read_schema(Mod, Opaque) -> + R = #restore{bup_module = Mod, bup_data = Opaque}, + case catch read_schema_section(R) of + {error, Reason} -> + {error, Reason}; + {R2, {_Header, Schema, _}} -> + catch safe_apply(R2, close_read, [R2#restore.bup_data]), + Schema + end. + +%% Open backup media and extract schema +%% rewind backup media and leave it open +%% Returns {R, {Header, Schema}} +read_schema_section(R) -> + case catch do_read_schema_section(R) of + {'EXIT', Reason} -> + catch safe_apply(R, close_read, [R#restore.bup_data]), + {error, {'EXIT', Reason}}; + {error, Reason} -> + catch safe_apply(R, close_read, [R#restore.bup_data]), + {error, Reason}; + {R2, {H, Schema, Rest}} -> + Schema2 = convert_schema(H#log_header.log_version, Schema), + {R2, {H, Schema2, Rest}} + end. + +do_read_schema_section(R) -> + R2 = safe_apply(R, open_read, [R#restore.bup_data]), + {R3, RawSchema} = safe_apply(R2, read, [R2#restore.bup_data]), + do_read_schema_section(R3, verify_header(RawSchema), []). + +do_read_schema_section(R, {ok, B, C, []}, Acc) -> + case safe_apply(R, read, [R#restore.bup_data]) of + {R2, []} -> + {R2, {B, Acc, []}}; + {R2, RawSchema} -> + do_read_schema_section(R2, {ok, B, C, RawSchema}, Acc) + end; + +do_read_schema_section(R, {ok, B, C, [Head | Tail]}, Acc) + when element(1, Head) =:= schema -> + do_read_schema_section(R, {ok, B, C, Tail}, Acc ++ [Head]); + +do_read_schema_section(R, {ok, B, _C, Rest}, Acc) -> + {R, {B, Acc, Rest}}; + +do_read_schema_section(_R, {error, Reason}, _Acc) -> + {error, Reason}. + +verify_header([H | RawSchema]) when is_record(H, log_header) -> + Current = mnesia_log:backup_log_header(), + if + H#log_header.log_kind =:= Current#log_header.log_kind -> + Versions = ["0.1", "1.1", Current#log_header.log_version], + case lists:member(H#log_header.log_version, Versions) of + true -> + {ok, H, Current, RawSchema}; + false -> + {error, {"Bad header version. Cannot be used as backup.", H}} + end; + true -> + {error, {"Bad kind of header. Cannot be used as backup.", H}} + end; +verify_header(RawSchema) -> + {error, {"Missing header. Cannot be used as backup.", catch hd(RawSchema)}}. + +refresh_cookie(Schema, NewCookie) -> + case lists:keysearch(schema, 2, Schema) of + {value, {schema, schema, List}} -> + Cs = mnesia_schema:list2cs(List), + Cs2 = Cs#cstruct{cookie = NewCookie}, + Item = {schema, schema, mnesia_schema:cs2list(Cs2)}, + lists:keyreplace(schema, 2, Schema, Item); + + false -> + Reason = "No schema found. Cannot be used as backup.", + throw({error, {Reason, Schema}}) + end. + +%% Convert schema items from an external backup +%% If backup format is the latest, no conversion is needed +%% All supported backup formats should have their converters +%% here as separate function clauses. +convert_schema("0.1", Schema) -> + convert_0_1(Schema); +convert_schema("1.1", Schema) -> + %% The new backup format is a pure extension of the old one + Current = mnesia_log:backup_log_header(), + convert_schema(Current#log_header.log_version, Schema); +convert_schema(Latest, Schema) -> + H = mnesia_log:backup_log_header(), + if + H#log_header.log_version =:= Latest -> + Schema; + true -> + Reason = "Bad backup header version. Cannot convert schema.", + throw({error, {Reason, H}}) + end. + +%% Backward compatibility for 0.1 +convert_0_1(Schema) -> + case lists:keysearch(schema, 2, Schema) of + {value, {schema, schema, List}} -> + Schema2 = lists:keydelete(schema, 2, Schema), + Cs = mnesia_schema:list2cs(List), + convert_0_1(Schema2, [], Cs); + false -> + List = mnesia_schema:get_initial_schema(disc_copies, [node()]), + Cs = mnesia_schema:list2cs(List), + convert_0_1(Schema, [], Cs) + end. + +convert_0_1([{schema, cookie, Cookie} | Schema], Acc, Cs) -> + convert_0_1(Schema, Acc, Cs#cstruct{cookie = Cookie}); +convert_0_1([{schema, db_nodes, DbNodes} | Schema], Acc, Cs) -> + convert_0_1(Schema, Acc, Cs#cstruct{disc_copies = DbNodes}); +convert_0_1([{schema, version, Version} | Schema], Acc, Cs) -> + convert_0_1(Schema, Acc, Cs#cstruct{version = Version}); +convert_0_1([{schema, Tab, Def} | Schema], Acc, Cs) -> + Head = + case lists:keysearch(index, 1, Def) of + {value, {index, PosList}} -> + %% Remove the snmp "index" + P = PosList -- [snmp], + Def2 = lists:keyreplace(index, 1, Def, {index, P}), + {schema, Tab, Def2}; + false -> + {schema, Tab, Def} + end, + convert_0_1(Schema, [Head | Acc], Cs); +convert_0_1([Head | Schema], Acc, Cs) -> + convert_0_1(Schema, [Head | Acc], Cs); +convert_0_1([], Acc, Cs) -> + [schema2bup({schema, schema, Cs}) | Acc]. + +%% Returns Val or throw error +lookup_schema(Key, Schema) -> + case lists:keysearch(Key, 2, Schema) of + {value, {schema, Key, Val}} -> Val; + false -> throw({error, {"Cannot lookup", Key}}) + end. + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +%% Backup compatibility + +%% Convert internal schema items to backup dito +schema2bup({schema, Tab}) -> + {schema, Tab}; +schema2bup({schema, Tab, TableDef}) -> + {schema, Tab, mnesia_schema:cs2list(TableDef)}. + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +%% Create schema on the given nodes +%% Requires that old schemas has been deleted +%% Returns ok | {error, Reason} +create_schema([]) -> + create_schema([node()]); +create_schema(Ns) when is_list(Ns) -> + case is_set(Ns) of + true -> + create_schema(Ns, mnesia_schema:ensure_no_schema(Ns)); + false -> + {error, {combine_error, Ns}} + end; +create_schema(Ns) -> + {error, {badarg, Ns}}. + +is_set(List) when is_list(List) -> + ordsets:is_set(lists:sort(List)); +is_set(_) -> + false. + +create_schema(Ns, ok) -> + %% Ensure that we access the intended Mnesia + %% directory. This function may not be called + %% during startup since it will cause the + %% application_controller to get into deadlock + case mnesia_lib:ensure_loaded(?APPLICATION) of + ok -> + case mnesia_monitor:get_env(schema_location) of + ram -> + {error, {has_no_disc, node()}}; + _ -> + case mnesia_schema:opt_create_dir(true, mnesia_lib:dir()) of + {error, What} -> + {error, What}; + ok -> + Mod = mnesia_backup, + Str = mk_str(), + File = mnesia_lib:dir(Str), + file:delete(File), + case catch make_initial_backup(Ns, File, Mod) of + {ok, _Res} -> + case do_install_fallback(File, Mod) of + ok -> + file:delete(File), + ok; + {error, Reason} -> + {error, Reason} + end; + {error, Reason} -> + {error, Reason} + end + end + end; + {error, Reason} -> + {error, Reason} + end; +create_schema(_Ns, {error, Reason}) -> + {error, Reason}; +create_schema(_Ns, Reason) -> + {error, Reason}. + +mk_str() -> + Now = [integer_to_list(I) || I <- tuple_to_list(now())], + lists:concat([node()] ++ Now ++ ".TMP"). + +make_initial_backup(Ns, Opaque, Mod) -> + Schema = [{schema, schema, mnesia_schema:get_initial_schema(disc_copies, Ns)}], + O2 = do_apply(Mod, open_write, [Opaque], Opaque), + O3 = do_apply(Mod, write, [O2, [mnesia_log:backup_log_header()]], O2), + O4 = do_apply(Mod, write, [O3, Schema], O3), + O5 = do_apply(Mod, commit_write, [O4], O4), + {ok, O5}. + +do_apply(_, write, [_, Items], Opaque) when Items =:= [] -> + Opaque; +do_apply(Mod, What, Args, _Opaque) -> + case catch apply(Mod, What, Args) of + {ok, Opaque2} -> Opaque2; + {error, Reason} -> throw({error, Reason}); + {'EXIT', Reason} -> throw({error, {'EXIT', Reason}}) + end. + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +%% Restore + +%% Restore schema and possibly other tables from a backup +%% and replicate them to the necessary nodes +%% Requires that old schemas has been deleted +%% Returns ok | {error, Reason} +install_fallback(Opaque) -> + install_fallback(Opaque, []). + +install_fallback(Opaque, Args) -> + %% Ensure that we access the intended Mnesia + %% directory. This function may not be called + %% during startup since it will cause the + %% application_controller to get into deadlock + case mnesia_lib:ensure_loaded(?APPLICATION) of + ok -> + do_install_fallback(Opaque, Args); + {error, Reason} -> + {error, Reason} + end. + +do_install_fallback(Opaque, Mod) when is_atom(Mod) -> + do_install_fallback(Opaque, [{module, Mod}]); +do_install_fallback(Opaque, Args) when is_list(Args) -> + case check_fallback_args(Args, #fallback_args{opaque = Opaque}) of + {ok, FA} -> + do_install_fallback(FA); + {error, Reason} -> + {error, Reason} + end; +do_install_fallback(_Opaque, Args) -> + {error, {badarg, Args}}. + +check_fallback_args([Arg | Tail], FA) -> + case catch check_fallback_arg_type(Arg, FA) of + {'EXIT', _Reason} -> + {error, {badarg, Arg}}; + FA2 -> + check_fallback_args(Tail, FA2) + end; +check_fallback_args([], FA) -> + {ok, FA}. + +check_fallback_arg_type(Arg, FA) -> + case Arg of + {scope, global} -> + FA#fallback_args{scope = global}; + {scope, local} -> + FA#fallback_args{scope = local}; + {module, Mod} -> + Mod2 = mnesia_monitor:do_check_type(backup_module, Mod), + FA#fallback_args{module = Mod2}; + {mnesia_dir, Dir} -> + FA#fallback_args{mnesia_dir = Dir, + use_default_dir = false}; + {keep_tables, Tabs} -> + atom_list(Tabs), + FA#fallback_args{keep_tables = Tabs}; + {skip_tables, Tabs} -> + atom_list(Tabs), + FA#fallback_args{skip_tables = Tabs}; + {default_op, keep_tables} -> + FA#fallback_args{default_op = keep_tables}; + {default_op, skip_tables} -> + FA#fallback_args{default_op = skip_tables} + end. + +atom_list([H | T]) when is_atom(H) -> + atom_list(T); +atom_list([]) -> + ok. + +do_install_fallback(FA) -> + Pid = spawn_link(?MODULE, install_fallback_master, [self(), FA]), + Res = + receive + {'EXIT', Pid, Reason} -> % if appl has trapped exit + {error, {'EXIT', Reason}}; + {Pid, Res2} -> + case Res2 of + {ok, _} -> + ok; + {error, Reason} -> + {error, {"Cannot install fallback", Reason}} + end + end, + Res. + +install_fallback_master(ClientPid, FA) -> + process_flag(trap_exit, true), + State = {start, FA}, + Opaque = FA#fallback_args.opaque, + Mod = FA#fallback_args.module, + Res = (catch iterate(Mod, fun restore_recs/4, Opaque, State)), + unlink(ClientPid), + ClientPid ! {self(), Res}, + exit(shutdown). + +restore_recs(_, _, _, stop) -> + throw({error, "restore_recs already stopped"}); + +restore_recs(Recs, Header, Schema, {start, FA}) -> + %% No records in backup + Schema2 = convert_schema(Header#log_header.log_version, Schema), + CreateList = lookup_schema(schema, Schema2), + case catch mnesia_schema:list2cs(CreateList) of + {'EXIT', Reason} -> + throw({error, {"Bad schema in restore_recs", Reason}}); + Cs -> + Ns = get_fallback_nodes(FA, Cs#cstruct.disc_copies), + global:set_lock({{mnesia_table_lock, schema}, self()}, Ns, infinity), + Args = [self(), FA], + Pids = [spawn_link(N, ?MODULE, fallback_receiver, Args) || N <- Ns], + send_fallback(Pids, {start, Header, Schema2}), + Res = restore_recs(Recs, Header, Schema2, Pids), + global:del_lock({{mnesia_table_lock, schema}, self()}, Ns), + Res + end; + +restore_recs([], _Header, _Schema, Pids) -> + send_fallback(Pids, swap), + send_fallback(Pids, stop), + stop; + +restore_recs(Recs, _, _, Pids) -> + send_fallback(Pids, {records, Recs}), + Pids. + +get_fallback_nodes(FA, Ns) -> + This = node(), + case lists:member(This, Ns) of + true -> + case FA#fallback_args.scope of + global -> Ns; + local -> [This] + end; + false -> + throw({error, {"No disc resident schema on local node", Ns}}) + end. + +send_fallback(Pids, Msg) when is_list(Pids), Pids =/= [] -> + lists:foreach(fun(Pid) -> Pid ! {self(), Msg} end, Pids), + rec_answers(Pids, []). + +rec_answers([], Acc) -> + case {lists:keysearch(error, 1, Acc), mnesia_lib:uniq(Acc)} of + {{value, {error, Val}}, _} -> throw({error, Val}); + {_, [SameAnswer]} -> SameAnswer; + {_, Other} -> throw({error, {"Different answers", Other}}) + end; +rec_answers(Pids, Acc) -> + receive + {'EXIT', Pid, stopped} -> + Pids2 = lists:delete(Pid, Pids), + rec_answers(Pids2, [stopped|Acc]); + {'EXIT', Pid, Reason} -> + Pids2 = lists:delete(Pid, Pids), + rec_answers(Pids2, [{error, {'EXIT', Pid, Reason}}|Acc]); + {Pid, Reply} -> + Pids2 = lists:delete(Pid, Pids), + rec_answers(Pids2, [Reply|Acc]) + end. + +fallback_exists() -> + Fname = fallback_bup(), + fallback_exists(Fname). + +fallback_exists(Fname) -> + case mnesia_monitor:use_dir() of + true -> + mnesia_lib:exists(Fname); + false -> + case ?catch_val(active_fallback) of + {'EXIT', _} -> false; + Bool -> Bool + end + end. + +fallback_name() -> "FALLBACK.BUP". +fallback_bup() -> mnesia_lib:dir(fallback_name()). + +fallback_tmp_name() -> "FALLBACK.TMP". +%% fallback_full_tmp_name() -> mnesia_lib:dir(fallback_tmp_name()). + +fallback_receiver(Master, FA) -> + process_flag(trap_exit, true), + + case catch register(mnesia_fallback, self()) of + {'EXIT', _} -> + Reason = {already_exists, node()}, + local_fallback_error(Master, Reason); + true -> + FA2 = check_fallback_dir(Master, FA), + Bup = FA2#fallback_args.fallback_bup, + case mnesia_lib:exists(Bup) of + true -> + Reason2 = {already_exists, node()}, + local_fallback_error(Master, Reason2); + false -> + Mod = mnesia_backup, + Tmp = FA2#fallback_args.fallback_tmp, + R = #restore{mode = replace, + bup_module = Mod, + bup_data = Tmp}, + file:delete(Tmp), + case catch fallback_receiver_loop(Master, R, FA2, schema) of + {error, Reason} -> + local_fallback_error(Master, Reason); + Other -> + exit(Other) + end + end + end. + +local_fallback_error(Master, Reason) -> + Master ! {self(), {error, Reason}}, + unlink(Master), + exit(Reason). + +check_fallback_dir(Master, FA) -> + case mnesia:system_info(schema_location) of + ram -> + Reason = {has_no_disc, node()}, + local_fallback_error(Master, Reason); + _ -> + Dir = check_fallback_dir_arg(Master, FA), + Bup = filename:join([Dir, fallback_name()]), + Tmp = filename:join([Dir, fallback_tmp_name()]), + FA#fallback_args{fallback_bup = Bup, + fallback_tmp = Tmp, + mnesia_dir = Dir} + end. + +check_fallback_dir_arg(Master, FA) -> + case FA#fallback_args.use_default_dir of + true -> + mnesia_lib:dir(); + false when FA#fallback_args.scope =:= local -> + Dir = FA#fallback_args.mnesia_dir, + case catch mnesia_monitor:do_check_type(dir, Dir) of + {'EXIT', _R} -> + Reason = {badarg, {dir, Dir}, node()}, + local_fallback_error(Master, Reason); + AbsDir-> + AbsDir + end; + false when FA#fallback_args.scope =:= global -> + Reason = {combine_error, global, dir, node()}, + local_fallback_error(Master, Reason) + end. + +fallback_receiver_loop(Master, R, FA, State) -> + receive + {Master, {start, Header, Schema}} when State =:= schema -> + Dir = FA#fallback_args.mnesia_dir, + throw_bad_res(ok, mnesia_schema:opt_create_dir(true, Dir)), + R2 = safe_apply(R, open_write, [R#restore.bup_data]), + R3 = safe_apply(R2, write, [R2#restore.bup_data, [Header]]), + BupSchema = [schema2bup(S) || S <- Schema], + R4 = safe_apply(R3, write, [R3#restore.bup_data, BupSchema]), + Master ! {self(), ok}, + fallback_receiver_loop(Master, R4, FA, records); + + {Master, {records, Recs}} when State =:= records -> + R2 = safe_apply(R, write, [R#restore.bup_data, Recs]), + Master ! {self(), ok}, + fallback_receiver_loop(Master, R2, FA, records); + + {Master, swap} when State =/= schema -> + ?eval_debug_fun({?MODULE, fallback_receiver_loop, pre_swap}, []), + safe_apply(R, commit_write, [R#restore.bup_data]), + Bup = FA#fallback_args.fallback_bup, + Tmp = FA#fallback_args.fallback_tmp, + throw_bad_res(ok, file:rename(Tmp, Bup)), + catch mnesia_lib:set(active_fallback, true), + ?eval_debug_fun({?MODULE, fallback_receiver_loop, post_swap}, []), + Master ! {self(), ok}, + fallback_receiver_loop(Master, R, FA, stop); + + {Master, stop} when State =:= stop -> + stopped; + + Msg -> + safe_apply(R, abort_write, [R#restore.bup_data]), + Tmp = FA#fallback_args.fallback_tmp, + file:delete(Tmp), + throw({error, "Unexpected msg fallback_receiver_loop", Msg}) + end. + +throw_bad_res(Expected, Expected) -> Expected; +throw_bad_res(_Expected, {error, Actual}) -> throw({error, Actual}); +throw_bad_res(_Expected, Actual) -> throw({error, Actual}). + +-record(local_tab, {name, + storage_type, + open, + add, + close, + swap, + record_name, + opened}). + +tm_fallback_start(IgnoreFallback) -> + mnesia_schema:lock_schema(), + Res = do_fallback_start(fallback_exists(), IgnoreFallback), + mnesia_schema: unlock_schema(), + case Res of + ok -> ok; + {error, Reason} -> exit(Reason) + end. + +do_fallback_start(false, _IgnoreFallback) -> + ok; +do_fallback_start(true, true) -> + verbose("Ignoring fallback at startup, but leaving it active...~n", []), + mnesia_lib:set(active_fallback, true), + ok; +do_fallback_start(true, false) -> + verbose("Starting from fallback...~n", []), + + BupFile = fallback_bup(), + Mod = mnesia_backup, + LocalTabs = ?ets_new_table(mnesia_local_tables, [set, public, {keypos, 2}]), + case catch iterate(Mod, fun restore_tables/4, BupFile, {start, LocalTabs}) of + {ok, _Res} -> + catch dets:close(schema), + TmpSchema = mnesia_lib:tab2tmp(schema), + DatSchema = mnesia_lib:tab2dat(schema), + AllLT = ?ets_match_object(LocalTabs, '_'), + ?ets_delete_table(LocalTabs), + case file:rename(TmpSchema, DatSchema) of + ok -> + [(LT#local_tab.swap)(LT#local_tab.name, LT) || + LT <- AllLT, LT#local_tab.name =/= schema], + file:delete(BupFile), + ok; + {error, Reason} -> + file:delete(TmpSchema), + {error, {"Cannot start from fallback. Rename error.", Reason}} + end; + {error, Reason} -> + {error, {"Cannot start from fallback", Reason}}; + {'EXIT', Reason} -> + {error, {"Cannot start from fallback", Reason}} + end. + +restore_tables(All=[Rec | Recs], Header, Schema, State={local, LocalTabs, LT}) -> + Tab = element(1, Rec), + if + Tab =:= LT#local_tab.name -> + Key = element(2, Rec), + (LT#local_tab.add)(Tab, Key, Rec, LT), + restore_tables(Recs, Header, Schema, State); + true -> + NewState = {new, LocalTabs}, + restore_tables(All, Header, Schema, NewState) + end; +restore_tables(All=[Rec | Recs], Header, Schema, {new, LocalTabs}) -> + Tab = element(1, Rec), + case ?ets_lookup(LocalTabs, Tab) of + [] -> + State = {not_local, LocalTabs, Tab}, + restore_tables(Recs, Header, Schema, State); + [LT] when is_record(LT, local_tab) -> + State = {local, LocalTabs, LT}, + case LT#local_tab.opened of + true -> ignore; + false -> + (LT#local_tab.open)(Tab, LT), + ?ets_insert(LocalTabs,LT#local_tab{opened=true}) + end, + restore_tables(All, Header, Schema, State) + end; +restore_tables(All=[Rec | Recs], Header, Schema, S = {not_local, LocalTabs, PrevTab}) -> + Tab = element(1, Rec), + if + Tab =:= PrevTab -> + restore_tables(Recs, Header, Schema, S); + true -> + State = {new, LocalTabs}, + restore_tables(All, Header, Schema, State) + end; +restore_tables(Recs, Header, Schema, {start, LocalTabs}) -> + Dir = mnesia_lib:dir(), + OldDir = filename:join([Dir, "OLD_DIR"]), + mnesia_schema:purge_dir(OldDir, []), + mnesia_schema:purge_dir(Dir, [fallback_name()]), + init_dat_files(Schema, LocalTabs), + State = {new, LocalTabs}, + restore_tables(Recs, Header, Schema, State); +restore_tables([], _Header, _Schema, State) -> + State. + +%% Creates all neccessary dat files and inserts +%% the table definitions in the schema table +%% +%% Returns a list of local_tab tuples for all local tables +init_dat_files(Schema, LocalTabs) -> + TmpFile = mnesia_lib:tab2tmp(schema), + Args = [{file, TmpFile}, {keypos, 2}, {type, set}], + case dets:open_file(schema, Args) of % Assume schema lock + {ok, _} -> + create_dat_files(Schema, LocalTabs), + ok = dets:close(schema), + LocalTab = #local_tab{name = schema, + storage_type = disc_copies, + open = undefined, + add = undefined, + close = undefined, + swap = undefined, + record_name = schema, + opened = false}, + ?ets_insert(LocalTabs, LocalTab); + {error, Reason} -> + throw({error, {"Cannot open file", schema, Args, Reason}}) + end. + +create_dat_files([{schema, schema, TabDef} | Tail], LocalTabs) -> + ok = dets:insert(schema, {schema, schema, TabDef}), + create_dat_files(Tail, LocalTabs); +create_dat_files([{schema, Tab, TabDef} | Tail], LocalTabs) -> + TmpFile = mnesia_lib:tab2tmp(Tab), + DatFile = mnesia_lib:tab2dat(Tab), + DclFile = mnesia_lib:tab2dcl(Tab), + DcdFile = mnesia_lib:tab2dcd(Tab), + Expunge = fun() -> + file:delete(DatFile), + file:delete(DclFile), + file:delete(DcdFile) + end, + + mnesia_lib:dets_sync_close(Tab), + file:delete(TmpFile), + Cs = mnesia_schema:list2cs(TabDef), + ok = dets:insert(schema, {schema, Tab, TabDef}), + RecName = Cs#cstruct.record_name, + Storage = mnesia_lib:cs_to_storage_type(node(), Cs), + if + Storage =:= unknown -> + ok = dets:delete(schema, {schema, Tab}), + create_dat_files(Tail, LocalTabs); + Storage =:= disc_only_copies -> + Args = [{file, TmpFile}, {keypos, 2}, + {type, mnesia_lib:disk_type(Tab, Cs#cstruct.type)}], + Open = fun(T, LT) when T =:= LT#local_tab.name -> + case mnesia_lib:dets_sync_open(T, Args) of + {ok, _} -> + ok; + {error, Reason} -> + throw({error, {"Cannot open file", T, Args, Reason}}) + end + end, + Add = fun(T, Key, Rec, LT) when T =:= LT#local_tab.name -> + case Rec of + {_T, Key} -> + ok = dets:delete(T, Key); + (Rec) when T =:= RecName -> + ok = dets:insert(Tab, Rec); + (Rec) -> + Rec2 = setelement(1, Rec, RecName), + ok = dets:insert(T, Rec2) + end + end, + Close = fun(T, LT) when T =:= LT#local_tab.name -> + mnesia_lib:dets_sync_close(T) + end, + Swap = fun(T, LT) when T =:= LT#local_tab.name -> + Expunge(), + case LT#local_tab.opened of + true -> + Close(T,LT); + false -> + Open(T,LT), + Close(T,LT) + end, + case file:rename(TmpFile, DatFile) of + ok -> + ok; + {error, Reason} -> + mnesia_lib:fatal("Cannot rename file ~p -> ~p: ~p~n", + [TmpFile, DatFile, Reason]) + end + end, + LocalTab = #local_tab{name = Tab, + storage_type = Storage, + open = Open, + add = Add, + close = Close, + swap = Swap, + record_name = RecName, + opened = false}, + ?ets_insert(LocalTabs, LocalTab), + create_dat_files(Tail, LocalTabs); + Storage =:= ram_copies; Storage =:= disc_copies -> + Open = fun(T, LT) when T =:= LT#local_tab.name -> + mnesia_log:open_log({?MODULE, T}, + mnesia_log:dcl_log_header(), + TmpFile, + false, + false, + read_write) + end, + Add = fun(T, Key, Rec, LT) when T =:= LT#local_tab.name -> + Log = {?MODULE, T}, + case Rec of + {_T, Key} -> + mnesia_log:append(Log, {{T, Key}, {T, Key}, delete}); + (Rec) when T =:= RecName -> + mnesia_log:append(Log, {{T, Key}, Rec, write}); + (Rec) -> + Rec2 = setelement(1, Rec, RecName), + mnesia_log:append(Log, {{T, Key}, Rec2, write}) + end + end, + Close = fun(T, LT) when T =:= LT#local_tab.name -> + mnesia_log:close_log({?MODULE, T}) + end, + Swap = fun(T, LT) when T =:= LT#local_tab.name -> + Expunge(), + if + Storage =:= ram_copies, LT#local_tab.opened =:= false -> + ok; + true -> + Log = mnesia_log:open_log(fallback_tab, + mnesia_log:dcd_log_header(), + DcdFile, + false), + mnesia_log:close_log(Log), + case LT#local_tab.opened of + true -> + Close(T,LT); + false -> + Open(T,LT), + Close(T,LT) + end, + case file:rename(TmpFile, DclFile) of + ok -> + ok; + {error, Reason} -> + mnesia_lib:fatal("Cannot rename file ~p -> ~p: ~p~n", + [TmpFile, DclFile, Reason]) + end + end + end, + LocalTab = #local_tab{name = Tab, + storage_type = Storage, + open = Open, + add = Add, + close = Close, + swap = Swap, + record_name = RecName, + opened = false + }, + ?ets_insert(LocalTabs, LocalTab), + create_dat_files(Tail, LocalTabs) + end; +create_dat_files([{schema, Tab} | Tail], LocalTabs) -> + ?ets_delete(LocalTabs, Tab), + ok = dets:delete(schema, {schema, Tab}), + TmpFile = mnesia_lib:tab2tmp(Tab), + mnesia_lib:dets_sync_close(Tab), + file:delete(TmpFile), + create_dat_files(Tail, LocalTabs); +create_dat_files([], _LocalTabs) -> + ok. + +uninstall_fallback() -> + uninstall_fallback([{scope, global}]). + +uninstall_fallback(Args) -> + case check_fallback_args(Args, #fallback_args{}) of + {ok, FA} -> + do_uninstall_fallback(FA); + {error, Reason} -> + {error, Reason} + end. + +do_uninstall_fallback(FA) -> + %% Ensure that we access the intended Mnesia + %% directory. This function may not be called + %% during startup since it will cause the + %% application_controller to get into deadlock + case mnesia_lib:ensure_loaded(?APPLICATION) of + ok -> + Pid = spawn_link(?MODULE, uninstall_fallback_master, [self(), FA]), + receive + {'EXIT', Pid, Reason} -> % if appl has trapped exit + {error, {'EXIT', Reason}}; + {Pid, Res} -> + Res + end; + {error, Reason} -> + {error, Reason} + end. + +uninstall_fallback_master(ClientPid, FA) -> + process_flag(trap_exit, true), + + FA2 = check_fallback_dir(ClientPid, FA), % May exit + Bup = FA2#fallback_args.fallback_bup, + case fallback_to_schema(Bup) of + {ok, fallback, List} -> + Cs = mnesia_schema:list2cs(List), + case catch get_fallback_nodes(FA, Cs#cstruct.disc_copies) of + Ns when is_list(Ns) -> + do_uninstall(ClientPid, Ns, FA); + {error, Reason} -> + local_fallback_error(ClientPid, Reason) + end; + {error, Reason} -> + local_fallback_error(ClientPid, Reason) + end. + +do_uninstall(ClientPid, Ns, FA) -> + Args = [self(), FA], + global:set_lock({{mnesia_table_lock, schema}, self()}, Ns, infinity), + Pids = [spawn_link(N, ?MODULE, local_uninstall_fallback, Args) || N <- Ns], + Res = do_uninstall(ClientPid, Pids, [], [], ok), + global:del_lock({{mnesia_table_lock, schema}, self()}, Ns), + ClientPid ! {self(), Res}, + unlink(ClientPid), + exit(shutdown). + +do_uninstall(ClientPid, [Pid | Pids], GoodPids, BadNodes, Res) -> + receive + %% {'EXIT', ClientPid, _} -> + %% client_exit; + {'EXIT', Pid, Reason} -> + BadNode = node(Pid), + BadRes = {error, {"Uninstall fallback", BadNode, Reason}}, + do_uninstall(ClientPid, Pids, GoodPids, [BadNode | BadNodes], BadRes); + {Pid, {error, Reason}} -> + BadNode = node(Pid), + BadRes = {error, {"Uninstall fallback", BadNode, Reason}}, + do_uninstall(ClientPid, Pids, GoodPids, [BadNode | BadNodes], BadRes); + {Pid, started} -> + do_uninstall(ClientPid, Pids, [Pid | GoodPids], BadNodes, Res) + end; +do_uninstall(ClientPid, [], GoodPids, [], ok) -> + lists:foreach(fun(Pid) -> Pid ! {self(), do_uninstall} end, GoodPids), + rec_uninstall(ClientPid, GoodPids, ok); +do_uninstall(_ClientPid, [], GoodPids, BadNodes, BadRes) -> + lists:foreach(fun(Pid) -> exit(Pid, shutdown) end, GoodPids), + {error, {node_not_running, BadNodes, BadRes}}. + +local_uninstall_fallback(Master, FA) -> + %% Don't trap exit + + register(mnesia_fallback, self()), % May exit + FA2 = check_fallback_dir(Master, FA), % May exit + Master ! {self(), started}, + + receive + {Master, do_uninstall} -> + ?eval_debug_fun({?MODULE, uninstall_fallback2, pre_delete}, []), + catch mnesia_lib:set(active_fallback, false), + Tmp = FA2#fallback_args.fallback_tmp, + Bup = FA2#fallback_args.fallback_bup, + file:delete(Tmp), + Res = + case fallback_exists(Bup) of + true -> file:delete(Bup); + false -> ok + end, + ?eval_debug_fun({?MODULE, uninstall_fallback2, post_delete}, []), + Master ! {self(), Res}, + unlink(Master), + exit(normal) + end. + +rec_uninstall(ClientPid, [Pid | Pids], AccRes) -> + receive + %% {'EXIT', ClientPid, _} -> + %% exit(shutdown); + {'EXIT', Pid, R} -> + Reason = {node_not_running, {node(Pid), R}}, + rec_uninstall(ClientPid, Pids, {error, Reason}); + {Pid, ok} -> + rec_uninstall(ClientPid, Pids, AccRes); + {Pid, BadRes} -> + rec_uninstall(ClientPid, Pids, BadRes) + end; +rec_uninstall(ClientPid, [], Res) -> + ClientPid ! {self(), Res}, + unlink(ClientPid), + exit(normal). + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +%% Backup traversal + +%% Iterate over a backup and produce a new backup. +%% Fun(BackupItem, Acc) is applied for each BackupItem. +%% +%% Valid BackupItems are: +%% +%% {schema, Tab} Table to be deleted +%% {schema, Tab, CreateList} Table to be created, CreateList may be empty +%% {schema, db_nodes, DbNodes}List of nodes, defaults to [node()] OLD +%% {schema, version, Version} Schema version OLD +%% {schema, cookie, Cookie} Unique schema cookie OLD +%% {Tab, Key} Oid for record to be deleted +%% Record Record to be inserted. +%% +%% The Fun must return a tuple {BackupItems, NewAcc} +%% where BackupItems is a list of valid BackupItems and +%% NewAcc is a new accumulator value. Once BackupItems +%% that not are schema related has been returned, no more schema +%% items may be returned. The schema related items must always be +%% first in the backup. +%% +%% If TargetMod =:= read_only, no new backup will be created. +%% +%% Opening of the source media will be performed by +%% to SourceMod:open_read(Source) +%% +%% Opening of the target media will be performed by +%% to TargetMod:open_write(Target) +traverse_backup(Source, Target, Fun, Acc) -> + Mod = mnesia_monitor:get_env(backup_module), + traverse_backup(Source, Mod, Target, Mod, Fun, Acc). + +traverse_backup(Source, SourceMod, Target, TargetMod, Fun, Acc) -> + Args = [self(), Source, SourceMod, Target, TargetMod, Fun, Acc], + Pid = spawn_link(?MODULE, do_traverse_backup, Args), + receive + {'EXIT', Pid, Reason} -> + {error, {"Backup traversal crashed", Reason}}; + {iter_done, Pid, Res} -> + Res + end. + +do_traverse_backup(ClientPid, Source, SourceMod, Target, TargetMod, Fun, Acc) -> + process_flag(trap_exit, true), + Iter = + if + TargetMod =/= read_only -> + case catch do_apply(TargetMod, open_write, [Target], Target) of + {error, Error} -> + unlink(ClientPid), + ClientPid ! {iter_done, self(), {error, Error}}, + exit(Error); + Else -> Else + end; + true -> + ignore + end, + A = {start, Fun, Acc, TargetMod, Iter}, + Res = + case iterate(SourceMod, fun trav_apply/4, Source, A) of + {ok, {iter, _, Acc2, _, Iter2}} when TargetMod =/= read_only -> + case catch do_apply(TargetMod, commit_write, [Iter2], Iter2) of + {error, Reason} -> + {error, Reason}; + _ -> + {ok, Acc2} + end; + {ok, {iter, _, Acc2, _, _}} -> + {ok, Acc2}; + {error, Reason} when TargetMod =/= read_only-> + catch do_apply(TargetMod, abort_write, [Iter], Iter), + {error, {"Backup traversal failed", Reason}}; + {error, Reason} -> + {error, {"Backup traversal failed", Reason}} + end, + unlink(ClientPid), + ClientPid ! {iter_done, self(), Res}. + +trav_apply(Recs, _Header, _Schema, {iter, Fun, Acc, Mod, Iter}) -> + {NewRecs, Acc2} = filter_foldl(Fun, Acc, Recs), + if + Mod =/= read_only, NewRecs =/= [] -> + Iter2 = do_apply(Mod, write, [Iter, NewRecs], Iter), + {iter, Fun, Acc2, Mod, Iter2}; + true -> + {iter, Fun, Acc2, Mod, Iter} + end; +trav_apply(Recs, Header, Schema, {start, Fun, Acc, Mod, Iter}) -> + Iter2 = + if + Mod =/= read_only -> + do_apply(Mod, write, [Iter, [Header]], Iter); + true -> + Iter + end, + TravAcc = trav_apply(Schema, Header, Schema, {iter, Fun, Acc, Mod, Iter2}), + trav_apply(Recs, Header, Schema, TravAcc). + +filter_foldl(Fun, Acc, [Head|Tail]) -> + case Fun(Head, Acc) of + {HeadItems, HeadAcc} when is_list(HeadItems) -> + {TailItems, TailAcc} = filter_foldl(Fun, HeadAcc, Tail), + {HeadItems ++ TailItems, TailAcc}; + Other -> + throw({error, {"Fun must return a list", Other}}) + end; +filter_foldl(_Fun, Acc, []) -> + {[], Acc}. + diff --git a/lib/mnesia/src/mnesia_checkpoint.erl b/lib/mnesia/src/mnesia_checkpoint.erl new file mode 100644 index 0000000000..eb8fe38908 --- /dev/null +++ b/lib/mnesia/src/mnesia_checkpoint.erl @@ -0,0 +1,1295 @@ +%% +%% %CopyrightBegin% +%% +%% Copyright Ericsson AB 1996-2009. All Rights Reserved. +%% +%% The contents of this file are subject to the Erlang Public License, +%% Version 1.1, (the "License"); you may not use this file except in +%% compliance with the License. You should have received a copy of the +%% Erlang Public License along with this software. If not, it can be +%% retrieved online at http://www.erlang.org/. +%% +%% Software distributed under the License is distributed on an "AS IS" +%% basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See +%% the License for the specific language governing rights and limitations +%% under the License. +%% +%% %CopyrightEnd% +%% + +%% +-module(mnesia_checkpoint). + +%% TM callback interface +-export([ + tm_add_copy/2, + tm_change_table_copy_type/3, + tm_del_copy/2, + tm_mnesia_down/1, + tm_prepare/1, + tm_retain/4, + tm_retain/5, + tm_enter_pending/1, + tm_enter_pending/3, + tm_exit_pending/1, + convert_cp_record/1 + ]). + +%% Public interface +-export([ + activate/1, + checkpoints/0, + deactivate/1, + deactivate/2, + iterate/6, + most_local_node/2, + really_retain/2, + stop/0, + stop_iteration/1, + tables_and_cookie/1 + ]). + +%% Internal +-export([ + call/2, + cast/2, + init/1, + remote_deactivate/1, + start/1 + ]). + +%% sys callback interface +-export([ + system_code_change/4, + system_continue/3, + system_terminate/4 + ]). + +-include("mnesia.hrl"). +-import(mnesia_lib, [add/2, del/2, set/2, unset/1]). +-import(mnesia_lib, [dbg_out/2]). + +-record(checkpoint_args, {name = {now(), node()}, + allow_remote = true, + ram_overrides_dump = false, + nodes = [], + node = node(), + now = now(), + cookie = ?unique_cookie, + min = [], + max = [], + pending_tab, + wait_for_old, % Initially undefined then List + is_activated = false, + ignore_new = [], + retainers = [], + iterators = [], + supervisor, + pid + }). + +%% Old record definition +-record(checkpoint, {name, + allow_remote, + ram_overrides_dump, + nodes, + node, + now, + min, + max, + pending_tab, + wait_for_old, + is_activated, + ignore_new, + retainers, + iterators, + supervisor, + pid + }). + +-record(retainer, {cp_name, tab_name, store, writers = [], really_retain = true}). + +-record(iter, {tab_name, oid_tab, main_tab, retainer_tab, source, val, pid}). + +-record(pending, {tid, disc_nodes = [], ram_nodes = []}). + + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +%% TM callback functions + +stop() -> + lists:foreach(fun(Name) -> call(Name, stop) end, + checkpoints()), + ok. + +tm_prepare(Cp) when is_record(Cp, checkpoint_args) -> + Name = Cp#checkpoint_args.name, + case lists:member(Name, checkpoints()) of + false -> + start_retainer(Cp); + true -> + {error, {already_exists, Name, node()}} + end; +tm_prepare(Cp) when is_record(Cp, checkpoint) -> + %% Node with old protocol sent an old checkpoint record + %% and we have to convert it + case convert_cp_record(Cp) of + {ok, NewCp} -> + tm_prepare(NewCp); + {error, Reason} -> + {error, Reason} + end. + +tm_mnesia_down(Node) -> + lists:foreach(fun(Name) -> cast(Name, {mnesia_down, Node}) end, + checkpoints()). + +%% Returns pending +tm_enter_pending(Tid, DiscNs, RamNs) -> + Pending = #pending{tid = Tid, disc_nodes = DiscNs, ram_nodes = RamNs}, + tm_enter_pending(Pending). + +tm_enter_pending(Pending) -> + PendingTabs = val(pending_checkpoints), + tm_enter_pending(PendingTabs, Pending). + +tm_enter_pending([], Pending) -> + Pending; +tm_enter_pending([Tab | Tabs], Pending) -> + catch ?ets_insert(Tab, Pending), + tm_enter_pending(Tabs, Pending). + +tm_exit_pending(Tid) -> + Pids = val(pending_checkpoint_pids), + tm_exit_pending(Pids, Tid). + +tm_exit_pending([], Tid) -> + Tid; +tm_exit_pending([Pid | Pids], Tid) -> + Pid ! {self(), {exit_pending, Tid}}, + tm_exit_pending(Pids, Tid). + +enter_still_pending([Tid | Tids], Tab) -> + ?ets_insert(Tab, #pending{tid = Tid}), + enter_still_pending(Tids, Tab); +enter_still_pending([], _Tab) -> + ok. + + +%% Looks up checkpoints for functions in mnesia_tm. +tm_retain(Tid, Tab, Key, Op) -> + case val({Tab, commit_work}) of + [{checkpoints, Checkpoints} | _ ] -> + tm_retain(Tid, Tab, Key, Op, Checkpoints); + _ -> + undefined + end. + +tm_retain(Tid, Tab, Key, Op, Checkpoints) -> + case Op of + clear_table -> + OldRecs = mnesia_lib:db_match_object(Tab, '_'), + send_group_retain(OldRecs, Checkpoints, Tid, Tab, []), + OldRecs; + _ -> + OldRecs = mnesia_lib:db_get(Tab, Key), + send_retain(Checkpoints, {retain, Tid, Tab, Key, OldRecs}), + OldRecs + end. + +send_group_retain([Rec | Recs], Checkpoints, Tid, Tab, [PrevRec | PrevRecs]) + when element(2, Rec) /= element(2, PrevRec) -> + Key = element(2, PrevRec), + OldRecs = lists:reverse([PrevRec | PrevRecs]), + send_retain(Checkpoints, {retain, Tid, Tab, Key, OldRecs}), + send_group_retain(Recs, Checkpoints, Tid, Tab, [Rec]); +send_group_retain([Rec | Recs], Checkpoints, Tid, Tab, Acc) -> + send_group_retain(Recs, Checkpoints, Tid, Tab, [Rec | Acc]); +send_group_retain([], Checkpoints, Tid, Tab, [PrevRec | PrevRecs]) -> + Key = element(2, PrevRec), + OldRecs = lists:reverse([PrevRec | PrevRecs]), + send_retain(Checkpoints, {retain, Tid, Tab, Key, OldRecs}), + ok; +send_group_retain([], _Checkpoints, _Tid, _Tab, []) -> + ok. + +send_retain([Name | Names], Msg) -> + cast(Name, Msg), + send_retain(Names, Msg); +send_retain([], _Msg) -> + ok. + +tm_add_copy(Tab, Node) when Node /= node() -> + case val({Tab, commit_work}) of + [{checkpoints, Checkpoints} | _ ] -> + Fun = fun(Name) -> call(Name, {add_copy, Tab, Node}) end, + map_call(Fun, Checkpoints, ok); + _ -> + ok + end. + +tm_del_copy(Tab, Node) when Node == node() -> + mnesia_subscr:unsubscribe_table(Tab), + case val({Tab, commit_work}) of + [{checkpoints, Checkpoints} | _ ] -> + Fun = fun(Name) -> call(Name, {del_copy, Tab, Node}) end, + map_call(Fun, Checkpoints, ok); + _ -> + ok + end. + +tm_change_table_copy_type(Tab, From, To) -> + case val({Tab, commit_work}) of + [{checkpoints, Checkpoints} | _ ] -> + Fun = fun(Name) -> call(Name, {change_copy, Tab, From, To}) end, + map_call(Fun, Checkpoints, ok); + _ -> + ok + end. + +map_call(Fun, [Name | Names], Res) -> + case Fun(Name) of + ok -> + map_call(Fun, Names, Res); + {error, {no_exists, Name}} -> + map_call(Fun, Names, Res); + {error, Reason} -> + %% BUGBUG: We may end up with some checkpoint retainers + %% too much in the add_copy case. How do we remove them? + map_call(Fun, Names, {error, Reason}) + end; +map_call(_Fun, [], Res) -> + Res. + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +%% Public functions + +deactivate(Name) -> + case call(Name, get_checkpoint) of + {error, Reason} -> + {error, Reason}; + Cp -> + deactivate(Cp#checkpoint_args.nodes, Name) + end. + +deactivate(Nodes, Name) -> + rpc:multicall(Nodes, ?MODULE, remote_deactivate, [Name]), + ok. + +remote_deactivate(Name) -> + call(Name, deactivate). + +checkpoints() -> val(checkpoints). + +tables_and_cookie(Name) -> + case call(Name, get_checkpoint) of + {error, Reason} -> + {error, Reason}; + Cp -> + Tabs = Cp#checkpoint_args.min ++ Cp#checkpoint_args.max, + Cookie = Cp#checkpoint_args.cookie, + {ok, Tabs, Cookie} + end. + +most_local_node(Name, Tab) -> + case ?catch_val({Tab, {retainer, Name}}) of + {'EXIT', _} -> + {error, {"No retainer attached to table", [Tab, Name]}}; + R -> + Writers = R#retainer.writers, + LocalWriter = lists:member(node(), Writers), + if + LocalWriter == true -> + {ok, node()}; + Writers /= [] -> + {ok, hd(Writers)}; + true -> + {error, {"No retainer attached to table", [Tab, Name]}} + end + end. + +really_retain(Name, Tab) -> + R = val({Tab, {retainer, Name}}), + R#retainer.really_retain. + +%% Activate a checkpoint. +%% +%% A checkpoint is a transaction consistent state that may be used to +%% perform a distributed backup or to rollback the involved tables to +%% their old state. Backups may also be used to restore tables to +%% their old state. Args is a list of the following tuples: +%% +%% {name, Name} +%% Name of checkpoint. Each checkpoint must have a name which +%% is unique on the reachable nodes. The name may be reused when +%% the checkpoint has been deactivated. +%% By default a probably unique name is generated. +%% Multiple checkpoints may be set on the same table. +%% +%% {allow_remote, Bool} +%% false means that all retainers must be local. If the +%% table does not reside locally, the checkpoint fails. +%% true allows retainers on other nodes. +%% +%% {min, MinTabs} +%% Minimize redundancy and only keep checkpoint info together with +%% one replica, preferrably at the local node. If any node involved +%% the checkpoint goes down, the checkpoint is deactivated. +%% +%% {max, MaxTabs} +%% Maximize redundancy and keep checkpoint info together with all +%% replicas. The checkpoint becomes more fault tolerant if the +%% tables has several replicas. When new replicas are added, they +%% will also get a retainer attached to them. +%% +%% {ram_overrides_dump, Bool} +%% {ram_overrides_dump, Tabs} +%% Only applicable for ram_copies. Bool controls which versions of +%% the records that should be included in the checkpoint state. +%% true means that the latest comitted records in ram (i.e. the +%% records that the application accesses) should be included +%% in the checkpoint. false means that the records dumped to +%% dat-files (the records that will be loaded at startup) should +%% be included in the checkpoint. Tabs is a list of tables. +%% Default is false. +%% +%% {ignore_new, TidList} +%% Normally we wait for all pending transactions to complete +%% before we allow iteration over the checkpoint. But in order +%% to cope with checkpoint activation inside a transaction that +%% currently prepares commit (mnesia_init:get_net_work_copy) we +%% need to have the ability to ignore the enclosing transaction. +%% We do not wait for the transactions in TidList to end. The +%% transactions in TidList are regarded as newer than the checkpoint. + +activate(Args) -> + case args2cp(Args) of + {ok, Cp} -> + do_activate(Cp); + {error, Reason} -> + {error, Reason} + end. + +args2cp(Args) when is_list(Args)-> + case catch lists:foldl(fun check_arg/2, #checkpoint_args{}, Args) of + {'EXIT', Reason} -> + {error, Reason}; + Cp -> + case check_tables(Cp) of + {error, Reason} -> + {error, Reason}; + {ok, Overriders, AllTabs} -> + arrange_retainers(Cp, Overriders, AllTabs) + end + end; +args2cp(Args) -> + {error, {badarg, Args}}. + +check_arg({name, Name}, Cp) -> + case lists:member(Name, checkpoints()) of + true -> + exit({already_exists, Name}); + false -> + case catch tab2retainer({foo, Name}) of + List when is_list(List) -> + Cp#checkpoint_args{name = Name}; + _ -> + exit({badarg, Name}) + end + end; +check_arg({allow_remote, true}, Cp) -> + Cp#checkpoint_args{allow_remote = true}; +check_arg({allow_remote, false}, Cp) -> + Cp#checkpoint_args{allow_remote = false}; +check_arg({ram_overrides_dump, true}, Cp) -> + Cp#checkpoint_args{ram_overrides_dump = true}; +check_arg({ram_overrides_dump, false}, Cp) -> + Cp#checkpoint_args{ram_overrides_dump = false}; +check_arg({ram_overrides_dump, Tabs}, Cp) when is_list(Tabs) -> + Cp#checkpoint_args{ram_overrides_dump = Tabs}; +check_arg({min, Tabs}, Cp) when is_list(Tabs) -> + Cp#checkpoint_args{min = Tabs}; +check_arg({max, Tabs}, Cp) when is_list(Tabs) -> + Cp#checkpoint_args{max = Tabs}; +check_arg({ignore_new, Tids}, Cp) when is_list(Tids) -> + Cp#checkpoint_args{ignore_new = Tids}; +check_arg(Arg, _) -> + exit({badarg, Arg}). + +check_tables(Cp) -> + Min = Cp#checkpoint_args.min, + Max = Cp#checkpoint_args.max, + AllTabs = Min ++ Max, + DoubleTabs = [T || T <- Min, lists:member(T, Max)], + Overriders = Cp#checkpoint_args.ram_overrides_dump, + if + DoubleTabs /= [] -> + {error, {combine_error, Cp#checkpoint_args.name, + [{min, DoubleTabs}, {max, DoubleTabs}]}}; + Min == [], Max == [] -> + {error, {combine_error, Cp#checkpoint_args.name, + [{min, Min}, {max, Max}]}}; + Overriders == false -> + {ok, [], AllTabs}; + Overriders == true -> + {ok, AllTabs, AllTabs}; + is_list(Overriders) -> + case [T || T <- Overriders, not lists:member(T, Min)] of + [] -> + case [T || T <- Overriders, not lists:member(T, Max)] of + [] -> + {ok, Overriders, AllTabs}; + Outsiders -> + {error, {combine_error, Cp#checkpoint_args.name, + [{ram_overrides_dump, Outsiders}, + {max, Outsiders}]}} + end; + Outsiders -> + {error, {combine_error, Cp#checkpoint_args.name, + [{ram_overrides_dump, Outsiders}, + {min, Outsiders}]}} + end + end. + +arrange_retainers(Cp, Overriders, AllTabs) -> + R = #retainer{cp_name = Cp#checkpoint_args.name}, + case catch [R#retainer{tab_name = Tab, + writers = select_writers(Cp, Tab)} + || Tab <- AllTabs] of + {'EXIT', Reason} -> + {error, Reason}; + Retainers -> + {ok, Cp#checkpoint_args{ram_overrides_dump = Overriders, + retainers = Retainers, + nodes = writers(Retainers)}} + end. + +select_writers(Cp, Tab) -> + case filter_remote(Cp, val({Tab, active_replicas})) of + [] -> + exit({"Cannot prepare checkpoint (replica not available)", + [Tab, Cp#checkpoint_args.name]}); + Writers -> + This = node(), + case {lists:member(Tab, Cp#checkpoint_args.max), + lists:member(This, Writers)} of + {true, _} -> Writers; % Max + {false, true} -> [This]; + {false, false} -> [hd(Writers)] + end + end. + +filter_remote(Cp, Writers) when Cp#checkpoint_args.allow_remote == true -> + Writers; +filter_remote(_Cp, Writers) -> + This = node(), + case lists:member(This, Writers) of + true -> [This]; + false -> [] + end. + +writers(Retainers) -> + Fun = fun(R, Acc) -> R#retainer.writers ++ Acc end, + Writers = lists:foldl(Fun, [], Retainers), + mnesia_lib:uniq(Writers). + +do_activate(Cp) -> + Name = Cp#checkpoint_args.name, + Nodes = Cp#checkpoint_args.nodes, + case mnesia_tm:prepare_checkpoint(Nodes, Cp) of + {Replies, []} -> + check_prep(Replies, Name, Nodes, Cp#checkpoint_args.ignore_new); + {_, BadNodes} -> + {error, {"Cannot prepare checkpoint (bad nodes)", + [Name, BadNodes]}} + end. + +check_prep([{ok, Name, IgnoreNew, _Node} | Replies], Name, Nodes, IgnoreNew) -> + check_prep(Replies, Name, Nodes, IgnoreNew); +check_prep([{error, Reason} | _Replies], Name, _Nodes, _IgnoreNew) -> + {error, {"Cannot prepare checkpoint (bad reply)", + [Name, Reason]}}; +check_prep([{badrpc, Reason} | _Replies], Name, _Nodes, _IgnoreNew) -> + {error, {"Cannot prepare checkpoint (badrpc)", + [Name, Reason]}}; +check_prep([], Name, Nodes, IgnoreNew) -> + collect_pending(Name, Nodes, IgnoreNew). + +collect_pending(Name, Nodes, IgnoreNew) -> + case rpc:multicall(Nodes, ?MODULE, call, [Name, collect_pending]) of + {Replies, []} -> + case catch ?ets_new_table(mnesia_union, [bag]) of + {'EXIT', Reason} -> %% system limit + Msg = "Cannot create an ets table pending union", + {error, {system_limit, Msg, Reason}}; + UnionTab -> + compute_union(Replies, Nodes, Name, UnionTab, IgnoreNew) + end; + {_, BadNodes} -> + deactivate(Nodes, Name), + {error, {"Cannot collect from pending checkpoint", Name, BadNodes}} + end. + +compute_union([{ok, Pending} | Replies], Nodes, Name, UnionTab, IgnoreNew) -> + add_pending(Pending, UnionTab), + compute_union(Replies, Nodes, Name, UnionTab, IgnoreNew); +compute_union([{error, Reason} | _Replies], Nodes, Name, UnionTab, _IgnoreNew) -> + deactivate(Nodes, Name), + ?ets_delete_table(UnionTab), + {error, Reason}; +compute_union([{badrpc, Reason} | _Replies], Nodes, Name, UnionTab, _IgnoreNew) -> + deactivate(Nodes, Name), + ?ets_delete_table(UnionTab), + {error, {badrpc, Reason}}; +compute_union([], Nodes, Name, UnionTab, IgnoreNew) -> + send_activate(Nodes, Nodes, Name, UnionTab, IgnoreNew). + +add_pending([P | Pending], UnionTab) -> + add_pending_node(P#pending.disc_nodes, P#pending.tid, UnionTab), + add_pending_node(P#pending.ram_nodes, P#pending.tid, UnionTab), + add_pending(Pending, UnionTab); +add_pending([], _UnionTab) -> + ok. + +add_pending_node([Node | Nodes], Tid, UnionTab) -> + ?ets_insert(UnionTab, {Node, Tid}), + add_pending_node(Nodes, Tid, UnionTab); +add_pending_node([], _Tid, _UnionTab) -> + ok. + +send_activate([Node | Nodes], AllNodes, Name, UnionTab, IgnoreNew) -> + Pending = [Tid || {_, Tid} <- ?ets_lookup(UnionTab, Node), + not lists:member(Tid, IgnoreNew)], + case rpc:call(Node, ?MODULE, call, [Name, {activate, Pending}]) of + activated -> + send_activate(Nodes, AllNodes, Name, UnionTab, IgnoreNew); + {badrpc, Reason} -> + deactivate(Nodes, Name), + ?ets_delete_table(UnionTab), + {error, {"Activation failed (bad node)", Name, Node, Reason}}; + {error, Reason} -> + deactivate(Nodes, Name), + ?ets_delete_table(UnionTab), + {error, {"Activation failed", Name, Node, Reason}} + end; +send_activate([], AllNodes, Name, UnionTab, _IgnoreNew) -> + ?ets_delete_table(UnionTab), + {ok, Name, AllNodes}. + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +%% Checkpoint server + +cast(Name, Msg) -> + case ?catch_val({checkpoint, Name}) of + {'EXIT', _} -> + {error, {no_exists, Name}}; + + Pid when is_pid(Pid) -> + Pid ! {self(), Msg}, + {ok, Pid} + end. + +call(Name, Msg) -> + case ?catch_val({checkpoint, Name}) of + {'EXIT', _} -> + {error, {no_exists, Name}}; + + Pid when is_pid(Pid) -> + Monitor = erlang:monitor(process, Pid), %catch link(Pid), % Always local + Pid ! {self(), Msg}, + Self = self(), + receive + {'EXIT', Pid, Reason} -> + {error, {"Got exit", [Name, Reason]}}; + {'DOWN', Monitor, _, Pid, Reason} -> + {error, {"Got exit", [Name, Reason]}}; + {Name, Self, Reply} -> + erlang:demonitor(Monitor), + Reply + end; + Error -> + Error + end. + +abcast(Nodes, Name, Msg) -> + rpc:eval_everywhere(Nodes, ?MODULE, cast, [Name, Msg]). + +reply(nopid, _Name, _Reply) -> + ignore; +reply(ReplyTo, Name, Reply) -> + ReplyTo ! {Name, ReplyTo, Reply}. + +%% Returns {ok, NewCp} or {error, Reason} +start_retainer(Cp) -> + % Will never be restarted + Name = Cp#checkpoint_args.name, + case supervisor:start_child(mnesia_checkpoint_sup, [Cp]) of + {ok, _Pid} -> + {ok, Name, Cp#checkpoint_args.ignore_new, node()}; + {error, Reason} -> + {error, {"Cannot create checkpoint retainer", + Name, node(), Reason}} + end. + +start(Cp) -> + Name = Cp#checkpoint_args.name, + Args = [Cp#checkpoint_args{supervisor = self()}], + mnesia_monitor:start_proc({?MODULE, Name}, ?MODULE, init, Args). + +init(Cp) -> + process_flag(trap_exit, true), + process_flag(priority, high), %% Needed dets files might starve the system + Name = Cp#checkpoint_args.name, + Props = [set, public, {keypos, 2}], + case catch ?ets_new_table(mnesia_pending_checkpoint, Props) of + {'EXIT', Reason} -> %% system limit + Msg = "Cannot create an ets table for pending transactions", + Error = {error, {system_limit, Name, Msg, Reason}}, + proc_lib:init_ack(Cp#checkpoint_args.supervisor, Error); + PendingTab -> + Rs = [prepare_tab(Cp, R) || R <- Cp#checkpoint_args.retainers], + Cp2 = Cp#checkpoint_args{retainers = Rs, + pid = self(), + pending_tab = PendingTab}, + add(pending_checkpoint_pids, self()), + add(pending_checkpoints, PendingTab), + set({checkpoint, Name}, self()), + add(checkpoints, Name), + dbg_out("Checkpoint ~p (~p) started~n", [Name, self()]), + proc_lib:init_ack(Cp2#checkpoint_args.supervisor, {ok, self()}), + retainer_loop(Cp2) + end. + +prepare_tab(Cp, R) -> + Tab = R#retainer.tab_name, + prepare_tab(Cp, R, val({Tab, storage_type})). + +prepare_tab(Cp, R, Storage) -> + Tab = R#retainer.tab_name, + Name = R#retainer.cp_name, + case lists:member(node(), R#retainer.writers) of + true -> + R2 = retainer_create(Cp, R, Tab, Name, Storage), + set({Tab, {retainer, Name}}, R2), + %% Keep checkpoint info for table_info & mnesia_session + add({Tab, checkpoints}, Name), + add_chkp_info(Tab, Name), + R2; + false -> + set({Tab, {retainer, Name}}, R#retainer{store = undefined}), + R + end. + +add_chkp_info(Tab, Name) -> + case val({Tab, commit_work}) of + [{checkpoints, OldList} | CommitList] -> + case lists:member(Name, OldList) of + true -> + ok; + false -> + NewC = [{checkpoints, [Name | OldList]} | CommitList], + mnesia_lib:set({Tab, commit_work}, NewC) + end; + CommitList -> + Chkp = {checkpoints, [Name]}, + %% OBS checkpoints needs to be first in the list! + mnesia_lib:set({Tab, commit_work}, [Chkp | CommitList]) + end. + +tab2retainer({Tab, Name}) -> + FlatName = lists:flatten(io_lib:write(Name)), + mnesia_lib:dir(lists:concat([?MODULE, "_", Tab, "_", FlatName, ".RET"])). + +retainer_create(_Cp, R, Tab, Name, disc_only_copies) -> + Fname = tab2retainer({Tab, Name}), + file:delete(Fname), + Args = [{file, Fname}, {type, set}, {keypos, 2}, {repair, false}], + {ok, _} = mnesia_lib:dets_sync_open({Tab, Name}, Args), + dbg_out("Checkpoint retainer created ~p ~p~n", [Name, Tab]), + R#retainer{store = {dets, {Tab, Name}}, really_retain = true}; +retainer_create(Cp, R, Tab, Name, Storage) -> + T = ?ets_new_table(mnesia_retainer, [set, public, {keypos, 2}]), + Overriders = Cp#checkpoint_args.ram_overrides_dump, + ReallyR = R#retainer.really_retain, + ReallyCp = lists:member(Tab, Overriders), + ReallyR2 = prepare_ram_tab(Tab, T, Storage, ReallyR, ReallyCp), + dbg_out("Checkpoint retainer created ~p ~p~n", [Name, Tab]), + R#retainer{store = {ets, T}, really_retain = ReallyR2}. + +%% Copy the dumped table into retainer if needed +%% If the really_retain flag already has been set to false, +%% it should remain false even if we change storage type +%% while the checkpoint is activated. +prepare_ram_tab(Tab, T, ram_copies, true, false) -> + Fname = mnesia_lib:tab2dcd(Tab), + case mnesia_lib:exists(Fname) of + true -> + Log = mnesia_log:open_log(prepare_ram_tab, + mnesia_log:dcd_log_header(), + Fname, true, + mnesia_monitor:get_env(auto_repair), + read_only), + Add = fun(Rec) -> + Key = element(2, Rec), + Recs = + case ?ets_lookup(T, Key) of + [] -> []; + [{_, _, Old}] -> Old + end, + ?ets_insert(T, {Tab, Key, [Rec | Recs]}), + continue + end, + traverse_dcd(mnesia_log:chunk_log(Log, start), Log, Add), + mnesia_log:close_log(Log); + false -> + ok + end, + false; +prepare_ram_tab(_, _, _, ReallyRetain, _) -> + ReallyRetain. + +traverse_dcd({Cont, [LogH | Rest]}, Log, Fun) + when is_record(LogH, log_header), + LogH#log_header.log_kind == dcd_log, + LogH#log_header.log_version >= "1.0" -> + traverse_dcd({Cont, Rest}, Log, Fun); %% BUGBUG Error handling repaired files +traverse_dcd({Cont, Recs}, Log, Fun) -> %% trashed data?? + lists:foreach(Fun, Recs), + traverse_dcd(mnesia_log:chunk_log(Log, Cont), Log, Fun); +traverse_dcd(eof, _Log, _Fun) -> + ok. + +retainer_get({ets, Store}, Key) -> ?ets_lookup(Store, Key); +retainer_get({dets, Store}, Key) -> dets:lookup(Store, Key). + +retainer_put({ets, Store}, Val) -> ?ets_insert(Store, Val); +retainer_put({dets, Store}, Val) -> dets:insert(Store, Val). + +retainer_first({ets, Store}) -> ?ets_first(Store); +retainer_first({dets, Store}) -> dets:first(Store). + +retainer_next({ets, Store}, Key) -> ?ets_next(Store, Key); +retainer_next({dets, Store}, Key) -> dets:next(Store, Key). + +%% retainer_next_slot(Tab, Pos) -> +%% case retainer_slot(Tab, Pos) of +%% '$end_of_table' -> +%% '$end_of_table'; +%% [] -> +%% retainer_next_slot(Tab, Pos + 1); +%% Recs when is_list(Recs) -> +%% {Pos, Recs} +%% end. +%% +%% retainer_slot({ets, Store}, Pos) -> ?ets_next(Store, Pos); +%% retainer_slot({dets, Store}, Pos) -> dets:slot(Store, Pos). + +retainer_fixtable(Tab, Bool) when is_atom(Tab) -> + mnesia_lib:db_fixtable(val({Tab, storage_type}), Tab, Bool); +retainer_fixtable({ets, Tab}, Bool) -> + mnesia_lib:db_fixtable(ram_copies, Tab, Bool); +retainer_fixtable({dets, Tab}, Bool) -> + mnesia_lib:db_fixtable(disc_only_copies, Tab, Bool). + +retainer_delete({ets, Store}) -> + ?ets_delete_table(Store); +retainer_delete({dets, Store}) -> + mnesia_lib:dets_sync_close(Store), + Fname = tab2retainer(Store), + file:delete(Fname). + +retainer_loop(Cp) -> + Name = Cp#checkpoint_args.name, + receive + {_From, {retain, Tid, Tab, Key, OldRecs}} + when Cp#checkpoint_args.wait_for_old == [] -> + R = val({Tab, {retainer, Name}}), + PendingTab = Cp#checkpoint_args.pending_tab, + case R#retainer.really_retain of + true when PendingTab =:= undefined -> + Store = R#retainer.store, + case retainer_get(Store, Key) of + [] -> retainer_put(Store, {Tab, Key, OldRecs}); + _ -> already_retained + end; + true -> + case ets:member(PendingTab, Tid) of + true -> ignore; + false -> + Store = R#retainer.store, + case retainer_get(Store, Key) of + [] -> retainer_put(Store, {Tab, Key, OldRecs}); + _ -> already_retained + end + end; + false -> + ignore + end, + retainer_loop(Cp); + + %% Adm + {From, deactivate} -> + do_stop(Cp), + reply(From, Name, deactivated), + unlink(From), + exit(shutdown); + + {'EXIT', Parent, _} when Parent == Cp#checkpoint_args.supervisor -> + %% do_stop(Cp), + %% assume that entire Mnesia is terminating + exit(shutdown); + + {_From, {mnesia_down, Node}} -> + Cp2 = do_del_retainers(Cp, Node), + retainer_loop(Cp2); + {From, get_checkpoint} -> + reply(From, Name, Cp), + retainer_loop(Cp); + {From, {add_copy, Tab, Node}} when Cp#checkpoint_args.wait_for_old == [] -> + {Res, Cp2} = do_add_copy(Cp, Tab, Node), + reply(From, Name, Res), + retainer_loop(Cp2); + {From, {del_copy, Tab, Node}} when Cp#checkpoint_args.wait_for_old == [] -> + Cp2 = do_del_copy(Cp, Tab, Node), + reply(From, Name, ok), + retainer_loop(Cp2); + {From, {change_copy, Tab, From, To}} when Cp#checkpoint_args.wait_for_old == [] -> + Cp2 = do_change_copy(Cp, Tab, From, To), + reply(From, Name, ok), + retainer_loop(Cp2); + {_From, {add_retainer, R, Node}} -> + Cp2 = do_add_retainer(Cp, R, Node), + retainer_loop(Cp2); + {_From, {del_retainer, R, Node}} when Cp#checkpoint_args.wait_for_old == [] -> + Cp2 = do_del_retainer(Cp, R, Node), + retainer_loop(Cp2); + + %% Iteration + {From, {iter_begin, Iter}} when Cp#checkpoint_args.wait_for_old == [] -> + Cp2 = iter_begin(Cp, From, Iter), + retainer_loop(Cp2); + + {From, {iter_end, Iter}} when Cp#checkpoint_args.wait_for_old == [] -> + retainer_fixtable(Iter#iter.oid_tab, false), + Iters = Cp#checkpoint_args.iterators -- [Iter], + reply(From, Name, ok), + retainer_loop(Cp#checkpoint_args{iterators = Iters}); + + {_From, {exit_pending, Tid}} + when is_list(Cp#checkpoint_args.wait_for_old) -> + StillPending = lists:delete(Tid, Cp#checkpoint_args.wait_for_old), + Cp2 = Cp#checkpoint_args{wait_for_old = StillPending}, + Cp3 = maybe_activate(Cp2), + retainer_loop(Cp3); + + {From, collect_pending} -> + PendingTab = Cp#checkpoint_args.pending_tab, + del(pending_checkpoints, PendingTab), + Pending = ?ets_match_object(PendingTab, '_'), + reply(From, Name, {ok, Pending}), + retainer_loop(Cp); + + {From, {activate, Pending}} -> + StillPending = mnesia_recover:still_pending(Pending), + enter_still_pending(StillPending, Cp#checkpoint_args.pending_tab), + Cp2 = maybe_activate(Cp#checkpoint_args{wait_for_old = StillPending}), + reply(From, Name, activated), + retainer_loop(Cp2); + + {'EXIT', From, _Reason} -> + Iters = [Iter || Iter <- Cp#checkpoint_args.iterators, + check_iter(From, Iter)], + retainer_loop(Cp#checkpoint_args{iterators = Iters}); + + {system, From, Msg} -> + dbg_out("~p got {system, ~p, ~p}~n", [?MODULE, From, Msg]), + sys:handle_system_msg(Msg, From, no_parent, ?MODULE, [], Cp) + end. + +maybe_activate(Cp) + when Cp#checkpoint_args.wait_for_old == [], + Cp#checkpoint_args.is_activated == false -> + Cp#checkpoint_args{pending_tab = undefined, is_activated = true}; +maybe_activate(Cp) -> + Cp. + +iter_begin(Cp, From, Iter) -> + Name = Cp#checkpoint_args.name, + R = val({Iter#iter.tab_name, {retainer, Name}}), + Iter2 = init_tabs(R, Iter), + Iter3 = Iter2#iter{pid = From}, + retainer_fixtable(Iter3#iter.oid_tab, true), + Iters = [Iter3 | Cp#checkpoint_args.iterators], + reply(From, Name, {ok, Iter3, self()}), + Cp#checkpoint_args{iterators = Iters}. + +do_stop(Cp) -> + Name = Cp#checkpoint_args.name, + del(pending_checkpoints, Cp#checkpoint_args.pending_tab), + del(pending_checkpoint_pids, self()), + del(checkpoints, Name), + unset({checkpoint, Name}), + lists:foreach(fun deactivate_tab/1, Cp#checkpoint_args.retainers), + Iters = Cp#checkpoint_args.iterators, + lists:foreach(fun(I) -> retainer_fixtable(I#iter.oid_tab, false) end, Iters). + +deactivate_tab(R) -> + Name = R#retainer.cp_name, + Tab = R#retainer.tab_name, + try + Active = lists:member(node(), R#retainer.writers), + case R#retainer.store of + undefined -> + ignore; + Store when Active == true -> + retainer_delete(Store); + _ -> + ignore + end, + unset({Tab, {retainer, Name}}), + del({Tab, checkpoints}, Name), %% Keep checkpoint info for table_info & mnesia_session + del_chkp_info(Tab, Name) + catch _:_ -> ignore + end. + +del_chkp_info(Tab, Name) -> + case val({Tab, commit_work}) of + [{checkpoints, ChkList} | Rest] -> + case lists:delete(Name, ChkList) of + [] -> + %% The only checkpoint was deleted + mnesia_lib:set({Tab, commit_work}, Rest); + NewList -> + mnesia_lib:set({Tab, commit_work}, + [{checkpoints, NewList} | Rest]) + end; + _ -> ignore + end. + +do_del_retainers(Cp, Node) -> + Rs = [do_del_retainer2(Cp, R, Node) || R <- Cp#checkpoint_args.retainers], + Cp#checkpoint_args{retainers = Rs, nodes = writers(Rs)}. + +do_del_retainer2(Cp, R, Node) -> + Writers = R#retainer.writers -- [Node], + R2 = R#retainer{writers = Writers}, + set({R2#retainer.tab_name, {retainer, R2#retainer.cp_name}}, R2), + if + Writers == [] -> + Event = {mnesia_checkpoint_deactivated, Cp#checkpoint_args.name}, + mnesia_lib:report_system_event(Event), + do_stop(Cp), + exit(shutdown); + Node == node() -> + deactivate_tab(R), % Avoids unnecessary tm_retain accesses + set({R2#retainer.tab_name, {retainer, R2#retainer.cp_name}}, R2), + R2; + true -> + R2 + end. + +do_del_retainer(Cp, R0, Node) -> + {R, Rest} = find_retainer(R0, Cp#checkpoint_args.retainers, []), + R2 = do_del_retainer2(Cp, R, Node), + Rs = [R2|Rest], + Cp#checkpoint_args{retainers = Rs, nodes = writers(Rs)}. + +do_del_copy(Cp, Tab, ThisNode) when ThisNode == node() -> + Name = Cp#checkpoint_args.name, + Others = Cp#checkpoint_args.nodes -- [ThisNode], + R = val({Tab, {retainer, Name}}), + abcast(Others, Name, {del_retainer, R, ThisNode}), + do_del_retainer(Cp, R, ThisNode). + +do_add_copy(Cp, Tab, Node) when Node /= node()-> + case lists:member(Tab, Cp#checkpoint_args.max) of + false -> + {ok, Cp}; + true -> + Name = Cp#checkpoint_args.name, + R0 = val({Tab, {retainer, Name}}), + W = R0#retainer.writers, + R = R0#retainer{writers = W ++ [Node]}, + + case lists:member(Node, Cp#checkpoint_args.nodes) of + true -> + send_retainer(Cp, R, Node); + false -> + case tm_remote_prepare(Node, Cp) of + {ok, Name, _IgnoreNew, Node} -> + case lists:member(schema, Cp#checkpoint_args.max) of + true -> + %% We need to send schema retainer somewhere + RS0 = val({schema, {retainer, Name}}), + WS = RS0#retainer.writers, + RS1 = RS0#retainer{writers = WS ++ [Node]}, + {ok, Cp1} = send_retainer(Cp, RS1, Node), + send_retainer(Cp1, R, Node); + false -> + send_retainer(Cp, R, Node) + end; + {badrpc, Reason} -> + {{error, {badrpc, Reason}}, Cp}; + {error, Reason} -> + {{error, Reason}, Cp} + end + end + end. + +tm_remote_prepare(Node, Cp) -> + rpc:call(Node, ?MODULE, tm_prepare, [Cp]). + +do_add_retainer(Cp, R0, Node) -> + Writers = R0#retainer.writers, + {R, Rest} = find_retainer(R0, Cp#checkpoint_args.retainers, []), + NewRet = + if + Node == node() -> + prepare_tab(Cp, R#retainer{writers = Writers}); + true -> + R#retainer{writers = Writers} + end, + Rs = [NewRet | Rest], + set({NewRet#retainer.tab_name, {retainer, NewRet#retainer.cp_name}}, NewRet), + Cp#checkpoint_args{retainers = Rs, nodes = writers(Rs)}. + +find_retainer(#retainer{cp_name = CP, tab_name = Tab}, + [Ret = #retainer{cp_name = CP, tab_name = Tab} | R], Acc) -> + {Ret, R ++ Acc}; +find_retainer(Ret, [H|R], Acc) -> + find_retainer(Ret, R, [H|Acc]). + +send_retainer(Cp, R, Node) -> + Name = Cp#checkpoint_args.name, + Nodes0 = Cp#checkpoint_args.nodes -- [Node], + Nodes = Nodes0 -- [node()], + Msg = {add_retainer, R, Node}, + abcast(Nodes, Name, Msg), + {ok, _} = rpc:call(Node, ?MODULE, cast, [Name, Msg]), + Store = R#retainer.store, + send_retainer2(Node, Name, Store, retainer_first(Store)), + Cp2 = do_add_retainer(Cp, R, Node), + {ok, Cp2}. + +send_retainer2(_, _, _, '$end_of_table') -> + ok; +%%send_retainer2(Node, Name, Store, {Slot, Records}) -> +send_retainer2(Node, Name, Store, Key) -> + [{Tab, _, Records}] = retainer_get(Store, Key), + abcast([Node], Name, {retain, {dirty, send_retainer}, Tab, Key, Records}), + send_retainer2(Node, Name, Store, retainer_next(Store, Key)). + +do_change_copy(Cp, Tab, FromType, ToType) -> + Name = Cp#checkpoint_args.name, + R = val({Tab, {retainer, Name}}), + R2 = prepare_tab(Cp, R, ToType), + {_, Old} = R#retainer.store, + {_, New} = R2#retainer.store, + + Fname = tab2retainer({Tab, Name}), + if + FromType == disc_only_copies -> + mnesia_lib:dets_sync_close(Old), + loaded = mnesia_lib:dets_to_ets(Old, New, Fname, set, no, yes), + ok = file:delete(Fname); + ToType == disc_only_copies -> + TabSize = ?ets_info(Old, size), + Props = [{file, Fname}, + {type, set}, + {keypos, 2}, +%% {ram_file, true}, + {estimated_no_objects, TabSize + 256}, + {repair, false}], + {ok, _} = mnesia_lib:dets_sync_open(New, Props), + ok = mnesia_dumper:raw_dump_table(New, Old), + ?ets_delete_table(Old); + true -> + ignore + end, + Pos = #retainer.tab_name, + Rs = lists:keyreplace(Tab, Pos, Cp#checkpoint_args.retainers, R2), + Cp#checkpoint_args{retainers = Rs, nodes = writers(Rs)}. + +check_iter(From, Iter) when Iter#iter.pid == From -> + retainer_fixtable(Iter#iter.oid_tab, false), + false; +check_iter(_From, _Iter) -> + true. + +init_tabs(R, Iter) -> + {Kind, _} = Store = R#retainer.store, + Main = {Kind, Iter#iter.tab_name}, + Ret = Store, + Iter2 = Iter#iter{main_tab = Main, retainer_tab = Ret}, + case Iter#iter.source of + table -> Iter2#iter{oid_tab = Main}; + retainer -> Iter2#iter{oid_tab = Ret} + end. + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +%% Iteration +%% +%% Iterates over a table and applies Fun(ListOfRecords) +%% with a suitable amount of records, e.g. 1000 or so. +%% ListOfRecords is [] when the iteration is over. +%% +%% OidKind affects which internal table to be iterated over and +%% ValKind affects which table to pick the actual records from. Legal +%% values for OidKind and ValKind is the atom table or the atom +%% retainer. +%% +%% The iteration may either be performed over the main table (which +%% contains the latest values of the records, i.e. the values that +%% are visible to the applications) or over the checkpoint retainer +%% (which contains the values as the looked like the timepoint when +%% the checkpoint was activated). +%% +%% It is possible to iterate over the main table and pick values +%% from the retainer and vice versa. + +iterate(Name, Tab, Fun, Acc, Source, Val) -> + Iter0 = #iter{tab_name = Tab, source = Source, val = Val}, + case call(Name, {iter_begin, Iter0}) of + {error, Reason} -> + {error, Reason}; + {ok, Iter, Pid} -> + link(Pid), % We don't want any pending fixtable's + Res = (catch iter(Fun, Acc, Iter)), + unlink(Pid), + call(Name, {iter_end, Iter}), + case Res of + {'EXIT', Reason} -> {error, Reason}; + {error, Reason} -> {error, Reason}; + Acc2 -> {ok, Acc2} + end + end. + +iter(Fun, Acc, Iter)-> + iter(Fun, Acc, Iter, retainer_first(Iter#iter.oid_tab)). + +iter(Fun, Acc, Iter, Key) -> + case get_records(Iter, Key) of + {'$end_of_table', []} -> + Fun([], Acc); + {'$end_of_table', Records} -> + Acc2 = Fun(Records, Acc), + Fun([], Acc2); + {Next, Records} -> + Acc2 = Fun(Records, Acc), + iter(Fun, Acc2, Iter, Next) + end. + +stop_iteration(Reason) -> + throw({error, {stopped, Reason}}). + +get_records(Iter, Key) -> + get_records(Iter, Key, 500, []). % 500 keys + +get_records(_Iter, Key, 0, Acc) -> + {Key, lists:append(lists:reverse(Acc))}; +get_records(_Iter, '$end_of_table', _I, Acc) -> + {'$end_of_table', lists:append(lists:reverse(Acc))}; +get_records(Iter, Key, I, Acc) -> + Recs = get_val(Iter, Key), + Next = retainer_next(Iter#iter.oid_tab, Key), + get_records(Iter, Next, I-1, [Recs | Acc]). + +get_val(Iter, Key) when Iter#iter.val == latest -> + get_latest_val(Iter, Key); +get_val(Iter, Key) when Iter#iter.val == checkpoint -> + get_checkpoint_val(Iter, Key). + +get_latest_val(Iter, Key) when Iter#iter.source == table -> + retainer_get(Iter#iter.main_tab, Key); +get_latest_val(Iter, Key) when Iter#iter.source == retainer -> + DeleteOid = {Iter#iter.tab_name, Key}, + [DeleteOid | retainer_get(Iter#iter.main_tab, Key)]. + +get_checkpoint_val(Iter, Key) when Iter#iter.source == table -> + retainer_get(Iter#iter.main_tab, Key); +get_checkpoint_val(Iter, Key) when Iter#iter.source == retainer -> + DeleteOid = {Iter#iter.tab_name, Key}, + case retainer_get(Iter#iter.retainer_tab, Key) of + [{_, _, []}] -> [DeleteOid]; + [{_, _, Records}] -> [DeleteOid | Records] + end. + +%%%%%%%%%%%%%%%%%%%%%%%%%%% +%% System upgrade + +system_continue(_Parent, _Debug, Cp) -> + retainer_loop(Cp). + +system_terminate(_Reason, _Parent,_Debug, Cp) -> + do_stop(Cp). + +system_code_change(Cp, _Module, _OldVsn, _Extra) -> + {ok, Cp}. + +convert_cp_record(Cp) when is_record(Cp, checkpoint) -> + ROD = + case Cp#checkpoint.ram_overrides_dump of + true -> Cp#checkpoint.min ++ Cp#checkpoint.max; + false -> [] + end, + + {ok, #checkpoint_args{name = Cp#checkpoint.name, + allow_remote = Cp#checkpoint.name, + ram_overrides_dump = ROD, + nodes = Cp#checkpoint.nodes, + node = Cp#checkpoint.node, + now = Cp#checkpoint.now, + cookie = ?unique_cookie, + min = Cp#checkpoint.min, + max = Cp#checkpoint.max, + pending_tab = Cp#checkpoint.pending_tab, + wait_for_old = Cp#checkpoint.wait_for_old, + is_activated = Cp#checkpoint.is_activated, + ignore_new = Cp#checkpoint.ignore_new, + retainers = Cp#checkpoint.retainers, + iterators = Cp#checkpoint.iterators, + supervisor = Cp#checkpoint.supervisor, + pid = Cp#checkpoint.pid + }}; +convert_cp_record(Cp) when is_record(Cp, checkpoint_args) -> + AllTabs = Cp#checkpoint_args.min ++ Cp#checkpoint_args.max, + ROD = case Cp#checkpoint_args.ram_overrides_dump of + [] -> + false; + AllTabs -> + true; + _ -> + error + end, + if + ROD == error -> + {error, {"Old node cannot handle new checkpoint protocol", + ram_overrides_dump}}; + true -> + {ok, #checkpoint{name = Cp#checkpoint_args.name, + allow_remote = Cp#checkpoint_args.name, + ram_overrides_dump = ROD, + nodes = Cp#checkpoint_args.nodes, + node = Cp#checkpoint_args.node, + now = Cp#checkpoint_args.now, + min = Cp#checkpoint_args.min, + max = Cp#checkpoint_args.max, + pending_tab = Cp#checkpoint_args.pending_tab, + wait_for_old = Cp#checkpoint_args.wait_for_old, + is_activated = Cp#checkpoint_args.is_activated, + ignore_new = Cp#checkpoint_args.ignore_new, + retainers = Cp#checkpoint_args.retainers, + iterators = Cp#checkpoint_args.iterators, + supervisor = Cp#checkpoint_args.supervisor, + pid = Cp#checkpoint_args.pid + }} + end. + +%%%%%%%%%%%%%%%%%%%%%%%%%% + +val(Var) -> + case ?catch_val(Var) of + {'EXIT', _ReASoN_} -> mnesia_lib:other_val(Var, _ReASoN_); + _VaLuE_ -> _VaLuE_ + end. + diff --git a/lib/mnesia/src/mnesia_checkpoint_sup.erl b/lib/mnesia/src/mnesia_checkpoint_sup.erl new file mode 100644 index 0000000000..2fe8df52f7 --- /dev/null +++ b/lib/mnesia/src/mnesia_checkpoint_sup.erl @@ -0,0 +1,42 @@ +%% +%% %CopyrightBegin% +%% +%% Copyright Ericsson AB 1997-2009. All Rights Reserved. +%% +%% The contents of this file are subject to the Erlang Public License, +%% Version 1.1, (the "License"); you may not use this file except in +%% compliance with the License. You should have received a copy of the +%% Erlang Public License along with this software. If not, it can be +%% retrieved online at http://www.erlang.org/. +%% +%% Software distributed under the License is distributed on an "AS IS" +%% basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See +%% the License for the specific language governing rights and limitations +%% under the License. +%% +%% %CopyrightEnd% +%% + +%% +-module(mnesia_checkpoint_sup). + +-behaviour(supervisor). + +-export([start/0, init/1]). + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +%% top supervisor callback functions + +start() -> + supervisor:start_link({local, ?MODULE}, ?MODULE, []). + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +%% sub supervisor callback functions + +init([]) -> + Flags = {simple_one_for_one, 0, timer:hours(24)}, % Trust the top supervisor + MFA = {mnesia_checkpoint, start, []}, + Modules = [?MODULE, mnesia_checkpoint, supervisor], + KillAfter = mnesia_kernel_sup:supervisor_timeout(timer:seconds(3)), + Workers = [{?MODULE, MFA, transient, KillAfter, worker, Modules}], + {ok, {Flags, Workers}}. diff --git a/lib/mnesia/src/mnesia_controller.erl b/lib/mnesia/src/mnesia_controller.erl new file mode 100644 index 0000000000..9bc480e619 --- /dev/null +++ b/lib/mnesia/src/mnesia_controller.erl @@ -0,0 +1,2182 @@ +%% +%% %CopyrightBegin% +%% +%% Copyright Ericsson AB 1996-2009. All Rights Reserved. +%% +%% The contents of this file are subject to the Erlang Public License, +%% Version 1.1, (the "License"); you may not use this file except in +%% compliance with the License. You should have received a copy of the +%% Erlang Public License along with this software. If not, it can be +%% retrieved online at http://www.erlang.org/. +%% +%% Software distributed under the License is distributed on an "AS IS" +%% basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See +%% the License for the specific language governing rights and limitations +%% under the License. +%% +%% %CopyrightEnd% +%% + +%% +%% The mnesia_init process loads tables from local disc or from +%% another nodes. It also coordinates updates of the info about +%% where we can read and write tables. +%% +%% Tables may need to be loaded initially at startup of the local +%% node or when other nodes announces that they already have loaded +%% tables that we also want. +%% +%% Initially we set the load request queue to those tables that we +%% safely can load locally, i.e. tables where we have the last +%% consistent replica and we have received mnesia_down from all +%% other nodes holding the table. Then we let the mnesia_init +%% process enter its normal working state. +%% +%% When we need to load a table we append a request to the load +%% request queue. All other requests are regarded as high priority +%% and are processed immediately (e.g. update table whereabouts). +%% We processes the load request queue as a "background" job.. + +-module(mnesia_controller). + +-behaviour(gen_server). + +%% Mnesia internal stuff +-export([ + start/0, + i_have_tab/1, + info/0, + get_info/1, + get_workers/1, + force_load_table/1, + async_dump_log/1, + sync_dump_log/1, + connect_nodes/1, + wait_for_schema_commit_lock/0, + release_schema_commit_lock/0, + create_table/1, + get_disc_copy/1, + get_cstructs/0, + sync_and_block_table_whereabouts/4, + sync_del_table_copy_whereabouts/2, + block_table/1, + unblock_table/1, + block_controller/0, + unblock_controller/0, + unannounce_add_table_copy/2, + master_nodes_updated/2, + mnesia_down/1, + add_active_replica/2, + add_active_replica/3, + add_active_replica/4, + update/1, + change_table_access_mode/1, + del_active_replica/2, + wait_for_tables/2, + get_network_copy/2, + merge_schema/0, + start_remote_sender/4, + schedule_late_disc_load/2 + ]). + +%% gen_server callbacks +-export([init/1, + handle_call/3, + handle_cast/2, + handle_info/2, + terminate/2, + code_change/3]). + +%% Module internal stuff +-export([call/1, + cast/1, + dump_and_reply/2, + load_and_reply/2, + send_and_reply/2, + wait_for_tables_init/2, + connect_nodes2/2 + ]). + +-import(mnesia_lib, [set/2, add/2]). +-import(mnesia_lib, [fatal/2, error/2, verbose/2, dbg_out/2]). + +-include("mnesia.hrl"). + +-define(SERVER_NAME, ?MODULE). + +-record(state, {supervisor, + schema_is_merged = false, + early_msgs = [], + loader_pid = [], %% Was Pid is now [{Pid,Work}|..] + loader_queue, %% Was list is now gb_tree + sender_pid = [], %% Was a pid or undef is now [{Pid,Work}|..] + sender_queue = [], + late_loader_queue, %% Was list is now gb_tree + dumper_pid, %% Dumper or schema commit pid + dumper_queue = [], %% Dumper or schema commit queue + others = [], %% Processes that needs the copier_done msg + dump_log_timer_ref, + is_stopping = false + }). +%% Backwards Comp. Sender_pid is now a list of senders.. +get_senders(#state{sender_pid = Pids}) when is_list(Pids) -> Pids. +%% Backwards Comp. loader_pid is now a list of loaders.. +get_loaders(#state{loader_pid = Pids}) when is_list(Pids) -> Pids. +max_loaders() -> + case ?catch_val(no_table_loaders) of + {'EXIT', _} -> + mnesia_lib:set(no_table_loaders,1), + 1; + Val -> Val + end. + +-record(schema_commit_lock, {owner}). +-record(block_controller, {owner}). + +-record(dump_log, {initiated_by, + opt_reply_to + }). + +-record(net_load, {table, + reason, + opt_reply_to, + cstruct = unknown + }). + +-record(send_table, {table, + receiver_pid, + remote_storage + }). + +-record(disc_load, {table, + reason, + opt_reply_to + }). + +-record(late_load, {table, + reason, + opt_reply_to, + loaders + }). + +-record(loader_done, {worker_pid, + is_loaded, + table_name, + needs_announce, + needs_sync, + needs_reply, + reply_to, + reply}). + +-record(sender_done, {worker_pid, + worker_res, + table_name + }). + +-record(dumper_done, {worker_pid, + worker_res + }). + +val(Var) -> + case ?catch_val(Var) of + {'EXIT', Reason} -> mnesia_lib:other_val(Var, Reason); + Value -> Value + end. + +start() -> + gen_server:start_link({local, ?SERVER_NAME}, ?MODULE, [self()], + [{timeout, infinity} + %% ,{debug, [trace]} + ]). + +sync_dump_log(InitBy) -> + call({sync_dump_log, InitBy}). + +async_dump_log(InitBy) -> + ?SERVER_NAME ! {async_dump_log, InitBy}. + +%% Wait for tables to be active +%% If needed, we will wait for Mnesia to start +%% If Mnesia stops, we will wait for Mnesia to restart +%% We will wait even if the list of tables is empty +%% +wait_for_tables(Tabs, Timeout) when is_list(Tabs), Timeout == infinity -> + do_wait_for_tables(Tabs, Timeout); +wait_for_tables(Tabs, Timeout) when is_list(Tabs), + is_integer(Timeout), Timeout >= 0 -> + do_wait_for_tables(Tabs, Timeout); +wait_for_tables(Tabs, Timeout) -> + {error, {badarg, Tabs, Timeout}}. + +do_wait_for_tables(Tabs, 0) -> + reply_wait(Tabs); +do_wait_for_tables(Tabs, Timeout) -> + Pid = spawn_link(?MODULE, wait_for_tables_init, [self(), Tabs]), + receive + {?SERVER_NAME, Pid, Res} -> + Res; + {'EXIT', Pid, _} -> + reply_wait(Tabs) + after Timeout -> + unlink(Pid), + exit(Pid, timeout), + reply_wait(Tabs) + end. + +reply_wait(Tabs) -> + case catch mnesia_lib:active_tables() of + {'EXIT', _} -> + {error, {node_not_running, node()}}; + Active when is_list(Active) -> + case Tabs -- Active of + [] -> + ok; + BadTabs -> + {timeout, BadTabs} + end + end. + +wait_for_tables_init(From, Tabs) -> + process_flag(trap_exit, true), + Res = wait_for_init(From, Tabs, whereis(?SERVER_NAME)), + From ! {?SERVER_NAME, self(), Res}, + unlink(From), + exit(normal). + +wait_for_init(From, Tabs, Init) -> + case catch link(Init) of + {'EXIT', _} -> + %% Mnesia is not started + {error, {node_not_running, node()}}; + true when is_pid(Init) -> + cast({sync_tabs, Tabs, self()}), + rec_tabs(Tabs, Tabs, From, Init) + end. + +sync_reply(Waiter, Tab) -> + Waiter ! {?SERVER_NAME, {tab_synced, Tab}}. + +rec_tabs([Tab | Tabs], AllTabs, From, Init) -> + receive + {?SERVER_NAME, {tab_synced, Tab}} -> + rec_tabs(Tabs, AllTabs, From, Init); + + {'EXIT', From, _} -> + %% This will trigger an exit signal + %% to mnesia_init + exit(wait_for_tables_timeout); + + {'EXIT', Init, _} -> + %% Oops, mnesia_init stopped, + exit(mnesia_stopped) + end; +rec_tabs([], _, _, Init) -> + unlink(Init), + ok. + +get_cstructs() -> + call(get_cstructs). + +update(Fun) -> + call({update,Fun}). + + +mnesia_down(Node) -> + case cast({mnesia_down, Node}) of + {error, _} -> mnesia_monitor:mnesia_down(?SERVER_NAME, Node); + _Pid -> ok + end. +wait_for_schema_commit_lock() -> + link(whereis(?SERVER_NAME)), + unsafe_call(wait_for_schema_commit_lock). + +block_controller() -> + call(block_controller). + +unblock_controller() -> + cast(unblock_controller). + +release_schema_commit_lock() -> + cast({release_schema_commit_lock, self()}), + unlink(whereis(?SERVER_NAME)). + +%% Special for preparation of add table copy +get_network_copy(Tab, Cs) -> +% We can't let the controller queue this one +% because that may cause a deadlock between schema_operations +% and initial tableloadings which both takes schema locks. +% But we have to get copier_done msgs when the other side +% goes down. + call({add_other, self()}), + Reason = {dumper,add_table_copy}, + Work = #net_load{table = Tab,reason = Reason,cstruct = Cs}, + %% I'll need this cause it's linked trough the subscriber + %% might be solved by using monitor in subscr instead. + process_flag(trap_exit, true), + Load = load_table_fun(Work), + Res = (catch Load()), + process_flag(trap_exit, false), + call({del_other, self()}), + case Res of + #loader_done{is_loaded = true} -> + Tab = Res#loader_done.table_name, + case Res#loader_done.needs_announce of + true -> + i_have_tab(Tab); + false -> + ignore + end, + Res#loader_done.reply; + #loader_done{} -> + Res#loader_done.reply; + Else -> + {not_loaded, Else} + end. + +%% This functions is invoked from the dumper +%% +%% There are two cases here: +%% startup -> +%% no need for sync, since mnesia_controller not started yet +%% schema_trans -> +%% already synced with mnesia_controller since the dumper +%% is syncronously started from mnesia_controller + +create_table(Tab) -> + {loaded, ok} = mnesia_loader:disc_load_table(Tab, {dumper,create_table}). + +get_disc_copy(Tab) -> + disc_load_table(Tab, {dumper,change_table_copy_type}, undefined). + +%% Returns ok instead of yes +force_load_table(Tab) when is_atom(Tab), Tab /= schema -> + case ?catch_val({Tab, storage_type}) of + ram_copies -> + do_force_load_table(Tab); + disc_copies -> + do_force_load_table(Tab); + disc_only_copies -> + do_force_load_table(Tab); + unknown -> + set({Tab, load_by_force}, true), + cast({force_load_updated, Tab}), + wait_for_tables([Tab], infinity); + {'EXIT', _} -> + {error, {no_exists, Tab}} + end; +force_load_table(Tab) -> + {error, {bad_type, Tab}}. + +do_force_load_table(Tab) -> + Loaded = ?catch_val({Tab, load_reason}), + case Loaded of + unknown -> + set({Tab, load_by_force}, true), + mnesia_late_loader:async_late_disc_load(node(), [Tab], forced_by_user), + wait_for_tables([Tab], infinity); + {'EXIT', _} -> + set({Tab, load_by_force}, true), + mnesia_late_loader:async_late_disc_load(node(), [Tab], forced_by_user), + wait_for_tables([Tab], infinity); + _ -> + ok + end. +master_nodes_updated(schema, _Masters) -> + ignore; +master_nodes_updated(Tab, Masters) -> + cast({master_nodes_updated, Tab, Masters}). + +schedule_late_disc_load(Tabs, Reason) -> + MsgTag = late_disc_load, + try_schedule_late_disc_load(Tabs, Reason, MsgTag). + +try_schedule_late_disc_load(Tabs, _Reason, MsgTag) + when Tabs == [], MsgTag /= schema_is_merged -> + ignore; +try_schedule_late_disc_load(Tabs, Reason, MsgTag) -> + GetIntents = + fun() -> + Item = mnesia_late_disc_load, + Nodes = val({current, db_nodes}), + mnesia:lock({global, Item, Nodes}, write), + case multicall(Nodes -- [node()], disc_load_intents) of + {Replies, []} -> + call({MsgTag, Tabs, Reason, Replies}), + done; + {_, BadNodes} -> + %% Some nodes did not respond, lets try again + {retry, BadNodes} + end + end, + case mnesia:transaction(GetIntents) of + {atomic, done} -> + done; + {atomic, {retry, BadNodes}} -> + verbose("Retry late_load_tables because bad nodes: ~p~n", + [BadNodes]), + try_schedule_late_disc_load(Tabs, Reason, MsgTag); + {aborted, AbortReason} -> + fatal("Cannot late_load_tables~p: ~p~n", + [[Tabs, Reason, MsgTag], AbortReason]) + end. + +connect_nodes(Ns) -> + case mnesia:system_info(is_running) of + no -> + {error, {node_not_running, node()}}; + yes -> + Pid = spawn_link(?MODULE,connect_nodes2,[self(),Ns]), + receive + {?MODULE, Pid, Res, New} -> + case Res of + ok -> + mnesia_lib:add_list(extra_db_nodes, New), + {ok, New}; + {aborted, {throw, Str}} when is_list(Str) -> + %%mnesia_recover:disconnect_nodes(New), + {error, {merge_schema_failed, lists:flatten(Str)}}; + Else -> + {error, Else} + end; + {'EXIT', Pid, Reason} -> + {error, Reason} + end + end. + +connect_nodes2(Father, Ns) -> + Current = val({current, db_nodes}), + abcast([node()|Ns], {merging_schema, node()}), + {NewC, OldC} = mnesia_recover:connect_nodes(Ns), + Connected = NewC ++OldC, + New1 = mnesia_lib:intersect(Ns, Connected), + New = New1 -- Current, + process_flag(trap_exit, true), + Res = try_merge_schema(New), + Msg = {schema_is_merged, [], late_merge, []}, + multicall([node()|Ns], Msg), + After = val({current, db_nodes}), + Father ! {?MODULE, self(), Res, mnesia_lib:intersect(Ns,After)}, + unlink(Father), + ok. + +%% Merge the local schema with the schema on other nodes. +%% But first we must let all processes that want to force +%% load tables wait until the schema merge is done. + +merge_schema() -> + AllNodes = mnesia_lib:all_nodes(), + case try_merge_schema(AllNodes) of + ok -> + schema_is_merged(); + {aborted, {throw, Str}} when is_list(Str) -> + fatal("Failed to merge schema: ~s~n", [Str]); + Else -> + fatal("Failed to merge schema: ~p~n", [Else]) + end. + +try_merge_schema(Nodes) -> + case mnesia_schema:merge_schema() of + {atomic, not_merged} -> + %% No more nodes that we need to merge the schema with + ok; + {atomic, {merged, OldFriends, NewFriends}} -> + %% Check if new nodes has been added to the schema + Diff = mnesia_lib:all_nodes() -- [node() | Nodes], + mnesia_recover:connect_nodes(Diff), + + %% Tell everybody to adopt orphan tables + im_running(OldFriends, NewFriends), + im_running(NewFriends, OldFriends), + + try_merge_schema(Nodes); + {atomic, {"Cannot get cstructs", Node, Reason}} -> + dbg_out("Cannot get cstructs, Node ~p ~p~n", [Node, Reason]), + timer:sleep(1000), % Avoid a endless loop look alike + try_merge_schema(Nodes); + Other -> + Other + end. + +im_running(OldFriends, NewFriends) -> + abcast(OldFriends, {im_running, node(), NewFriends}). + +schema_is_merged() -> + MsgTag = schema_is_merged, + SafeLoads = initial_safe_loads(), + + %% At this point we do not know anything about + %% which tables that the other nodes already + %% has loaded and therefore we let the normal + %% processing of the loader_queue take care + %% of it, since we at that time point will + %% know the whereabouts. We rely on the fact + %% that all nodes tells each other directly + %% when they have loaded a table and are + %% willing to share it. + + try_schedule_late_disc_load(SafeLoads, initial, MsgTag). + + +cast(Msg) -> + case whereis(?SERVER_NAME) of + undefined ->{error, {node_not_running, node()}}; + Pid -> gen_server:cast(Pid, Msg) + end. + +abcast(Nodes, Msg) -> + gen_server:abcast(Nodes, ?SERVER_NAME, Msg). + +unsafe_call(Msg) -> + case whereis(?SERVER_NAME) of + undefined -> {error, {node_not_running, node()}}; + Pid -> gen_server:call(Pid, Msg, infinity) + end. + +call(Msg) -> + case whereis(?SERVER_NAME) of + undefined -> + {error, {node_not_running, node()}}; + Pid -> + link(Pid), + Res = gen_server:call(Pid, Msg, infinity), + unlink(Pid), + + %% We get an exit signal if server dies + receive + {'EXIT', Pid, _Reason} -> + {error, {node_not_running, node()}} + after 0 -> + Res + end + end. + +remote_call(Node, Func, Args) -> + case catch gen_server:call({?MODULE, Node}, {Func, Args, self()}, infinity) of + {'EXIT', Error} -> + {error, Error}; + Else -> + Else + end. + +multicall(Nodes, Msg) -> + {Good, Bad} = gen_server:multi_call(Nodes, ?MODULE, Msg, infinity), + PatchedGood = [Reply || {_Node, Reply} <- Good], + {PatchedGood, Bad}. %% Make the replies look like rpc:multicalls.. +%% rpc:multicall(Nodes, ?MODULE, call, [Msg]). + +%%%---------------------------------------------------------------------- +%%% Callback functions from gen_server +%%%---------------------------------------------------------------------- + +%%---------------------------------------------------------------------- +%% Func: init/1 +%% Returns: {ok, State} | +%% {ok, State, Timeout} | +%% {stop, Reason} +%%---------------------------------------------------------------------- +init([Parent]) -> + process_flag(trap_exit, true), + mnesia_lib:verbose("~p starting: ~p~n", [?SERVER_NAME, self()]), + + %% Handshake and initialize transaction recovery + %% for new nodes detected in the schema + All = mnesia_lib:all_nodes(), + Diff = All -- [node() | val(original_nodes)], + mnesia_lib:unset(original_nodes), + mnesia_recover:connect_nodes(Diff), + + Interval = mnesia_monitor:get_env(dump_log_time_threshold), + Msg = {async_dump_log, time_threshold}, + {ok, Ref} = timer:send_interval(Interval, Msg), + mnesia_dumper:start_regulator(), + + Empty = gb_trees:empty(), + {ok, #state{supervisor = Parent, dump_log_timer_ref = Ref, + loader_queue = Empty, + late_loader_queue = Empty}}. + +%%---------------------------------------------------------------------- +%% Func: handle_call/3 +%% Returns: {reply, Reply, State} | +%% {reply, Reply, State, Timeout} | +%% {noreply, State} | +%% {noreply, State, Timeout} | +%% {stop, Reason, Reply, State} | (terminate/2 is called) +%% {stop, Reason, Reply, State} (terminate/2 is called) +%%---------------------------------------------------------------------- + +handle_call({sync_dump_log, InitBy}, From, State) -> + Worker = #dump_log{initiated_by = InitBy, + opt_reply_to = From + }, + State2 = add_worker(Worker, State), + noreply(State2); + +handle_call(wait_for_schema_commit_lock, From, State) -> + Worker = #schema_commit_lock{owner = From}, + State2 = add_worker(Worker, State), + noreply(State2); + +handle_call(block_controller, From, State) -> + Worker = #block_controller{owner = From}, + State2 = add_worker(Worker, State), + noreply(State2); + +handle_call({update,Fun}, From, State) -> + Res = (catch Fun()), + reply(From, Res), + noreply(State); + +handle_call(get_cstructs, From, State) -> + Tabs = val({schema, tables}), + Cstructs = [val({T, cstruct}) || T <- Tabs], + Running = val({current, db_nodes}), + reply(From, {cstructs, Cstructs, Running}), + noreply(State); + +handle_call({schema_is_merged, [], late_merge, []}, From, + State = #state{schema_is_merged = Merged}) -> + case Merged of + {false, Node} when Node == node(From) -> + Msgs = State#state.early_msgs, + State1 = State#state{early_msgs = [], schema_is_merged = true}, + handle_early_msgs(lists:reverse(Msgs), State1); + _ -> + %% Ooops this came to early, before we have merged :-) + %% or it came to late or from a node we don't care about + reply(From, ignore), + noreply(State) + end; + +handle_call({schema_is_merged, TabsR, Reason, RemoteLoaders}, From, State) -> + State2 = late_disc_load(TabsR, Reason, RemoteLoaders, From, State), + + %% Handle early messages + Msgs = State2#state.early_msgs, + State3 = State2#state{early_msgs = [], schema_is_merged = true}, + handle_early_msgs(lists:reverse(Msgs), State3); + +handle_call(disc_load_intents,From,State = #state{loader_queue=LQ,late_loader_queue=LLQ}) -> + LQTabs = gb_trees:keys(LQ), + LLQTabs = gb_trees:keys(LLQ), + ActiveTabs = lists:sort(mnesia_lib:local_active_tables()), + reply(From, {ok, node(), ordsets:union([LQTabs,LLQTabs,ActiveTabs])}), + noreply(State); + +handle_call({update_where_to_write, [add, Tab, AddNode], _From}, _Dummy, State) -> + Current = val({current, db_nodes}), + Res = + case lists:member(AddNode, Current) and + (State#state.schema_is_merged == true) of + true -> + mnesia_lib:add_lsort({Tab, where_to_write}, AddNode); + false -> + ignore + end, + {reply, Res, State}; + +handle_call({add_active_replica, [Tab, ToNode, RemoteS, AccessMode], From}, + ReplyTo, State) -> + KnownNode = lists:member(ToNode, val({current, db_nodes})), + Merged = State#state.schema_is_merged, + if + KnownNode == false -> + reply(ReplyTo, ignore), + noreply(State); + Merged == true -> + Res = case ?catch_val({Tab, cstruct}) of + {'EXIT', _} -> %% Tab deleted + deleted; + _ -> + add_active_replica(Tab, ToNode, RemoteS, AccessMode) + end, + reply(ReplyTo, Res), + noreply(State); + true -> %% Schema is not merged + Msg = {add_active_replica, [Tab, ToNode, RemoteS, AccessMode], From}, + Msgs = State#state.early_msgs, + reply(ReplyTo, ignore), %% Reply ignore and add data after schema merge + noreply(State#state{early_msgs = [{call, Msg, undefined} | Msgs]}) + end; + +handle_call({unannounce_add_table_copy, [Tab, Node], From}, ReplyTo, State) -> + KnownNode = lists:member(node(From), val({current, db_nodes})), + Merged = State#state.schema_is_merged, + if + KnownNode == false -> + reply(ReplyTo, ignore), + noreply(State); + Merged == true -> + Res = unannounce_add_table_copy(Tab, Node), + reply(ReplyTo, Res), + noreply(State); + true -> %% Schema is not merged + Msg = {unannounce_add_table_copy, [Tab, Node], From}, + Msgs = State#state.early_msgs, + reply(ReplyTo, ignore), %% Reply ignore and add data after schema merge + %% Set ReplyTO to undefined so we don't reply twice + noreply(State#state{early_msgs = [{call, Msg, undefined} | Msgs]}) + end; + +handle_call({net_load, Tab, Cs}, From, State) -> + State2 = + case State#state.schema_is_merged of + true -> + Worker = #net_load{table = Tab, + opt_reply_to = From, + reason = {dumper,add_table_copy}, + cstruct = Cs + }, + add_worker(Worker, State); + false -> + reply(From, {not_loaded, schema_not_merged}), + State + end, + noreply(State2); + +handle_call(Msg, From, State) when State#state.schema_is_merged /= true -> + %% Buffer early messages + Msgs = State#state.early_msgs, + noreply(State#state{early_msgs = [{call, Msg, From} | Msgs]}); + +handle_call({late_disc_load, Tabs, Reason, RemoteLoaders}, From, State) -> + State2 = late_disc_load(Tabs, Reason, RemoteLoaders, From, State), + noreply(State2); + +handle_call({unblock_table, Tab}, _Dummy, State) -> + Var = {Tab, where_to_commit}, + case val(Var) of + {blocked, List} -> + set(Var, List); % where_to_commit + _ -> + ignore + end, + {reply, ok, State}; + +handle_call({block_table, [Tab], From}, _Dummy, State) -> + case lists:member(node(From), val({current, db_nodes})) of + true -> + block_table(Tab); + false -> + ignore + end, + {reply, ok, State}; + +handle_call({check_w2r, _Node, Tab}, _From, State) -> + {reply, val({Tab, where_to_read}), State}; + +handle_call({add_other, Who}, _From, State = #state{others=Others0}) -> + Others = [Who|Others0], + {reply, ok, State#state{others=Others}}; +handle_call({del_other, Who}, _From, State = #state{others=Others0}) -> + Others = lists:delete(Who, Others0), + {reply, ok, State#state{others=Others}}; + +handle_call(Msg, _From, State) -> + error("~p got unexpected call: ~p~n", [?SERVER_NAME, Msg]), + noreply(State). + +late_disc_load(TabsR, Reason, RemoteLoaders, From, + State = #state{loader_queue = LQ, late_loader_queue = LLQ}) -> + verbose("Intend to load tables: ~p~n", [TabsR]), + ?eval_debug_fun({?MODULE, late_disc_load}, + [{tabs, TabsR}, + {reason, Reason}, + {loaders, RemoteLoaders}]), + + reply(From, queued), + %% RemoteLoaders is a list of {ok, Node, Tabs} tuples + + %% Remove deleted tabs and queued/loaded + LocalTabs = gb_sets:from_ordset(lists:sort(mnesia_lib:val({schema,local_tables}))), + Filter = fun(TabInfo0, Acc) -> + TabInfo = {Tab,_} = + case TabInfo0 of + {_,_} -> TabInfo0; + TabN -> {TabN,Reason} + end, + case gb_sets:is_member(Tab, LocalTabs) of + true -> + case ?catch_val({Tab, where_to_read}) == node() of + true -> Acc; + false -> + case gb_trees:is_defined(Tab,LQ) of + true -> Acc; + false -> [TabInfo | Acc] + end + end; + false -> Acc + end + end, + + Tabs = lists:foldl(Filter, [], TabsR), + + Nodes = val({current, db_nodes}), + LateQueue = late_loaders(Tabs, RemoteLoaders, Nodes, LLQ), + State#state{late_loader_queue = LateQueue}. + +late_loaders([{Tab, Reason} | Tabs], RemoteLoaders, Nodes, LLQ) -> + case gb_trees:is_defined(Tab, LLQ) of + false -> + LoadNodes = late_load_filter(RemoteLoaders, Tab, Nodes, []), + case LoadNodes of + [] -> cast({disc_load, Tab, Reason}); % Ugly cast + _ -> ignore + end, + LateLoad = #late_load{table=Tab,loaders=LoadNodes,reason=Reason}, + late_loaders(Tabs, RemoteLoaders, Nodes, gb_trees:insert(Tab,LateLoad,LLQ)); + true -> + late_loaders(Tabs, RemoteLoaders, Nodes, LLQ) + end; +late_loaders([], _RemoteLoaders, _Nodes, LLQ) -> + LLQ. + +late_load_filter([{error, _} | RemoteLoaders], Tab, Nodes, Acc) -> + late_load_filter(RemoteLoaders, Tab, Nodes, Acc); +late_load_filter([{badrpc, _} | RemoteLoaders], Tab, Nodes, Acc) -> + late_load_filter(RemoteLoaders, Tab, Nodes, Acc); +late_load_filter([RL | RemoteLoaders], Tab, Nodes, Acc) -> + {ok, Node, Intents} = RL, + Access = val({Tab, access_mode}), + LocalC = val({Tab, local_content}), + StillActive = lists:member(Node, Nodes), + RemoteIntent = lists:member(Tab, Intents), + if + Access == read_write, + LocalC == false, + StillActive == true, + RemoteIntent == true -> + Masters = mnesia_recover:get_master_nodes(Tab), + case lists:member(Node, Masters) of + true -> + %% The other node is master node for + %% the table, accept his load intent + late_load_filter(RemoteLoaders, Tab, Nodes, [Node | Acc]); + false when Masters == [] -> + %% The table has no master nodes + %% accept his load intent + late_load_filter(RemoteLoaders, Tab, Nodes, [Node | Acc]); + false -> + %% Some one else is master node for + %% the table, ignore his load intent + late_load_filter(RemoteLoaders, Tab, Nodes, Acc) + end; + true -> + late_load_filter(RemoteLoaders, Tab, Nodes, Acc) + end; +late_load_filter([], _Tab, _Nodes, Acc) -> + Acc. + +%%---------------------------------------------------------------------- +%% Func: handle_cast/2 +%% Returns: {noreply, State} | +%% {noreply, State, Timeout} | +%% {stop, Reason, State} (terminate/2 is called) +%%---------------------------------------------------------------------- + +handle_cast({release_schema_commit_lock, _Owner}, State) -> + if + State#state.is_stopping == true -> + {stop, shutdown, State}; + true -> + case State#state.dumper_queue of + [#schema_commit_lock{}|Rest] -> + [_Worker | Rest] = State#state.dumper_queue, + State2 = State#state{dumper_pid = undefined, + dumper_queue = Rest}, + State3 = opt_start_worker(State2), + noreply(State3); + _ -> + noreply(State) + end + end; + +handle_cast(unblock_controller, State) -> + if + State#state.is_stopping == true -> + {stop, shutdown, State}; + is_record(hd(State#state.dumper_queue), block_controller) -> + [_Worker | Rest] = State#state.dumper_queue, + State2 = State#state{dumper_pid = undefined, + dumper_queue = Rest}, + State3 = opt_start_worker(State2), + noreply(State3) + end; + +handle_cast({mnesia_down, Node}, State) -> + maybe_log_mnesia_down(Node), + mnesia_lib:del({current, db_nodes}, Node), + mnesia_checkpoint:tm_mnesia_down(Node), + Alltabs = val({schema, tables}), + reconfigure_tables(Node, Alltabs), + %% Done from (external point of view) + mnesia_monitor:mnesia_down(?SERVER_NAME, Node), + + %% Fix if we are late_merging against the node that went down + case State#state.schema_is_merged of + {false, Node} -> + spawn(?MODULE, call, [{schema_is_merged, [], late_merge, []}]); + _ -> + ignore + end, + + %% Fix internal stuff + LateQ = remove_loaders(Alltabs, Node, State#state.late_loader_queue), + + case get_senders(State) ++ get_loaders(State) of + [] -> ignore; + Senders -> + lists:foreach(fun({Pid,_}) -> Pid ! {copier_done, Node} end, + Senders) + end, + lists:foreach(fun(Pid) -> Pid ! {copier_done,Node} end, + State#state.others), + + Remove = fun(ST) -> + node(ST#send_table.receiver_pid) /= Node + end, + NewSenders = lists:filter(Remove, State#state.sender_queue), + Early = remove_early_messages(State#state.early_msgs, Node), + noreply(State#state{sender_queue = NewSenders, + early_msgs = Early, + late_loader_queue = LateQ + }); + +handle_cast({merging_schema, Node}, State) -> + case State#state.schema_is_merged of + false -> + %% This comes from dynamic connect_nodes which are made + %% after mnesia:start() and the schema_merge. + ImANewKidInTheBlock = + (val({schema, storage_type}) == ram_copies) + andalso (mnesia_lib:val({schema, local_tables}) == [schema]), + case ImANewKidInTheBlock of + true -> %% I'm newly started ram_node.. + noreply(State#state{schema_is_merged = {false, Node}}); + false -> + noreply(State) + end; + _ -> %% Already merging schema. + noreply(State) + end; + +handle_cast(Msg, State) when State#state.schema_is_merged /= true -> + %% Buffer early messages + Msgs = State#state.early_msgs, + noreply(State#state{early_msgs = [{cast, Msg} | Msgs]}); + +%% This must be done after schema_is_merged otherwise adopt_orphan +%% might trigger a table load from wrong nodes as a result of that we don't +%% know which tables we can load safly first. +handle_cast({im_running, _Node, NewFriends}, State) -> + LocalTabs = mnesia_lib:local_active_tables() -- [schema], + RemoveLocalOnly = fun(Tab) -> not val({Tab, local_content}) end, + Tabs = lists:filter(RemoveLocalOnly, LocalTabs), + Ns = mnesia_lib:intersect(NewFriends, val({current, db_nodes})), + abcast(Ns, {adopt_orphans, node(), Tabs}), + noreply(State); + +handle_cast({disc_load, Tab, Reason}, State) -> + Worker = #disc_load{table = Tab, reason = Reason}, + State2 = add_worker(Worker, State), + noreply(State2); + +handle_cast(Worker = #send_table{}, State) -> + State2 = add_worker(Worker, State), + noreply(State2); + +handle_cast({sync_tabs, Tabs, From}, State) -> + %% user initiated wait_for_tables + handle_sync_tabs(Tabs, From), + noreply(State); + +handle_cast({i_have_tab, Tab, Node}, State) -> + case lists:member(Node, val({current, db_nodes})) of + true -> + State2 = node_has_tabs([Tab], Node, State), + noreply(State2); + false -> + noreply(State) + end; + +handle_cast({force_load_updated, Tab}, State) -> + case val({Tab, active_replicas}) of + [] -> + %% No valid replicas + noreply(State); + [SomeNode | _] -> + State2 = node_has_tabs([Tab], SomeNode, State), + noreply(State2) + end; + +handle_cast({master_nodes_updated, Tab, Masters}, State) -> + Active = val({Tab, active_replicas}), + Valid = + case val({Tab, load_by_force}) of + true -> + Active; + false -> + if + Masters == [] -> + Active; + true -> + mnesia_lib:intersect(Masters, Active) + end + end, + case Valid of + [] -> + %% No valid replicas + noreply(State); + [SomeNode | _] -> + State2 = node_has_tabs([Tab], SomeNode, State), + noreply(State2) + end; + +handle_cast({adopt_orphans, Node, Tabs}, State) -> + + State2 = node_has_tabs(Tabs, Node, State), + + %% Register the other node as up and running + mnesia_recover:log_mnesia_up(Node), + verbose("Logging mnesia_up ~w~n",[Node]), + mnesia_lib:report_system_event({mnesia_up, Node}), + + %% Load orphan tables + LocalTabs = val({schema, local_tables}) -- [schema], + Nodes = val({current, db_nodes}), + {LocalOrphans, RemoteMasters} = + orphan_tables(LocalTabs, Node, Nodes, [], []), + Reason = {adopt_orphan, node()}, + mnesia_late_loader:async_late_disc_load(node(), LocalOrphans, Reason), + + Fun = + fun(N) -> + RemoteOrphans = + [Tab || {Tab, Ns} <- RemoteMasters, + lists:member(N, Ns)], + mnesia_late_loader:maybe_async_late_disc_load(N, RemoteOrphans, Reason) + end, + lists:foreach(Fun, Nodes), + noreply(State2); + +handle_cast(Msg, State) -> + error("~p got unexpected cast: ~p~n", [?SERVER_NAME, Msg]), + noreply(State). + +handle_sync_tabs([Tab | Tabs], From) -> + case val({Tab, where_to_read}) of + nowhere -> + case get({sync_tab, Tab}) of + undefined -> + put({sync_tab, Tab}, [From]); + Pids -> + put({sync_tab, Tab}, [From | Pids]) + end; + _ -> + sync_reply(From, Tab) + end, + handle_sync_tabs(Tabs, From); +handle_sync_tabs([], _From) -> + ok. + +%%---------------------------------------------------------------------- +%% Func: handle_info/2 +%% Returns: {noreply, State} | +%% {noreply, State, Timeout} | +%% {stop, Reason, State} (terminate/2 is called) +%%---------------------------------------------------------------------- + +handle_info({async_dump_log, InitBy}, State) -> + Worker = #dump_log{initiated_by = InitBy}, + State2 = add_worker(Worker, State), + noreply(State2); + +handle_info(#dumper_done{worker_pid=Pid, worker_res=Res}, State) -> + if + State#state.is_stopping == true -> + {stop, shutdown, State}; + Res == dumped, Pid == State#state.dumper_pid -> + [Worker | Rest] = State#state.dumper_queue, + reply(Worker#dump_log.opt_reply_to, Res), + State2 = State#state{dumper_pid = undefined, + dumper_queue = Rest}, + State3 = opt_start_worker(State2), + noreply(State3); + true -> + fatal("Dumper failed: ~p~n state: ~p~n", [Res, State]), + {stop, fatal, State} + end; + +handle_info(Done = #loader_done{worker_pid=WPid, table_name=Tab}, State0) -> + LateQueue0 = State0#state.late_loader_queue, + State1 = State0#state{loader_pid = lists:keydelete(WPid,1,get_loaders(State0))}, + + State2 = + case Done#loader_done.is_loaded of + true -> + %% Optional table announcement + if + Done#loader_done.needs_announce == true, + Done#loader_done.needs_reply == true -> + i_have_tab(Tab), + %% Should be {dumper,add_table_copy} only + reply(Done#loader_done.reply_to, + Done#loader_done.reply); + Done#loader_done.needs_reply == true -> + %% Should be {dumper,add_table_copy} only + reply(Done#loader_done.reply_to, + Done#loader_done.reply); + Done#loader_done.needs_announce == true, Tab == schema -> + i_have_tab(Tab); + Done#loader_done.needs_announce == true -> + i_have_tab(Tab), + %% Local node needs to perform user_sync_tab/1 + Ns = val({current, db_nodes}), + abcast(Ns, {i_have_tab, Tab, node()}); + Tab == schema -> + ignore; + true -> + %% Local node needs to perform user_sync_tab/1 + Ns = val({current, db_nodes}), + AlreadyKnows = val({Tab, active_replicas}), + abcast(Ns -- AlreadyKnows, {i_have_tab, Tab, node()}) + end, + %% Optional user sync + case Done#loader_done.needs_sync of + true -> user_sync_tab(Tab); + false -> ignore + end, + State1#state{late_loader_queue=gb_trees:delete_any(Tab, LateQueue0)}; + false -> + %% Either the node went down or table was not + %% loaded remotly yet + case Done#loader_done.needs_reply of + true -> + reply(Done#loader_done.reply_to, + Done#loader_done.reply); + false -> + ignore + end, + case ?catch_val({Tab, active_replicas}) of + [_|_] -> % still available elsewhere + {value,{_,Worker}} = lists:keysearch(WPid,1,get_loaders(State0)), + add_loader(Tab,Worker,State1); + _ -> + State1 + end + end, + State3 = opt_start_worker(State2), + noreply(State3); + +handle_info(#sender_done{worker_pid=Pid, worker_res=Res}, State) -> + Senders = get_senders(State), + {value, {Pid,_Worker}} = lists:keysearch(Pid, 1, Senders), + if + Res == ok -> + State2 = State#state{sender_pid = lists:keydelete(Pid, 1, Senders)}, + State3 = opt_start_worker(State2), + noreply(State3); + true -> + %% No need to send any message to the table receiver + %% since it will soon get a mnesia_down anyway + fatal("Sender failed: ~p~n state: ~p~n", [Res, State]), + {stop, fatal, State} + end; + +handle_info({'EXIT', Pid, R}, State) when Pid == State#state.supervisor -> + catch set(mnesia_status, stopping), + case State#state.dumper_pid of + undefined -> + dbg_out("~p was ~p~n", [?SERVER_NAME, R]), + {stop, shutdown, State}; + _ -> + noreply(State#state{is_stopping = true}) + end; + +handle_info({'EXIT', Pid, R}, State) when Pid == State#state.dumper_pid -> + case State#state.dumper_queue of + [#schema_commit_lock{}|Workers] -> %% Schema trans crashed or was killed + dbg_out("WARNING: Dumper ~p exited ~p~n", [Pid, R]), + State2 = State#state{dumper_queue = Workers, dumper_pid = undefined}, + State3 = opt_start_worker(State2), + noreply(State3); + _Other -> + fatal("Dumper or schema commit crashed: ~p~n state: ~p~n", [R, State]), + {stop, fatal, State} + end; + +handle_info(Msg = {'EXIT', Pid, R}, State) when R /= wait_for_tables_timeout -> + case lists:keymember(Pid, 1, get_senders(State)) of + true -> + %% No need to send any message to the table receiver + %% since it will soon get a mnesia_down anyway + fatal("Sender crashed: ~p~n state: ~p~n", [{Pid,R}, State]), + {stop, fatal, State}; + false -> + case lists:keymember(Pid, 1, get_loaders(State)) of + true -> + fatal("Loader crashed: ~p~n state: ~p~n", [R, State]), + {stop, fatal, State}; + false -> + error("~p got unexpected info: ~p~n", [?SERVER_NAME, Msg]), + noreply(State) + end + end; + +handle_info({From, get_state}, State) -> + From ! {?SERVER_NAME, State}, + noreply(State); + +%% No real need for buffering +handle_info(Msg, State) when State#state.schema_is_merged /= true -> + %% Buffer early messages + Msgs = State#state.early_msgs, + noreply(State#state{early_msgs = [{info, Msg} | Msgs]}); + +handle_info({'EXIT', Pid, wait_for_tables_timeout}, State) -> + sync_tab_timeout(Pid, get()), + noreply(State); + +handle_info(Msg, State) -> + error("~p got unexpected info: ~p~n", [?SERVER_NAME, Msg]), + noreply(State). + +sync_tab_timeout(Pid, [{{sync_tab, Tab}, Pids} | Tail]) -> + case lists:delete(Pid, Pids) of + [] -> + erase({sync_tab, Tab}); + Pids2 -> + put({sync_tab, Tab}, Pids2) + end, + sync_tab_timeout(Pid, Tail); +sync_tab_timeout(Pid, [_ | Tail]) -> + sync_tab_timeout(Pid, Tail); +sync_tab_timeout(_Pid, []) -> + ok. + +%% Pick the load record that has the highest load order +%% Returns {BestLoad, RemainingQueue} or {none, []} if queue is empty +pick_next(Queue) -> + List = gb_trees:values(Queue), + case pick_next(List, none, none) of + none -> {none, gb_trees:empty()}; + {Tab, Worker} -> {Worker, gb_trees:delete(Tab,Queue)} + end. + +pick_next([Head = #net_load{table=Tab}| Tail], Load, Order) -> + select_best(Head, Tail, ?catch_val({Tab, load_order}), Load, Order); +pick_next([Head = #disc_load{table=Tab}| Tail], Load, Order) -> + select_best(Head, Tail, ?catch_val({Tab, load_order}), Load, Order); +pick_next([], none, _Order) -> + none; +pick_next([], Load, _Order) -> + {element(2,Load), Load}. + +select_best(_Head, Tail, {'EXIT', _WHAT}, Load, Order) -> + %% Table have been deleted drop it. + pick_next(Tail, Load, Order); +select_best(Load, Tail, Order, none, none) -> + pick_next(Tail, Load, Order); +select_best(Load, Tail, Order, _OldLoad, OldOrder) when Order > OldOrder -> + pick_next(Tail, Load, Order); +select_best(_Load, Tail, _Order, OldLoad, OldOrder) -> + pick_next(Tail, OldLoad, OldOrder). + +%%---------------------------------------------------------------------- +%% Func: terminate/2 +%% Purpose: Shutdown the server +%% Returns: any (ignored by gen_server) +%%---------------------------------------------------------------------- +terminate(Reason, State) -> + mnesia_monitor:terminate_proc(?SERVER_NAME, Reason, State). + +%%---------------------------------------------------------------------- +%% Func: code_change/3 +%% Purpose: Upgrade process when its code is to be changed +%% Returns: {ok, NewState} +%%---------------------------------------------------------------------- +code_change(_OldVsn, State0, _Extra) -> + %% Loader Queue + State1 = case State0#state.loader_pid of + Pids when is_list(Pids) -> State0; + undefined -> State0#state{loader_pid = [],loader_queue=gb_trees:empty()}; + Pid when is_pid(Pid) -> + [Loader|Rest] = State0#state.loader_queue, + LQ0 = [{element(2,Rec),Rec} || Rec <- Rest], + LQ1 = lists:sort(LQ0), + LQ = gb_trees:from_orddict(LQ1), + State0#state{loader_pid=[{Pid,Loader}], loader_queue=LQ} + end, + %% LateLoaderQueue + State = if is_list(State1#state.late_loader_queue) -> + LLQ0 = State1#state.late_loader_queue, + LLQ1 = lists:sort([{element(2,Rec),Rec} || Rec <- LLQ0]), + LLQ = gb_trees:from_orddict(LLQ1), + State1#state{late_loader_queue=LLQ}; + true -> + State1 + end, + {ok, State}. + +%%%---------------------------------------------------------------------- +%%% Internal functions +%%%---------------------------------------------------------------------- + +maybe_log_mnesia_down(N) -> + %% We use mnesia_down when deciding which tables to load locally, + %% so if we are not running (i.e haven't decided which tables + %% to load locally), don't log mnesia_down yet. + case mnesia_lib:is_running() of + yes -> + verbose("Logging mnesia_down ~w~n", [N]), + mnesia_recover:log_mnesia_down(N), + ok; + _ -> + Filter = fun(Tab) -> + inactive_copy_holders(Tab, N) + end, + HalfLoadedTabs = lists:any(Filter, val({schema, local_tables}) -- [schema]), + if + HalfLoadedTabs == true -> + verbose("Logging mnesia_down ~w~n", [N]), + mnesia_recover:log_mnesia_down(N), + ok; + true -> + %% Unfortunately we have not loaded some common + %% tables yet, so we cannot rely on the nodedown + log_later %% BUGBUG handle this case!!! + end + end. + +inactive_copy_holders(Tab, Node) -> + Cs = val({Tab, cstruct}), + case mnesia_lib:cs_to_storage_type(Node, Cs) of + unknown -> + false; + _Storage -> + mnesia_lib:not_active_here(Tab) + end. + +orphan_tables([Tab | Tabs], Node, Ns, Local, Remote) -> + Cs = val({Tab, cstruct}), + CopyHolders = mnesia_lib:copy_holders(Cs), + RamCopyHolders = Cs#cstruct.ram_copies, + DiscCopyHolders = CopyHolders -- RamCopyHolders, + DiscNodes = val({schema, disc_copies}), + LocalContent = Cs#cstruct.local_content, + RamCopyHoldersOnDiscNodes = mnesia_lib:intersect(RamCopyHolders, DiscNodes), + Active = val({Tab, active_replicas}), + BeingCreated = (?catch_val({Tab, create_table}) == true), + Read = val({Tab, where_to_read}), + case lists:member(Node, DiscCopyHolders) of + _ when BeingCreated == true -> + orphan_tables(Tabs, Node, Ns, Local, Remote); + _ when Read == node() -> %% Allready loaded + orphan_tables(Tabs, Node, Ns, Local, Remote); + true when Active == [] -> + case DiscCopyHolders -- Ns of + [] -> + %% We're last up and the other nodes have not + %% loaded the table. Lets load it if we are + %% the smallest node. + case lists:min(DiscCopyHolders) of + Min when Min == node() -> + case mnesia_recover:get_master_nodes(Tab) of + [] -> + L = [Tab | Local], + orphan_tables(Tabs, Node, Ns, L, Remote); + Masters -> + R = [{Tab, Masters} | Remote], + orphan_tables(Tabs, Node, Ns, Local, R) + end; + _ -> + orphan_tables(Tabs, Node, Ns, Local, Remote) + end; + _ -> + orphan_tables(Tabs, Node, Ns, Local, Remote) + end; + false when Active == [], DiscCopyHolders == [], RamCopyHoldersOnDiscNodes == [] -> + %% Special case when all replicas resides on disc less nodes + orphan_tables(Tabs, Node, Ns, [Tab | Local], Remote); + _ when LocalContent == true -> + orphan_tables(Tabs, Node, Ns, [Tab | Local], Remote); + _ -> + orphan_tables(Tabs, Node, Ns, Local, Remote) + end; +orphan_tables([], _, _, LocalOrphans, RemoteMasters) -> + {LocalOrphans, RemoteMasters}. + +node_has_tabs([Tab | Tabs], Node, State) when Node /= node() -> + State2 = + case catch update_whereabouts(Tab, Node, State) of + State1 = #state{} -> State1; + {'EXIT', R} -> %% Tab was just deleted? + case ?catch_val({Tab, cstruct}) of + {'EXIT', _} -> State; % yes + _ -> erlang:error(R) + end + end, + node_has_tabs(Tabs, Node, State2); +node_has_tabs([Tab | Tabs], Node, State) -> + user_sync_tab(Tab), + node_has_tabs(Tabs, Node, State); +node_has_tabs([], _Node, State) -> + State. + +update_whereabouts(Tab, Node, State) -> + Storage = val({Tab, storage_type}), + Read = val({Tab, where_to_read}), + LocalC = val({Tab, local_content}), + BeingCreated = (?catch_val({Tab, create_table}) == true), + Masters = mnesia_recover:get_master_nodes(Tab), + ByForce = val({Tab, load_by_force}), + GoGetIt = + if + ByForce == true -> + true; + Masters == [] -> + true; + true -> + lists:member(Node, Masters) + end, + + dbg_out("Table ~w is loaded on ~w. s=~w, r=~w, lc=~w, f=~w, m=~w~n", + [Tab, Node, Storage, Read, LocalC, ByForce, GoGetIt]), + if + LocalC == true -> + %% Local contents, don't care about other node + State; + BeingCreated == true -> + %% The table is currently being created + %% It will be handled elsewhere + State; + Storage == unknown, Read == nowhere -> + %% No own copy, time to read remotely + %% if the other node is a good node + add_active_replica(Tab, Node), + case GoGetIt of + true -> + set({Tab, where_to_read}, Node), + user_sync_tab(Tab), + State; + false -> + State + end; + Storage == unknown -> + %% No own copy, continue to read remotely + add_active_replica(Tab, Node), + NodeST = mnesia_lib:storage_type_at_node(Node, Tab), + ReadST = mnesia_lib:storage_type_at_node(Read, Tab), + if %% Avoid reading from disc_only_copies + NodeST == disc_only_copies -> + ignore; + ReadST == disc_only_copies -> + mnesia_lib:set_remote_where_to_read(Tab); + true -> + ignore + end, + user_sync_tab(Tab), + State; + Read == nowhere -> + %% Own copy, go and get a copy of the table + %% if the other node is master or if there + %% are no master at all + add_active_replica(Tab, Node), + case GoGetIt of + true -> + Worker = #net_load{table = Tab, + reason = {active_remote, Node}}, + add_worker(Worker, State); + false -> + State + end; + true -> + %% We already have an own copy + add_active_replica(Tab, Node), + user_sync_tab(Tab), + State + end. + +initial_safe_loads() -> + case val({schema, storage_type}) of + ram_copies -> + Downs = [], + Tabs = val({schema, local_tables}) -- [schema], + LastC = fun(T) -> last_consistent_replica(T, Downs) end, + lists:zf(LastC, Tabs); + + disc_copies -> + Downs = mnesia_recover:get_mnesia_downs(), + dbg_out("mnesia_downs = ~p~n", [Downs]), + + Tabs = val({schema, local_tables}) -- [schema], + LastC = fun(T) -> last_consistent_replica(T, Downs) end, + lists:zf(LastC, Tabs) + end. + +last_consistent_replica(Tab, Downs) -> + Cs = val({Tab, cstruct}), + Storage = mnesia_lib:cs_to_storage_type(node(), Cs), + Ram = Cs#cstruct.ram_copies, + Disc = Cs#cstruct.disc_copies, + DiscOnly = Cs#cstruct.disc_only_copies, + BetterCopies0 = mnesia_lib:remote_copy_holders(Cs) -- Downs, + BetterCopies = BetterCopies0 -- Ram, + AccessMode = Cs#cstruct.access_mode, + Copies = mnesia_lib:copy_holders(Cs), + Masters = mnesia_recover:get_master_nodes(Tab), + LocalMaster0 = lists:member(node(), Masters), + LocalContent = Cs#cstruct.local_content, + RemoteMaster = + if + Masters == [] -> false; + true -> not LocalMaster0 + end, + LocalMaster = + if + Masters == [] -> false; + true -> LocalMaster0 + end, + if + Copies == [node()] -> + %% Only one copy holder and it is local. + %% It may also be a local contents table + {true, {Tab, local_only}}; + LocalContent == true -> + {true, {Tab, local_content}}; + LocalMaster == true -> + %% We have a local master + {true, {Tab, local_master}}; + RemoteMaster == true -> + %% Wait for remote master copy + false; + Storage == ram_copies -> + if + Disc == [], DiscOnly == [] -> + %% Nobody has copy on disc + {true, {Tab, ram_only}}; + true -> + %% Some other node has copy on disc + false + end; + AccessMode == read_only -> + %% No one has been able to update the table, + %% i.e. all disc resident copies are equal + {true, {Tab, read_only}}; + BetterCopies /= [], Masters /= [node()] -> + %% There are better copies on other nodes + %% and we do not have the only master copy + false; + true -> + {true, {Tab, initial}} + end. + +reconfigure_tables(N, [Tab |Tail]) -> + del_active_replica(Tab, N), + case val({Tab, where_to_read}) of + N -> mnesia_lib:set_remote_where_to_read(Tab); + _ -> ignore + end, + reconfigure_tables(N, Tail); +reconfigure_tables(_, []) -> + ok. + +remove_loaders([Tab| Tabs], N, Loaders) -> + LateQ = drop_loaders(Tab, N, Loaders), + remove_loaders(Tabs, N, LateQ); +remove_loaders([],_, LateQ) -> LateQ. + +remove_early_messages([], _Node) -> + []; +remove_early_messages([{call, {add_active_replica, [_, Node, _, _], _}, _}|R], Node) -> + remove_early_messages(R, Node); %% Does a reply before queuing +remove_early_messages([{call, {block_table, _, From}, ReplyTo}|R], Node) + when node(From) == Node -> + reply(ReplyTo, ok), %% Remove gen:server waits.. + remove_early_messages(R, Node); +remove_early_messages([{cast, {i_have_tab, _Tab, Node}}|R], Node) -> + remove_early_messages(R, Node); +remove_early_messages([{cast, {adopt_orphans, Node, _Tabs}}|R], Node) -> + remove_early_messages(R, Node); +remove_early_messages([M|R],Node) -> + [M|remove_early_messages(R,Node)]. + +%% Drop loader from late load queue and possibly trigger a disc_load +drop_loaders(Tab, Node, LLQ) -> + case gb_trees:lookup(Tab,LLQ) of + none -> + LLQ; + {value, H} -> + %% Check if it is time to issue a disc_load request + case H#late_load.loaders of + [Node] -> + Reason = {H#late_load.reason, last_loader_down, Node}, + cast({disc_load, Tab, Reason}); % Ugly cast + _ -> + ignore + end, + %% Drop the node from the list of loaders + H2 = H#late_load{loaders = H#late_load.loaders -- [Node]}, + gb_trees:update(Tab, H2, LLQ) + end. + +add_active_replica(Tab, Node) -> + add_active_replica(Tab, Node, val({Tab, cstruct})). + +add_active_replica(Tab, Node, Cs = #cstruct{}) -> + Storage = mnesia_lib:schema_cs_to_storage_type(Node, Cs), + AccessMode = Cs#cstruct.access_mode, + add_active_replica(Tab, Node, Storage, AccessMode). + +%% Block table primitives + +block_table(Tab) -> + Var = {Tab, where_to_commit}, + Old = val(Var), + New = {blocked, Old}, + set(Var, New). % where_to_commit + +unblock_table(Tab) -> + call({unblock_table, Tab}). + +is_tab_blocked(W2C) when is_list(W2C) -> + {false, W2C}; +is_tab_blocked({blocked, W2C}) when is_list(W2C) -> + {true, W2C}. + +mark_blocked_tab(true, Value) -> + {blocked, Value}; +mark_blocked_tab(false, Value) -> + Value. + +%% + +add_active_replica(Tab, Node, Storage, AccessMode) -> + Var = {Tab, where_to_commit}, + {Blocked, Old} = is_tab_blocked(val(Var)), + Del = lists:keydelete(Node, 1, Old), + case AccessMode of + read_write -> + New = lists:sort([{Node, Storage} | Del]), + set(Var, mark_blocked_tab(Blocked, New)), % where_to_commit + mnesia_lib:add_lsort({Tab, where_to_write}, Node); + read_only -> + set(Var, mark_blocked_tab(Blocked, Del)), + mnesia_lib:del({Tab, where_to_write}, Node) + end, + add({Tab, active_replicas}, Node). + +del_active_replica(Tab, Node) -> + Var = {Tab, where_to_commit}, + {Blocked, Old} = is_tab_blocked(val(Var)), + Del = lists:keydelete(Node, 1, Old), + New = lists:sort(Del), + set(Var, mark_blocked_tab(Blocked, New)), % where_to_commit + mnesia_lib:del({Tab, active_replicas}, Node), + mnesia_lib:del({Tab, where_to_write}, Node). + +change_table_access_mode(Cs) -> + W = fun() -> + Tab = Cs#cstruct.name, + lists:foreach(fun(N) -> add_active_replica(Tab, N, Cs) end, + val({Tab, active_replicas})) + end, + update(W). + + +%% node To now has tab loaded, but this must be undone +%% This code is rpc:call'ed from the tab_copier process +%% when it has *not* released it's table lock +unannounce_add_table_copy(Tab, To) -> + catch del_active_replica(Tab, To), + case catch val({Tab , where_to_read}) of + To -> + mnesia_lib:set_remote_where_to_read(Tab); + _ -> + ignore + end. + +user_sync_tab(Tab) -> + case val(debug) of + trace -> + mnesia_subscr:subscribe(whereis(mnesia_event), {table, Tab}); + _ -> + ignore + end, + + case erase({sync_tab, Tab}) of + undefined -> + ok; + Pids -> + lists:foreach(fun(Pid) -> sync_reply(Pid, Tab) end, Pids) + end. + +i_have_tab(Tab) -> + case val({Tab, local_content}) of + true -> + mnesia_lib:set_local_content_whereabouts(Tab); + false -> + set({Tab, where_to_read}, node()) + end, + add_active_replica(Tab, node()). + +sync_and_block_table_whereabouts(Tab, ToNode, RemoteS, AccessMode) when Tab /= schema -> + Current = val({current, db_nodes}), + Ns = + case lists:member(ToNode, Current) of + true -> Current -- [ToNode]; + false -> Current + end, + remote_call(ToNode, block_table, [Tab]), + [remote_call(Node, add_active_replica, [Tab, ToNode, RemoteS, AccessMode]) || + Node <- [ToNode | Ns]], + ok. + +sync_del_table_copy_whereabouts(Tab, ToNode) when Tab /= schema -> + Current = val({current, db_nodes}), + Ns = + case lists:member(ToNode, Current) of + true -> Current; + false -> [ToNode | Current] + end, + Args = [Tab, ToNode], + [remote_call(Node, unannounce_add_table_copy, Args) || Node <- Ns], + ok. + +get_info(Timeout) -> + case whereis(?SERVER_NAME) of + undefined -> + {timeout, Timeout}; + Pid -> + Pid ! {self(), get_state}, + receive + {?SERVER_NAME, State = #state{loader_queue=LQ,late_loader_queue=LLQ}} -> + {info,State#state{loader_queue=gb_trees:to_list(LQ), + late_loader_queue=gb_trees:to_list(LLQ)}} + after Timeout -> + {timeout, Timeout} + end + end. + +get_workers(Timeout) -> + case whereis(?SERVER_NAME) of + undefined -> + {timeout, Timeout}; + Pid -> + Pid ! {self(), get_state}, + receive + {?SERVER_NAME, State = #state{}} -> + {workers, get_loaders(State), get_senders(State), State#state.dumper_pid} + after Timeout -> + {timeout, Timeout} + end + end. + +info() -> + Tabs = mnesia_lib:local_active_tables(), + io:format( "---> Active tables <--- ~n", []), + info(Tabs). + +info([Tab | Tail]) -> + case val({Tab, storage_type}) of + disc_only_copies -> + info_format(Tab, + dets:info(Tab, size), + dets:info(Tab, file_size), + "bytes on disc"); + _ -> + info_format(Tab, + ?ets_info(Tab, size), + ?ets_info(Tab, memory), + "words of mem") + end, + info(Tail); +info([]) -> ok. + + +info_format(Tab, Size, Mem, Media) -> + StrT = mnesia_lib:pad_name(atom_to_list(Tab), 15, []), + StrS = mnesia_lib:pad_name(integer_to_list(Size), 8, []), + StrM = mnesia_lib:pad_name(integer_to_list(Mem), 8, []), + io:format("~s: with ~s records occupying ~s ~s~n", + [StrT, StrS, StrM, Media]). + +%% Handle early arrived messages +handle_early_msgs([Msg | Msgs], State) -> + %% The messages are in reverse order + case handle_early_msg(Msg, State) of +%% {stop, Reason, Reply, State2} -> % Will not happen according to dialyzer +%% {stop, Reason, Reply, State2}; + {stop, Reason, State2} -> + {stop, Reason, State2}; + {noreply, State2} -> + handle_early_msgs(Msgs, State2); + {reply, Reply, State2} -> + {call, _Call, From} = Msg, + reply(From, Reply), + handle_early_msgs(Msgs, State2) + end; +handle_early_msgs([], State) -> + noreply(State). + +handle_early_msg({call, Msg, From}, State) -> + handle_call(Msg, From, State); +handle_early_msg({cast, Msg}, State) -> + handle_cast(Msg, State); +handle_early_msg({info, Msg}, State) -> + handle_info(Msg, State). + +noreply(State) -> + {noreply, State}. + +reply(undefined, Reply) -> + Reply; +reply(ReplyTo, Reply) -> + gen_server:reply(ReplyTo, Reply), + Reply. + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +%% Worker management + +%% Returns new State +add_worker(Worker = #dump_log{}, State) -> + InitBy = Worker#dump_log.initiated_by, + Queue = State#state.dumper_queue, + case lists:keymember(InitBy, #dump_log.initiated_by, Queue) of + true when Worker#dump_log.opt_reply_to == undefined -> + %% The same threshold has been exceeded again, + %% before we have had the possibility to + %% process the older one. + DetectedBy = {dump_log, InitBy}, + Event = {mnesia_overload, DetectedBy}, + mnesia_lib:report_system_event(Event); + _ -> + ignore + end, + Queue2 = Queue ++ [Worker], + State2 = State#state{dumper_queue = Queue2}, + opt_start_worker(State2); +add_worker(Worker = #schema_commit_lock{}, State) -> + Queue = State#state.dumper_queue, + Queue2 = Queue ++ [Worker], + State2 = State#state{dumper_queue = Queue2}, + opt_start_worker(State2); +add_worker(Worker = #net_load{}, State) -> + opt_start_worker(add_loader(Worker#net_load.table,Worker,State)); +add_worker(Worker = #send_table{}, State) -> + Queue = State#state.sender_queue, + State2 = State#state{sender_queue = Queue ++ [Worker]}, + opt_start_worker(State2); +add_worker(Worker = #disc_load{}, State) -> + opt_start_worker(add_loader(Worker#disc_load.table,Worker,State)); +% Block controller should be used for upgrading mnesia. +add_worker(Worker = #block_controller{}, State) -> + Queue = State#state.dumper_queue, + Queue2 = [Worker | Queue], + State2 = State#state{dumper_queue = Queue2}, + opt_start_worker(State2). + +add_loader(Tab,Worker,State = #state{loader_queue=LQ0}) -> + case gb_trees:is_defined(Tab, LQ0) of + true -> State; + false -> + LQ=gb_trees:insert(Tab, Worker, LQ0), + State#state{loader_queue=LQ} + end. + +%% Optionally start a worker +%% +%% Dumpers and loaders may run simultaneously +%% but neither of them may run during schema commit. +%% Loaders may not start if a schema commit is enqueued. +opt_start_worker(State) when State#state.is_stopping == true -> + State; +opt_start_worker(State) -> + %% Prioritize dumper and schema commit + %% by checking them first + case State#state.dumper_queue of + [Worker | _Rest] when State#state.dumper_pid == undefined -> + %% Great, a worker in queue and neither + %% a schema transaction is being + %% committed and nor a dumper is running + + %% Start worker but keep him in the queue + if + is_record(Worker, schema_commit_lock) -> + ReplyTo = Worker#schema_commit_lock.owner, + reply(ReplyTo, granted), + {Owner, _Tag} = ReplyTo, + opt_start_loader(State#state{dumper_pid = Owner}); + + is_record(Worker, dump_log) -> + Pid = spawn_link(?MODULE, dump_and_reply, [self(), Worker]), + State2 = State#state{dumper_pid = Pid}, + + %% If the worker was a dumper we may + %% possibly be able to start a loader + %% or sender + State3 = opt_start_sender(State2), + opt_start_loader(State3); + + is_record(Worker, block_controller) -> + case {get_senders(State), get_loaders(State)} of + {[], []} -> + ReplyTo = Worker#block_controller.owner, + reply(ReplyTo, granted), + {Owner, _Tag} = ReplyTo, + State#state{dumper_pid = Owner}; + _ -> + State + end + end; + _ -> + %% Bad luck, try with a loader or sender instead + State2 = opt_start_sender(State), + opt_start_loader(State2) + end. + +opt_start_sender(State) -> + case State#state.sender_queue of + []-> State; %% No need + SenderQ -> + {NewS,Kept} = opt_start_sender2(SenderQ, get_senders(State), + [], get_loaders(State)), + State#state{sender_pid = NewS, sender_queue = Kept} + end. + +opt_start_sender2([], Pids,Kept, _) -> {Pids,Kept}; +opt_start_sender2([Sender|R], Pids, Kept, LoaderQ) -> + Tab = Sender#send_table.table, + Active = val({Tab, active_replicas}), + IgotIt = lists:member(node(), Active), + IsLoading = lists:any(fun({_Pid,Loader}) -> + Tab == element(#net_load.table, Loader) + end, LoaderQ), + if + IgotIt, IsLoading -> + %% I'm currently finishing loading the table let him wait + opt_start_sender2(R,Pids, [Sender|Kept], LoaderQ); + IgotIt -> + %% Start worker but keep him in the queue + Pid = spawn_link(?MODULE, send_and_reply,[self(), Sender]), + opt_start_sender2(R,[{Pid,Sender}|Pids],Kept,LoaderQ); + true -> + verbose("Send table failed ~p not active on this node ~n", [Tab]), + Sender#send_table.receiver_pid ! {copier_done, node()}, + opt_start_sender2(R,Pids, Kept, LoaderQ) + end. + +opt_start_loader(State = #state{loader_queue = LoaderQ}) -> + Current = get_loaders(State), + Max = max_loaders(), + case gb_trees:is_empty(LoaderQ) of + true -> + State; + _ when length(Current) >= Max -> + State; + false -> + SchemaQueue = State#state.dumper_queue, + case lists:keymember(schema_commit_lock, 1, SchemaQueue) of + false -> + case pick_next(LoaderQ) of + {none,Rest} -> + State#state{loader_queue=Rest}; + {Worker,Rest} -> + case already_loading(Worker, get_loaders(State)) of + true -> + opt_start_loader(State#state{loader_queue = Rest}); + false -> + %% Start worker but keep him in the queue + Pid = load_and_reply(self(), Worker), + State#state{loader_pid=[{Pid,Worker}|get_loaders(State)], + loader_queue = Rest} + end + end; + true -> + %% Bad luck, we must wait for the schema commit + State + end + end. + +already_loading(#net_load{table=Tab},Loaders) -> + already_loading2(Tab,Loaders); +already_loading(#disc_load{table=Tab},Loaders) -> + already_loading2(Tab,Loaders). + +already_loading2(Tab, [{_,#net_load{table=Tab}}|_]) -> true; +already_loading2(Tab, [{_,#disc_load{table=Tab}}|_]) -> true; +already_loading2(Tab, [_|Rest]) -> already_loading2(Tab,Rest); +already_loading2(_,[]) -> false. + +start_remote_sender(Node, Tab, Receiver, Storage) -> + Msg = #send_table{table = Tab, + receiver_pid = Receiver, + remote_storage = Storage}, + gen_server:cast({?SERVER_NAME, Node}, Msg). + +dump_and_reply(ReplyTo, Worker) -> + %% No trap_exit, die intentionally instead + Res = mnesia_dumper:opt_dump_log(Worker#dump_log.initiated_by), + ReplyTo ! #dumper_done{worker_pid = self(), + worker_res = Res}, + unlink(ReplyTo), + exit(normal). + +send_and_reply(ReplyTo, Worker) -> + %% No trap_exit, die intentionally instead + Res = mnesia_loader:send_table(Worker#send_table.receiver_pid, + Worker#send_table.table, + Worker#send_table.remote_storage), + ReplyTo ! #sender_done{worker_pid = self(), + worker_res = Res}, + unlink(ReplyTo), + exit(normal). + +load_and_reply(ReplyTo, Worker) -> + Load = load_table_fun(Worker), + SendAndReply = + fun() -> + process_flag(trap_exit, true), + Done = Load(), + ReplyTo ! Done#loader_done{worker_pid = self()}, + unlink(ReplyTo), + exit(normal) + end, + spawn_link(SendAndReply). + +%% Now it is time to load the table +%% but first we must check if it still is neccessary +load_table_fun(#net_load{cstruct=Cs, table=Tab, reason=Reason, opt_reply_to=ReplyTo}) -> + LocalC = val({Tab, local_content}), + AccessMode = val({Tab, access_mode}), + ReadNode = val({Tab, where_to_read}), + Active = filter_active(Tab), + Done = #loader_done{is_loaded = true, + table_name = Tab, + needs_announce = false, + needs_sync = false, + needs_reply = (ReplyTo /= undefined), + reply_to = ReplyTo, + reply = {loaded, ok} + }, + if + ReadNode == node() -> + %% Already loaded locally + fun() -> Done end; + LocalC == true -> + fun() -> + Res = mnesia_loader:disc_load_table(Tab, load_local_content), + Done#loader_done{reply = Res, needs_announce = true, needs_sync = true} + end; + AccessMode == read_only, Reason /= {dumper,add_table_copy} -> + fun() -> disc_load_table(Tab, Reason, ReplyTo) end; + true -> + fun() -> + %% Either we cannot read the table yet + %% or someone is moving a replica between + %% two nodes + Res = mnesia_loader:net_load_table(Tab, Reason, Active, Cs), + case Res of + {loaded, ok} -> + Done#loader_done{needs_sync = true, + reply = Res}; + {not_loaded, _} -> + Done#loader_done{is_loaded = false, + reply = Res} + end + end + end; +load_table_fun(#disc_load{table=Tab, reason=Reason, opt_reply_to=ReplyTo}) -> + ReadNode = val({Tab, where_to_read}), + Active = filter_active(Tab), + Done = #loader_done{is_loaded = true, + table_name = Tab, + needs_announce = false, + needs_sync = false, + needs_reply = false + }, + if + Active == [], ReadNode == nowhere -> + %% Not loaded anywhere, lets load it from disc + fun() -> disc_load_table(Tab, Reason, ReplyTo) end; + ReadNode == nowhere -> + %% Already loaded on other node, lets get it + Cs = val({Tab, cstruct}), + fun() -> + case mnesia_loader:net_load_table(Tab, Reason, Active, Cs) of + {loaded, ok} -> + Done#loader_done{needs_sync = true}; + {not_loaded, storage_unknown} -> + Done#loader_done{is_loaded = false}; + {not_loaded, ErrReason} -> + Done#loader_done{is_loaded = false, + reply = {not_loaded,ErrReason}} + end + end; + true -> + %% Already readable, do not worry be happy + fun() -> Done end + end. + +disc_load_table(Tab, Reason, ReplyTo) -> + Done = #loader_done{is_loaded = true, + table_name = Tab, + needs_announce = false, + needs_sync = false, + needs_reply = ReplyTo /= undefined, + reply_to = ReplyTo, + reply = {loaded, ok} + }, + Res = mnesia_loader:disc_load_table(Tab, Reason), + if + Res == {loaded, ok} -> + Done#loader_done{needs_announce = true, + needs_sync = true, + reply = Res}; + ReplyTo /= undefined -> + Done#loader_done{is_loaded = false, + reply = Res}; + true -> + fatal("Cannot load table ~p from disc: ~p~n", [Tab, Res]) + end. + +filter_active(Tab) -> + ByForce = val({Tab, load_by_force}), + Active = val({Tab, active_replicas}), + Masters = mnesia_recover:get_master_nodes(Tab), + Ns = do_filter_active(ByForce, Active, Masters), + %% Reorder the so that we load from fastest first + LS = ?catch_val({Tab, storage_type}), + DOC = val({Tab, disc_only_copies}), + {Good,Worse} = + case LS of + disc_only_copies -> + G = mnesia_lib:intersect(Ns, DOC), + {G,Ns--G}; + _ -> + G = Ns -- DOC, + {G,Ns--G} + end, + %% Pick a random node of the fastest + Len = length(Good), + if + Len > 0 -> + R = erlang:phash(node(), Len+1), + random(R-1,Good,Worse); + true -> + Worse + end. + +random(N, [H|R], Acc) when N > 0 -> + random(N-1,R, [H|Acc]); +random(0, L, Acc) -> + L ++ Acc. + +do_filter_active(true, Active, _Masters) -> + Active; +do_filter_active(false, Active, []) -> + Active; +do_filter_active(false, Active, Masters) -> + mnesia_lib:intersect(Active, Masters). + + diff --git a/lib/mnesia/src/mnesia_dumper.erl b/lib/mnesia/src/mnesia_dumper.erl new file mode 100644 index 0000000000..f669d009c6 --- /dev/null +++ b/lib/mnesia/src/mnesia_dumper.erl @@ -0,0 +1,1218 @@ +%% +%% %CopyrightBegin% +%% +%% Copyright Ericsson AB 1996-2009. All Rights Reserved. +%% +%% The contents of this file are subject to the Erlang Public License, +%% Version 1.1, (the "License"); you may not use this file except in +%% compliance with the License. You should have received a copy of the +%% Erlang Public License along with this software. If not, it can be +%% retrieved online at http://www.erlang.org/. +%% +%% Software distributed under the License is distributed on an "AS IS" +%% basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See +%% the License for the specific language governing rights and limitations +%% under the License. +%% +%% %CopyrightEnd% +%% + +%% +-module(mnesia_dumper). + +%% The InitBy arg may be one of the following: +%% scan_decisions Initial scan for decisions +%% startup Initial dump during startup +%% schema_prepare Dump initiated during schema transaction preparation +%% schema_update Dump initiated during schema transaction commit +%% fast_schema_update A schema_update, but ignores the log file +%% user Dump initiated by user +%% write_threshold Automatic dump caused by too many log writes +%% time_threshold Automatic dump caused by timeout + +%% Public interface +-export([ + get_log_writes/0, + incr_log_writes/0, + raw_dump_table/2, + raw_named_dump_table/2, + start_regulator/0, + opt_dump_log/1, + update/3 + ]). + + %% Internal stuff +-export([regulator_init/1]). + +-include("mnesia.hrl"). +-include_lib("kernel/include/file.hrl"). + +-import(mnesia_lib, [fatal/2, dbg_out/2]). + +-define(REGULATOR_NAME, mnesia_dumper_load_regulator). +-define(DumpToEtsMultiplier, 4). + +get_log_writes() -> + Max = mnesia_monitor:get_env(dump_log_write_threshold), + Prev = mnesia_lib:read_counter(trans_log_writes), + Left = mnesia_lib:read_counter(trans_log_writes_left), + Diff = Max - Left, + Prev + Diff. + +incr_log_writes() -> + Left = mnesia_lib:incr_counter(trans_log_writes_left, -1), + if + Left > 0 -> + ignore; + true -> + adjust_log_writes(true) + end. + +adjust_log_writes(DoCast) -> + Token = {mnesia_adjust_log_writes, self()}, + case global:set_lock(Token, [node()], 1) of + false -> + ignore; %% Somebody else is sending a dump request + true -> + case DoCast of + false -> + ignore; + true -> + mnesia_controller:async_dump_log(write_threshold) + end, + Max = mnesia_monitor:get_env(dump_log_write_threshold), + Left = mnesia_lib:read_counter(trans_log_writes_left), + %% Don't care if we lost a few writes + mnesia_lib:set_counter(trans_log_writes_left, Max), + Diff = Max - Left, + mnesia_lib:incr_counter(trans_log_writes, Diff), + global:del_lock(Token, [node()]) + end. + +%% Returns 'ok' or exits +opt_dump_log(InitBy) -> + Reg = case whereis(?REGULATOR_NAME) of + undefined -> + nopid; + Pid when is_pid(Pid) -> + Pid + end, + perform_dump(InitBy, Reg). + +%% Scan for decisions +perform_dump(InitBy, Regulator) when InitBy == scan_decisions -> + ?eval_debug_fun({?MODULE, perform_dump}, [InitBy]), + + dbg_out("Transaction log dump initiated by ~w~n", [InitBy]), + scan_decisions(mnesia_log:previous_log_file(), InitBy, Regulator), + scan_decisions(mnesia_log:latest_log_file(), InitBy, Regulator); + +%% Propagate the log into the DAT-files +perform_dump(InitBy, Regulator) -> + ?eval_debug_fun({?MODULE, perform_dump}, [InitBy]), + LogState = mnesia_log:prepare_log_dump(InitBy), + dbg_out("Transaction log dump initiated by ~w: ~w~n", + [InitBy, LogState]), + adjust_log_writes(false), + case LogState of + already_dumped -> + mnesia_recover:allow_garb(), + dumped; + {needs_dump, Diff} -> + U = mnesia_monitor:get_env(dump_log_update_in_place), + Cont = mnesia_log:init_log_dump(), + mnesia_recover:sync(), + case catch do_perform_dump(Cont, U, InitBy, Regulator, undefined) of + ok -> + ?eval_debug_fun({?MODULE, post_dump}, [InitBy]), + case mnesia_monitor:use_dir() of + true -> + mnesia_recover:dump_decision_tab(); + false -> + mnesia_log:purge_some_logs() + end, + mnesia_recover:allow_garb(), + %% And now to the crucial point... + mnesia_log:confirm_log_dump(Diff); + {error, Reason} -> + {error, Reason}; + {'EXIT', {Desc, Reason}} -> + case mnesia_monitor:get_env(auto_repair) of + true -> + mnesia_lib:important(Desc, Reason), + %% Ignore rest of the log + mnesia_log:confirm_log_dump(Diff); + false -> + fatal(Desc, Reason) + end + end; + {error, Reason} -> + {error, {"Cannot prepare log dump", Reason}} + end. + +scan_decisions(Fname, InitBy, Regulator) -> + Exists = mnesia_lib:exists(Fname), + case Exists of + false -> + ok; + true -> + Header = mnesia_log:trans_log_header(), + Name = previous_log, + mnesia_log:open_log(Name, Header, Fname, Exists, + mnesia_monitor:get_env(auto_repair), read_only), + Cont = start, + Res = (catch do_perform_dump(Cont, false, InitBy, Regulator, undefined)), + mnesia_log:close_log(Name), + case Res of + ok -> ok; + {'EXIT', Reason} -> {error, Reason} + end + end. + +do_perform_dump(Cont, InPlace, InitBy, Regulator, OldVersion) -> + case mnesia_log:chunk_log(Cont) of + {C2, Recs} -> + case catch insert_recs(Recs, InPlace, InitBy, Regulator, OldVersion) of + {'EXIT', R} -> + Reason = {"Transaction log dump error: ~p~n", [R]}, + close_files(InPlace, {error, Reason}, InitBy), + exit(Reason); + Version -> + do_perform_dump(C2, InPlace, InitBy, Regulator, Version) + end; + eof -> + close_files(InPlace, ok, InitBy), + erase(mnesia_dumper_dets), + ok + end. + +insert_recs([Rec | Recs], InPlace, InitBy, Regulator, LogV) -> + regulate(Regulator), + case insert_rec(Rec, InPlace, InitBy, LogV) of + LogH when is_record(LogH, log_header) -> + insert_recs(Recs, InPlace, InitBy, Regulator, LogH#log_header.log_version); + _ -> + insert_recs(Recs, InPlace, InitBy, Regulator, LogV) + end; + +insert_recs([], _InPlace, _InitBy, _Regulator, Version) -> + Version. + +insert_rec(Rec, _InPlace, scan_decisions, _LogV) -> + if + is_record(Rec, commit) -> + ignore; + is_record(Rec, log_header) -> + ignore; + true -> + mnesia_recover:note_log_decision(Rec, scan_decisions) + end; +insert_rec(Rec, InPlace, InitBy, LogV) when is_record(Rec, commit) -> + %% Determine the Outcome of the transaction and recover it + D = Rec#commit.decision, + case mnesia_recover:wait_for_decision(D, InitBy) of + {Tid, committed} -> + do_insert_rec(Tid, Rec, InPlace, InitBy, LogV); + {Tid, aborted} -> + mnesia_schema:undo_prepare_commit(Tid, Rec) + end; +insert_rec(H, _InPlace, _InitBy, _LogV) when is_record(H, log_header) -> + CurrentVersion = mnesia_log:version(), + if + H#log_header.log_kind /= trans_log -> + exit({"Bad kind of transaction log", H}); + H#log_header.log_version == CurrentVersion -> + ok; + H#log_header.log_version == "4.2" -> + ok; + H#log_header.log_version == "4.1" -> + ok; + H#log_header.log_version == "4.0" -> + ok; + true -> + fatal("Bad version of transaction log: ~p~n", [H]) + end, + H; + +insert_rec(_Rec, _InPlace, _InitBy, _LogV) -> + ok. + +do_insert_rec(Tid, Rec, InPlace, InitBy, LogV) -> + case Rec#commit.schema_ops of + [] -> + ignore; + SchemaOps -> + case val({schema, storage_type}) of + ram_copies -> + insert_ops(Tid, schema_ops, SchemaOps, InPlace, InitBy, LogV); + Storage -> + true = open_files(schema, Storage, InPlace, InitBy), + insert_ops(Tid, schema_ops, SchemaOps, InPlace, InitBy, LogV) + end + end, + D = Rec#commit.disc_copies, + insert_ops(Tid, disc_copies, D, InPlace, InitBy, LogV), + case InitBy of + startup -> + DO = Rec#commit.disc_only_copies, + insert_ops(Tid, disc_only_copies, DO, InPlace, InitBy, LogV); + _ -> + ignore + end. + + +update(_Tid, [], _DumperMode) -> + dumped; +update(Tid, SchemaOps, DumperMode) -> + UseDir = mnesia_monitor:use_dir(), + Res = perform_update(Tid, SchemaOps, DumperMode, UseDir), + mnesia_controller:release_schema_commit_lock(), + Res. + +perform_update(_Tid, _SchemaOps, mandatory, true) -> + %% Force a dump of the transaction log in order to let the + %% dumper perform needed updates + + InitBy = schema_update, + ?eval_debug_fun({?MODULE, dump_schema_op}, [InitBy]), + opt_dump_log(InitBy); +perform_update(Tid, SchemaOps, _DumperMode, _UseDir) -> + %% No need for a full transaction log dump. + %% Ignore the log file and perform only perform + %% the corresponding updates. + + InitBy = fast_schema_update, + InPlace = mnesia_monitor:get_env(dump_log_update_in_place), + ?eval_debug_fun({?MODULE, dump_schema_op}, [InitBy]), + case catch insert_ops(Tid, schema_ops, SchemaOps, InPlace, InitBy, + mnesia_log:version()) of + {'EXIT', Reason} -> + Error = {error, {"Schema update error", Reason}}, + close_files(InPlace, Error, InitBy), + fatal("Schema update error ~p ~p", [Reason, SchemaOps]); + _ -> + ?eval_debug_fun({?MODULE, post_dump}, [InitBy]), + close_files(InPlace, ok, InitBy), + ok + end. + +insert_ops(_Tid, _Storage, [], _InPlace, _InitBy, _) -> ok; +insert_ops(Tid, Storage, [Op], InPlace, InitBy, Ver) when Ver >= "4.3"-> + insert_op(Tid, Storage, Op, InPlace, InitBy), + ok; +insert_ops(Tid, Storage, [Op | Ops], InPlace, InitBy, Ver) when Ver >= "4.3"-> + insert_op(Tid, Storage, Op, InPlace, InitBy), + insert_ops(Tid, Storage, Ops, InPlace, InitBy, Ver); +insert_ops(Tid, Storage, [Op | Ops], InPlace, InitBy, Ver) when Ver < "4.3" -> + insert_ops(Tid, Storage, Ops, InPlace, InitBy, Ver), + insert_op(Tid, Storage, Op, InPlace, InitBy). + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +%% Normal ops + +disc_insert(_Tid, Storage, Tab, Key, Val, Op, InPlace, InitBy) -> + case open_files(Tab, Storage, InPlace, InitBy) of + true -> + case Storage of + disc_copies when Tab /= schema -> + mnesia_log:append({?MODULE,Tab}, {{Tab, Key}, Val, Op}), + ok; + _ -> + dets_insert(Op,Tab,Key,Val) + end; + false -> + ignore + end. + +%% To fix update_counter so that it behaves better. +%% i.e. if nothing have changed in tab except update_counter +%% trust that the value in the dets file is correct. +%% Otherwise we will get a double increment. +%% This is perfect but update_counter is a dirty op. + +dets_insert(Op,Tab,Key,Val) -> + case Op of + write -> + dets_updated(Tab,Key), + ok = dets:insert(Tab, Val); + delete -> + dets_updated(Tab,Key), + ok = dets:delete(Tab, Key); + update_counter -> + case dets_incr_counter(Tab,Key) of + true -> + {RecName, Incr} = Val, + case catch dets:update_counter(Tab, Key, Incr) of + CounterVal when is_integer(CounterVal) -> + ok; + _ when Incr < 0 -> + Zero = {RecName, Key, 0}, + ok = dets:insert(Tab, Zero); + _ -> + Init = {RecName, Key, Incr}, + ok = dets:insert(Tab, Init) + end; + false -> ok + end; + delete_object -> + dets_updated(Tab,Key), + ok = dets:delete_object(Tab, Val); + clear_table -> + dets_cleared(Tab), + ok = dets:match_delete(Tab, '_') + end. + +dets_updated(Tab,Key) -> + case get(mnesia_dumper_dets) of + undefined -> + Empty = gb_trees:empty(), + Tree = gb_trees:insert(Tab, gb_sets:singleton(Key), Empty), + put(mnesia_dumper_dets, Tree); + Tree -> + case gb_trees:lookup(Tab,Tree) of + {value, cleared} -> ignore; + {value, Set} -> + T = gb_trees:update(Tab, gb_sets:add(Key, Set), Tree), + put(mnesia_dumper_dets, T); + none -> + T = gb_trees:insert(Tab, gb_sets:singleton(Key), Tree), + put(mnesia_dumper_dets, T) + end + end. + +dets_incr_counter(Tab,Key) -> + case get(mnesia_dumper_dets) of + undefined -> false; + Tree -> + case gb_trees:lookup(Tab,Tree) of + {value, cleared} -> true; + {value, Set} -> gb_sets:is_member(Key, Set); + none -> false + end + end. + +dets_cleared(Tab) -> + case get(mnesia_dumper_dets) of + undefined -> + Empty = gb_trees:empty(), + Tree = gb_trees:insert(Tab, cleared, Empty), + put(mnesia_dumper_dets, Tree); + Tree -> + case gb_trees:lookup(Tab,Tree) of + {value, cleared} -> ignore; + _ -> + T = gb_trees:enter(Tab, cleared, Tree), + put(mnesia_dumper_dets, T) + end + end. + +insert(Tid, Storage, Tab, Key, [Val | Tail], Op, InPlace, InitBy) -> + insert(Tid, Storage, Tab, Key, Val, Op, InPlace, InitBy), + insert(Tid, Storage, Tab, Key, Tail, Op, InPlace, InitBy); + +insert(_Tid, _Storage, _Tab, _Key, [], _Op, _InPlace, _InitBy) -> + ok; + +insert(Tid, Storage, Tab, Key, Val, Op, InPlace, InitBy) -> + Item = {{Tab, Key}, Val, Op}, + case InitBy of + startup -> + disc_insert(Tid, Storage, Tab, Key, Val, Op, InPlace, InitBy); + + _ when Storage == ram_copies -> + mnesia_tm:do_update_op(Tid, Storage, Item), + Snmp = mnesia_tm:prepare_snmp(Tab, Key, [Item]), + mnesia_tm:do_snmp(Tid, Snmp); + + _ when Storage == disc_copies -> + disc_insert(Tid, Storage, Tab, Key, Val, Op, InPlace, InitBy), + mnesia_tm:do_update_op(Tid, Storage, Item), + Snmp = mnesia_tm:prepare_snmp(Tab, Key, [Item]), + mnesia_tm:do_snmp(Tid, Snmp); + + _ when Storage == disc_only_copies -> + mnesia_tm:do_update_op(Tid, Storage, Item), + Snmp = mnesia_tm:prepare_snmp(Tab, Key, [Item]), + mnesia_tm:do_snmp(Tid, Snmp); + + _ when Storage == unknown -> + ignore + end. + +disc_delete_table(Tab, Storage) -> + case mnesia_monitor:use_dir() of + true -> + if + Storage == disc_only_copies; Tab == schema -> + mnesia_monitor:unsafe_close_dets(Tab), + Dat = mnesia_lib:tab2dat(Tab), + file:delete(Dat); + true -> + DclFile = mnesia_lib:tab2dcl(Tab), + case get({?MODULE,Tab}) of + {opened_dumper, dcl} -> + del_opened_tab(Tab), + mnesia_log:unsafe_close_log(Tab); + _ -> + ok + end, + file:delete(DclFile), + DcdFile = mnesia_lib:tab2dcd(Tab), + file:delete(DcdFile), + ok + end, + erase({?MODULE, Tab}); + false -> + ignore + end. + +disc_delete_indecies(_Tab, _Cs, Storage) when Storage /= disc_only_copies -> + ignore; +disc_delete_indecies(Tab, Cs, disc_only_copies) -> + Indecies = Cs#cstruct.index, + mnesia_index:del_transient(Tab, Indecies, disc_only_copies). + +insert_op(Tid, Storage, {{Tab, Key}, Val, Op}, InPlace, InitBy) -> + %% Propagate to disc only + disc_insert(Tid, Storage, Tab, Key, Val, Op, InPlace, InitBy); + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +%% NOTE that all operations below will only +%% be performed if the dump is initiated by +%% startup or fast_schema_update +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% + +insert_op(_Tid, schema_ops, _OP, _InPlace, Initby) + when Initby /= startup, + Initby /= fast_schema_update, + Initby /= schema_update -> + ignore; + +insert_op(Tid, _, {op, rec, Storage, Item}, InPlace, InitBy) -> + {{Tab, Key}, ValList, Op} = Item, + insert(Tid, Storage, Tab, Key, ValList, Op, InPlace, InitBy); + +insert_op(Tid, _, {op, change_table_copy_type, N, FromS, ToS, TabDef}, InPlace, InitBy) -> + Cs = mnesia_schema:list2cs(TabDef), + Val = mnesia_schema:insert_cstruct(Tid, Cs, true), % Update ram only + {schema, Tab, _} = Val, + case lists:member(N, val({current, db_nodes})) of + true when InitBy /= startup -> + mnesia_controller:add_active_replica(Tab, N, Cs); + _ -> + ignore + end, + if + N == node() -> + Dmp = mnesia_lib:tab2dmp(Tab), + Dat = mnesia_lib:tab2dat(Tab), + Dcd = mnesia_lib:tab2dcd(Tab), + Dcl = mnesia_lib:tab2dcl(Tab), + case {FromS, ToS} of + {ram_copies, disc_copies} when Tab == schema -> + ok = ensure_rename(Dmp, Dat); + {ram_copies, disc_copies} -> + file:delete(Dcl), + ok = ensure_rename(Dmp, Dcd); + {disc_copies, ram_copies} when Tab == schema -> + mnesia_lib:set(use_dir, false), + mnesia_monitor:unsafe_close_dets(Tab), + file:delete(Dat); + {disc_copies, ram_copies} -> + file:delete(Dcl), + file:delete(Dcd); + {ram_copies, disc_only_copies} -> + ok = ensure_rename(Dmp, Dat), + true = open_files(Tab, disc_only_copies, InPlace, InitBy), + %% ram_delete_table must be done before init_indecies, + %% it uses info which is reset in init_indecies, + %% it doesn't matter, because init_indecies don't use + %% the ram replica of the table when creating the disc + %% index; Could be improved :) + mnesia_schema:ram_delete_table(Tab, FromS), + PosList = Cs#cstruct.index, + mnesia_index:init_indecies(Tab, disc_only_copies, PosList); + {disc_only_copies, ram_copies} -> + mnesia_monitor:unsafe_close_dets(Tab), + disc_delete_indecies(Tab, Cs, disc_only_copies), + case InitBy of + startup -> + ignore; + _ -> + mnesia_controller:get_disc_copy(Tab) + end, + disc_delete_table(Tab, disc_only_copies); + {disc_copies, disc_only_copies} -> + ok = ensure_rename(Dmp, Dat), + true = open_files(Tab, disc_only_copies, InPlace, InitBy), + mnesia_schema:ram_delete_table(Tab, FromS), + PosList = Cs#cstruct.index, + mnesia_index:init_indecies(Tab, disc_only_copies, PosList), + file:delete(Dcl), + file:delete(Dcd); + {disc_only_copies, disc_copies} -> + mnesia_monitor:unsafe_close_dets(Tab), + disc_delete_indecies(Tab, Cs, disc_only_copies), + case InitBy of + startup -> + ignore; + _ -> + mnesia_log:ets2dcd(Tab), + mnesia_controller:get_disc_copy(Tab), + disc_delete_table(Tab, disc_only_copies) + end + end; + true -> + ignore + end, + S = val({schema, storage_type}), + disc_insert(Tid, S, schema, Tab, Val, write, InPlace, InitBy); + +insert_op(Tid, _, {op, transform, _Fun, TabDef}, InPlace, InitBy) -> + Cs = mnesia_schema:list2cs(TabDef), + case mnesia_lib:cs_to_storage_type(node(), Cs) of + disc_copies -> + open_dcl(Cs#cstruct.name); + _ -> + ignore + end, + insert_cstruct(Tid, Cs, true, InPlace, InitBy); + +%%% Operations below this are handled without using the logg. + +insert_op(Tid, _, {op, restore_recreate, TabDef}, InPlace, InitBy) -> + Cs = mnesia_schema:list2cs(TabDef), + Tab = Cs#cstruct.name, + Type = Cs#cstruct.type, + Storage = mnesia_lib:cs_to_storage_type(node(), Cs), + %% Delete all possibly existing files and tables + disc_delete_table(Tab, Storage), + disc_delete_indecies(Tab, Cs, Storage), + case InitBy of + startup -> + ignore; + _ -> + case ?catch_val({Tab, cstruct}) of + {'EXIT', _} -> ignore; + _ -> + mnesia_schema:ram_delete_table(Tab, Storage), + mnesia_checkpoint:tm_del_copy(Tab, node()) + end + end, + %% And create new ones.. + if + (InitBy == startup) or (Storage == unknown) -> + ignore; + Storage == ram_copies -> + Args = [{keypos, 2}, public, named_table, Type], + mnesia_monitor:mktab(Tab, Args); + Storage == disc_copies -> + Args = [{keypos, 2}, public, named_table, Type], + mnesia_monitor:mktab(Tab, Args), + File = mnesia_lib:tab2dcd(Tab), + FArg = [{file, File}, {name, {mnesia,create}}, + {repair, false}, {mode, read_write}], + {ok, Log} = mnesia_monitor:open_log(FArg), + mnesia_monitor:unsafe_close_log(Log); + Storage == disc_only_copies -> + File = mnesia_lib:tab2dat(Tab), + file:delete(File), + Args = [{file, mnesia_lib:tab2dat(Tab)}, + {type, mnesia_lib:disk_type(Tab, Type)}, + {keypos, 2}, + {repair, mnesia_monitor:get_env(auto_repair)}], + mnesia_monitor:open_dets(Tab, Args) + end, + insert_op(Tid, ignore, {op, create_table, TabDef}, InPlace, InitBy); + +insert_op(Tid, _, {op, create_table, TabDef}, InPlace, InitBy) -> + Cs = mnesia_schema:list2cs(TabDef), + insert_cstruct(Tid, Cs, false, InPlace, InitBy), + Tab = Cs#cstruct.name, + Storage = mnesia_lib:cs_to_storage_type(node(), Cs), + case InitBy of + startup -> + case Storage of + unknown -> + ignore; + ram_copies -> + ignore; + disc_copies -> + Dcd = mnesia_lib:tab2dcd(Tab), + case mnesia_lib:exists(Dcd) of + true -> ignore; + false -> + mnesia_log:open_log(temp, + mnesia_log:dcl_log_header(), + Dcd, + false, + false, + read_write), + mnesia_log:unsafe_close_log(temp) + end; + _ -> + Args = [{file, mnesia_lib:tab2dat(Tab)}, + {type, mnesia_lib:disk_type(Tab, Cs#cstruct.type)}, + {keypos, 2}, + {repair, mnesia_monitor:get_env(auto_repair)}], + case mnesia_monitor:open_dets(Tab, Args) of + {ok, _} -> + mnesia_monitor:unsafe_close_dets(Tab); + {error, Error} -> + exit({"Failed to create dets table", Error}) + end + end; + _ -> + Copies = mnesia_lib:copy_holders(Cs), + Active = mnesia_lib:intersect(Copies, val({current, db_nodes})), + [mnesia_controller:add_active_replica(Tab, N, Cs) || N <- Active], + + case Storage of + unknown -> + mnesia_lib:unset({Tab, create_table}), + case Cs#cstruct.local_content of + true -> + ignore; + false -> + mnesia_lib:set_remote_where_to_read(Tab) + end; + _ -> + case Cs#cstruct.local_content of + true -> + mnesia_lib:set_local_content_whereabouts(Tab); + false -> + mnesia_lib:set({Tab, where_to_read}, node()) + end, + case Storage of + ram_copies -> + ignore; + _ -> + %% Indecies are still created by loader + disc_delete_indecies(Tab, Cs, Storage) + %% disc_delete_table(Tab, Storage) + end, + + %% Update whereabouts and create table + mnesia_controller:create_table(Tab), + mnesia_lib:unset({Tab, create_table}) + end + end; + +insert_op(_Tid, _, {op, dump_table, Size, TabDef}, _InPlace, _InitBy) -> + case Size of + unknown -> + ignore; + _ -> + Cs = mnesia_schema:list2cs(TabDef), + Tab = Cs#cstruct.name, + Dmp = mnesia_lib:tab2dmp(Tab), + Dat = mnesia_lib:tab2dcd(Tab), + case Size of + 0 -> + %% Assume that table files already are closed + file:delete(Dmp), + file:delete(Dat); + _ -> + ok = ensure_rename(Dmp, Dat) + end + end; + +insert_op(Tid, _, {op, delete_table, TabDef}, InPlace, InitBy) -> + Cs = mnesia_schema:list2cs(TabDef), + Tab = Cs#cstruct.name, + case mnesia_lib:cs_to_storage_type(node(), Cs) of + unknown -> + ignore; + Storage -> + disc_delete_table(Tab, Storage), + disc_delete_indecies(Tab, Cs, Storage), + case InitBy of + startup -> + ignore; + _ -> + mnesia_schema:ram_delete_table(Tab, Storage), + mnesia_checkpoint:tm_del_copy(Tab, node()) + end + end, + delete_cstruct(Tid, Cs, InPlace, InitBy); + +insert_op(Tid, _, {op, clear_table, TabDef}, InPlace, InitBy) -> + Cs = mnesia_schema:list2cs(TabDef), + Tab = Cs#cstruct.name, + case mnesia_lib:cs_to_storage_type(node(), Cs) of + unknown -> + ignore; + Storage -> + Oid = '_', %%val({Tab, wild_pattern}), + if Storage == disc_copies -> + open_dcl(Cs#cstruct.name); + true -> + ignore + end, + %% Need to catch this, it crashes on ram_copies if + %% the op comes before table is loaded at startup. + catch insert(Tid, Storage, Tab, '_', Oid, clear_table, InPlace, InitBy) + end; + +insert_op(Tid, _, {op, merge_schema, TabDef}, InPlace, InitBy) -> + Cs = mnesia_schema:list2cs(TabDef), + case Cs#cstruct.name of + schema -> + %% If we bootstrap an empty (diskless) mnesia from another node + %% we might have changed the storage_type of schema. + %% I think this is a good place to do it. + Update = fun(NS = {Node,Storage}) -> + case mnesia_lib:cs_to_storage_type(Node, Cs) of + Storage -> NS; + disc_copies when Node == node() -> + Dir = mnesia_lib:dir(), + ok = mnesia_schema:opt_create_dir(true, Dir), + mnesia_schema:purge_dir(Dir, []), + mnesia_log:purge_all_logs(), + + mnesia_lib:set(use_dir, true), + mnesia_log:init(), + Ns = val({current, db_nodes}), + F = fun(U) -> mnesia_recover:log_mnesia_up(U) end, + lists:foreach(F, Ns), + raw_named_dump_table(schema, dat), + temp_set_master_nodes(), + {Node,disc_copies}; + CSstorage -> + {Node,CSstorage} + end + end, + + W2C0 = val({schema, where_to_commit}), + W2C = case W2C0 of + {blocked, List} -> + {blocked,lists:map(Update,List)}; + List -> + lists:map(Update,List) + end, + if W2C == W2C0 -> ignore; + true -> mnesia_lib:set({schema, where_to_commit}, W2C) + end; + _ -> + ignore + end, + insert_cstruct(Tid, Cs, false, InPlace, InitBy); + +insert_op(Tid, _, {op, del_table_copy, Storage, Node, TabDef}, InPlace, InitBy) -> + Cs = mnesia_schema:list2cs(TabDef), + Tab = Cs#cstruct.name, + if + Tab == schema, Storage == ram_copies -> + insert_cstruct(Tid, Cs, true, InPlace, InitBy); + Tab /= schema -> + mnesia_controller:del_active_replica(Tab, Node), + mnesia_lib:del({Tab, Storage}, Node), + if + Node == node() -> + case Cs#cstruct.local_content of + true -> mnesia_lib:set({Tab, where_to_read}, nowhere); + false -> mnesia_lib:set_remote_where_to_read(Tab) + end, + mnesia_lib:del({schema, local_tables}, Tab), + mnesia_lib:set({Tab, storage_type}, unknown), + insert_cstruct(Tid, Cs, true, InPlace, InitBy), + disc_delete_table(Tab, Storage), + disc_delete_indecies(Tab, Cs, Storage), + mnesia_schema:ram_delete_table(Tab, Storage), + mnesia_checkpoint:tm_del_copy(Tab, Node); + true -> + case val({Tab, where_to_read}) of + Node -> + mnesia_lib:set_remote_where_to_read(Tab); + _ -> + ignore + end, + insert_cstruct(Tid, Cs, true, InPlace, InitBy) + end + end; + +insert_op(Tid, _, {op, add_table_copy, _Storage, _Node, TabDef}, InPlace, InitBy) -> + %% During prepare commit, the files was created + %% and the replica was announced + Cs = mnesia_schema:list2cs(TabDef), + insert_cstruct(Tid, Cs, true, InPlace, InitBy); + +insert_op(Tid, _, {op, add_snmp, _Us, TabDef}, InPlace, InitBy) -> + Cs = mnesia_schema:list2cs(TabDef), + insert_cstruct(Tid, Cs, true, InPlace, InitBy); + +insert_op(Tid, _, {op, del_snmp, TabDef}, InPlace, InitBy) -> + Cs = mnesia_schema:list2cs(TabDef), + Tab = Cs#cstruct.name, + Storage = mnesia_lib:cs_to_storage_type(node(), Cs), + if + InitBy /= startup, + Storage /= unknown -> + case ?catch_val({Tab, {index, snmp}}) of + {'EXIT', _} -> + ignore; + Stab -> + mnesia_snmp_hook:delete_table(Tab, Stab), + mnesia_lib:unset({Tab, {index, snmp}}) + end; + true -> + ignore + end, + insert_cstruct(Tid, Cs, true, InPlace, InitBy); + +insert_op(Tid, _, {op, add_index, Pos, TabDef}, InPlace, InitBy) -> + Cs = mnesia_schema:list2cs(TabDef), + Tab = insert_cstruct(Tid, Cs, true, InPlace, InitBy), + Storage = mnesia_lib:cs_to_storage_type(node(), Cs), + case InitBy of + startup when Storage == disc_only_copies -> + true = open_files(Tab, Storage, InPlace, InitBy), + mnesia_index:init_indecies(Tab, Storage, [Pos]); + startup -> + ignore; + _ -> + mnesia_index:init_indecies(Tab, Storage, [Pos]) + end; + +insert_op(Tid, _, {op, del_index, Pos, TabDef}, InPlace, InitBy) -> + Cs = mnesia_schema:list2cs(TabDef), + Tab = Cs#cstruct.name, + Storage = mnesia_lib:cs_to_storage_type(node(), Cs), + case InitBy of + startup when Storage == disc_only_copies -> + mnesia_index:del_index_table(Tab, Storage, Pos); + startup -> + ignore; + _ -> + mnesia_index:del_index_table(Tab, Storage, Pos) + end, + insert_cstruct(Tid, Cs, true, InPlace, InitBy); + +insert_op(Tid, _, {op, change_table_access_mode,TabDef, _OldAccess, _Access}, InPlace, InitBy) -> + Cs = mnesia_schema:list2cs(TabDef), + case InitBy of + startup -> ignore; + _ -> mnesia_controller:change_table_access_mode(Cs) + end, + insert_cstruct(Tid, Cs, true, InPlace, InitBy); + +insert_op(Tid, _, {op, change_table_load_order, TabDef, _OldLevel, _Level}, InPlace, InitBy) -> + Cs = mnesia_schema:list2cs(TabDef), + insert_cstruct(Tid, Cs, true, InPlace, InitBy); + +insert_op(Tid, _, {op, delete_property, TabDef, PropKey}, InPlace, InitBy) -> + Cs = mnesia_schema:list2cs(TabDef), + Tab = Cs#cstruct.name, + mnesia_lib:unset({Tab, user_property, PropKey}), + insert_cstruct(Tid, Cs, true, InPlace, InitBy); + +insert_op(Tid, _, {op, write_property, TabDef, _Prop}, InPlace, InitBy) -> + Cs = mnesia_schema:list2cs(TabDef), + insert_cstruct(Tid, Cs, true, InPlace, InitBy); + +insert_op(Tid, _, {op, change_table_frag, _Change, TabDef}, InPlace, InitBy) -> + Cs = mnesia_schema:list2cs(TabDef), + insert_cstruct(Tid, Cs, true, InPlace, InitBy). + +open_files(Tab, Storage, UpdateInPlace, InitBy) + when Storage /= unknown, Storage /= ram_copies -> + case get({?MODULE, Tab}) of + undefined -> + case ?catch_val({Tab, setorbag}) of + {'EXIT', _} -> + false; + Type -> + case Storage of + disc_copies when Tab /= schema -> + Bool = open_disc_copies(Tab, InitBy), + Bool; + _ -> + Fname = prepare_open(Tab, UpdateInPlace), + Args = [{file, Fname}, + {keypos, 2}, + {repair, mnesia_monitor:get_env(auto_repair)}, + {type, mnesia_lib:disk_type(Tab, Type)}], + {ok, _} = mnesia_monitor:open_dets(Tab, Args), + put({?MODULE, Tab}, {opened_dumper, dat}), + true + end + end; + already_dumped -> + false; + {opened_dumper, _} -> + true + end; +open_files(_Tab, _Storage, _UpdateInPlace, _InitBy) -> + false. + +open_disc_copies(Tab, InitBy) -> + DclF = mnesia_lib:tab2dcl(Tab), + DumpEts = + case file:read_file_info(DclF) of + {error, enoent} -> + false; + {ok, DclInfo} -> + DcdF = mnesia_lib:tab2dcd(Tab), + case file:read_file_info(DcdF) of + {error, Reason} -> + mnesia_lib:dbg_out("File ~p info_error ~p ~n", + [DcdF, Reason]), + true; + {ok, DcdInfo} -> + Mul = case ?catch_val(dc_dump_limit) of + {'EXIT', _} -> ?DumpToEtsMultiplier; + Val -> Val + end, + DcdInfo#file_info.size =< (DclInfo#file_info.size * Mul) + end + end, + if + DumpEts == false; InitBy == startup -> + mnesia_log:open_log({?MODULE,Tab}, + mnesia_log:dcl_log_header(), + DclF, + mnesia_lib:exists(DclF), + mnesia_monitor:get_env(auto_repair), + read_write), + put({?MODULE, Tab}, {opened_dumper, dcl}), + true; + true -> + mnesia_log:ets2dcd(Tab), + put({?MODULE, Tab}, already_dumped), + false + end. + +%% Always opens the dcl file for writing overriding already_dumped +%% mechanismen, used for schema transactions. +open_dcl(Tab) -> + case get({?MODULE, Tab}) of + {opened_dumper, _} -> + true; + _ -> %% undefined or already_dumped + DclF = mnesia_lib:tab2dcl(Tab), + mnesia_log:open_log({?MODULE,Tab}, + mnesia_log:dcl_log_header(), + DclF, + mnesia_lib:exists(DclF), + mnesia_monitor:get_env(auto_repair), + read_write), + put({?MODULE, Tab}, {opened_dumper, dcl}), + true + end. + +prepare_open(Tab, UpdateInPlace) -> + Dat = mnesia_lib:tab2dat(Tab), + case UpdateInPlace of + true -> + Dat; + false -> + Tmp = mnesia_lib:tab2tmp(Tab), + case catch mnesia_lib:copy_file(Dat, Tmp) of + ok -> + Tmp; + Error -> + fatal("Cannot copy dets file ~p to ~p: ~p~n", + [Dat, Tmp, Error]) + end + end. + +del_opened_tab(Tab) -> + erase({?MODULE, Tab}). + +close_files(UpdateInPlace, Outcome, InitBy) -> % Update in place + close_files(UpdateInPlace, Outcome, InitBy, get()). + +close_files(InPlace, Outcome, InitBy, [{{?MODULE, Tab}, already_dumped} | Tail]) -> + erase({?MODULE, Tab}), + close_files(InPlace, Outcome, InitBy, Tail); +close_files(InPlace, Outcome, InitBy, [{{?MODULE, Tab}, {opened_dumper, Type}} | Tail]) -> + erase({?MODULE, Tab}), + case val({Tab, storage_type}) of + disc_only_copies when InitBy /= startup -> + ignore; + disc_copies when Tab /= schema -> + mnesia_log:close_log({?MODULE,Tab}); + Storage -> + do_close(InPlace, Outcome, Tab, Type, Storage) + end, + close_files(InPlace, Outcome, InitBy, Tail); + +close_files(InPlace, Outcome, InitBy, [_ | Tail]) -> + close_files(InPlace, Outcome, InitBy, Tail); +close_files(_, _, _InitBy, []) -> + ok. + +%% If storage is unknown during close clean up files, this can happen if timing +%% is right and dirty_write conflicts with schema operations. +do_close(_, _, Tab, dcl, unknown) -> + mnesia_log:close_log({?MODULE,Tab}), + file:delete(mnesia_lib:tab2dcl(Tab)); +do_close(_, _, Tab, dcl, _) -> %% To be safe, can it happen? + mnesia_log:close_log({?MODULE,Tab}); + +do_close(InPlace, Outcome, Tab, dat, Storage) -> + mnesia_monitor:close_dets(Tab), + if + Storage == unknown, InPlace == true -> + file:delete(mnesia_lib:tab2dat(Tab)); + InPlace == true -> + %% Update in place + ok; + Outcome == ok, Storage /= unknown -> + %% Success: swap tmp files with dat files + TabDat = mnesia_lib:tab2dat(Tab), + ok = file:rename(mnesia_lib:tab2tmp(Tab), TabDat); + true -> + file:delete(mnesia_lib:tab2tmp(Tab)) + end. + + +ensure_rename(From, To) -> + case mnesia_lib:exists(From) of + true -> + file:rename(From, To); + false -> + case mnesia_lib:exists(To) of + true -> + ok; + false -> + {error, {rename_failed, From, To}} + end + end. + +insert_cstruct(Tid, Cs, KeepWhereabouts, InPlace, InitBy) -> + Val = mnesia_schema:insert_cstruct(Tid, Cs, KeepWhereabouts), + {schema, Tab, _} = Val, + S = val({schema, storage_type}), + disc_insert(Tid, S, schema, Tab, Val, write, InPlace, InitBy), + Tab. + +delete_cstruct(Tid, Cs, InPlace, InitBy) -> + Val = mnesia_schema:delete_cstruct(Tid, Cs), + {schema, Tab, _} = Val, + S = val({schema, storage_type}), + disc_insert(Tid, S, schema, Tab, Val, delete, InPlace, InitBy), + Tab. + + +temp_set_master_nodes() -> + Tabs = val({schema, local_tables}), + Masters = [{Tab, (val({Tab, disc_copies}) ++ + val({Tab, ram_copies}) ++ + val({Tab, disc_only_copies})) -- [node()]} + || Tab <- Tabs], + %% UseDir = false since we don't want to remember these + %% masternodes and we are running (really soon anyway) since we want this + %% to be known during table loading. + mnesia_recover:log_master_nodes(Masters, false, yes), + ok. + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +%% Raw dump of table. Dumper must have unique access to the ets table. + +raw_named_dump_table(Tab, Ftype) -> + case mnesia_monitor:use_dir() of + true -> + mnesia_lib:lock_table(Tab), + TmpFname = mnesia_lib:tab2tmp(Tab), + Fname = + case Ftype of + dat -> mnesia_lib:tab2dat(Tab); + dmp -> mnesia_lib:tab2dmp(Tab) + end, + file:delete(TmpFname), + file:delete(Fname), + TabSize = ?ets_info(Tab, size), + TabRef = Tab, + DiskType = mnesia_lib:disk_type(Tab), + Args = [{file, TmpFname}, + {keypos, 2}, + %% {ram_file, true}, + {estimated_no_objects, TabSize + 256}, + {repair, mnesia_monitor:get_env(auto_repair)}, + {type, DiskType}], + case mnesia_lib:dets_sync_open(TabRef, Args) of + {ok, TabRef} -> + Storage = ram_copies, + mnesia_lib:db_fixtable(Storage, Tab, true), + + case catch raw_dump_table(TabRef, Tab) of + {'EXIT', Reason} -> + mnesia_lib:db_fixtable(Storage, Tab, false), + mnesia_lib:dets_sync_close(Tab), + file:delete(TmpFname), + mnesia_lib:unlock_table(Tab), + exit({"Dump of table to disc failed", Reason}); + ok -> + mnesia_lib:db_fixtable(Storage, Tab, false), + mnesia_lib:dets_sync_close(Tab), + mnesia_lib:unlock_table(Tab), + ok = file:rename(TmpFname, Fname) + end; + {error, Reason} -> + mnesia_lib:unlock_table(Tab), + exit({"Open of file before dump to disc failed", Reason}) + end; + false -> + exit({has_no_disc, node()}) + end. + +raw_dump_table(DetsRef, EtsRef) -> + dets:from_ets(DetsRef, EtsRef). + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +%% Load regulator +%% +%% This is a poor mans substitute for a fair scheduler algorithm +%% in the Erlang emulator. The mnesia_dumper process performs many +%% costly BIF invokations and must pay for this. But since the +%% Emulator does not handle this properly we must compensate for +%% this with some form of load regulation of ourselves in order to +%% not steal all computation power in the Erlang Emulator ans make +%% other processes starve. Hopefully this is a temporary solution. + +start_regulator() -> + case mnesia_monitor:get_env(dump_log_load_regulation) of + false -> + nopid; + true -> + N = ?REGULATOR_NAME, + case mnesia_monitor:start_proc(N, ?MODULE, regulator_init, [self()]) of + {ok, Pid} -> + Pid; + {error, Reason} -> + fatal("Failed to start ~n: ~p~n", [N, Reason]) + end + end. + +regulator_init(Parent) -> + %% No need for trapping exits. + %% Using low priority causes the regulation + process_flag(priority, low), + register(?REGULATOR_NAME, self()), + proc_lib:init_ack(Parent, {ok, self()}), + regulator_loop(). + +regulator_loop() -> + receive + {regulate, From} -> + From ! {regulated, self()}, + regulator_loop(); + {stop, From} -> + From ! {stopped, self()}, + exit(normal) + end. + +regulate(nopid) -> + ok; +regulate(RegulatorPid) -> + RegulatorPid ! {regulate, self()}, + receive + {regulated, RegulatorPid} -> ok + end. + +val(Var) -> + case ?catch_val(Var) of + {'EXIT', Reason} -> mnesia_lib:other_val(Var, Reason); + Value -> Value + end. diff --git a/lib/mnesia/src/mnesia_event.erl b/lib/mnesia/src/mnesia_event.erl new file mode 100644 index 0000000000..ec6b99ecaa --- /dev/null +++ b/lib/mnesia/src/mnesia_event.erl @@ -0,0 +1,260 @@ +%% +%% %CopyrightBegin% +%% +%% Copyright Ericsson AB 1997-2009. All Rights Reserved. +%% +%% The contents of this file are subject to the Erlang Public License, +%% Version 1.1, (the "License"); you may not use this file except in +%% compliance with the License. You should have received a copy of the +%% Erlang Public License along with this software. If not, it can be +%% retrieved online at http://www.erlang.org/. +%% +%% Software distributed under the License is distributed on an "AS IS" +%% basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See +%% the License for the specific language governing rights and limitations +%% under the License. +%% +%% %CopyrightEnd% +%% + +%% +-module(mnesia_event). + +-behaviour(gen_event). +%-behaviour(mnesia_event). + +%% gen_event callback interface +-export([init/1, + handle_event/2, + handle_call/2, + handle_info/2, + terminate/2, + code_change/3]). + +-record(state, {nodes = [], + dumped_core = false, %% only dump fatal core once + args}). + +%%%---------------------------------------------------------------- +%%% Callback functions from gen_server +%%%---------------------------------------------------------------- + +%%----------------------------------------------------------------- +%% init(Args) -> +%% {ok, State} | Error +%%----------------------------------------------------------------- + +init(Args) -> + {ok, #state{args = Args}}. + +%%----------------------------------------------------------------- +%% handle_event(Event, State) -> +%% {ok, NewState} | remove_handler | +%% {swap_handler, Args1, State1, Mod2, Args2} +%%----------------------------------------------------------------- + +handle_event(Event, State) -> + handle_any_event(Event, State). + +%%----------------------------------------------------------------- +%% handle_info(Msg, State) -> +%% {ok, NewState} | remove_handler | +%% {swap_handler, Args1, State1, Mod2, Args2} +%%----------------------------------------------------------------- + +handle_info(Msg, State) -> + handle_any_event(Msg, State), + {ok, State}. + +%%----------------------------------------------------------------- +%% handle_call(Event, State) -> +%% {ok, Reply, NewState} | {remove_handler, Reply} | +%% {swap_handler, Reply, Args1, State1, Mod2, Args2} +%%----------------------------------------------------------------- + +handle_call(Msg, State) -> + Reply = ok, + {ok, NewState} = handle_any_event(Msg, State), + {ok, Reply, NewState}. + +%%----------------------------------------------------------------- +%% terminate(Reason, State) -> +%% AnyVal +%%----------------------------------------------------------------- + +terminate(_Reason, _State) -> + ok. + +%%---------------------------------------------------------------------- +%% Func: code_change/3 +%% Purpose: Upgrade process when its code is to be changed +%% Returns: {ok, NewState} +%%---------------------------------------------------------------------- +code_change(_OldVsn, State, _Extra) -> + {ok, State}. + +%%----------------------------------------------------------------- +%% Internal functions +%%----------------------------------------------------------------- + +handle_any_event({mnesia_system_event, Event}, State) -> + handle_system_event(Event, State); +handle_any_event({mnesia_table_event, Event}, State) -> + handle_table_event(Event, State); +handle_any_event(Msg, State) -> + report_error("~p got unexpected event: ~p~n", [?MODULE, Msg]), + {ok, State}. + +handle_table_event({Oper, Record, TransId}, State) -> + report_info("~p performed by ~p on record:~n\t~p~n", + [Oper, TransId, Record]), + {ok, State}. + +handle_system_event({mnesia_checkpoint_activated, _Checkpoint}, State) -> + {ok, State}; + +handle_system_event({mnesia_checkpoint_deactivated, _Checkpoint}, State) -> + {ok, State}; + +handle_system_event({mnesia_up, Node}, State) -> + Nodes = [Node | State#state.nodes], + {ok, State#state{nodes = Nodes}}; + +handle_system_event({mnesia_down, Node}, State) -> + case mnesia:system_info(fallback_activated) of + true -> + case mnesia_monitor:get_env(fallback_error_function) of + {mnesia, lkill} -> + Msg = "A fallback is installed and Mnesia " + "must be restarted. Forcing shutdown " + "after mnesia_down from ~p...~n", + report_fatal(Msg, [Node], nocore, State#state.dumped_core), + mnesia:lkill(), + exit(fatal); + {UserMod, UserFunc} -> + Msg = "Warning: A fallback is installed and Mnesia got mnesia_down " + "from ~p. ~n", + report_info(Msg, [Node]), + case catch apply(UserMod, UserFunc, [Node]) of + {'EXIT', {undef, _Reason}} -> + %% Backward compatibility + apply(UserMod, UserFunc, []); + {'EXIT', Reason} -> + exit(Reason); + _ -> + ok + end, + Nodes = lists:delete(Node, State#state.nodes), + {ok, State#state{nodes = Nodes}} + end; + false -> + Nodes = lists:delete(Node, State#state.nodes), + {ok, State#state{nodes = Nodes}} + end; + +handle_system_event({mnesia_overload, Details}, State) -> + report_warning("Mnesia is overloaded: ~p~n", [Details]), + {ok, State}; + +handle_system_event({mnesia_info, Format, Args}, State) -> + report_info(Format, Args), + {ok, State}; + +handle_system_event({mnesia_warning, Format, Args}, State) -> + report_warning(Format, Args), + {ok, State}; + +handle_system_event({mnesia_error, Format, Args}, State) -> + report_error(Format, Args), + {ok, State}; + +handle_system_event({mnesia_fatal, Format, Args, BinaryCore}, State) -> + report_fatal(Format, Args, BinaryCore, State#state.dumped_core), + {ok, State#state{dumped_core = true}}; + +handle_system_event({inconsistent_database, Reason, Node}, State) -> + report_error("mnesia_event got {inconsistent_database, ~w, ~w}~n", + [Reason, Node]), + {ok, State}; + +handle_system_event({mnesia_user, Event}, State) -> + report_info("User event: ~p~n", [Event]), + {ok, State}; + +handle_system_event(Msg, State) -> + report_error("mnesia_event got unexpected system event: ~p~n", [Msg]), + {ok, State}. + +report_info(Format0, Args0) -> + Format = "Mnesia(~p): " ++ Format0, + Args = [node() | Args0], + case global:whereis_name(mnesia_global_logger) of + undefined -> + io:format(Format, Args); + Pid -> + io:format(Pid, Format, Args) + end. + +report_warning(Format0, Args0) -> + Format = "Mnesia(~p): ** WARNING ** " ++ Format0, + Args = [node() | Args0], + case erlang:function_exported(error_logger, warning_msg, 2) of + true -> + error_logger:warning_msg(Format, Args); + false -> + error_logger:format(Format, Args) + end, + case global:whereis_name(mnesia_global_logger) of + undefined -> + ok; + Pid -> + io:format(Pid, Format, Args) + end. + +report_error(Format0, Args0) -> + Format = "Mnesia(~p): ** ERROR ** " ++ Format0, + Args = [node() | Args0], + error_logger:format(Format, Args), + case global:whereis_name(mnesia_global_logger) of + undefined -> + ok; + Pid -> + io:format(Pid, Format, Args) + end. + +report_fatal(Format, Args, BinaryCore, CoreDumped) -> + UseDir = mnesia_monitor:use_dir(), + CoreDir = mnesia_monitor:get_env(core_dir), + if + is_list(CoreDir),CoreDumped == false, is_binary(BinaryCore) -> + core_file(CoreDir,BinaryCore,Format,Args); + (UseDir == true),CoreDumped == false, is_binary(BinaryCore) -> + core_file(CoreDir,BinaryCore,Format,Args); + true -> + report_error("(ignoring core) ** FATAL ** " ++ Format, Args) + end. + +core_file(CoreDir,BinaryCore,Format,Args) -> + %% Integers = tuple_to_list(date()) ++ tuple_to_list(time()), + Integers = tuple_to_list(now()), + Fun = fun(I) when I < 10 -> ["_0",I]; + (I) -> ["_",I] + end, + List = lists:append([Fun(I) || I <- Integers]), + CoreFile = if is_list(CoreDir) -> + filename:absname(lists:concat(["MnesiaCore.", node()] ++ List), + CoreDir); + true -> + filename:absname(lists:concat(["MnesiaCore.", node()] ++ List)) + end, + case file:write_file(CoreFile, BinaryCore) of + ok -> + report_error("(core dumped to file: ~p)~n ** FATAL ** " ++ Format, + [CoreFile] ++ Args); + {error, Reason} -> + report_error("(could not write core file: ~p)~n ** FATAL ** " ++ Format, + [Reason] ++ Args) + end. + + + diff --git a/lib/mnesia/src/mnesia_frag.erl b/lib/mnesia/src/mnesia_frag.erl new file mode 100644 index 0000000000..a2958ab461 --- /dev/null +++ b/lib/mnesia/src/mnesia_frag.erl @@ -0,0 +1,1361 @@ +%%% +%% %CopyrightBegin% +%% +%% Copyright Ericsson AB 1998-2009. All Rights Reserved. +%% +%% The contents of this file are subject to the Erlang Public License, +%% Version 1.1, (the "License"); you may not use this file except in +%% compliance with the License. You should have received a copy of the +%% Erlang Public License along with this software. If not, it can be +%% retrieved online at http://www.erlang.org/. +%% +%% Software distributed under the License is distributed on an "AS IS" +%% basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See +%% the License for the specific language governing rights and limitations +%% under the License. +%% +%% %CopyrightEnd% +%% + +%%% +%%%---------------------------------------------------------------------- +%%% Purpose : Support tables so large that they need +%%% to be divided into several fragments. +%%%---------------------------------------------------------------------- + +%header_doc_include + +-module(mnesia_frag). + +%% Callback functions when accessed within an activity +-export([ + lock/4, + write/5, delete/5, delete_object/5, + read/5, match_object/5, all_keys/4, + select/5,select/6,select_cont/3, + index_match_object/6, index_read/6, + foldl/6, foldr/6, table_info/4, + first/3, next/4, prev/4, last/3, + clear_table/4 + ]). + +%header_doc_include + +%% -behaviour(mnesia_access). + +-export([ + change_table_frag/2, + remove_node/2, + expand_cstruct/1, + lookup_frag_hash/1, + lookup_foreigners/1, + frag_names/1, + set_frag_hash/2, + local_select/4, + remote_select/4 + ]). + +-include("mnesia.hrl"). + +-define(OLD_HASH_MOD, mnesia_frag_old_hash). +-define(DEFAULT_HASH_MOD, mnesia_frag_hash). +%%-define(DEFAULT_HASH_MOD, ?OLD_HASH_MOD). %% BUGBUG: New should be default + +-record(frag_state, + {foreign_key, + n_fragments, + hash_module, + hash_state}). + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +%% Access functions + +%impl_doc_include + +%% Callback functions which provides transparent +%% access of fragmented tables from any activity +%% access context. + +lock(ActivityId, Opaque, {table , Tab}, LockKind) -> + case frag_names(Tab) of + [Tab] -> + mnesia:lock(ActivityId, Opaque, {table, Tab}, LockKind); + Frags -> + DeepNs = [mnesia:lock(ActivityId, Opaque, {table, F}, LockKind) || + F <- Frags], + mnesia_lib:uniq(lists:append(DeepNs)) + end; + +lock(ActivityId, Opaque, LockItem, LockKind) -> + mnesia:lock(ActivityId, Opaque, LockItem, LockKind). + +write(ActivityId, Opaque, Tab, Rec, LockKind) -> + Frag = record_to_frag_name(Tab, Rec), + mnesia:write(ActivityId, Opaque, Frag, Rec, LockKind). + +delete(ActivityId, Opaque, Tab, Key, LockKind) -> + Frag = key_to_frag_name(Tab, Key), + mnesia:delete(ActivityId, Opaque, Frag, Key, LockKind). + +delete_object(ActivityId, Opaque, Tab, Rec, LockKind) -> + Frag = record_to_frag_name(Tab, Rec), + mnesia:delete_object(ActivityId, Opaque, Frag, Rec, LockKind). + +read(ActivityId, Opaque, Tab, Key, LockKind) -> + Frag = key_to_frag_name(Tab, Key), + mnesia:read(ActivityId, Opaque, Frag, Key, LockKind). + +match_object(ActivityId, Opaque, Tab, HeadPat, LockKind) -> + MatchSpec = [{HeadPat, [], ['$_']}], + select(ActivityId, Opaque, Tab, MatchSpec, LockKind). + +select(ActivityId, Opaque, Tab, MatchSpec, LockKind) -> + do_select(ActivityId, Opaque, Tab, MatchSpec, LockKind). + + +select(ActivityId, Opaque, Tab, MatchSpec, Limit, LockKind) -> + init_select(ActivityId, Opaque, Tab, MatchSpec, Limit, LockKind). + + +all_keys(ActivityId, Opaque, Tab, LockKind) -> + Match = [mnesia:all_keys(ActivityId, Opaque, Frag, LockKind) + || Frag <- frag_names(Tab)], + lists:append(Match). + +clear_table(ActivityId, Opaque, Tab, Obj) -> + [mnesia:clear_table(ActivityId, Opaque, Frag, Obj) || Frag <- frag_names(Tab)], + ok. + +index_match_object(ActivityId, Opaque, Tab, Pat, Attr, LockKind) -> + Match = + [mnesia:index_match_object(ActivityId, Opaque, Frag, Pat, Attr, LockKind) + || Frag <- frag_names(Tab)], + lists:append(Match). + +index_read(ActivityId, Opaque, Tab, Key, Attr, LockKind) -> + Match = + [mnesia:index_read(ActivityId, Opaque, Frag, Key, Attr, LockKind) + || Frag <- frag_names(Tab)], + lists:append(Match). + +foldl(ActivityId, Opaque, Fun, Acc, Tab, LockKind) -> + Fun2 = fun(Frag, A) -> + mnesia:foldl(ActivityId, Opaque, Fun, A, Frag, LockKind) + end, + lists:foldl(Fun2, Acc, frag_names(Tab)). + +foldr(ActivityId, Opaque, Fun, Acc, Tab, LockKind) -> + Fun2 = fun(Frag, A) -> + mnesia:foldr(ActivityId, Opaque, Fun, A, Frag, LockKind) + end, + lists:foldr(Fun2, Acc, frag_names(Tab)). + +table_info(ActivityId, Opaque, {Tab, Key}, Item) -> + Frag = key_to_frag_name(Tab, Key), + table_info2(ActivityId, Opaque, Tab, Frag, Item); +table_info(ActivityId, Opaque, Tab, Item) -> + table_info2(ActivityId, Opaque, Tab, Tab, Item). + +table_info2(ActivityId, Opaque, Tab, Frag, Item) -> + case Item of + size -> + SumFun = fun({_, Size}, Acc) -> Acc + Size end, + lists:foldl(SumFun, 0, frag_size(ActivityId, Opaque, Tab)); + memory -> + SumFun = fun({_, Size}, Acc) -> Acc + Size end, + lists:foldl(SumFun, 0, frag_memory(ActivityId, Opaque, Tab)); + base_table -> + lookup_prop(Tab, base_table); + node_pool -> + lookup_prop(Tab, node_pool); + n_fragments -> + FH = lookup_frag_hash(Tab), + FH#frag_state.n_fragments; + foreign_key -> + FH = lookup_frag_hash(Tab), + FH#frag_state.foreign_key; + foreigners -> + lookup_foreigners(Tab); + n_ram_copies -> + length(val({Tab, ram_copies})); + n_disc_copies -> + length(val({Tab, disc_copies})); + n_disc_only_copies -> + length(val({Tab, disc_only_copies})); + + frag_names -> + frag_names(Tab); + frag_dist -> + frag_dist(Tab); + frag_size -> + frag_size(ActivityId, Opaque, Tab); + frag_memory -> + frag_memory(ActivityId, Opaque, Tab); + _ -> + mnesia:table_info(ActivityId, Opaque, Frag, Item) + end. + +first(ActivityId, Opaque, Tab) -> + case ?catch_val({Tab, frag_hash}) of + {'EXIT', _} -> + mnesia:first(ActivityId, Opaque, Tab); + FH -> + FirstFrag = Tab, + case mnesia:first(ActivityId, Opaque, FirstFrag) of + '$end_of_table' -> + search_first(ActivityId, Opaque, Tab, 1, FH); + Next -> + Next + end + end. + +search_first(ActivityId, Opaque, Tab, N, FH) when N =< FH#frag_state.n_fragments -> + NextN = N + 1, + NextFrag = n_to_frag_name(Tab, NextN), + case mnesia:first(ActivityId, Opaque, NextFrag) of + '$end_of_table' -> + search_first(ActivityId, Opaque, Tab, NextN, FH); + Next -> + Next + end; +search_first(_ActivityId, _Opaque, _Tab, _N, _FH) -> + '$end_of_table'. + +last(ActivityId, Opaque, Tab) -> + case ?catch_val({Tab, frag_hash}) of + {'EXIT', _} -> + mnesia:last(ActivityId, Opaque, Tab); + FH -> + LastN = FH#frag_state.n_fragments, + search_last(ActivityId, Opaque, Tab, LastN, FH) + end. + +search_last(ActivityId, Opaque, Tab, N, FH) when N >= 1 -> + Frag = n_to_frag_name(Tab, N), + case mnesia:last(ActivityId, Opaque, Frag) of + '$end_of_table' -> + PrevN = N - 1, + search_last(ActivityId, Opaque, Tab, PrevN, FH); + Prev -> + Prev + end; +search_last(_ActivityId, _Opaque, _Tab, _N, _FH) -> + '$end_of_table'. + +prev(ActivityId, Opaque, Tab, Key) -> + case ?catch_val({Tab, frag_hash}) of + {'EXIT', _} -> + mnesia:prev(ActivityId, Opaque, Tab, Key); + FH -> + N = key_to_n(FH, Key), + Frag = n_to_frag_name(Tab, N), + case mnesia:prev(ActivityId, Opaque, Frag, Key) of + '$end_of_table' -> + search_prev(ActivityId, Opaque, Tab, N); + Prev -> + Prev + end + end. + +search_prev(ActivityId, Opaque, Tab, N) when N > 1 -> + PrevN = N - 1, + PrevFrag = n_to_frag_name(Tab, PrevN), + case mnesia:last(ActivityId, Opaque, PrevFrag) of + '$end_of_table' -> + search_prev(ActivityId, Opaque, Tab, PrevN); + Prev -> + Prev + end; +search_prev(_ActivityId, _Opaque, _Tab, _N) -> + '$end_of_table'. + +next(ActivityId, Opaque, Tab, Key) -> + case ?catch_val({Tab, frag_hash}) of + {'EXIT', _} -> + mnesia:next(ActivityId, Opaque, Tab, Key); + FH -> + N = key_to_n(FH, Key), + Frag = n_to_frag_name(Tab, N), + case mnesia:next(ActivityId, Opaque, Frag, Key) of + '$end_of_table' -> + search_next(ActivityId, Opaque, Tab, N, FH); + Prev -> + Prev + end + end. + +search_next(ActivityId, Opaque, Tab, N, FH) when N < FH#frag_state.n_fragments -> + NextN = N + 1, + NextFrag = n_to_frag_name(Tab, NextN), + case mnesia:first(ActivityId, Opaque, NextFrag) of + '$end_of_table' -> + search_next(ActivityId, Opaque, Tab, NextN, FH); + Next -> + Next + end; +search_next(_ActivityId, _Opaque, _Tab, _N, _FH) -> + '$end_of_table'. + +%impl_doc_include + +frag_size(ActivityId, Opaque, Tab) -> + [{F, remote_table_info(ActivityId, Opaque, F, size)} || F <- frag_names(Tab)]. + +frag_memory(ActivityId, Opaque, Tab) -> + [{F, remote_table_info(ActivityId, Opaque, F, memory)} || F <- frag_names(Tab)]. + +remote_table_info(ActivityId, Opaque, Tab, Item) -> + N = val({Tab, where_to_read}), + case rpc:call(N, mnesia, table_info, [ActivityId, Opaque, Tab, Item]) of + {badrpc, _} -> + mnesia:abort({no_exists, Tab, Item}); + Info -> + Info + end. + +init_select(Tid,Opaque,Tab,Pat,Limit,LockKind) -> + case ?catch_val({Tab, frag_hash}) of + {'EXIT', _} -> + mnesia:select(Tid, Opaque, Tab, Pat, Limit,LockKind); + FH -> + FragNumbers = verify_numbers(FH,Pat), + Fun = fun(Num) -> + Name = n_to_frag_name(Tab, Num), + Node = val({Name, where_to_read}), + Storage = mnesia_lib:storage_type_at_node(Node, Name), + mnesia:lock(Tid, Opaque, {table, Name}, LockKind), + {Name, Node, Storage} + end, + [{FTab,Node,Type}|NameNodes] = lists:map(Fun, FragNumbers), + InitFun = fun(FixedSpec) -> mnesia:dirty_sel_init(Node,FTab,FixedSpec,Limit,Type) end, + Res = mnesia:fun_select(Tid,Opaque,FTab,Pat,LockKind,FTab,InitFun,Limit,Node,Type), + frag_sel_cont(Res, NameNodes, {Pat,LockKind,Limit}) + end. + +select_cont(_Tid,_,{frag_cont, '$end_of_table', [],_}) -> '$end_of_table'; +select_cont(Tid,Ts,{frag_cont, '$end_of_table', [{Tab,Node,Type}|Rest],Args}) -> + {Spec,LockKind,Limit} = Args, + InitFun = fun(FixedSpec) -> mnesia:dirty_sel_init(Node,Tab,FixedSpec,Limit,Type) end, + Res = mnesia:fun_select(Tid,Ts,Tab,Spec,LockKind,Tab,InitFun,Limit,Node,Type), + frag_sel_cont(Res, Rest, Args); +select_cont(Tid,Ts,{frag_cont, Cont, TabL, Args}) -> + frag_sel_cont(mnesia:select_cont(Tid,Ts,Cont),TabL,Args); +select_cont(Tid,Ts,Else) -> %% Not a fragmented table + mnesia:select_cont(Tid,Ts,Else). + +frag_sel_cont('$end_of_table', [],_) -> + '$end_of_table'; +frag_sel_cont('$end_of_table', TabL,Args) -> + {[], {frag_cont, '$end_of_table', TabL,Args}}; +frag_sel_cont({Recs,Cont}, TabL,Args) -> + {Recs, {frag_cont, Cont, TabL,Args}}. + +do_select(ActivityId, Opaque, Tab, MatchSpec, LockKind) -> + case ?catch_val({Tab, frag_hash}) of + {'EXIT', _} -> + mnesia:select(ActivityId, Opaque, Tab, MatchSpec, LockKind); + FH -> + FragNumbers = verify_numbers(FH,MatchSpec), + Fun = fun(Num) -> + Name = n_to_frag_name(Tab, Num), + Node = val({Name, where_to_read}), + mnesia:lock(ActivityId, Opaque, {table, Name}, LockKind), + {Name, Node} + end, + NameNodes = lists:map(Fun, FragNumbers), + SelectAllFun = + fun(PatchedMatchSpec) -> + Match = [mnesia:dirty_select(Name, PatchedMatchSpec) + || {Name, _Node} <- NameNodes], + lists:append(Match) + end, + case [{Name, Node} || {Name, Node} <- NameNodes, Node /= node()] of + [] -> + %% All fragments are local + mnesia:fun_select(ActivityId, Opaque, Tab, MatchSpec, none, '_', SelectAllFun); + RemoteNameNodes -> + Type = val({Tab,setorbag}), + SelectFun = + fun(PatchedMatchSpec) -> + Ref = make_ref(), + Args = [self(), Ref, RemoteNameNodes, PatchedMatchSpec], + Pid = spawn_link(?MODULE, local_select, Args), + LocalMatch0 = [mnesia:dirty_select(Name, PatchedMatchSpec) + || {Name, Node} <- NameNodes, Node == node()], + LocalMatch = case Type of + ordered_set -> lists:merge(LocalMatch0); + _ -> lists:append(LocalMatch0) + end, + OldSelectFun = fun() -> SelectAllFun(PatchedMatchSpec) end, + local_collect(Ref, Pid, Type, LocalMatch, OldSelectFun) + end, + mnesia:fun_select(ActivityId, Opaque, Tab, MatchSpec, none, '_', SelectFun) + end + end. + +verify_numbers(FH,MatchSpec) -> + HashState = FH#frag_state.hash_state, + FragNumbers = + case FH#frag_state.hash_module of + HashMod when HashMod == ?DEFAULT_HASH_MOD -> + ?DEFAULT_HASH_MOD:match_spec_to_frag_numbers(HashState, MatchSpec); + HashMod -> + HashMod:match_spec_to_frag_numbers(HashState, MatchSpec) + end, + N = FH#frag_state.n_fragments, + VerifyFun = fun(F) when is_integer(F), F >= 1, F =< N -> false; + (_F) -> true + end, + case catch lists:filter(VerifyFun, FragNumbers) of + [] -> + FragNumbers; + BadFrags -> + mnesia:abort({"match_spec_to_frag_numbers: Fragment numbers out of range", + BadFrags, {range, 1, N}}) + end. + +local_select(ReplyTo, Ref, RemoteNameNodes, MatchSpec) -> + RemoteNodes = mnesia_lib:uniq([Node || {_Name, Node} <- RemoteNameNodes]), + Args = [ReplyTo, Ref, RemoteNameNodes, MatchSpec], + {Replies, BadNodes} = rpc:multicall(RemoteNodes, ?MODULE, remote_select, Args), + case mnesia_lib:uniq(Replies) -- [ok] of + [] when BadNodes == [] -> + ReplyTo ! {local_select, Ref, ok}; + _ when BadNodes /= [] -> + ReplyTo ! {local_select, Ref, {error, {node_not_running, hd(BadNodes)}}}; + [{badrpc, {'EXIT', Reason}} | _] -> + ReplyTo ! {local_select, Ref, {error, Reason}}; + [Reason | _] -> + ReplyTo ! {local_select, Ref, {error, Reason}} + end, + unlink(ReplyTo), + exit(normal). + +remote_select(ReplyTo, Ref, NameNodes, MatchSpec) -> + do_remote_select(ReplyTo, Ref, NameNodes, MatchSpec). + +do_remote_select(ReplyTo, Ref, [{Name, Node} | NameNodes], MatchSpec) -> + if + Node == node() -> + Res = (catch {ok, mnesia:dirty_select(Name, MatchSpec)}), + ReplyTo ! {remote_select, Ref, Node, Res}, + do_remote_select(ReplyTo, Ref, NameNodes, MatchSpec); + true -> + do_remote_select(ReplyTo, Ref, NameNodes, MatchSpec) + end; +do_remote_select(_ReplyTo, _Ref, [], _MatchSpec) -> + ok. + +local_collect(Ref, Pid, Type, LocalMatch, OldSelectFun) -> + receive + {local_select, Ref, LocalRes} -> + remote_collect(Ref, Type, LocalRes, LocalMatch, OldSelectFun); + {'EXIT', Pid, Reason} -> + remote_collect(Ref, Type, {error, Reason}, [], OldSelectFun) + end. + +remote_collect(Ref, Type, LocalRes = ok, Acc, OldSelectFun) -> + receive + {remote_select, Ref, Node, RemoteRes} -> + case RemoteRes of + {ok, RemoteMatch} -> + Matches = case Type of + ordered_set -> lists:merge(RemoteMatch, Acc); + _ -> RemoteMatch ++ Acc + end, + remote_collect(Ref, Type, LocalRes, Matches, OldSelectFun); + _ -> + remote_collect(Ref, Type, {error, {node_not_running, Node}}, [], OldSelectFun) + end + after 0 -> + Acc + end; +remote_collect(Ref, Type, LocalRes = {error, Reason}, _Acc, OldSelectFun) -> + receive + {remote_select, Ref, _Node, _RemoteRes} -> + remote_collect(Ref, Type, LocalRes, [], OldSelectFun) + after 0 -> + mnesia:abort(Reason) + end. + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +%% Returns a list of cstructs + +expand_cstruct(Cs) -> + expand_cstruct(Cs, create). + +expand_cstruct(Cs, Mode) -> + Tab = Cs#cstruct.name, + Props = Cs#cstruct.frag_properties, + mnesia_schema:verify({alt, [nil, list]}, mnesia_lib:etype(Props), + {badarg, Tab, Props}), + %% Verify keys + ValidKeys = [foreign_key, n_fragments, node_pool, + n_ram_copies, n_disc_copies, n_disc_only_copies, + hash_module, hash_state], + Keys = mnesia_schema:check_keys(Tab, Props, ValidKeys), + mnesia_schema:check_duplicates(Tab, Keys), + + %% Pick fragmentation props + ForeignKey = mnesia_schema:pick(Tab, foreign_key, Props, undefined), + {ForeignKey2, N, Pool, DefaultNR, DefaultND, DefaultNDO} = + pick_props(Tab, Cs, ForeignKey), + + %% Verify node_pool + BadPool = {bad_type, Tab, {node_pool, Pool}}, + mnesia_schema:verify(list, mnesia_lib:etype(Pool), BadPool), + NotAtom = fun(A) when is_atom(A) -> false; + (_A) -> true + end, + mnesia_schema:verify([], [P || P <- Pool, NotAtom(P)], BadPool), + + NR = mnesia_schema:pick(Tab, n_ram_copies, Props, 0), + ND = mnesia_schema:pick(Tab, n_disc_copies, Props, 0), + NDO = mnesia_schema:pick(Tab, n_disc_only_copies, Props, 0), + + PosInt = fun(I) when is_integer(I), I >= 0 -> true; + (_I) -> false + end, + mnesia_schema:verify(true, PosInt(NR), + {bad_type, Tab, {n_ram_copies, NR}}), + mnesia_schema:verify(true, PosInt(ND), + {bad_type, Tab, {n_disc_copies, ND}}), + mnesia_schema:verify(true, PosInt(NDO), + {bad_type, Tab, {n_disc_only_copies, NDO}}), + + %% Verify n_fragments + Cs2 = verify_n_fragments(N, Cs, Mode), + + %% Verify hash callback + HashMod = mnesia_schema:pick(Tab, hash_module, Props, ?DEFAULT_HASH_MOD), + HashState = mnesia_schema:pick(Tab, hash_state, Props, undefined), + HashState2 = HashMod:init_state(Tab, HashState), %% BUGBUG: Catch? + + FH = #frag_state{foreign_key = ForeignKey2, + n_fragments = 1, + hash_module = HashMod, + hash_state = HashState2}, + if + NR == 0, ND == 0, NDO == 0 -> + do_expand_cstruct(Cs2, FH, N, Pool, DefaultNR, DefaultND, DefaultNDO, Mode); + true -> + do_expand_cstruct(Cs2, FH, N, Pool, NR, ND, NDO, Mode) + end. + +do_expand_cstruct(Cs, FH, N, Pool, NR, ND, NDO, Mode) -> + Tab = Cs#cstruct.name, + + LC = Cs#cstruct.local_content, + mnesia_schema:verify(false, LC, + {combine_error, Tab, {local_content, LC}}), + + Snmp = Cs#cstruct.snmp, + mnesia_schema:verify([], Snmp, + {combine_error, Tab, {snmp, Snmp}}), + + %% Add empty fragments + CommonProps = [{base_table, Tab}], + Cs2 = Cs#cstruct{frag_properties = lists:sort(CommonProps)}, + expand_frag_cstructs(N, NR, ND, NDO, Cs2, Pool, Pool, FH, Mode). + +verify_n_fragments(N, Cs, Mode) when is_integer(N), N >= 1 -> + case Mode of + create -> + Cs#cstruct{ram_copies = [], + disc_copies = [], + disc_only_copies = []}; + activate -> + Reason = {combine_error, Cs#cstruct.name, {n_fragments, N}}, + mnesia_schema:verify(1, N, Reason), + Cs + end; +verify_n_fragments(N, Cs, _Mode) -> + mnesia:abort({bad_type, Cs#cstruct.name, {n_fragments, N}}). + +pick_props(Tab, Cs, {ForeignTab, Attr}) -> + mnesia_schema:verify(true, ForeignTab /= Tab, + {combine_error, Tab, {ForeignTab, Attr}}), + Props = Cs#cstruct.frag_properties, + Attrs = Cs#cstruct.attributes, + + ForeignKey = lookup_prop(ForeignTab, foreign_key), + ForeignN = lookup_prop(ForeignTab, n_fragments), + ForeignPool = lookup_prop(ForeignTab, node_pool), + N = mnesia_schema:pick(Tab, n_fragments, Props, ForeignN), + Pool = mnesia_schema:pick(Tab, node_pool, Props, ForeignPool), + + mnesia_schema:verify(ForeignN, N, + {combine_error, Tab, {n_fragments, N}, + ForeignTab, {n_fragments, ForeignN}}), + + mnesia_schema:verify(ForeignPool, Pool, + {combine_error, Tab, {node_pool, Pool}, + ForeignTab, {node_pool, ForeignPool}}), + + mnesia_schema:verify(undefined, ForeignKey, + {combine_error, Tab, + "Multiple levels of foreign_key dependencies", + {ForeignTab, Attr}, ForeignKey}), + + Key = {ForeignTab, mnesia_schema:attr_to_pos(Attr, Attrs)}, + DefaultNR = length(val({ForeignTab, ram_copies})), + DefaultND = length(val({ForeignTab, disc_copies})), + DefaultNDO = length(val({ForeignTab, disc_only_copies})), + {Key, N, Pool, DefaultNR, DefaultND, DefaultNDO}; +pick_props(Tab, Cs, undefined) -> + Props = Cs#cstruct.frag_properties, + DefaultN = 1, + DefaultPool = mnesia:system_info(db_nodes), + N = mnesia_schema:pick(Tab, n_fragments, Props, DefaultN), + Pool = mnesia_schema:pick(Tab, node_pool, Props, DefaultPool), + DefaultNR = 1, + DefaultND = 0, + DefaultNDO = 0, + {undefined, N, Pool, DefaultNR, DefaultND, DefaultNDO}; +pick_props(Tab, _Cs, BadKey) -> + mnesia:abort({bad_type, Tab, {foreign_key, BadKey}}). + +expand_frag_cstructs(N, NR, ND, NDO, CommonCs, Dist, Pool, FH, Mode) + when N > 1, Mode == create -> + Frag = n_to_frag_name(CommonCs#cstruct.name, N), + Cs = CommonCs#cstruct{name = Frag}, + {Cs2, RevModDist, RestDist} = set_frag_nodes(NR, ND, NDO, Cs, Dist, []), + ModDist = lists:reverse(RevModDist), + Dist2 = rearrange_dist(Cs, ModDist, RestDist, Pool), + %% Adjusts backwards, but it doesn't matter. + {FH2, _FromFrags, _AdditionalWriteFrags} = adjust_before_split(FH), + CsList = expand_frag_cstructs(N - 1, NR, ND, NDO, CommonCs, Dist2, Pool, FH2, Mode), + [Cs2 | CsList]; +expand_frag_cstructs(1, NR, ND, NDO, CommonCs, Dist, Pool, FH, Mode) -> + BaseProps = CommonCs#cstruct.frag_properties ++ + [{foreign_key, FH#frag_state.foreign_key}, + {hash_module, FH#frag_state.hash_module}, + {hash_state, FH#frag_state.hash_state}, + {n_fragments, FH#frag_state.n_fragments}, + {node_pool, Pool} + ], + BaseCs = CommonCs#cstruct{frag_properties = lists:sort(BaseProps)}, + case Mode of + activate -> + [BaseCs]; + create -> + {BaseCs2, _, _} = set_frag_nodes(NR, ND, NDO, BaseCs, Dist, []), + [BaseCs2] + end. + +set_frag_nodes(NR, ND, NDO, Cs, [Head | Tail], Acc) when NR > 0 -> + Pos = #cstruct.ram_copies, + {Cs2, Head2} = set_frag_node(Cs, Pos, Head), + set_frag_nodes(NR - 1, ND, NDO, Cs2, Tail, [Head2 | Acc]); +set_frag_nodes(NR, ND, NDO, Cs, [Head | Tail], Acc) when ND > 0 -> + Pos = #cstruct.disc_copies, + {Cs2, Head2} = set_frag_node(Cs, Pos, Head), + set_frag_nodes(NR, ND - 1, NDO, Cs2, Tail, [Head2 | Acc]); +set_frag_nodes(NR, ND, NDO, Cs, [Head | Tail], Acc) when NDO > 0 -> + Pos = #cstruct.disc_only_copies, + {Cs2, Head2} = set_frag_node(Cs, Pos, Head), + set_frag_nodes(NR, ND, NDO - 1, Cs2, Tail, [Head2 | Acc]); +set_frag_nodes(0, 0, 0, Cs, RestDist, ModDist) -> + {Cs, ModDist, RestDist}; +set_frag_nodes(_, _, _, Cs, [], _) -> + mnesia:abort({combine_error, Cs#cstruct.name, "Too few nodes in node_pool"}). + +set_frag_node(Cs, Pos, Head) -> + Ns = element(Pos, Cs), + {Node, Count2} = + case Head of + {N, Count} when is_atom(N), is_integer(Count), Count >= 0 -> + {N, Count + 1}; + N when is_atom(N) -> + {N, 1}; + BadNode -> + mnesia:abort({bad_type, Cs#cstruct.name, BadNode}) + end, + mnesia_schema:verify(true, + lists:member(Node, val({current,db_nodes})), + {not_active, Cs#cstruct.name, Node}), + Cs2 = setelement(Pos, Cs, [Node | Ns]), + {Cs2, {Node, Count2}}. + +rearrange_dist(Cs, [{Node, Count} | ModDist], Dist, Pool) -> + Dist2 = insert_dist(Cs, Node, Count, Dist, Pool), + rearrange_dist(Cs, ModDist, Dist2, Pool); +rearrange_dist(_Cs, [], Dist, _) -> + Dist. + +insert_dist(Cs, Node, Count, [Head | Tail], Pool) -> + case Head of + {Node2, Count2} when is_atom(Node2), is_integer(Count2), Count2 >= 0 -> + case node_diff(Node, Count, Node2, Count2, Pool) of + less -> + [{Node, Count}, Head | Tail]; + greater -> + [Head | insert_dist(Cs, Node, Count, Tail, Pool)] + end; + Node2 when is_atom(Node2) -> + insert_dist(Cs, Node, Count, [{Node2, 0} | Tail], Pool); + BadNode -> + mnesia:abort({bad_type, Cs#cstruct.name, BadNode}) + end; +insert_dist(_Cs, Node, Count, [], _Pool) -> + [{Node, Count}]; +insert_dist(_Cs, _Node, _Count, Dist, _Pool) -> + mnesia:abort({bad_type, Dist}). + +node_diff(_Node, Count, _Node2, Count2, _Pool) when Count < Count2 -> + less; +node_diff(Node, Count, Node2, Count2, Pool) when Count == Count2 -> + Pos = list_pos(Node, Pool, 1), + Pos2 = list_pos(Node2, Pool, 1), + if + Pos < Pos2 -> + less; + Pos > Pos2 -> + greater + end; +node_diff(_Node, Count, _Node2, Count2, _Pool) when Count > Count2 -> + greater. + +%% Returns position of element in list +list_pos(H, [H | _T], Pos) -> + Pos; +list_pos(E, [_H | T], Pos) -> + list_pos(E, T, Pos + 1). + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +%% Switch function for changing of table fragmentation +%% +%% Returns a list of lists of schema ops + +change_table_frag(Tab, {activate, FragProps}) -> + make_activate(Tab, FragProps); +change_table_frag(Tab, deactivate) -> + make_deactivate(Tab); +change_table_frag(Tab, {add_frag, SortedNodes}) -> + make_multi_add_frag(Tab, SortedNodes); +change_table_frag(Tab, del_frag) -> + make_multi_del_frag(Tab); +change_table_frag(Tab, {add_node, Node}) -> + make_multi_add_node(Tab, Node); +change_table_frag(Tab, {del_node, Node}) -> + make_multi_del_node(Tab, Node); +change_table_frag(Tab, Change) -> + mnesia:abort({bad_type, Tab, Change}). + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +%% Turn a normal table into a fragmented table +%% +%% The storage type must be the same on all nodes + +make_activate(Tab, Props) -> + Cs = mnesia_schema:incr_version(val({Tab, cstruct})), + mnesia_schema:ensure_active(Cs), + case Cs#cstruct.frag_properties of + [] -> + Cs2 = Cs#cstruct{frag_properties = Props}, + [Cs3] = expand_cstruct(Cs2, activate), + TabDef = mnesia_schema:cs2list(Cs3), + Op = {op, change_table_frag, activate, TabDef}, + [[Op]]; + BadProps -> + mnesia:abort({already_exists, Tab, {frag_properties, BadProps}}) + end. + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +%% Turn a table into a normal defragmented table + +make_deactivate(Tab) -> + Cs = mnesia_schema:incr_version(val({Tab, cstruct})), + mnesia_schema:ensure_active(Cs), + Foreigners = lookup_foreigners(Tab), + BaseTab = lookup_prop(Tab, base_table), + FH = lookup_frag_hash(Tab), + if + BaseTab /= Tab -> + mnesia:abort({combine_error, Tab, "Not a base table"}); + Foreigners /= [] -> + mnesia:abort({combine_error, Tab, "Too many foreigners", Foreigners}); + FH#frag_state.n_fragments > 1 -> + mnesia:abort({combine_error, Tab, "Too many fragments"}); + true -> + Cs2 = Cs#cstruct{frag_properties = []}, + TabDef = mnesia_schema:cs2list(Cs2), + Op = {op, change_table_frag, deactivate, TabDef}, + [[Op]] + end. + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +%% Add a fragment to a fragmented table and fill it with half of +%% the records from one of the old fragments + +make_multi_add_frag(Tab, SortedNs) when is_list(SortedNs) -> + verify_multi(Tab), + Ops = make_add_frag(Tab, SortedNs), + + %% Propagate to foreigners + MoreOps = [make_add_frag(T, SortedNs) || T <- lookup_foreigners(Tab)], + [Ops | MoreOps]; +make_multi_add_frag(Tab, SortedNs) -> + mnesia:abort({bad_type, Tab, SortedNs}). + +verify_multi(Tab) -> + FH = lookup_frag_hash(Tab), + ForeignKey = FH#frag_state.foreign_key, + mnesia_schema:verify(undefined, ForeignKey, + {combine_error, Tab, + "Op only allowed via foreign table", + {foreign_key, ForeignKey}}). + +make_frag_names_and_acquire_locks(Tab, N, FragIndecies, DoNotLockN) -> + mnesia_schema:get_tid_ts_and_lock(Tab, write), + Fun = fun(Index, FN) -> + if + DoNotLockN == true, Index == N -> + Name = n_to_frag_name(Tab, Index), + setelement(Index, FN, Name); + true -> + Name = n_to_frag_name(Tab, Index), + mnesia_schema:get_tid_ts_and_lock(Name, write), + setelement(Index , FN, Name) + end + end, + FragNames = erlang:make_tuple(N, undefined), + lists:foldl(Fun, FragNames, FragIndecies). + +make_add_frag(Tab, SortedNs) -> + Cs = mnesia_schema:incr_version(val({Tab, cstruct})), + mnesia_schema:ensure_active(Cs), + FH = lookup_frag_hash(Tab), + {FH2, FromIndecies, WriteIndecies} = adjust_before_split(FH), + N = FH2#frag_state.n_fragments, + FragNames = make_frag_names_and_acquire_locks(Tab, N, WriteIndecies, true), + NewFrag = element(N, FragNames), + + NR = length(Cs#cstruct.ram_copies), + ND = length(Cs#cstruct.disc_copies), + NDO = length(Cs#cstruct.disc_only_copies), + NewCs = Cs#cstruct{name = NewFrag, + frag_properties = [{base_table, Tab}], + ram_copies = [], + disc_copies = [], + disc_only_copies = []}, + + {NewCs2, _, _} = set_frag_nodes(NR, ND, NDO, NewCs, SortedNs, []), + [NewOp] = mnesia_schema:make_create_table(NewCs2), + + SplitOps = split(Tab, FH2, FromIndecies, FragNames, []), + + Cs2 = replace_frag_hash(Cs, FH2), + TabDef = mnesia_schema:cs2list(Cs2), + BaseOp = {op, change_table_frag, {add_frag, SortedNs}, TabDef}, + + [BaseOp, NewOp | SplitOps]. + +replace_frag_hash(Cs, FH) when is_record(FH, frag_state) -> + Fun = fun(Prop) -> + case Prop of + {n_fragments, _} -> + {true, {n_fragments, FH#frag_state.n_fragments}}; + {hash_module, _} -> + {true, {hash_module, FH#frag_state.hash_module}}; + {hash_state, _} -> + {true, {hash_state, FH#frag_state.hash_state}}; + {next_n_to_split, _} -> + false; + {n_doubles, _} -> + false; + _ -> + true + end + end, + Props = lists:zf(Fun, Cs#cstruct.frag_properties), + Cs#cstruct{frag_properties = Props}. + +%% Adjust table info before split +adjust_before_split(FH) -> + HashState = FH#frag_state.hash_state, + {HashState2, FromFrags, AdditionalWriteFrags} = + case FH#frag_state.hash_module of + HashMod when HashMod == ?DEFAULT_HASH_MOD -> + ?DEFAULT_HASH_MOD:add_frag(HashState); + HashMod -> + HashMod:add_frag(HashState) + end, + N = FH#frag_state.n_fragments + 1, + FromFrags2 = (catch lists:sort(FromFrags)), + UnionFrags = (catch lists:merge(FromFrags2, lists:sort(AdditionalWriteFrags))), + VerifyFun = fun(F) when is_integer(F), F >= 1, F =< N -> false; + (_F) -> true + end, + case catch lists:filter(VerifyFun, UnionFrags) of + [] -> + FH2 = FH#frag_state{n_fragments = N, + hash_state = HashState2}, + {FH2, FromFrags2, UnionFrags}; + BadFrags -> + mnesia:abort({"add_frag: Fragment numbers out of range", + BadFrags, {range, 1, N}}) + end. + +split(Tab, FH, [SplitN | SplitNs], FragNames, Ops) -> + SplitFrag = element(SplitN, FragNames), + Pat = mnesia:table_info(SplitFrag, wild_pattern), + {_Mod, Tid, Ts} = mnesia_schema:get_tid_ts_and_lock(Tab, none), + Recs = mnesia:match_object(Tid, Ts, SplitFrag, Pat, read), + Ops2 = do_split(FH, SplitN, FragNames, Recs, Ops), + split(Tab, FH, SplitNs, FragNames, Ops2); +split(_Tab, _FH, [], _FragNames, Ops) -> + Ops. + +%% Perform the split of the table +do_split(FH, OldN, FragNames, [Rec | Recs], Ops) -> + Pos = key_pos(FH), + HashKey = element(Pos, Rec), + case key_to_n(FH, HashKey) of + NewN when NewN == OldN -> + %% Keep record in the same fragment. No need to move it. + do_split(FH, OldN, FragNames, Recs, Ops); + NewN -> + case element(NewN, FragNames) of + NewFrag when NewFrag /= undefined -> + OldFrag = element(OldN, FragNames), + Key = element(2, Rec), + NewOid = {NewFrag, Key}, + OldOid = {OldFrag, Key}, + Ops2 = [{op, rec, unknown, {NewOid, [Rec], write}}, + {op, rec, unknown, {OldOid, [OldOid], delete}} | Ops], + do_split(FH, OldN, FragNames, Recs, Ops2); + _NewFrag -> + %% Tried to move record to fragment that not is locked + mnesia:abort({"add_frag: Fragment not locked", NewN}) + end + end; +do_split(_FH, _OldN, _FragNames, [], Ops) -> + Ops. + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +%% Delete a fragment from a fragmented table +%% and merge its records with an other fragment + +make_multi_del_frag(Tab) -> + verify_multi(Tab), + Ops = make_del_frag(Tab), + + %% Propagate to foreigners + MoreOps = [make_del_frag(T) || T <- lookup_foreigners(Tab)], + [Ops | MoreOps]. + +make_del_frag(Tab) -> + FH = lookup_frag_hash(Tab), + case FH#frag_state.n_fragments of + N when N > 1 -> + Cs = mnesia_schema:incr_version(val({Tab, cstruct})), + mnesia_schema:ensure_active(Cs), + {FH2, FromIndecies, WriteIndecies} = adjust_before_merge(FH), + FragNames = make_frag_names_and_acquire_locks(Tab, N, WriteIndecies, false), + + MergeOps = merge(Tab, FH2, FromIndecies, FragNames, []), + LastFrag = element(N, FragNames), + [LastOp] = mnesia_schema:make_delete_table(LastFrag, single_frag), + Cs2 = replace_frag_hash(Cs, FH2), + TabDef = mnesia_schema:cs2list(Cs2), + BaseOp = {op, change_table_frag, del_frag, TabDef}, + [BaseOp, LastOp | MergeOps]; + _ -> + %% Cannot remove the last fragment + mnesia:abort({no_exists, Tab}) + end. + +%% Adjust tab info before merge +adjust_before_merge(FH) -> + HashState = FH#frag_state.hash_state, + {HashState2, FromFrags, AdditionalWriteFrags} = + case FH#frag_state.hash_module of + HashMod when HashMod == ?DEFAULT_HASH_MOD -> + ?DEFAULT_HASH_MOD:del_frag(HashState); + HashMod -> + HashMod:del_frag(HashState) + end, + N = FH#frag_state.n_fragments, + FromFrags2 = (catch lists:sort(FromFrags)), + UnionFrags = (catch lists:merge(FromFrags2, lists:sort(AdditionalWriteFrags))), + VerifyFun = fun(F) when is_integer(F), F >= 1, F =< N -> false; + (_F) -> true + end, + case catch lists:filter(VerifyFun, UnionFrags) of + [] -> + case lists:member(N, FromFrags2) of + true -> + FH2 = FH#frag_state{n_fragments = N - 1, + hash_state = HashState2}, + {FH2, FromFrags2, UnionFrags}; + false -> + mnesia:abort({"del_frag: Last fragment number not included", N}) + end; + BadFrags -> + mnesia:abort({"del_frag: Fragment numbers out of range", + BadFrags, {range, 1, N}}) + end. + +merge(Tab, FH, [FromN | FromNs], FragNames, Ops) -> + FromFrag = element(FromN, FragNames), + Pat = mnesia:table_info(FromFrag, wild_pattern), + {_Mod, Tid, Ts} = mnesia_schema:get_tid_ts_and_lock(Tab, none), + Recs = mnesia:match_object(Tid, Ts, FromFrag, Pat, read), + Ops2 = do_merge(FH, FromN, FragNames, Recs, Ops), + merge(Tab, FH, FromNs, FragNames, Ops2); +merge(_Tab, _FH, [], _FragNames, Ops) -> + Ops. + +%% Perform the merge of the table +do_merge(FH, OldN, FragNames, [Rec | Recs], Ops) -> + Pos = key_pos(FH), + LastN = FH#frag_state.n_fragments + 1, + HashKey = element(Pos, Rec), + case key_to_n(FH, HashKey) of + NewN when NewN == LastN -> + %% Tried to leave a record in the fragment that is to be deleted + mnesia:abort({"del_frag: Fragment number out of range", + NewN, {range, 1, LastN}}); + NewN when NewN == OldN -> + %% Keep record in the same fragment. No need to move it. + do_merge(FH, OldN, FragNames, Recs, Ops); + NewN when OldN == LastN -> + %% Move record from the fragment that is to be deleted + %% No need to create a delete op for each record. + case element(NewN, FragNames) of + NewFrag when NewFrag /= undefined -> + Key = element(2, Rec), + NewOid = {NewFrag, Key}, + Ops2 = [{op, rec, unknown, {NewOid, [Rec], write}} | Ops], + do_merge(FH, OldN, FragNames, Recs, Ops2); + _NewFrag -> + %% Tried to move record to fragment that not is locked + mnesia:abort({"del_frag: Fragment not locked", NewN}) + end; + NewN -> + case element(NewN, FragNames) of + NewFrag when NewFrag /= undefined -> + OldFrag = element(OldN, FragNames), + Key = element(2, Rec), + NewOid = {NewFrag, Key}, + OldOid = {OldFrag, Key}, + Ops2 = [{op, rec, unknown, {NewOid, [Rec], write}}, + {op, rec, unknown, {OldOid, [OldOid], delete}} | Ops], + do_merge(FH, OldN, FragNames, Recs, Ops2); + _NewFrag -> + %% Tried to move record to fragment that not is locked + mnesia:abort({"del_frag: Fragment not locked", NewN}) + end + end; + do_merge(_FH, _OldN, _FragNames, [], Ops) -> + Ops. + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +%% Add a node to the node pool of a fragmented table + +make_multi_add_node(Tab, Node) -> + verify_multi(Tab), + Ops = make_add_node(Tab, Node), + + %% Propagate to foreigners + MoreOps = [make_add_node(T, Node) || T <- lookup_foreigners(Tab)], + [Ops | MoreOps]. + +make_add_node(Tab, Node) when is_atom(Node) -> + Pool = lookup_prop(Tab, node_pool), + case lists:member(Node, Pool) of + false -> + Cs = mnesia_schema:incr_version(val({Tab, cstruct})), + Pool2 = Pool ++ [Node], + Props = Cs#cstruct.frag_properties, + Props2 = lists:keyreplace(node_pool, 1, Props, {node_pool, Pool2}), + Cs2 = Cs#cstruct{frag_properties = Props2}, + TabDef = mnesia_schema:cs2list(Cs2), + Op = {op, change_table_frag, {add_node, Node}, TabDef}, + [Op]; + true -> + mnesia:abort({already_exists, Tab, Node}) + end; +make_add_node(Tab, Node) -> + mnesia:abort({bad_type, Tab, Node}). + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +%% Delet a node from the node pool of a fragmented table + +make_multi_del_node(Tab, Node) -> + verify_multi(Tab), + Ops = make_del_node(Tab, Node), + + %% Propagate to foreigners + MoreOps = [make_del_node(T, Node) || T <- lookup_foreigners(Tab)], + [Ops | MoreOps]. + +make_del_node(Tab, Node) when is_atom(Node) -> + Cs = mnesia_schema:incr_version(val({Tab, cstruct})), + mnesia_schema:ensure_active(Cs), + Pool = lookup_prop(Tab, node_pool), + case lists:member(Node, Pool) of + true -> + Pool2 = Pool -- [Node], + Props = lists:keyreplace(node_pool, 1, Cs#cstruct.frag_properties, {node_pool, Pool2}), + Cs2 = Cs#cstruct{frag_properties = Props}, + TabDef = mnesia_schema:cs2list(Cs2), + Op = {op, change_table_frag, {del_node, Node}, TabDef}, + [Op]; + false -> + mnesia:abort({no_exists, Tab, Node}) + end; +make_del_node(Tab, Node) -> + mnesia:abort({bad_type, Tab, Node}). + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +%% Special case used to remove all references to a node during +%% mnesia:del_table_copy(schema, Node) + +remove_node(Node, Cs) -> + Tab = Cs#cstruct.name, + case ?catch_val({Tab, frag_hash}) of + {'EXIT', _} -> + {Cs, false}; + _ -> + Pool = lookup_prop(Tab, node_pool), + case lists:member(Node, Pool) of + true -> + Pool2 = Pool -- [Node], + Props = lists:keyreplace(node_pool, 1, + Cs#cstruct.frag_properties, + {node_pool, Pool2}), + {Cs#cstruct{frag_properties = Props}, true}; + false -> + {Cs, false} + end + end. + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +%% Helpers + +val(Var) -> + case ?catch_val(Var) of + {'EXIT', Reason} -> mnesia_lib:other_val(Var, Reason); + Value -> Value + end. + +set_frag_hash(Tab, Props) -> + case props_to_frag_hash(Tab, Props) of + FH when is_record(FH, frag_state) -> + mnesia_lib:set({Tab, frag_hash}, FH); + no_hash -> + mnesia_lib:unset({Tab, frag_hash}) + end. + +props_to_frag_hash(_Tab, []) -> + no_hash; +props_to_frag_hash(Tab, Props) -> + case mnesia_schema:pick(Tab, base_table, Props, undefined) of + T when T == Tab -> + Foreign = mnesia_schema:pick(Tab, foreign_key, Props, must), + N = mnesia_schema:pick(Tab, n_fragments, Props, must), + + case mnesia_schema:pick(Tab, hash_module, Props, undefined) of + undefined -> + Split = mnesia_schema:pick(Tab, next_n_to_split, Props, must), + Doubles = mnesia_schema:pick(Tab, n_doubles, Props, must), + FH = {frag_hash, Foreign, N, Split, Doubles}, + HashState = ?OLD_HASH_MOD:init_state(Tab, FH), + #frag_state{foreign_key = Foreign, + n_fragments = N, + hash_module = ?OLD_HASH_MOD, + hash_state = HashState}; + HashMod -> + HashState = mnesia_schema:pick(Tab, hash_state, Props, must), + #frag_state{foreign_key = Foreign, + n_fragments = N, + hash_module = HashMod, + hash_state = HashState} + %% Old style. Kept for backwards compatibility. + end; + _ -> + no_hash + end. + +lookup_prop(Tab, Prop) -> + Props = val({Tab, frag_properties}), + case lists:keysearch(Prop, 1, Props) of + {value, {Prop, Val}} -> + Val; + false -> + mnesia:abort({no_exists, Tab, Prop, {frag_properties, Props}}) + end. + +lookup_frag_hash(Tab) -> + case ?catch_val({Tab, frag_hash}) of + FH when is_record(FH, frag_state) -> + FH; + {frag_hash, K, N, _S, _D} = FH -> + %% Old style. Kept for backwards compatibility. + HashState = ?OLD_HASH_MOD:init_state(Tab, FH), + #frag_state{foreign_key = K, + n_fragments = N, + hash_module = ?OLD_HASH_MOD, + hash_state = HashState}; + {'EXIT', _} -> + mnesia:abort({no_exists, Tab, frag_properties, frag_hash}) + end. + +%% Returns a list of tables +lookup_foreigners(Tab) -> + %% First field in HashPat is either frag_hash or frag_state + HashPat = {'_', {Tab, '_'}, '_', '_', '_'}, + [T || [T] <- ?ets_match(mnesia_gvar, {{'$1', frag_hash}, HashPat})]. + +%% Returns name of fragment table +record_to_frag_name(Tab, Rec) -> + case ?catch_val({Tab, frag_hash}) of + {'EXIT', _} -> + Tab; + FH -> + Pos = key_pos(FH), + Key = element(Pos, Rec), + N = key_to_n(FH, Key), + n_to_frag_name(Tab, N) + end. + +key_pos(FH) -> + case FH#frag_state.foreign_key of + undefined -> + 2; + {_ForeignTab, Pos} -> + Pos + end. + +%% Returns name of fragment table +key_to_frag_name({BaseTab, _} = Tab, Key) -> + N = key_to_frag_number(Tab, Key), + n_to_frag_name(BaseTab, N); +key_to_frag_name(Tab, Key) -> + N = key_to_frag_number(Tab, Key), + n_to_frag_name(Tab, N). + +%% Returns name of fragment table +n_to_frag_name(Tab, 1) -> + Tab; +n_to_frag_name(Tab, N) when is_atom(Tab), is_integer(N) -> + list_to_atom(atom_to_list(Tab) ++ "_frag" ++ integer_to_list(N)); +n_to_frag_name(Tab, N) -> + mnesia:abort({bad_type, Tab, N}). + +%% Returns name of fragment table +key_to_frag_number({Tab, ForeignKey}, _Key) -> + FH = val({Tab, frag_hash}), + case FH#frag_state.foreign_key of + {_ForeignTab, _Pos} -> + key_to_n(FH, ForeignKey); + undefined -> + mnesia:abort({combine_error, Tab, frag_properties, + {foreign_key, undefined}}) + end; +key_to_frag_number(Tab, Key) -> + case ?catch_val({Tab, frag_hash}) of + {'EXIT', _} -> + 1; + FH -> + key_to_n(FH, Key) + end. + +%% Returns fragment number +key_to_n(FH, Key) -> + HashState = FH#frag_state.hash_state, + N = + case FH#frag_state.hash_module of + HashMod when HashMod == ?DEFAULT_HASH_MOD -> + ?DEFAULT_HASH_MOD:key_to_frag_number(HashState, Key); + HashMod -> + HashMod:key_to_frag_number(HashState, Key) + end, + if + is_integer(N), N >= 1, N =< FH#frag_state.n_fragments -> + N; + true -> + mnesia:abort({"key_to_frag_number: Fragment number out of range", + N, {range, 1, FH#frag_state.n_fragments}}) + end. + +%% Returns a list of frament table names +frag_names(Tab) -> + case ?catch_val({Tab, frag_hash}) of + {'EXIT', _} -> + [Tab]; + FH -> + N = FH#frag_state.n_fragments, + frag_names(Tab, N, []) + end. + +frag_names(Tab, 1, Acc) -> + [Tab | Acc]; +frag_names(Tab, N, Acc) -> + Frag = n_to_frag_name(Tab, N), + frag_names(Tab, N - 1, [Frag | Acc]). + +%% Returns a list of {Node, FragCount} tuples +%% sorted on FragCounts +frag_dist(Tab) -> + Pool = lookup_prop(Tab, node_pool), + Dist = [{good, Node, 0} || Node <- Pool], + Dist2 = count_frag(frag_names(Tab), Dist), + sort_dist(Dist2). + +count_frag([Frag | Frags], Dist) -> + Dist2 = incr_nodes(val({Frag, ram_copies}), Dist), + Dist3 = incr_nodes(val({Frag, disc_copies}), Dist2), + Dist4 = incr_nodes(val({Frag, disc_only_copies}), Dist3), + count_frag(Frags, Dist4); +count_frag([], Dist) -> + Dist. + +incr_nodes([Node | Nodes], Dist) -> + Dist2 = incr_node(Node, Dist), + incr_nodes(Nodes, Dist2); +incr_nodes([], Dist) -> + Dist. + +incr_node(Node, [{Kind, Node, Count} | Tail]) -> + [{Kind, Node, Count + 1} | Tail]; +incr_node(Node, [Head | Tail]) -> + [Head | incr_node(Node, Tail)]; +incr_node(Node, []) -> + [{bad, Node, 1}]. + +%% Sorts dist according in decreasing count order +sort_dist(Dist) -> + Dist2 = deep_dist(Dist, []), + Dist3 = lists:keysort(1, Dist2), + shallow_dist(Dist3). + +deep_dist([Head | Tail], Deep) -> + {Kind, _Node, Count} = Head, + {Tag, Same, Other} = pick_count(Kind, Count, [Head | Tail]), + deep_dist(Other, [{Tag, Same} | Deep]); +deep_dist([], Deep) -> + Deep. + +pick_count(Kind, Count, [{Kind2, Node2, Count2} | Tail]) -> + Head = {Node2, Count2}, + {_, Same, Other} = pick_count(Kind, Count, Tail), + if + Kind == bad -> + {bad, [Head | Same], Other}; + Kind2 == bad -> + {Count, Same, [{Kind2, Node2, Count2} | Other]}; + Count == Count2 -> + {Count, [Head | Same], Other}; + true -> + {Count, Same, [{Kind2, Node2, Count2} | Other]} + end; +pick_count(_Kind, Count, []) -> + {Count, [], []}. + +shallow_dist([{_Tag, Shallow} | Deep]) -> + Shallow ++ shallow_dist(Deep); +shallow_dist([]) -> + []. diff --git a/lib/mnesia/src/mnesia_frag_hash.erl b/lib/mnesia/src/mnesia_frag_hash.erl new file mode 100644 index 0000000000..610ba2535c --- /dev/null +++ b/lib/mnesia/src/mnesia_frag_hash.erl @@ -0,0 +1,151 @@ +%% +%% %CopyrightBegin% +%% +%% Copyright Ericsson AB 2002-2009. All Rights Reserved. +%% +%% The contents of this file are subject to the Erlang Public License, +%% Version 1.1, (the "License"); you may not use this file except in +%% compliance with the License. You should have received a copy of the +%% Erlang Public License along with this software. If not, it can be +%% retrieved online at http://www.erlang.org/. +%% +%% Software distributed under the License is distributed on an "AS IS" +%% basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See +%% the License for the specific language governing rights and limitations +%% under the License. +%% +%% %CopyrightEnd% +%% + +%% +%%%---------------------------------------------------------------------- +%%% Purpose : Implements hashing functionality for fragmented tables +%%%---------------------------------------------------------------------- + +%header_doc_include +-module(mnesia_frag_hash). + +%% Fragmented Table Hashing callback functions +-export([ + init_state/2, + add_frag/1, + del_frag/1, + key_to_frag_number/2, + match_spec_to_frag_numbers/2 + ]). + +%header_doc_include +%%-behaviour(mnesia_frag_hash). + +%impl_doc_include +-record(hash_state, + {n_fragments, + next_n_to_split, + n_doubles, + function}). + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% + +init_state(_Tab, State) when State == undefined -> + #hash_state{n_fragments = 1, + next_n_to_split = 1, + n_doubles = 0, + function = phash2}. + +convert_old_state({hash_state, N, P, L}) -> + #hash_state{n_fragments = N, + next_n_to_split = P, + n_doubles = L, + function = phash}. + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% + +add_frag(#hash_state{next_n_to_split = SplitN, n_doubles = L, n_fragments = N} = State) -> + P = SplitN + 1, + NewN = N + 1, + State2 = case power2(L) + 1 of + P2 when P2 == P -> + State#hash_state{n_fragments = NewN, + n_doubles = L + 1, + next_n_to_split = 1}; + _ -> + State#hash_state{n_fragments = NewN, + next_n_to_split = P} + end, + {State2, [SplitN], [NewN]}; +add_frag(OldState) -> + State = convert_old_state(OldState), + add_frag(State). + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% + +del_frag(#hash_state{next_n_to_split = SplitN, n_doubles = L, n_fragments = N} = State) -> + P = SplitN - 1, + if + P < 1 -> + L2 = L - 1, + MergeN = power2(L2), + State2 = State#hash_state{n_fragments = N - 1, + next_n_to_split = MergeN, + n_doubles = L2}, + {State2, [N], [MergeN]}; + true -> + MergeN = P, + State2 = State#hash_state{n_fragments = N - 1, + next_n_to_split = MergeN}, + {State2, [N], [MergeN]} + end; +del_frag(OldState) -> + State = convert_old_state(OldState), + del_frag(State). + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% + +key_to_frag_number(#hash_state{function = phash, next_n_to_split = SplitN, n_doubles = L}, Key) -> + P = SplitN, + A = erlang:phash(Key, power2(L)), + if + A < P -> + erlang:phash(Key, power2(L + 1)); + true -> + A + end; +key_to_frag_number(#hash_state{function = phash2, next_n_to_split = SplitN, n_doubles = L}, Key) -> + P = SplitN, + A = erlang:phash2(Key, power2(L)) + 1, + if + A < P -> + erlang:phash2(Key, power2(L + 1)) + 1; + true -> + A + end; +key_to_frag_number(OldState, Key) -> + State = convert_old_state(OldState), + key_to_frag_number(State, Key). + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% + +match_spec_to_frag_numbers(#hash_state{n_fragments = N} = State, MatchSpec) -> + case MatchSpec of + [{HeadPat, _, _}] when is_tuple(HeadPat), tuple_size(HeadPat) > 2 -> + KeyPat = element(2, HeadPat), + case has_var(KeyPat) of + false -> + [key_to_frag_number(State, KeyPat)]; + true -> + lists:seq(1, N) + end; + _ -> + lists:seq(1, N) + end; +match_spec_to_frag_numbers(OldState, MatchSpec) -> + State = convert_old_state(OldState), + match_spec_to_frag_numbers(State, MatchSpec). + +power2(Y) -> + 1 bsl Y. % trunc(math:pow(2, Y)). + +%impl_doc_include + +has_var(Pat) -> + mnesia:has_var(Pat). diff --git a/lib/mnesia/src/mnesia_frag_old_hash.erl b/lib/mnesia/src/mnesia_frag_old_hash.erl new file mode 100644 index 0000000000..817bb54eb1 --- /dev/null +++ b/lib/mnesia/src/mnesia_frag_old_hash.erl @@ -0,0 +1,132 @@ +%% +%% %CopyrightBegin% +%% +%% Copyright Ericsson AB 2002-2009. All Rights Reserved. +%% +%% The contents of this file are subject to the Erlang Public License, +%% Version 1.1, (the "License"); you may not use this file except in +%% compliance with the License. You should have received a copy of the +%% Erlang Public License along with this software. If not, it can be +%% retrieved online at http://www.erlang.org/. +%% +%% Software distributed under the License is distributed on an "AS IS" +%% basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See +%% the License for the specific language governing rights and limitations +%% under the License. +%% +%% %CopyrightEnd% +%% + +%% +%%%---------------------------------------------------------------------- +%%% Purpose : Implements hashing functionality for fragmented tables +%%%---------------------------------------------------------------------- + +-module(mnesia_frag_old_hash). +%%-behaviour(mnesia_frag_hash). + +-compile({nowarn_deprecated_function, {erlang,hash,2}}). + +%% Hashing callback functions +-export([ + init_state/2, + add_frag/1, + del_frag/1, + key_to_frag_number/2, + match_spec_to_frag_numbers/2 + ]). + +-record(old_hash_state, + {n_fragments, + next_n_to_split, + n_doubles}). + +%% Old style. Kept for backwards compatibility. +-record(frag_hash, + {foreign_key, + n_fragments, + next_n_to_split, + n_doubles}). + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% + +init_state(_Tab, InitialState) when InitialState == undefined -> + #old_hash_state{n_fragments = 1, + next_n_to_split = 1, + n_doubles = 0}; +init_state(_Tab, FH) when is_record(FH, frag_hash) -> + %% Old style. Kept for backwards compatibility. + #old_hash_state{n_fragments = FH#frag_hash.n_fragments, + next_n_to_split = FH#frag_hash.next_n_to_split, + n_doubles = FH#frag_hash.n_doubles}. + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% + +add_frag(State) when is_record(State, old_hash_state) -> + SplitN = State#old_hash_state.next_n_to_split, + P = SplitN + 1, + L = State#old_hash_state.n_doubles, + NewN = State#old_hash_state.n_fragments + 1, + State2 = case trunc(math:pow(2, L)) + 1 of + P2 when P2 == P -> + State#old_hash_state{n_fragments = NewN, + next_n_to_split = 1, + n_doubles = L + 1}; + _ -> + State#old_hash_state{n_fragments = NewN, + next_n_to_split = P} + end, + {State2, [SplitN], [NewN]}. + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% + +del_frag(State) when is_record(State, old_hash_state) -> + P = State#old_hash_state.next_n_to_split - 1, + L = State#old_hash_state.n_doubles, + N = State#old_hash_state.n_fragments, + if + P < 1 -> + L2 = L - 1, + MergeN = trunc(math:pow(2, L2)), + State2 = State#old_hash_state{n_fragments = N - 1, + next_n_to_split = MergeN, + n_doubles = L2}, + {State2, [N], [MergeN]}; + true -> + MergeN = P, + State2 = State#old_hash_state{n_fragments = N - 1, + next_n_to_split = MergeN}, + {State2, [N], [MergeN]} + end. + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% + +key_to_frag_number(State, Key) when is_record(State, old_hash_state) -> + L = State#old_hash_state.n_doubles, + A = erlang:hash(Key, trunc(math:pow(2, L))), + P = State#old_hash_state.next_n_to_split, + if + A < P -> + erlang:hash(Key, trunc(math:pow(2, L + 1))); + true -> + A + end. + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% + +match_spec_to_frag_numbers(State, MatchSpec) when is_record(State, old_hash_state) -> + case MatchSpec of + [{HeadPat, _, _}] when is_tuple(HeadPat), tuple_size(HeadPat) > 2 -> + KeyPat = element(2, HeadPat), + case has_var(KeyPat) of + false -> + [key_to_frag_number(State, KeyPat)]; + true -> + lists:seq(1, State#old_hash_state.n_fragments) + end; + _ -> + lists:seq(1, State#old_hash_state.n_fragments) + end. + +has_var(Pat) -> + mnesia:has_var(Pat). diff --git a/lib/mnesia/src/mnesia_index.erl b/lib/mnesia/src/mnesia_index.erl new file mode 100644 index 0000000000..4e6e8a997c --- /dev/null +++ b/lib/mnesia/src/mnesia_index.erl @@ -0,0 +1,384 @@ +%% +%% %CopyrightBegin% +%% +%% Copyright Ericsson AB 1996-2009. All Rights Reserved. +%% +%% The contents of this file are subject to the Erlang Public License, +%% Version 1.1, (the "License"); you may not use this file except in +%% compliance with the License. You should have received a copy of the +%% Erlang Public License along with this software. If not, it can be +%% retrieved online at http://www.erlang.org/. +%% +%% Software distributed under the License is distributed on an "AS IS" +%% basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See +%% the License for the specific language governing rights and limitations +%% under the License. +%% +%% %CopyrightEnd% +%% + +%% +%% Purpose: Handles index functionality in mnesia + +-module(mnesia_index). +-export([read/5, + add_index/5, + delete_index/3, + del_object_index/5, + clear_index/4, + dirty_match_object/3, + dirty_select/3, + dirty_read/3, + dirty_read2/3, + + db_put/2, + db_get/2, + db_match_erase/2, + get_index_table/2, + get_index_table/3, + + tab2filename/2, + tab2tmp_filename/2, + init_index/2, + init_indecies/3, + del_transient/2, + del_transient/3, + del_index_table/3]). + +-import(mnesia_lib, [verbose/2]). +-include("mnesia.hrl"). + +-record(index, {setorbag, pos_list}). + +val(Var) -> + case ?catch_val(Var) of + {'EXIT', _ReASoN_} -> mnesia_lib:other_val(Var, _ReASoN_); + _VaLuE_ -> _VaLuE_ + end. + +%% read an object list throuh its index table +%% we assume that table Tab has index on attribute number Pos + +read(Tid, Store, Tab, IxKey, Pos) -> + ResList = mnesia_locker:ixrlock(Tid, Store, Tab, IxKey, Pos), + %% Remove all tuples which don't include Ixkey, happens when Tab is a bag + case val({Tab, setorbag}) of + bag -> + mnesia_lib:key_search_all(IxKey, Pos, ResList); + _ -> + ResList + end. + +add_index(Index, Tab, Key, Obj, Old) -> + add_index2(Index#index.pos_list, Index#index.setorbag, Tab, Key, Obj, Old). + +add_index2([{Pos, Ixt} |Tail], bag, Tab, K, Obj, OldRecs) -> + db_put(Ixt, {element(Pos, Obj), K}), + add_index2(Tail, bag, Tab, K, Obj, OldRecs); +add_index2([{Pos, Ixt} |Tail], Type, Tab, K, Obj, OldRecs) -> + %% Remove old tuples in index if Tab is updated + case OldRecs of + undefined -> + Old = mnesia_lib:db_get(Tab, K), + del_ixes(Ixt, Old, Pos, K); + Old -> + del_ixes(Ixt, Old, Pos, K) + end, + db_put(Ixt, {element(Pos, Obj), K}), + add_index2(Tail, Type, Tab, K, Obj, OldRecs); +add_index2([], _, _Tab, _K, _Obj, _) -> ok. + +delete_index(Index, Tab, K) -> + delete_index2(Index#index.pos_list, Tab, K). + +delete_index2([{Pos, Ixt} | Tail], Tab, K) -> + DelObjs = mnesia_lib:db_get(Tab, K), + del_ixes(Ixt, DelObjs, Pos, K), + delete_index2(Tail, Tab, K); +delete_index2([], _Tab, _K) -> ok. + + +del_ixes(_Ixt, [], _Pos, _L) -> ok; +del_ixes(Ixt, [Obj | Tail], Pos, Key) -> + db_match_erase(Ixt, {element(Pos, Obj), Key}), + del_ixes(Ixt, Tail, Pos, Key). + +del_object_index(Index, Tab, K, Obj, Old) -> + del_object_index2(Index#index.pos_list, Index#index.setorbag, Tab, K, Obj, Old). + +del_object_index2([], _, _Tab, _K, _Obj, _Old) -> ok; +del_object_index2([{Pos, Ixt} | Tail], SoB, Tab, K, Obj, Old) -> + case SoB of + bag -> + del_object_bag(Tab, K, Obj, Pos, Ixt, Old); + _ -> %% If set remove the tuple in index table + del_ixes(Ixt, [Obj], Pos, K) + end, + del_object_index2(Tail, SoB, Tab, K, Obj, Old). + +del_object_bag(Tab, Key, Obj, Pos, Ixt, undefined) -> + IxKey = element(Pos, Obj), + Old = [X || X <- mnesia_lib:db_get(Tab, Key), element(Pos, X) =:= IxKey], + del_object_bag(Tab, Key, Obj, Pos, Ixt, Old); +%% If Tab type is bag we need remove index identifier if Tab +%% contains less than 2 elements. +del_object_bag(_Tab, Key, Obj, Pos, Ixt, Old) when length(Old) < 2 -> + del_ixes(Ixt, [Obj], Pos, Key); +del_object_bag(_Tab, _Key, _Obj, _Pos, _Ixt, _Old) -> ok. + +clear_index(Index, Tab, K, Obj) -> + clear_index2(Index#index.pos_list, Tab, K, Obj). + +clear_index2([], _Tab, _K, _Obj) -> ok; +clear_index2([{_Pos, Ixt} | Tail], Tab, K, Obj) -> + db_match_erase(Ixt, Obj), + clear_index2(Tail, Tab, K, Obj). + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% + +dirty_match_object(Tab, Pat, Pos) -> + %% Assume that we are on the node where the replica is + case element(2, Pat) of + '_' -> + IxKey = element(Pos, Pat), + RealKeys = realkeys(Tab, Pos, IxKey), + merge(RealKeys, Tab, Pat, []); + _Else -> + mnesia_lib:db_match_object(Tab, Pat) + end. + +merge([{_IxKey, RealKey} | Tail], Tab, Pat, Ack) -> + %% Assume that we are on the node where the replica is + Pat2 = setelement(2, Pat, RealKey), + Recs = mnesia_lib:db_match_object(Tab, Pat2), + merge(Tail, Tab, Pat, Recs ++ Ack); +merge([], _, _, Ack) -> + Ack. + +realkeys(Tab, Pos, IxKey) -> + Index = get_index_table(Tab, Pos), + db_get(Index, IxKey). % a list on the form [{IxKey, RealKey1} , .... + +dirty_select(Tab, Spec, Pos) -> + %% Assume that we are on the node where the replica is + %% Returns the records without applying the match spec + %% The actual filtering is handled by the caller + IxKey = element(Pos, Spec), + RealKeys = realkeys(Tab, Pos, IxKey), + StorageType = val({Tab, storage_type}), + lists:append([mnesia_lib:db_get(StorageType, Tab, Key) || {_,Key} <- RealKeys]). + +dirty_read(Tab, IxKey, Pos) -> + ResList = mnesia:dirty_rpc(Tab, ?MODULE, dirty_read2, + [Tab, IxKey, Pos]), + case val({Tab, setorbag}) of + bag -> + %% Remove all tuples which don't include Ixkey + mnesia_lib:key_search_all(IxKey, Pos, ResList); + _ -> + ResList + end. + +dirty_read2(Tab, IxKey, Pos) -> + Ix = get_index_table(Tab, Pos), + Keys = db_match(Ix, {IxKey, '$1'}), + r_keys(Keys, Tab, []). + +r_keys([[H]|T],Tab,Ack) -> + V = mnesia_lib:db_get(Tab, H), + r_keys(T, Tab, V ++ Ack); +r_keys([], _, Ack) -> + Ack. + + +%%%%%%% Creation, Init and deletion routines for index tables +%% We can have several indexes on the same table +%% this can be a fairly costly operation if table is *very* large + +tab2filename(Tab, Pos) -> + mnesia_lib:dir(Tab) ++ "_" ++ integer_to_list(Pos) ++ ".DAT". + +tab2tmp_filename(Tab, Pos) -> + mnesia_lib:dir(Tab) ++ "_" ++ integer_to_list(Pos) ++ ".TMP". + +init_index(Tab, Storage) -> + PosList = val({Tab, index}), + init_indecies(Tab, Storage, PosList). + +init_indecies(Tab, Storage, PosList) -> + case Storage of + unknown -> + ignore; + disc_only_copies -> + init_disc_index(Tab, PosList); + ram_copies -> + make_ram_index(Tab, PosList); + disc_copies -> + make_ram_index(Tab, PosList) + end. + +%% works for both ram and disc indexes + +del_index_table(_, unknown, _) -> + ignore; +del_index_table(Tab, Storage, Pos) -> + delete_transient_index(Tab, Pos, Storage), + mnesia_lib:del({Tab, index}, Pos). + +del_transient(Tab, Storage) -> + PosList = val({Tab, index}), + del_transient(Tab, PosList, Storage). + +del_transient(_, [], _) -> done; +del_transient(Tab, [Pos | Tail], Storage) -> + delete_transient_index(Tab, Pos, Storage), + del_transient(Tab, Tail, Storage). + +delete_transient_index(Tab, Pos, disc_only_copies) -> + Tag = {Tab, index, Pos}, + mnesia_monitor:unsafe_close_dets(Tag), + file:delete(tab2filename(Tab, Pos)), + del_index_info(Tab, Pos), %% Uses val(..) + mnesia_lib:unset({Tab, {index, Pos}}); + +delete_transient_index(Tab, Pos, _Storage) -> + Ixt = val({Tab, {index, Pos}}), + ?ets_delete_table(Ixt), + del_index_info(Tab, Pos), + mnesia_lib:unset({Tab, {index, Pos}}). + +%%%%% misc functions for the index create/init/delete functions above + +%% assuming that the file exists. +init_disc_index(_Tab, []) -> + done; +init_disc_index(Tab, [Pos | Tail]) when is_integer(Pos) -> + Fn = tab2filename(Tab, Pos), + IxTag = {Tab, index, Pos}, + file:delete(Fn), + Args = [{file, Fn}, {keypos, 1}, {type, bag}], + mnesia_monitor:open_dets(IxTag, Args), + Storage = disc_only_copies, + Key = mnesia_lib:db_first(Storage, Tab), + Recs = mnesia_lib:db_get(Storage, Tab, Key), + BinSize = size(term_to_binary(Recs)), + KeysPerChunk = (4000 div BinSize) + 1, + Init = {start, KeysPerChunk}, + mnesia_lib:db_fixtable(Storage, Tab, true), + ok = dets:init_table(IxTag, create_fun(Init, Tab, Pos)), + mnesia_lib:db_fixtable(Storage, Tab, false), + mnesia_lib:set({Tab, {index, Pos}}, IxTag), + add_index_info(Tab, val({Tab, setorbag}), {Pos, {dets, IxTag}}), + init_disc_index(Tab, Tail). + +create_fun(Cont, Tab, Pos) -> + fun(read) -> + Data = + case Cont of + {start, KeysPerChunk} -> + mnesia_lib:db_init_chunk(disc_only_copies, Tab, KeysPerChunk); + '$end_of_table' -> + '$end_of_table'; + _Else -> + mnesia_lib:db_chunk(disc_only_copies, Cont) + end, + case Data of + '$end_of_table' -> + end_of_input; + {Recs, Next} -> + IdxElems = [{element(Pos, Obj), element(2, Obj)} || Obj <- Recs], + {IdxElems, create_fun(Next, Tab, Pos)} + end; + (close) -> + ok + end. + +make_ram_index(_, []) -> + done; +make_ram_index(Tab, [Pos | Tail]) -> + add_ram_index(Tab, Pos), + make_ram_index(Tab, Tail). + +add_ram_index(Tab, Pos) when is_integer(Pos) -> + verbose("Creating index for ~w ~n", [Tab]), + Index = mnesia_monitor:mktab(mnesia_index, [bag, public]), + Insert = fun(Rec, _Acc) -> + true = ?ets_insert(Index, {element(Pos, Rec), element(2, Rec)}) + end, + mnesia_lib:db_fixtable(ram_copies, Tab, true), + true = ets:foldl(Insert, true, Tab), + mnesia_lib:db_fixtable(ram_copies, Tab, false), + mnesia_lib:set({Tab, {index, Pos}}, Index), + add_index_info(Tab, val({Tab, setorbag}), {Pos, {ram, Index}}); +add_ram_index(_Tab, snmp) -> + ok. + +add_index_info(Tab, Type, IxElem) -> + Commit = val({Tab, commit_work}), + case lists:keysearch(index, 1, Commit) of + false -> + Index = #index{setorbag = Type, + pos_list = [IxElem]}, + %% Check later if mnesia_tm is sensative about the order + mnesia_lib:set({Tab, commit_work}, + mnesia_lib:sort_commit([Index | Commit])); + {value, Old} -> + %% We could check for consistency here + Index = Old#index{pos_list = [IxElem | Old#index.pos_list]}, + NewC = lists:keyreplace(index, 1, Commit, Index), + mnesia_lib:set({Tab, commit_work}, + mnesia_lib:sort_commit(NewC)) + end. + +del_index_info(Tab, Pos) -> + Commit = val({Tab, commit_work}), + case lists:keysearch(index, 1, Commit) of + false -> + %% Something is wrong ignore + skip; + {value, Old} -> + case lists:keydelete(Pos, 1, Old#index.pos_list) of + [] -> + NewC = lists:keydelete(index, 1, Commit), + mnesia_lib:set({Tab, commit_work}, + mnesia_lib:sort_commit(NewC)); + New -> + Index = Old#index{pos_list = New}, + NewC = lists:keyreplace(index, 1, Commit, Index), + mnesia_lib:set({Tab, commit_work}, + mnesia_lib:sort_commit(NewC)) + end + end. + +db_put({ram, Ixt}, V) -> + true = ?ets_insert(Ixt, V); +db_put({dets, Ixt}, V) -> + ok = dets:insert(Ixt, V). + +db_get({ram, Ixt}, K) -> + ?ets_lookup(Ixt, K); +db_get({dets, Ixt}, K) -> + dets:lookup(Ixt, K). + +db_match_erase({ram, Ixt}, Pat) -> + true = ?ets_match_delete(Ixt, Pat); +db_match_erase({dets, Ixt}, Pat) -> + ok = dets:match_delete(Ixt, Pat). + +db_match({ram, Ixt}, Pat) -> + ?ets_match(Ixt, Pat); +db_match({dets, Ixt}, Pat) -> + dets:match(Ixt, Pat). + +get_index_table(Tab, Pos) -> + get_index_table(Tab, val({Tab, storage_type}), Pos). + +get_index_table(Tab, ram_copies, Pos) -> + {ram, val({Tab, {index, Pos}})}; +get_index_table(Tab, disc_copies, Pos) -> + {ram, val({Tab, {index, Pos}})}; +get_index_table(Tab, disc_only_copies, Pos) -> + {dets, val({Tab, {index, Pos}})}; +get_index_table(_Tab, unknown, _Pos) -> + unknown. + diff --git a/lib/mnesia/src/mnesia_kernel_sup.erl b/lib/mnesia/src/mnesia_kernel_sup.erl new file mode 100644 index 0000000000..08f6129fc0 --- /dev/null +++ b/lib/mnesia/src/mnesia_kernel_sup.erl @@ -0,0 +1,65 @@ +%% +%% %CopyrightBegin% +%% +%% Copyright Ericsson AB 1997-2009. All Rights Reserved. +%% +%% The contents of this file are subject to the Erlang Public License, +%% Version 1.1, (the "License"); you may not use this file except in +%% compliance with the License. You should have received a copy of the +%% Erlang Public License along with this software. If not, it can be +%% retrieved online at http://www.erlang.org/. +%% +%% Software distributed under the License is distributed on an "AS IS" +%% basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See +%% the License for the specific language governing rights and limitations +%% under the License. +%% +%% %CopyrightEnd% +%% + +%% +-module(mnesia_kernel_sup). + +-behaviour(supervisor). + +-export([start/0, init/1, supervisor_timeout/1]). + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +%% top supervisor callback functions + +start() -> + supervisor:start_link({local, mnesia_kernel_sup}, ?MODULE, []). + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +%% sub supervisor callback functions + +init([]) -> + ProcLib = [mnesia_monitor, proc_lib], + Flags = {one_for_all, 0, timer:hours(24)}, % Trust the top supervisor + Workers = [worker_spec(mnesia_monitor, timer:seconds(3), [gen_server]), + worker_spec(mnesia_subscr, timer:seconds(3), [gen_server]), + worker_spec(mnesia_locker, timer:seconds(3), ProcLib), + worker_spec(mnesia_recover, timer:minutes(3), [gen_server]), + worker_spec(mnesia_tm, timer:seconds(30), ProcLib), + supervisor_spec(mnesia_checkpoint_sup), + supervisor_spec(mnesia_snmp_sup), + worker_spec(mnesia_controller, timer:seconds(3), [gen_server]), + worker_spec(mnesia_late_loader, timer:seconds(3), ProcLib) + ], + {ok, {Flags, Workers}}. + +worker_spec(Name, KillAfter, Modules) -> + KA = supervisor_timeout(KillAfter), + {Name, {Name, start, []}, permanent, KA, worker, [Name] ++ Modules}. + +supervisor_spec(Name) -> + {Name, {Name, start, []}, permanent, infinity, supervisor, + [Name, supervisor]}. + +-ifdef(debug_shutdown). +supervisor_timeout(_KillAfter) -> timer:hours(24). +-else. +supervisor_timeout(KillAfter) -> KillAfter. +-endif. + + diff --git a/lib/mnesia/src/mnesia_late_loader.erl b/lib/mnesia/src/mnesia_late_loader.erl new file mode 100644 index 0000000000..d09de3ca66 --- /dev/null +++ b/lib/mnesia/src/mnesia_late_loader.erl @@ -0,0 +1,108 @@ +%% +%% %CopyrightBegin% +%% +%% Copyright Ericsson AB 1998-2009. All Rights Reserved. +%% +%% The contents of this file are subject to the Erlang Public License, +%% Version 1.1, (the "License"); you may not use this file except in +%% compliance with the License. You should have received a copy of the +%% Erlang Public License along with this software. If not, it can be +%% retrieved online at http://www.erlang.org/. +%% +%% Software distributed under the License is distributed on an "AS IS" +%% basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See +%% the License for the specific language governing rights and limitations +%% under the License. +%% +%% %CopyrightEnd% +%% + +%% +-module(mnesia_late_loader). + +-export([ + async_late_disc_load/3, + maybe_async_late_disc_load/3, + init/1, + start/0 + ]). + +%% sys callback functions +-export([ + system_continue/3, + system_terminate/4, + system_code_change/4 + ]). + +-define(SERVER_NAME, ?MODULE). + +-record(state, {supervisor}). + +async_late_disc_load(_, [], _) -> ok; +async_late_disc_load(Node, Tabs, Reason) -> + Msg = {async_late_disc_load, Tabs, Reason}, + catch ({?SERVER_NAME, Node} ! {self(), Msg}). + +maybe_async_late_disc_load(_, [], _) -> ok; +maybe_async_late_disc_load(Node, Tabs, Reason) -> + Msg = {maybe_async_late_disc_load, Tabs, Reason}, + catch ({?SERVER_NAME, Node} ! {self(), Msg}). + +start() -> + mnesia_monitor:start_proc(?SERVER_NAME, ?MODULE, init, [self()]). + +init(Parent) -> + %% Trap exit omitted intentionally + register(?SERVER_NAME, self()), + link(whereis(mnesia_controller)), %% We may not hang + mnesia_controller:merge_schema(), + unlink(whereis(mnesia_controller)), + mnesia_lib:set(mnesia_status, running), + proc_lib:init_ack(Parent, {ok, self()}), + loop(#state{supervisor = Parent}). + +loop(State) -> + receive + {_From, {async_late_disc_load, Tabs, Reason}} -> + mnesia_controller:schedule_late_disc_load(Tabs, Reason), + loop(State); + + {_From, {maybe_async_late_disc_load, Tabs, Reason}} -> + CheckMaster = + fun(Tab, Good) -> + case mnesia_recover:get_master_nodes(Tab) of + [] -> [Tab|Good]; + Masters -> + case lists:member(node(),Masters) of + true -> [Tab|Good]; + false -> Good + end + end + end, + GoodTabs = lists:foldl(CheckMaster, [], Tabs), + mnesia_controller:schedule_late_disc_load(GoodTabs, Reason), + loop(State); + + {system, From, Msg} -> + mnesia_lib:dbg_out("~p got {system, ~p, ~p}~n", + [?SERVER_NAME, From, Msg]), + Parent = State#state.supervisor, + sys:handle_system_msg(Msg, From, Parent, ?MODULE, [], State); + + Msg -> + mnesia_lib:error("~p got unexpected message: ~p~n", + [?SERVER_NAME, Msg]), + loop(State) + end. + +%%%%%%%%%%%%%%%%%%%%%%%%%%% +%% System upgrade + +system_continue(_Parent, _Debug, State) -> + loop(State). + +system_terminate(Reason, _Parent, _Debug, _State) -> + exit(Reason). + +system_code_change(State, _Module, _OldVsn, _Extra) -> + {ok, State}. diff --git a/lib/mnesia/src/mnesia_lib.erl b/lib/mnesia/src/mnesia_lib.erl new file mode 100644 index 0000000000..dba808e66e --- /dev/null +++ b/lib/mnesia/src/mnesia_lib.erl @@ -0,0 +1,1306 @@ +%% +%% %CopyrightBegin% +%% +%% Copyright Ericsson AB 1996-2009. All Rights Reserved. +%% +%% The contents of this file are subject to the Erlang Public License, +%% Version 1.1, (the "License"); you may not use this file except in +%% compliance with the License. You should have received a copy of the +%% Erlang Public License along with this software. If not, it can be +%% retrieved online at http://www.erlang.org/. +%% +%% Software distributed under the License is distributed on an "AS IS" +%% basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See +%% the License for the specific language governing rights and limitations +%% under the License. +%% +%% %CopyrightEnd% +%% + +%% +%% This module contains all sorts of various which doesn't fit +%% anywhere else. Basically everything is exported. + +-module(mnesia_lib). + +-include("mnesia.hrl"). +-include_lib("kernel/include/file.hrl"). + +-export([core_file/0]). + +-export([ + active_tables/0, + add/2, + add_list/2, + add_lsort/2, + all_nodes/0, +%% catch_val/1, + copy_file/2, + copy_holders/1, + coredump/0, + coredump/1, + create_counter/1, + cs_to_nodes/1, + cs_to_storage_type/2, + dets_to_ets/6, + db_chunk/2, + db_init_chunk/1, + db_init_chunk/2, + db_init_chunk/3, + db_erase/2, + db_erase/3, + db_erase_tab/1, + db_erase_tab/2, + db_first/1, + db_first/2, + db_last/1, + db_last/2, + db_fixtable/3, + db_get/2, + db_get/3, + db_match_erase/2, + db_match_erase/3, + db_match_object/2, + db_match_object/3, + db_next_key/2, + db_next_key/3, + db_prev_key/2, + db_prev_key/3, + db_put/2, + db_put/3, + db_select/2, + db_select/3, + db_select_init/4, + db_select_cont/3, + db_slot/2, + db_slot/3, + db_update_counter/3, + db_update_counter/4, + dbg_out/2, + del/2, + dets_sync_close/1, + dets_sync_open/2, + dets_sync_open/3, + dir/0, + dir/1, + dir_info/0, + dirty_rpc_error_tag/1, + dist_coredump/0, + disk_type/1, + disk_type/2, + elems/2, + ensure_loaded/1, + error/2, + error_desc/1, + etype/1, + exists/1, + fatal/2, + get_node_number/0, + fix_error/1, + important/2, + incr_counter/1, + incr_counter/2, + intersect/2, + is_running/0, + is_running/1, + is_running_remote/0, + is_string/1, + key_search_delete/3, + key_search_all/3, + last_error/0, + local_active_tables/0, + lock_table/1, + mkcore/1, + not_active_here/1, + other_val/2, + pad_name/3, + random_time/2, + read_counter/1, + readable_indecies/1, + remote_copy_holders/1, + report_fatal/2, + report_system_event/1, + running_nodes/0, + running_nodes/1, + schema_cs_to_storage_type/2, + search_delete/2, + set/2, + set_counter/2, + set_local_content_whereabouts/1, + set_remote_where_to_read/1, + set_remote_where_to_read/2, + show/1, + show/2, + sort_commit/1, + storage_type_at_node/2, + tab2dat/1, + tab2dmp/1, + tab2tmp/1, + tab2dcd/1, + tab2dcl/1, + to_list/1, + union/2, + uniq/1, + unlock_table/1, + unset/1, + %% update_counter/2, + val/1, + vcore/0, + vcore/1, + verbose/2, + view/0, + view/1, + view/2, + warning/2, + + is_debug_compiled/0, + activate_debug_fun/5, + deactivate_debug_fun/3, + eval_debug_fun/4, + scratch_debug_fun/0 + ]). + + +search_delete(Obj, List) -> + search_delete(Obj, List, [], none). +search_delete(Obj, [Obj|Tail], Ack, _Res) -> + search_delete(Obj, Tail, Ack, Obj); +search_delete(Obj, [H|T], Ack, Res) -> + search_delete(Obj, T, [H|Ack], Res); +search_delete(_, [], Ack, Res) -> + {Res, Ack}. + +key_search_delete(Key, Pos, TupleList) -> + key_search_delete(Key, Pos, TupleList, none, []). +key_search_delete(Key, Pos, [H|T], _Obj, Ack) when element(Pos, H) == Key -> + key_search_delete(Key, Pos, T, H, Ack); +key_search_delete(Key, Pos, [H|T], Obj, Ack) -> + key_search_delete(Key, Pos, T, Obj, [H|Ack]); +key_search_delete(_, _, [], Obj, Ack) -> + {Obj, Ack}. + +key_search_all(Key, Pos, TupleList) -> + key_search_all(Key, Pos, TupleList, []). +key_search_all(Key, N, [H|T], Ack) when element(N, H) == Key -> + key_search_all(Key, N, T, [H|Ack]); +key_search_all(Key, N, [_|T], Ack) -> + key_search_all(Key, N, T, Ack); +key_search_all(_, _, [], Ack) -> Ack. + +intersect(L1, L2) -> + L2 -- (L2 -- L1). + +elems(I, [H|T]) -> + [element(I, H) | elems(I, T)]; +elems(_, []) -> + []. + +%% sort_commit see to that checkpoint info is always first in +%% commit_work structure the other info don't need to be sorted. +sort_commit(List) -> + sort_commit2(List, []). + +sort_commit2([{checkpoints, ChkpL}| Rest], Acc) -> + [{checkpoints, ChkpL}| Rest] ++ Acc; +sort_commit2([H | R], Acc) -> + sort_commit2(R, [H | Acc]); +sort_commit2([], Acc) -> Acc. + +is_string([H|T]) -> + if + 0 =< H, H < 256, is_integer(H) -> is_string(T); + true -> false + end; +is_string([]) -> true. + +%%% + +union([H|L1], L2) -> + case lists:member(H, L2) of + true -> union(L1, L2); + false -> [H | union(L1, L2)] + end; +union([], L2) -> L2. + +uniq([]) -> + []; +uniq(List) -> + [H|T] = lists:sort(List), + uniq1(H, T, []). + +uniq1(H, [H|R], Ack) -> + uniq1(H, R, Ack); +uniq1(Old, [H|R], Ack) -> + uniq1(H, R, [Old|Ack]); +uniq1(Old, [], Ack) -> + [Old| Ack]. + +to_list(X) when is_list(X) -> X; +to_list(X) -> atom_to_list(X). + +all_nodes() -> + Ns = mnesia:system_info(db_nodes) ++ + mnesia:system_info(extra_db_nodes), + mnesia_lib:uniq(Ns). + +running_nodes() -> + running_nodes(all_nodes()). + +running_nodes(Ns) -> + {Replies, _BadNs} = rpc:multicall(Ns, ?MODULE, is_running_remote, []), + [N || {GoodState, N} <- Replies, GoodState == true]. + +is_running_remote() -> + IsRunning = is_running(), + {IsRunning == yes, node()}. + +is_running(Node) when is_atom(Node) -> + case rpc:call(Node, ?MODULE, is_running, []) of + {badrpc, _} -> no; + X -> X + end. + +is_running() -> + case ?catch_val(mnesia_status) of + {'EXIT', _} -> no; + running -> yes; + starting -> starting; + stopping -> stopping + end. + +show(X) -> + show(X, []). +show(F, A) -> + io:format(user, F, A). + + +pad_name([Char | Chars], Len, Tail) -> + [Char | pad_name(Chars, Len - 1, Tail)]; +pad_name([], Len, Tail) when Len =< 0 -> + Tail; +pad_name([], Len, Tail) -> + [$ | pad_name([], Len - 1, Tail)]. + +%% Some utility functions ..... +active_here(Tab) -> + case val({Tab, where_to_read}) of + Node when Node == node() -> true; + _ -> false + end. + +not_active_here(Tab) -> + not active_here(Tab). + +exists(Fname) -> + case file:open(Fname, [raw,read]) of + {ok, F} ->file:close(F), true; + _ -> false + end. + +dir() -> mnesia_monitor:get_env(dir). + +dir(Fname) -> + filename:join([dir(), to_list(Fname)]). + +tab2dat(Tab) -> %% DETS files + dir(lists:concat([Tab, ".DAT"])). + +tab2tmp(Tab) -> + dir(lists:concat([Tab, ".TMP"])). + +tab2dmp(Tab) -> %% Dumped ets tables + dir(lists:concat([Tab, ".DMP"])). + +tab2dcd(Tab) -> %% Disc copies data + dir(lists:concat([Tab, ".DCD"])). + +tab2dcl(Tab) -> %% Disc copies log + dir(lists:concat([Tab, ".DCL"])). + +storage_type_at_node(Node, Tab) -> + search_key(Node, [{disc_copies, val({Tab, disc_copies})}, + {ram_copies, val({Tab, ram_copies})}, + {disc_only_copies, val({Tab, disc_only_copies})}]). + +cs_to_storage_type(Node, Cs) -> + search_key(Node, [{disc_copies, Cs#cstruct.disc_copies}, + {ram_copies, Cs#cstruct.ram_copies}, + {disc_only_copies, Cs#cstruct.disc_only_copies}]). + +schema_cs_to_storage_type(Node, Cs) -> + case cs_to_storage_type(Node, Cs) of + unknown when Cs#cstruct.name == schema -> ram_copies; + Other -> Other + end. + + +search_key(Key, [{Val, List} | Tail]) -> + case lists:member(Key, List) of + true -> Val; + false -> search_key(Key, Tail) + end; +search_key(_Key, []) -> + unknown. + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +%% ops, we've got some global variables here :-) + +%% They are +%% +%% {Tab, setorbag}, -> set | bag +%% {Tab, storage_type} -> disc_copies |ram_copies | unknown (**) +%% {Tab, disc_copies} -> node list (from schema) +%% {Tab, ram_copies}, -> node list (from schema) +%% {Tab, arity}, -> number +%% {Tab, attributes}, -> atom list +%% {Tab, wild_pattern}, -> record tuple with '_'s +%% {Tab, {index, Pos}} -> ets table +%% {Tab, index} -> integer list +%% {Tab, cstruct} -> cstruct structure +%% + +%% The following fields are dynamic according to the +%% the current node/table situation + +%% {Tab, where_to_write} -> node list +%% {Tab, where_to_read} -> node | nowhere +%% +%% {schema, tables} -> tab list +%% {schema, local_tables} -> tab list (**) +%% +%% {current, db_nodes} -> node list +%% +%% dir -> directory path (**) +%% mnesia_status -> status | running | stopping (**) +%% (**) == (Different on all nodes) +%% + +val(Var) -> + case ?catch_val(Var) of + {'EXIT', _ReASoN_} -> mnesia_lib:other_val(Var, _ReASoN_); + _VaLuE_ -> _VaLuE_ + end. + +set(Var, Val) -> + ?ets_insert(mnesia_gvar, {Var, Val}). + +unset(Var) -> + ?ets_delete(mnesia_gvar, Var). + +other_val(Var, Other) -> + case Var of + {_, where_to_read} -> nowhere; + {_, where_to_write} -> []; + {_, active_replicas} -> []; + _ -> + pr_other(Var, Other) + end. + +-spec(pr_other/2 :: (_,_) -> no_return()). + +pr_other(Var, Other) -> + Why = + case is_running() of + no -> {node_not_running, node()}; + _ -> {no_exists, Var} + end, + verbose("~p (~p) val(mnesia_gvar, ~w) -> ~p ~p ~n", + [self(), process_info(self(), registered_name), + Var, Other, Why]), + case Other of + {badarg, [{ets, lookup_element, _}|_]} -> + exit(Why); + _ -> + erlang:error(Why) + end. + +%% Some functions for list valued variables +add(Var, Val) -> + L = val(Var), + set(Var, [Val | lists:delete(Val, L)]). + +add_list(Var, List) -> + L = val(Var), + set(Var, union(L, List)). + +del(Var, Val) -> + L = val(Var), + set(Var, lists:delete(Val, L)). + +%% LSort -> [node()| Sorted] == Locker sorted + +add_lsort(Var, Val) when node() == Val -> + L = val(Var), + set(Var, [Val | lists:delete(Val, L)]); +add_lsort(Var,Val) -> + case val(Var) of + [Head|Rest] when Head == node() -> + set(Var,[Head|lsort_add(Val,Rest)]); + List -> + set(Var,lsort_add(Val,List)) + end. + +lsort_add(Val,List) -> + case ordsets:is_element(Val,List) of + true -> List; + false -> ordsets:add_element(Val,List) + end. + +%% This function is needed due to the fact +%% that the application_controller enters +%% a deadlock now and then. ac is implemented +%% as a rather naive server. +ensure_loaded(Appl) -> + case application_controller:get_loaded(Appl) of + {true, _} -> + ok; + false -> + case application:load(Appl) of + ok -> + ok; + {error, {already_loaded, Appl}} -> + ok; + {error, Reason} -> + {error, {application_load_error, Reason}} + end + end. + +local_active_tables() -> + Tabs = val({schema, local_tables}), + lists:zf(fun(Tab) -> active_here(Tab) end, Tabs). + +active_tables() -> + Tabs = val({schema, tables}), + F = fun(Tab) -> + case val({Tab, where_to_read}) of + nowhere -> false; + _ -> {true, Tab} + end + end, + lists:zf(F, Tabs). + +etype(X) when is_integer(X) -> integer; +etype([]) -> nil; +etype(X) when is_list(X) -> list; +etype(X) when is_tuple(X) -> tuple; +etype(X) when is_atom(X) -> atom; +etype(_) -> othertype. + +remote_copy_holders(Cs) -> + copy_holders(Cs) -- [node()]. + +copy_holders(Cs) when Cs#cstruct.local_content == false -> + cs_to_nodes(Cs); +copy_holders(Cs) when Cs#cstruct.local_content == true -> + case lists:member(node(), cs_to_nodes(Cs)) of + true -> [node()]; + false -> [] + end. + + +set_remote_where_to_read(Tab) -> + set_remote_where_to_read(Tab, []). + +set_remote_where_to_read(Tab, Ignore) -> + Active = val({Tab, active_replicas}), + Valid = + case mnesia_recover:get_master_nodes(Tab) of + [] -> Active; + Masters -> mnesia_lib:intersect(Masters, Active) + end, + Available = mnesia_lib:intersect(val({current, db_nodes}), Valid -- Ignore), + DiscOnlyC = val({Tab, disc_only_copies}), + Prefered = Available -- DiscOnlyC, + if + Prefered /= [] -> + set({Tab, where_to_read}, hd(Prefered)); + Available /= [] -> + set({Tab, where_to_read}, hd(Available)); + true -> + set({Tab, where_to_read}, nowhere) + end. + +%%% Local only +set_local_content_whereabouts(Tab) -> + add({schema, local_tables}, Tab), + add({Tab, active_replicas}, node()), + set({Tab, where_to_write}, [node()]), + set({Tab, where_to_read}, node()). + +%%% counter routines + +create_counter(Name) -> + set_counter(Name, 0). + +set_counter(Name, Val) -> + ?ets_insert(mnesia_stats, {Name, Val}). + +incr_counter(Name) -> + ?ets_update_counter(mnesia_stats, Name, 1). + +incr_counter(Name, I) -> + ?ets_update_counter(mnesia_stats, Name, I). + +%% update_counter(Name, Val) -> +%% ?ets_update_counter(mnesia_stats, Name, Val). + +read_counter(Name) -> + ?ets_lookup_element(mnesia_stats, Name, 2). + +cs_to_nodes(Cs) -> + Cs#cstruct.disc_only_copies ++ + Cs#cstruct.disc_copies ++ + Cs#cstruct.ram_copies. + +dist_coredump() -> + dist_coredump(all_nodes()). +dist_coredump(Ns) -> + {Replies, _} = rpc:multicall(Ns, ?MODULE, coredump, []), + Replies. + +coredump() -> + coredump({crashinfo, {"user initiated~n", []}}). +coredump(CrashInfo) -> + Core = mkcore(CrashInfo), + Out = core_file(), + important("Writing Mnesia core to file: ~p...~p~n", [Out, CrashInfo]), + file:write_file(Out, Core), + Out. + +core_file() -> + Integers = tuple_to_list(date()) ++ tuple_to_list(time()), + Fun = fun(I) when I < 10 -> ["_0", I]; + (I) -> ["_", I] + end, + List = lists:append([Fun(I) || I <- Integers]), + case mnesia_monitor:get_env(core_dir) of + Dir when is_list(Dir) -> + filename:absname(lists:concat(["MnesiaCore.", node()] ++ List), Dir); + _ -> + filename:absname(lists:concat(["MnesiaCore.", node()] ++ List)) + end. + +mkcore(CrashInfo) -> +% dbg_out("Making a Mnesia core dump...~p~n", [CrashInfo]), + Nodes = [node() |nodes()], + %%TidLocks = (catch ets:tab2list(mnesia_tid_locks)), + HeldLocks = (catch mnesia:system_info(held_locks)), + Core = [ + CrashInfo, + {time, {date(), time()}}, + {self, catch process_info(self())}, + {nodes, catch rpc:multicall(Nodes, ?MODULE, get_node_number, [])}, + {applications, catch lists:sort(application:loaded_applications())}, + {flags, catch init:get_arguments()}, + {code_path, catch code:get_path()}, + {code_loaded, catch lists:sort(code:all_loaded())}, + {etsinfo, catch ets_info(ets:all())}, + + {version, catch mnesia:system_info(version)}, + {schema, catch ets:tab2list(schema)}, + {gvar, catch ets:tab2list(mnesia_gvar)}, + {master_nodes, catch mnesia_recover:get_master_node_info()}, + + {processes, catch procs()}, + {relatives, catch relatives()}, + {workers, catch workers(mnesia_controller:get_workers(2000))}, + {locking_procs, catch locking_procs(HeldLocks)}, + + {held_locks, HeldLocks}, + {lock_queue, catch mnesia:system_info(lock_queue)}, + {load_info, catch mnesia_controller:get_info(2000)}, + {trans_info, catch mnesia_tm:get_info(2000)}, + + {schema_file, catch file:read_file(tab2dat(schema))}, + {dir_info, catch dir_info()}, + {logfile, catch {ok, read_log_files()}} + ], + term_to_binary(Core). + +procs() -> + Fun = fun(P) -> {P, (catch lists:zf(fun proc_info/1, process_info(P)))} end, + lists:map(Fun, processes()). + +proc_info({registered_name, Val}) -> {true, Val}; +proc_info({message_queue_len, Val}) -> {true, Val}; +proc_info({status, Val}) -> {true, Val}; +proc_info({current_function, Val}) -> {true, Val}; +proc_info(_) -> false. + +get_node_number() -> + {node(), self()}. + +read_log_files() -> + [{F, catch file:read_file(F)} || F <- mnesia_log:log_files()]. + +dir_info() -> + {ok, Cwd} = file:get_cwd(), + Dir = dir(), + [{cwd, Cwd, file:read_file_info(Cwd)}, + {mnesia_dir, Dir, file:read_file_info(Dir)}] ++ + case file:list_dir(Dir) of + {ok, Files} -> + [{mnesia_file, F, catch file:read_file_info(dir(F))} || F <- Files]; + Other -> + [Other] + end. + +ets_info([H|T]) -> + [{table, H, mk_info_tuple(ets:info(H))} | ets_info(T)]; +ets_info([]) -> []. + +mk_info_tuple(T) when is_list(T) -> + list_to_tuple(T); +mk_info_tuple(T) -> T. + +relatives() -> + Info = fun(Name) -> + case whereis(Name) of + undefined -> false; + Pid -> {true, {Name, Pid, catch process_info(Pid)}} + end + end, + lists:zf(Info, mnesia:ms()). + +workers({workers, Loaders, Senders, Dumper}) -> + Info = fun({Pid, {send_table, Tab, _Receiver, _St}}) -> + case Pid of + undefined -> false; + Pid -> {true, {Pid, Tab, catch process_info(Pid)}} + end; + ({Pid, What}) when is_pid(Pid) -> + {true, {Pid, What, catch process_info(Pid)}}; + ({Name, Pid}) -> + case Pid of + undefined -> false; + Pid -> {true, {Name, Pid, catch process_info(Pid)}} + end + end, + SInfo = lists:zf(Info, Senders), + Linfo = lists:zf(Info, Loaders), + [{senders, SInfo},{loader, Linfo}|lists:zf(Info, [{dumper, Dumper}])]. + +locking_procs(LockList) when is_list(LockList) -> + Tids = [element(3, Lock) || Lock <- LockList], + UT = uniq(Tids), + Info = fun(Tid) -> + Pid = Tid#tid.pid, + case node(Pid) == node() of + true -> + {true, {Pid, catch process_info(Pid)}}; + _ -> + false + end + end, + lists:zf(Info, UT). + +view() -> + Bin = mkcore({crashinfo, {"view only~n", []}}), + vcore(Bin). + +%% Displays a Mnesia file on the tty. The file may be repaired. +view(File) -> + case suffix([".DAT", ".RET", ".DMP", ".TMP"], File) of + true -> + view(File, dat); + false -> + case suffix([".LOG", ".BUP", ".ETS"], File) of + true -> + view(File, log); + false -> + case lists:prefix("MnesiaCore.", File) of + true -> + view(File, core); + false -> + {error, "Unknown file name"} + end + end + end. + +view(File, dat) -> + dets:view(File); +view(File, log) -> + mnesia_log:view(File); +view(File, core) -> + vcore(File). + +suffix(Suffixes, File) -> + Fun = fun(S) -> lists:suffix(S, File) end, + lists:any(Fun, Suffixes). + +%% View a core file + +vcore() -> + Prefix = lists:concat(["MnesiaCore.", node()]), + Filter = fun(F) -> lists:prefix(Prefix, F) end, + {ok, Cwd} = file:get_cwd(), + case file:list_dir(Cwd) of + {ok, Files}-> + CoreFiles = lists:sort(lists:zf(Filter, Files)), + show("Mnesia core files: ~p~n", [CoreFiles]), + vcore(lists:last(CoreFiles)); + Error -> + Error + end. + +vcore(Bin) when is_binary(Bin) -> + Core = binary_to_term(Bin), + Fun = fun({Item, Info}) -> + show("***** ~p *****~n", [Item]), + case catch vcore_elem({Item, Info}) of + {'EXIT', Reason} -> + show("{'EXIT', ~p}~n", [Reason]); + _ -> ok + end + end, + lists:foreach(Fun, Core); + +vcore(File) -> + show("~n***** Mnesia core: ~p *****~n", [File]), + case file:read_file(File) of + {ok, Bin} -> + vcore(Bin); + _ -> + nocore + end. + +vcore_elem({schema_file, {ok, B}}) -> + Fname = "/tmp/schema.DAT", + file:write_file(Fname, B), + dets:view(Fname), + file:delete(Fname); + +vcore_elem({logfile, {ok, BinList}}) -> + Fun = fun({F, Info}) -> + show("----- logfile: ~p -----~n", [F]), + case Info of + {ok, B} -> + Fname = "/tmp/mnesia_vcore_elem.TMP", + file:write_file(Fname, B), + mnesia_log:view(Fname), + file:delete(Fname); + _ -> + show("~p~n", [Info]) + end + end, + lists:foreach(Fun, BinList); + +vcore_elem({crashinfo, {Format, Args}}) -> + show(Format, Args); +vcore_elem({gvar, L}) -> + show("~p~n", [lists:sort(L)]); +vcore_elem({transactions, Info}) -> + mnesia_tm:display_info(user, Info); + +vcore_elem({_Item, Info}) -> + show("~p~n", [Info]). + +fix_error(X) -> + set(last_error, X), %% for debugabililty + case X of + {aborted, Reason} -> Reason; + {abort, Reason} -> Reason; + Y when is_atom(Y) -> Y; + {'EXIT', {_Reason, {Mod, _, _}}} when is_atom(Mod) -> + save(X), + case atom_to_list(Mod) of + [$m, $n, $e|_] -> badarg; + _ -> X + end; + _ -> X + end. + +last_error() -> + val(last_error). + +%% The following is a list of possible mnesia errors and what they +%% actually mean + +error_desc(nested_transaction) -> "Nested transactions are not allowed"; +error_desc(badarg) -> "Bad or invalid argument, possibly bad type"; +error_desc(no_transaction) -> "Operation not allowed outside transactions"; +error_desc(combine_error) -> "Table options were ilegally combined"; +error_desc(bad_index) -> "Index already exists or was out of bounds"; +error_desc(already_exists) -> "Some schema option we try to set is already on"; +error_desc(index_exists)-> "Some ops can not be performed on tabs with index"; +error_desc(no_exists)-> "Tried to perform op on non-existing (non alive) item"; +error_desc(system_limit) -> "Some system_limit was exhausted"; +error_desc(mnesia_down) -> "A transaction involving objects at some remote " + "node which died while transaction was executing" + "*and* object(s) are no longer available elsewhere" + "in the network"; +error_desc(not_a_db_node) -> "A node which is non existant in " + "the schema was mentioned"; +error_desc(bad_type) -> "Bad type on some provided arguments"; +error_desc(node_not_running) -> "Node not running"; +error_desc(truncated_binary_file) -> "Truncated binary in file"; +error_desc(active) -> "Some delete ops require that " + "all active objects are removed"; +error_desc(illegal) -> "Operation not supported on object"; +error_desc({'EXIT', Reason}) -> + error_desc(Reason); +error_desc({error, Reason}) -> + error_desc(Reason); +error_desc({aborted, Reason}) -> + error_desc(Reason); +error_desc(Reason) when is_tuple(Reason), size(Reason) > 0 -> + setelement(1, Reason, error_desc(element(1, Reason))); +error_desc(Reason) -> + Reason. + +dirty_rpc_error_tag(Reason) -> + case Reason of + {'EXIT', _} -> badarg; + no_variable -> badarg; + _ -> no_exists + end. + +fatal(Format, Args) -> + catch set(mnesia_status, stopping), + Core = mkcore({crashinfo, {Format, Args}}), + report_fatal(Format, Args, Core), + timer:sleep(10000), % Enough to write the core dump to disc? + mnesia:lkill(), + exit(fatal). + +report_fatal(Format, Args) -> + report_fatal(Format, Args, nocore). + +report_fatal(Format, Args, Core) -> + report_system_event({mnesia_fatal, Format, Args, Core}), + catch exit(whereis(mnesia_monitor), fatal). + +%% We sleep longer and longer the more we try +%% Made some testing and came up with the following constants +random_time(Retries, _Counter0) -> +% UpperLimit = 2000, +% MaxIntv = trunc(UpperLimit * (1-(4/((Retries*Retries)+4)))), + UpperLimit = 500, + Dup = Retries * Retries, + MaxIntv = trunc(UpperLimit * (1-(50/((Dup)+50)))), + + case get(random_seed) of + undefined -> + {X, Y, Z} = erlang:now(), %% time() + random:seed(X, Y, Z), + Time = Dup + random:uniform(MaxIntv), + %% dbg_out("---random_test rs ~w max ~w val ~w---~n", [Retries, MaxIntv, Time]), + Time; + _ -> + Time = Dup + random:uniform(MaxIntv), + %% dbg_out("---random_test rs ~w max ~w val ~w---~n", [Retries, MaxIntv, Time]), + Time + end. + +report_system_event(Event0) -> + Event = {mnesia_system_event, Event0}, + report_system_event(catch_notify(Event), Event), + case ?catch_val(subscribers) of + {'EXIT', _} -> ignore; + Pids -> lists:foreach(fun(Pid) -> Pid ! Event end, Pids) + end, + ok. + +catch_notify(Event) -> + case whereis(mnesia_event) of + undefined -> + {'EXIT', {badarg, {mnesia_event, Event}}}; + Pid -> + gen_event:notify(Pid, Event) + end. + +report_system_event({'EXIT', Reason}, Event) -> + Mod = mnesia_monitor:get_env(event_module), + case mnesia_sup:start_event() of + {ok, Pid} -> + link(Pid), + gen_event:call(mnesia_event, Mod, Event, infinity), + unlink(Pid), + + %% We get an exit signal if server dies + receive + {'EXIT', Pid, _Reason} -> + {error, {node_not_running, node()}} + after 0 -> + gen_event:stop(mnesia_event), + ok + end; + + Error -> + Msg = "Mnesia(~p): Cannot report event ~p: ~p (~p)~n", + error_logger:format(Msg, [node(), Event, Reason, Error]) + end; +report_system_event(_Res, _Event) -> + ignore. + +%% important messages are reported regardless of debug level +important(Format, Args) -> + save({Format, Args}), + report_system_event({mnesia_info, Format, Args}). + +%% Warning messages are reported regardless of debug level +warning(Format, Args) -> + save({Format, Args}), + report_system_event({mnesia_warning, Format, Args}). + +%% error messages are reported regardless of debug level +error(Format, Args) -> + save({Format, Args}), + report_system_event({mnesia_error, Format, Args}). + +%% verbose messages are reported if debug level == debug or verbose +verbose(Format, Args) -> + case mnesia_monitor:get_env(debug) of + none -> save({Format, Args}); + verbose -> important(Format, Args); + debug -> important(Format, Args); + trace -> important(Format, Args) + end. + +%% debug message are display if debug level == 2 +dbg_out(Format, Args) -> + case mnesia_monitor:get_env(debug) of + none -> ignore; + verbose -> save({Format, Args}); + _ -> report_system_event({mnesia_info, Format, Args}) + end. + +%% Keep the last 10 debug print outs +save(DbgInfo) -> + catch save2(DbgInfo). + +save2(DbgInfo) -> + Key = {'$$$_report', current_pos}, + P = + case ?ets_lookup_element(mnesia_gvar, Key, 2) of + 30 -> -1; + I -> I + end, + set({'$$$_report', current_pos}, P+1), + set({'$$$_report', P+1}, {date(), time(), DbgInfo}). + +copy_file(From, To) -> + case file:open(From, [raw, binary, read]) of + {ok, F} -> + case file:open(To, [raw, binary, write]) of + {ok, T} -> + Res = copy_file_loop(F, T, 8000), + file:close(F), + file:close(T), + Res; + {error, Reason} -> + {error, Reason} + end; + {error, Reason} -> + {error, Reason} + end. + +copy_file_loop(F, T, ChunkSize) -> + case file:read(F, ChunkSize) of + {ok, Bin} -> + file:write(T, Bin), + copy_file_loop(F, T, ChunkSize); + eof -> + ok; + {error, Reason} -> + {error, Reason} + end. + + +%%%%%%%%%%%% +%% versions of all the lowlevel db funcs that determine whether we +%% shall go to disc or ram to do the actual operation. + +db_get(Tab, Key) -> + db_get(val({Tab, storage_type}), Tab, Key). +db_get(ram_copies, Tab, Key) -> ?ets_lookup(Tab, Key); +db_get(disc_copies, Tab, Key) -> ?ets_lookup(Tab, Key); +db_get(disc_only_copies, Tab, Key) -> dets:lookup(Tab, Key). + +db_init_chunk(Tab) -> + db_init_chunk(val({Tab, storage_type}), Tab, 1000). +db_init_chunk(Tab, N) -> + db_init_chunk(val({Tab, storage_type}), Tab, N). + +db_init_chunk(disc_only_copies, Tab, N) -> + dets:select(Tab, [{'_', [], ['$_']}], N); +db_init_chunk(_, Tab, N) -> + ets:select(Tab, [{'_', [], ['$_']}], N). + +db_chunk(disc_only_copies, State) -> + dets:select(State); +db_chunk(_, State) -> + ets:select(State). + +db_put(Tab, Val) -> + db_put(val({Tab, storage_type}), Tab, Val). + +db_put(ram_copies, Tab, Val) -> ?ets_insert(Tab, Val), ok; +db_put(disc_copies, Tab, Val) -> ?ets_insert(Tab, Val), ok; +db_put(disc_only_copies, Tab, Val) -> dets:insert(Tab, Val). + +db_match_object(Tab, Pat) -> + db_match_object(val({Tab, storage_type}), Tab, Pat). +db_match_object(Storage, Tab, Pat) -> + db_fixtable(Storage, Tab, true), + Res = catch_match_object(Storage, Tab, Pat), + db_fixtable(Storage, Tab, false), + case Res of + {'EXIT', Reason} -> exit(Reason); + _ -> Res + end. + +catch_match_object(disc_only_copies, Tab, Pat) -> + catch dets:match_object(Tab, Pat); +catch_match_object(_, Tab, Pat) -> + catch ets:match_object(Tab, Pat). + +db_select(Tab, Pat) -> + db_select(val({Tab, storage_type}), Tab, Pat). + +db_select(Storage, Tab, Pat) -> + db_fixtable(Storage, Tab, true), + Res = catch_select(Storage, Tab, Pat), + db_fixtable(Storage, Tab, false), + case Res of + {'EXIT', Reason} -> exit(Reason); + _ -> Res + end. + +catch_select(disc_only_copies, Tab, Pat) -> + catch dets:select(Tab, Pat); +catch_select(_, Tab, Pat) -> + catch ets:select(Tab, Pat). + +db_select_init(disc_only_copies, Tab, Pat, Limit) -> + dets:select(Tab, Pat, Limit); +db_select_init(_, Tab, Pat, Limit) -> + ets:select(Tab, Pat, Limit). + +db_select_cont(disc_only_copies, Cont0, Ms) -> + Cont = dets:repair_continuation(Cont0, Ms), + dets:select(Cont); +db_select_cont(_, Cont0, Ms) -> + Cont = ets:repair_continuation(Cont0, Ms), + ets:select(Cont). + +db_fixtable(ets, Tab, Bool) -> + ets:safe_fixtable(Tab, Bool); +db_fixtable(ram_copies, Tab, Bool) -> + ets:safe_fixtable(Tab, Bool); +db_fixtable(disc_copies, Tab, Bool) -> + ets:safe_fixtable(Tab, Bool); +db_fixtable(dets, Tab, Bool) -> + dets:safe_fixtable(Tab, Bool); +db_fixtable(disc_only_copies, Tab, Bool) -> + dets:safe_fixtable(Tab, Bool). + +db_erase(Tab, Key) -> + db_erase(val({Tab, storage_type}), Tab, Key). +db_erase(ram_copies, Tab, Key) -> ?ets_delete(Tab, Key), ok; +db_erase(disc_copies, Tab, Key) -> ?ets_delete(Tab, Key), ok; +db_erase(disc_only_copies, Tab, Key) -> dets:delete(Tab, Key). + +db_match_erase(Tab, Pat) -> + db_match_erase(val({Tab, storage_type}), Tab, Pat). +db_match_erase(ram_copies, Tab, Pat) -> ?ets_match_delete(Tab, Pat), ok; +db_match_erase(disc_copies, Tab, Pat) -> ?ets_match_delete(Tab, Pat), ok; +db_match_erase(disc_only_copies, Tab, Pat) -> dets:match_delete(Tab, Pat). + +db_first(Tab) -> + db_first(val({Tab, storage_type}), Tab). +db_first(ram_copies, Tab) -> ?ets_first(Tab); +db_first(disc_copies, Tab) -> ?ets_first(Tab); +db_first(disc_only_copies, Tab) -> dets:first(Tab). + +db_next_key(Tab, Key) -> + db_next_key(val({Tab, storage_type}), Tab, Key). +db_next_key(ram_copies, Tab, Key) -> ?ets_next(Tab, Key); +db_next_key(disc_copies, Tab, Key) -> ?ets_next(Tab, Key); +db_next_key(disc_only_copies, Tab, Key) -> dets:next(Tab, Key). + +db_last(Tab) -> + db_last(val({Tab, storage_type}), Tab). +db_last(ram_copies, Tab) -> ?ets_last(Tab); +db_last(disc_copies, Tab) -> ?ets_last(Tab); +db_last(disc_only_copies, Tab) -> dets:first(Tab). %% Dets don't have order + +db_prev_key(Tab, Key) -> + db_prev_key(val({Tab, storage_type}), Tab, Key). +db_prev_key(ram_copies, Tab, Key) -> ?ets_prev(Tab, Key); +db_prev_key(disc_copies, Tab, Key) -> ?ets_prev(Tab, Key); +db_prev_key(disc_only_copies, Tab, Key) -> dets:next(Tab, Key). %% Dets don't have order + +db_slot(Tab, Pos) -> + db_slot(val({Tab, storage_type}), Tab, Pos). +db_slot(ram_copies, Tab, Pos) -> ?ets_slot(Tab, Pos); +db_slot(disc_copies, Tab, Pos) -> ?ets_slot(Tab, Pos); +db_slot(disc_only_copies, Tab, Pos) -> dets:slot(Tab, Pos). + +db_update_counter(Tab, C, Val) -> + db_update_counter(val({Tab, storage_type}), Tab, C, Val). +db_update_counter(ram_copies, Tab, C, Val) -> + ?ets_update_counter(Tab, C, Val); +db_update_counter(disc_copies, Tab, C, Val) -> + ?ets_update_counter(Tab, C, Val); +db_update_counter(disc_only_copies, Tab, C, Val) -> + dets:update_counter(Tab, C, Val). + +db_erase_tab(Tab) -> + db_erase_tab(val({Tab, storage_type}), Tab). +db_erase_tab(ram_copies, Tab) -> ?ets_delete_table(Tab); +db_erase_tab(disc_copies, Tab) -> ?ets_delete_table(Tab); +db_erase_tab(disc_only_copies, _Tab) -> ignore. + +%% assuming that Tab is a valid ets-table +dets_to_ets(Tabname, Tab, File, Type, Rep, Lock) -> + {Open, Close} = mkfuns(Lock), + case Open(Tabname, [{file, File}, {type, disk_type(Tab, Type)}, + {keypos, 2}, {repair, Rep}]) of + {ok, Tabname} -> + Res = dets:to_ets(Tabname, Tab), + Close(Tabname), + trav_ret(Res, Tab); + Other -> + Other + end. + +trav_ret(Tabname, Tabname) -> loaded; +trav_ret(Other, _Tabname) -> Other. + +mkfuns(yes) -> + {fun(Tab, Args) -> dets_sync_open(Tab, Args) end, + fun(Tab) -> dets_sync_close(Tab) end}; +mkfuns(no) -> + {fun(Tab, Args) -> dets:open_file(Tab, Args) end, + fun(Tab) -> dets:close(Tab) end}. + +disk_type(Tab) -> + disk_type(Tab, val({Tab, setorbag})). + +disk_type(_Tab, ordered_set) -> + set; +disk_type(_, Type) -> + Type. + +dets_sync_open(Tab, Ref, File) -> + Args = [{file, File}, + {keypos, 2}, + {repair, mnesia_monitor:get_env(auto_repair)}, + {type, disk_type(Tab)}], + dets_sync_open(Ref, Args). + +lock_table(Tab) -> + global:set_lock({{mnesia_table_lock, Tab}, self()}, [node()], infinity). +% dbg_out("dets_sync_open: ~p ~p~n", [T, self()]), + +unlock_table(Tab) -> + global:del_lock({{mnesia_table_lock, Tab}, self()}, [node()]). +% dbg_out("unlock_table: ~p ~p~n", [T, self()]), + +dets_sync_open(Tab, Args) -> + lock_table(Tab), + case dets:open_file(Tab, Args) of + {ok, Tab} -> + {ok, Tab}; + Other -> + dets_sync_close(Tab), + Other + end. + +dets_sync_close(Tab) -> + catch dets:close(Tab), + unlock_table(Tab), + ok. + +readable_indecies(Tab) -> + val({Tab, index}). + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +%% Managing conditional debug functions +%% +%% The main idea with the debug_fun's is to allow test programs +%% to control the internal behaviour of Mnesia. This is needed +%% to make the test programs independent of system load, swapping +%% and other circumstances that may affect the behaviour of Mnesia. +%% +%% First should calls to ?eval_debug_fun be inserted at well +%% defined places in Mnesia's code. E.g. in critical situations +%% of startup, transaction commit, backups etc. +%% +%% Then compile Mnesia with the compiler option 'debug'. +%% +%% In test programs ?activate_debug_fun should be called +%% in order to bind a fun to the debug identifier stated +%% in the call to ?eval_debug_fun. +%% +%% If eval_debug_fun finds that the fun is activated it +%% invokes the fun as NewContext = Fun(PreviousContext, EvalContext) +%% and replaces the PreviousContext with the NewContext. +%% The initial context of a debug_fun is given as argument to +%% activate_debug_fun. + +-define(DEBUG_TAB, mnesia_debug). +-record(debug_info, {id, function, context, file, line}). + +scratch_debug_fun() -> + dbg_out("scratch_debug_fun(): ~p~n", [?DEBUG_TAB]), + (catch ?ets_delete_table(?DEBUG_TAB)), + ?ets_new_table(?DEBUG_TAB, [set, public, named_table, {keypos, 2}]). + +activate_debug_fun(FunId, Fun, InitialContext, File, Line) -> + Info = #debug_info{id = FunId, + function = Fun, + context = InitialContext, + file = File, + line = Line + }, + update_debug_info(Info). + +update_debug_info(Info) -> + case catch ?ets_insert(?DEBUG_TAB, Info) of + {'EXIT', _} -> + scratch_debug_fun(), + ?ets_insert(?DEBUG_TAB, Info); + _ -> + ok + end, + dbg_out("update_debug_info(~p)~n", [Info]), + ok. + +deactivate_debug_fun(FunId, _File, _Line) -> + catch ?ets_delete(?DEBUG_TAB, FunId), + ok. + +eval_debug_fun(FunId, EvalContext, EvalFile, EvalLine) -> + case catch ?ets_lookup(?DEBUG_TAB, FunId) of + [] -> + ok; + [Info] -> + OldContext = Info#debug_info.context, + dbg_out("~s(~p): ~w " + "activated in ~s(~p)~n " + "eval_debug_fun(~w, ~w)~n", + [filename:basename(EvalFile), EvalLine, Info#debug_info.id, + filename:basename(Info#debug_info.file), Info#debug_info.line, + OldContext, EvalContext]), + Fun = Info#debug_info.function, + NewContext = Fun(OldContext, EvalContext), + + case catch ?ets_lookup(?DEBUG_TAB, FunId) of + [Info] when NewContext /= OldContext -> + NewInfo = Info#debug_info{context = NewContext}, + update_debug_info(NewInfo); + _ -> + ok + end; + {'EXIT', _} -> ok + end. + +-ifdef(debug). + is_debug_compiled() -> true. +-else. + is_debug_compiled() -> false. +-endif. + + diff --git a/lib/mnesia/src/mnesia_loader.erl b/lib/mnesia/src/mnesia_loader.erl new file mode 100644 index 0000000000..77c317abc5 --- /dev/null +++ b/lib/mnesia/src/mnesia_loader.erl @@ -0,0 +1,828 @@ +%% +%% %CopyrightBegin% +%% +%% Copyright Ericsson AB 1998-2009. All Rights Reserved. +%% +%% The contents of this file are subject to the Erlang Public License, +%% Version 1.1, (the "License"); you may not use this file except in +%% compliance with the License. You should have received a copy of the +%% Erlang Public License along with this software. If not, it can be +%% retrieved online at http://www.erlang.org/. +%% +%% Software distributed under the License is distributed on an "AS IS" +%% basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See +%% the License for the specific language governing rights and limitations +%% under the License. +%% +%% %CopyrightEnd% +%% + +%% +%%% Purpose : Loads tables from local disc or from remote node + +-module(mnesia_loader). + +%% Mnesia internal stuff +-export([disc_load_table/2, + net_load_table/4, + send_table/3]). + +-export([old_node_init_table/6]). %% Spawned old node protocol conversion hack +-export([spawned_receiver/8]). %% Spawned lock taking process + +-import(mnesia_lib, [set/2, fatal/2, verbose/2, dbg_out/2]). + +-include("mnesia.hrl"). + +val(Var) -> + case ?catch_val(Var) of + {'EXIT', Reason} -> mnesia_lib:other_val(Var, Reason); + Value -> Value + end. + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +%% Load a table from local disc + +disc_load_table(Tab, Reason) -> + Storage = val({Tab, storage_type}), + Type = val({Tab, setorbag}), + dbg_out("Getting table ~p (~p) from disc: ~p~n", + [Tab, Storage, Reason]), + ?eval_debug_fun({?MODULE, do_get_disc_copy}, + [{tab, Tab}, + {reason, Reason}, + {storage, Storage}, + {type, Type}]), + do_get_disc_copy2(Tab, Reason, Storage, Type). + +do_get_disc_copy2(Tab, _Reason, Storage, _Type) when Storage == unknown -> + verbose("Local table copy of ~p has recently been deleted, ignored.~n", + [Tab]), + {loaded, ok}; %% ? +do_get_disc_copy2(Tab, Reason, Storage, Type) when Storage == disc_copies -> + %% NOW we create the actual table + Repair = mnesia_monitor:get_env(auto_repair), + Args = [{keypos, 2}, public, named_table, Type], + case Reason of + {dumper, _} -> %% Resources allready allocated + ignore; + _ -> + mnesia_monitor:mktab(Tab, Args), + Count = mnesia_log:dcd2ets(Tab, Repair), + case ets:info(Tab, size) of + X when X < Count * 4 -> + ok = mnesia_log:ets2dcd(Tab); + _ -> + ignore + end + end, + mnesia_index:init_index(Tab, Storage), + snmpify(Tab, Storage), + set({Tab, load_node}, node()), + set({Tab, load_reason}, Reason), + {loaded, ok}; + +do_get_disc_copy2(Tab, Reason, Storage, Type) when Storage == ram_copies -> + Args = [{keypos, 2}, public, named_table, Type], + case Reason of + {dumper, _} -> %% Resources allready allocated + ignore; + _ -> + mnesia_monitor:mktab(Tab, Args), + Fname = mnesia_lib:tab2dcd(Tab), + Datname = mnesia_lib:tab2dat(Tab), + Repair = mnesia_monitor:get_env(auto_repair), + case mnesia_monitor:use_dir() of + true -> + case mnesia_lib:exists(Fname) of + true -> mnesia_log:dcd2ets(Tab, Repair); + false -> + case mnesia_lib:exists(Datname) of + true -> + mnesia_lib:dets_to_ets(Tab, Tab, Datname, + Type, Repair, no); + false -> + false + end + end; + false -> + false + end + end, + mnesia_index:init_index(Tab, Storage), + snmpify(Tab, Storage), + set({Tab, load_node}, node()), + set({Tab, load_reason}, Reason), + {loaded, ok}; + +do_get_disc_copy2(Tab, Reason, Storage, Type) when Storage == disc_only_copies -> + Args = [{file, mnesia_lib:tab2dat(Tab)}, + {type, mnesia_lib:disk_type(Tab, Type)}, + {keypos, 2}, + {repair, mnesia_monitor:get_env(auto_repair)}], + case Reason of + {dumper, _} -> + mnesia_index:init_index(Tab, Storage), + snmpify(Tab, Storage), + set({Tab, load_node}, node()), + set({Tab, load_reason}, Reason), + {loaded, ok}; + _ -> + case mnesia_monitor:open_dets(Tab, Args) of + {ok, _} -> + mnesia_index:init_index(Tab, Storage), + snmpify(Tab, Storage), + set({Tab, load_node}, node()), + set({Tab, load_reason}, Reason), + {loaded, ok}; + {error, Error} -> + {not_loaded, {"Failed to create dets table", Error}} + end + end. + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +%% Load a table from a remote node +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +%% +%% Receiver Sender +%% -------- ------ +%% Grab schema lock on table +%% Determine table size +%% Create empty pre-grown table +%% Grab read lock on table +%% Let receiver subscribe on updates done on sender node +%% Disable rehashing of table +%% Release read lock on table +%% Send table to receiver in chunks +%% +%% Grab read lock on table +%% Block dirty updates +%% Update wherabouts +%% +%% Cancel the update subscription +%% Process the subscription events +%% Optionally dump to disc +%% Unblock dirty updates +%% Release read lock on table +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% + +-define(MAX_TRANSFER_SIZE, 7500). +-define(MAX_RAM_FILE_SIZE, 1000000). +-define(MAX_RAM_TRANSFERS, (?MAX_RAM_FILE_SIZE div ?MAX_TRANSFER_SIZE) + 1). +-define(MAX_NOPACKETS, 20). + +net_load_table(Tab, Reason, Ns, Cs) + when Reason == {dumper,add_table_copy} -> + try_net_load_table(Tab, Reason, Ns, Cs); +net_load_table(Tab, Reason, Ns, _Cs) -> + try_net_load_table(Tab, Reason, Ns, val({Tab, cstruct})). + +try_net_load_table(Tab, _Reason, [], _Cs) -> + verbose("Copy failed. No active replicas of ~p are available.~n", [Tab]), + {not_loaded, none_active}; +try_net_load_table(Tab, Reason, Ns, Cs) -> + Storage = mnesia_lib:cs_to_storage_type(node(), Cs), + do_get_network_copy(Tab, Reason, Ns, Storage, Cs). + +do_get_network_copy(Tab, _Reason, _Ns, unknown, _Cs) -> + verbose("Local table copy of ~p has recently been deleted, ignored.~n", [Tab]), + {not_loaded, storage_unknown}; +do_get_network_copy(Tab, Reason, Ns, Storage, Cs) -> + [Node | Tail] = Ns, + case lists:member(Node,val({current, db_nodes})) of + true -> + dbg_out("Getting table ~p (~p) from node ~p: ~p~n", + [Tab, Storage, Node, Reason]), + ?eval_debug_fun({?MODULE, do_get_network_copy}, + [{tab, Tab}, {reason, Reason}, + {nodes, Ns}, {storage, Storage}]), + case init_receiver(Node, Tab, Storage, Cs, Reason) of + ok -> + set({Tab, load_node}, Node), + set({Tab, load_reason}, Reason), + mnesia_controller:i_have_tab(Tab), + dbg_out("Table ~p copied from ~p to ~p~n", [Tab, Node, node()]), + {loaded, ok}; + Err = {error, _} when element(1, Reason) == dumper -> + {not_loaded,Err}; + restart -> + try_net_load_table(Tab, Reason, Tail ++ [Node], Cs); + down -> + try_net_load_table(Tab, Reason, Tail, Cs) + end; + false -> + try_net_load_table(Tab, Reason, Tail, Cs) + end. + +snmpify(Tab, Storage) -> + do_snmpify(Tab, val({Tab, snmp}), Storage). + +do_snmpify(_Tab, [], _Storage) -> + ignore; +do_snmpify(Tab, Us, Storage) -> + Snmp = mnesia_snmp_hook:create_table(Us, Tab, Storage), + set({Tab, {index, snmp}}, Snmp). + +%% Start the recieiver +init_receiver(Node, Tab, Storage, Cs, Reas={dumper,add_table_copy}) -> + case start_remote_sender(Node, Tab, Storage) of + {SenderPid, TabSize, DetsData} -> + start_receiver(Tab,Storage,Cs,SenderPid,TabSize,DetsData,Reas); + Else -> + Else + end; +init_receiver(Node, Tab,Storage,Cs,Reason) -> + %% Grab a schema lock to avoid deadlock between table_loader and schema_commit dumping. + %% Both may grab tables-locks in different order. + Load = + fun() -> + {_,Tid,Ts} = get(mnesia_activity_state), + mnesia_locker:rlock(Tid, Ts#tidstore.store, {schema, Tab}), + %% Check that table still exists + Active = val({Tab, active_replicas}), + %% Check that we havn't loaded it already + case val({Tab,where_to_read}) == node() of + true -> ok; + _ -> + %% And that sender still got a copy + %% (something might have happend while + %% we where waiting for the lock) + true = lists:member(Node, Active), + {SenderPid, TabSize, DetsData} = + start_remote_sender(Node,Tab,Storage), + Init = table_init_fun(SenderPid), + Args = [self(),Tab,Storage,Cs,SenderPid, + TabSize,DetsData,Init], + Pid = spawn_link(?MODULE, spawned_receiver, Args), + put(mnesia_real_loader, Pid), + wait_on_load_complete(Pid) + end + end, + Res = + case mnesia:transaction(Load, 20) of + {atomic, {error,Result}} when + element(1,Reason) == dumper -> + {error,Result}; + {atomic, {error,Result}} -> + fatal("Cannot create table ~p: ~p~n", + [[Tab, Storage], Result]); + {atomic, Result} -> Result; + {aborted, nomore} -> restart; + {aborted, _Reas} -> + verbose("Receiver failed on ~p from ~p:~nReason: ~p~n", + [Tab,Node,_Reas]), + down %% either this node or sender is dying + end, + unlink(whereis(mnesia_tm)), %% Avoid late unlink from tm + Res. + +start_remote_sender(Node,Tab,Storage) -> + mnesia_controller:start_remote_sender(Node, Tab, self(), Storage), + put(mnesia_table_sender_node, {Tab, Node}), + receive + {SenderPid, {first, TabSize}} -> + {SenderPid, TabSize, false}; + {SenderPid, {first, TabSize, DetsData}} -> + {SenderPid, TabSize, DetsData}; + %% Protocol conversion hack + {copier_done, Node} -> + verbose("Sender of table ~p crashed on node ~p ~n", [Tab, Node]), + down(Tab, Storage) + end. + +table_init_fun(SenderPid) -> + PConv = mnesia_monitor:needs_protocol_conversion(node(SenderPid)), + MeMyselfAndI = self(), + fun(read) -> + Receiver = + if + PConv == true -> + MeMyselfAndI ! {actual_tabrec, self()}, + MeMyselfAndI; %% Old mnesia + PConv == false -> self() + end, + SenderPid ! {Receiver, more}, + get_data(SenderPid, Receiver) + end. + +%% Add_table_copy get's it's own locks. +start_receiver(Tab,Storage,Cs,SenderPid,TabSize,DetsData,{dumper,add_table_copy}) -> + Init = table_init_fun(SenderPid), + case do_init_table(Tab,Storage,Cs,SenderPid,TabSize,DetsData,self(), Init) of + Err = {error, _} -> + SenderPid ! {copier_done, node()}, + Err; + Else -> + Else + end. + +spawned_receiver(ReplyTo,Tab,Storage,Cs, SenderPid,TabSize,DetsData, Init) -> + process_flag(trap_exit, true), + Done = do_init_table(Tab,Storage,Cs, + SenderPid,TabSize,DetsData, + ReplyTo, Init), + ReplyTo ! {self(),Done}, + unlink(ReplyTo), + unlink(whereis(mnesia_controller)), + exit(normal). + +wait_on_load_complete(Pid) -> + receive + {Pid, Res} -> + Res; + {'EXIT', Pid, Reason} -> + exit(Reason); + Else -> + Pid ! Else, + wait_on_load_complete(Pid) + end. + +do_init_table(Tab,Storage,Cs,SenderPid, + TabSize,DetsInfo,OrigTabRec,Init) -> + case create_table(Tab, TabSize, Storage, Cs) of + {Storage,Tab} -> + %% Debug info + Node = node(SenderPid), + put(mnesia_table_receiver, {Tab, Node, SenderPid}), + mnesia_tm:block_tab(Tab), + PConv = mnesia_monitor:needs_protocol_conversion(Node), + + case init_table(Tab,Storage,Init,PConv,DetsInfo,SenderPid) of + ok -> + tab_receiver(Node,Tab,Storage,Cs,PConv,OrigTabRec); + Reason -> + Msg = "[d]ets:init table failed", + verbose("~s: ~p: ~p~n", [Msg, Tab, Reason]), + down(Tab, Storage) + end; + Error -> + Error + end. + +create_table(Tab, TabSize, Storage, Cs) -> + if + Storage == disc_only_copies -> + mnesia_lib:lock_table(Tab), + Tmp = mnesia_lib:tab2tmp(Tab), + Size = lists:max([TabSize, 256]), + Args = [{file, Tmp}, + {keypos, 2}, +%% {ram_file, true}, + {estimated_no_objects, Size}, + {repair, mnesia_monitor:get_env(auto_repair)}, + {type, mnesia_lib:disk_type(Tab, Cs#cstruct.type)}], + file:delete(Tmp), + case mnesia_lib:dets_sync_open(Tab, Args) of + {ok, _} -> + mnesia_lib:unlock_table(Tab), + {Storage, Tab}; + Else -> + mnesia_lib:unlock_table(Tab), + Else + end; + (Storage == ram_copies) or (Storage == disc_copies) -> + Args = [{keypos, 2}, public, named_table, Cs#cstruct.type], + case mnesia_monitor:unsafe_mktab(Tab, Args) of + Tab -> + {Storage, Tab}; + Else -> + Else + end + end. + +tab_receiver(Node, Tab, Storage, Cs, PConv, OrigTabRec) -> + receive + {SenderPid, {no_more, DatBin}} when PConv == false -> + finish_copy(Storage,Tab,Cs,SenderPid,DatBin,OrigTabRec); + + %% Protocol conversion hack + {SenderPid, {no_more, DatBin}} when is_pid(PConv) -> + PConv ! {SenderPid, no_more}, + receive + {old_init_table_complete, ok} -> + finish_copy(Storage, Tab, Cs, SenderPid, DatBin,OrigTabRec); + {old_init_table_complete, Reason} -> + Msg = "OLD: [d]ets:init table failed", + verbose("~s: ~p: ~p~n", [Msg, Tab, Reason]), + down(Tab, Storage) + end; + + {actual_tabrec, Pid} -> + tab_receiver(Node, Tab, Storage, Cs, Pid,OrigTabRec); + + {SenderPid, {more, [Recs]}} when is_pid(PConv) -> + PConv ! {SenderPid, {more, Recs}}, %% Forward Msg to OldNodes + tab_receiver(Node, Tab, Storage, Cs, PConv,OrigTabRec); + + {'EXIT', PConv, Reason} -> %% [d]ets:init process crashed + Msg = "Receiver crashed", + verbose("~s: ~p: ~p~n", [Msg, Tab, Reason]), + down(Tab, Storage); + + %% Protocol conversion hack + {copier_done, Node} -> + verbose("Sender of table ~p crashed on node ~p ~n", [Tab, Node]), + down(Tab, Storage); + + {'EXIT', Pid, Reason} -> + handle_exit(Pid, Reason), + tab_receiver(Node, Tab, Storage, Cs, PConv,OrigTabRec) + end. + +make_table_fun(Pid, TabRec) -> + fun(close) -> + ok; + (read) -> + get_data(Pid, TabRec) + end. + +get_data(Pid, TabRec) -> + receive + {Pid, {more, Recs}} -> + Pid ! {TabRec, more}, + {Recs, make_table_fun(Pid,TabRec)}; + {Pid, no_more} -> + end_of_input; + {copier_done, Node} -> + case node(Pid) of + Node -> + {copier_done, Node}; + _ -> + get_data(Pid, TabRec) + end; + {'EXIT', Pid, Reason} -> + handle_exit(Pid, Reason), + get_data(Pid, TabRec) + end. + +init_table(Tab, disc_only_copies, Fun, false, DetsInfo,Sender) -> + ErtsVer = erlang:system_info(version), + case DetsInfo of + {ErtsVer, DetsData} -> + Res = (catch dets:is_compatible_bchunk_format(Tab, DetsData)), + case Res of + {'EXIT',{undef,[{dets,_,_}|_]}} -> + Sender ! {self(), {old_protocol, Tab}}, + dets:init_table(Tab, Fun); %% Old dets version + {'EXIT', What} -> + exit(What); + false -> + Sender ! {self(), {old_protocol, Tab}}, + dets:init_table(Tab, Fun); %% Old dets version + true -> + dets:init_table(Tab, Fun, [{format, bchunk}]) + end; + Old when Old /= false -> + Sender ! {self(), {old_protocol, Tab}}, + dets:init_table(Tab, Fun); %% Old dets version + _ -> + dets:init_table(Tab, Fun) + end; +init_table(Tab, _, Fun, false, _DetsInfo,_) -> + case catch ets:init_table(Tab, Fun) of + true -> + ok; + {'EXIT', Else} -> Else + end; +init_table(Tab, Storage, Fun, true, _DetsInfo, Sender) -> %% Old Nodes + spawn_link(?MODULE, old_node_init_table, + [Tab, Storage, Fun, self(), false, Sender]), + ok. + +old_node_init_table(Tab, Storage, Fun, TabReceiver, DetsInfo,Sender) -> + Res = init_table(Tab, Storage, Fun, false, DetsInfo,Sender), + TabReceiver ! {old_init_table_complete, Res}, + unlink(TabReceiver), + ok. + +finish_copy(Storage,Tab,Cs,SenderPid,DatBin,OrigTabRec) -> + TabRef = {Storage, Tab}, + subscr_receiver(TabRef, Cs#cstruct.record_name), + case handle_last(TabRef, Cs#cstruct.type, DatBin) of + ok -> + mnesia_index:init_index(Tab, Storage), + snmpify(Tab, Storage), + %% OrigTabRec must not be the spawned tab-receiver + %% due to old protocol. + SenderPid ! {OrigTabRec, no_more}, + mnesia_tm:unblock_tab(Tab), + ok; + {error, Reason} -> + Msg = "Failed to handle last", + verbose("~s: ~p: ~p~n", [Msg, Tab, Reason]), + down(Tab, Storage) + end. + +subscr_receiver(TabRef = {_, Tab}, RecName) -> + receive + {mnesia_table_event, {Op, Val, _Tid}} -> + if + Tab == RecName -> + handle_event(TabRef, Op, Val); + true -> + handle_event(TabRef, Op, setelement(1, Val, RecName)) + end, + subscr_receiver(TabRef, RecName); + + {'EXIT', Pid, Reason} -> + handle_exit(Pid, Reason), + subscr_receiver(TabRef, RecName) + after 0 -> + ok + end. + +handle_event(TabRef, write, Rec) -> + db_put(TabRef, Rec); +handle_event(TabRef, delete, {_Tab, Key}) -> + db_erase(TabRef, Key); +handle_event(TabRef, delete_object, OldRec) -> + db_match_erase(TabRef, OldRec); +handle_event(TabRef, clear_table, {_Tab, _Key}) -> + db_match_erase(TabRef, '_'). + +handle_last({disc_copies, Tab}, _Type, nobin) -> + Ret = mnesia_log:ets2dcd(Tab), + Fname = mnesia_lib:tab2dat(Tab), + case mnesia_lib:exists(Fname) of + true -> %% Remove old .DAT files. + file:delete(Fname); + false -> + ok + end, + Ret; + +handle_last({disc_only_copies, Tab}, Type, nobin) -> + mnesia_lib:dets_sync_close(Tab), + Tmp = mnesia_lib:tab2tmp(Tab), + Dat = mnesia_lib:tab2dat(Tab), + case file:rename(Tmp, Dat) of + ok -> + Args = [{file, mnesia_lib:tab2dat(Tab)}, + {type, mnesia_lib:disk_type(Tab, Type)}, + {keypos, 2}, + {repair, mnesia_monitor:get_env(auto_repair)}], + mnesia_monitor:open_dets(Tab, Args), + ok; + {error, Reason} -> + {error, {"Cannot swap tmp files", Tab, Reason}} + end; + +handle_last({ram_copies, _Tab}, _Type, nobin) -> + ok; +handle_last({ram_copies, Tab}, _Type, DatBin) -> + case mnesia_monitor:use_dir() of + true -> + mnesia_lib:lock_table(Tab), + Tmp = mnesia_lib:tab2tmp(Tab), + ok = file:write_file(Tmp, DatBin), + ok = file:rename(Tmp, mnesia_lib:tab2dcd(Tab)), + mnesia_lib:unlock_table(Tab), + ok; + false -> + ok + end. + +down(Tab, Storage) -> + case Storage of + ram_copies -> + catch ?ets_delete_table(Tab); + disc_copies -> + catch ?ets_delete_table(Tab); + disc_only_copies -> + TmpFile = mnesia_lib:tab2tmp(Tab), + mnesia_lib:dets_sync_close(Tab), + file:delete(TmpFile) + end, + mnesia_checkpoint:tm_del_copy(Tab, node()), + mnesia_controller:sync_del_table_copy_whereabouts(Tab, node()), + mnesia_tm:unblock_tab(Tab), + flush_subcrs(), + down. + +flush_subcrs() -> + receive + {mnesia_table_event, _} -> + flush_subcrs(); + + {'EXIT', Pid, Reason} -> + handle_exit(Pid, Reason), + flush_subcrs() + after 0 -> + done + end. + +db_erase({ram_copies, Tab}, Key) -> + true = ?ets_delete(Tab, Key); +db_erase({disc_copies, Tab}, Key) -> + true = ?ets_delete(Tab, Key); +db_erase({disc_only_copies, Tab}, Key) -> + ok = dets:delete(Tab, Key). + +db_match_erase({ram_copies, Tab} , Pat) -> + true = ?ets_match_delete(Tab, Pat); +db_match_erase({disc_copies, Tab} , Pat) -> + true = ?ets_match_delete(Tab, Pat); +db_match_erase({disc_only_copies, Tab}, Pat) -> + ok = dets:match_delete(Tab, Pat). + +db_put({ram_copies, Tab}, Val) -> + true = ?ets_insert(Tab, Val); +db_put({disc_copies, Tab}, Val) -> + true = ?ets_insert(Tab, Val); +db_put({disc_only_copies, Tab}, Val) -> + ok = dets:insert(Tab, Val). + +%% This code executes at the remote site where the data is +%% executes in a special copier process. + +calc_nokeys(Storage, Tab) -> + %% Calculate #keys per transfer + Key = mnesia_lib:db_first(Storage, Tab), + Recs = mnesia_lib:db_get(Storage, Tab, Key), + BinSize = size(term_to_binary(Recs)), + (?MAX_TRANSFER_SIZE div BinSize) + 1. + +send_table(Pid, Tab, RemoteS) -> + case ?catch_val({Tab, storage_type}) of + {'EXIT', _} -> + {error, {no_exists, Tab}}; + unknown -> + {error, {no_exists, Tab}}; + Storage -> + %% Send first + TabSize = mnesia:table_info(Tab, size), + Pconvert = mnesia_monitor:needs_protocol_conversion(node(Pid)), + KeysPerTransfer = calc_nokeys(Storage, Tab), + ChunkData = dets:info(Tab, bchunk_format), + + UseDetsChunk = + Storage == RemoteS andalso + Storage == disc_only_copies andalso + ChunkData /= undefined andalso + Pconvert == false, + if + UseDetsChunk == true -> + DetsInfo = erlang:system_info(version), + Pid ! {self(), {first, TabSize, {DetsInfo, ChunkData}}}; + true -> + Pid ! {self(), {first, TabSize}} + end, + + %% Debug info + put(mnesia_table_sender, {Tab, node(Pid), Pid}), + {Init, Chunk} = reader_funcs(UseDetsChunk, Tab, Storage, KeysPerTransfer), + + SendIt = fun() -> + prepare_copy(Pid, Tab, Storage), + send_more(Pid, 1, Chunk, Init(), Tab, Pconvert), + finish_copy(Pid, Tab, Storage, RemoteS) + end, + + case catch SendIt() of + receiver_died -> + cleanup_tab_copier(Pid, Storage, Tab), + unlink(whereis(mnesia_tm)), + ok; + {_, receiver_died} -> + unlink(whereis(mnesia_tm)), + ok; + {atomic, no_more} -> + unlink(whereis(mnesia_tm)), + ok; + Reason -> + cleanup_tab_copier(Pid, Storage, Tab), + unlink(whereis(mnesia_tm)), + {error, Reason} + end + end. + +prepare_copy(Pid, Tab, Storage) -> + Trans = + fun() -> + mnesia:write_lock_table(Tab), + mnesia_subscr:subscribe(Pid, {table, Tab}), + update_where_to_write(Tab, node(Pid)), + mnesia_lib:db_fixtable(Storage, Tab, true), + ok + end, + case mnesia:transaction(Trans) of + {atomic, ok} -> + ok; + {aborted, Reason} -> + exit({tab_copier_prepare, Tab, Reason}) + end. + +update_where_to_write(Tab, Node) -> + case val({Tab, access_mode}) of + read_only -> + ignore; + read_write -> + Current = val({current, db_nodes}), + Ns = + case lists:member(Node, Current) of + true -> Current; + false -> [Node | Current] + end, + update_where_to_write(Ns, Tab, Node) + end. + +update_where_to_write([], _, _) -> + ok; +update_where_to_write([H|T], Tab, AddNode) -> + rpc:call(H, mnesia_controller, call, + [{update_where_to_write, [add, Tab, AddNode], self()}]), + update_where_to_write(T, Tab, AddNode). + +send_more(Pid, N, Chunk, DataState, Tab, OldNode) -> + receive + {NewPid, more} -> + case send_packet(N - 1, NewPid, Chunk, DataState, OldNode) of + New when is_integer(New) -> + New - 1; + NewData -> + send_more(NewPid, ?MAX_NOPACKETS, Chunk, NewData, Tab, OldNode) + end; + {_NewPid, {old_protocol, Tab}} -> + Storage = val({Tab, storage_type}), + {Init, NewChunk} = + reader_funcs(false, Tab, Storage, calc_nokeys(Storage, Tab)), + send_more(Pid, 1, NewChunk, Init(), Tab, OldNode); + + {copier_done, Node} when Node == node(Pid)-> + verbose("Receiver of table ~p crashed on ~p (more)~n", [Tab, Node]), + throw(receiver_died) + end. + +reader_funcs(UseDetsChunk, Tab, Storage, KeysPerTransfer) -> + case UseDetsChunk of + false -> + {fun() -> mnesia_lib:db_init_chunk(Storage, Tab, KeysPerTransfer) end, + fun(Cont) -> mnesia_lib:db_chunk(Storage, Cont) end}; + true -> + {fun() -> dets_bchunk(Tab, start) end, + fun(Cont) -> dets_bchunk(Tab, Cont) end} + end. + +dets_bchunk(Tab, Chunk) -> %% Arrg + case dets:bchunk(Tab, Chunk) of + {Cont, Data} -> {Data, Cont}; + Else -> Else + end. + +send_packet(N, Pid, _Chunk, '$end_of_table', OldNode) -> + case OldNode of + true -> ignore; %% Old nodes can't handle the new no_more + false -> Pid ! {self(), no_more} + end, + N; +send_packet(N, Pid, Chunk, {[], Cont}, OldNode) -> + send_packet(N, Pid, Chunk, Chunk(Cont), OldNode); +send_packet(N, Pid, Chunk, {Recs, Cont}, OldNode) when N < ?MAX_NOPACKETS -> + case OldNode of + true -> Pid ! {self(), {more, [Recs]}}; %% Old need's wrapping list + false -> Pid ! {self(), {more, Recs}} + end, + send_packet(N+1, Pid, Chunk, Chunk(Cont), OldNode); +send_packet(_N, _Pid, _Chunk, DataState, _OldNode) -> + DataState. + +finish_copy(Pid, Tab, Storage, RemoteS) -> + RecNode = node(Pid), + DatBin = dat2bin(Tab, Storage, RemoteS), + Trans = + fun() -> + mnesia:read_lock_table(Tab), + A = val({Tab, access_mode}), + mnesia_controller:sync_and_block_table_whereabouts(Tab, RecNode, RemoteS, A), + cleanup_tab_copier(Pid, Storage, Tab), + mnesia_checkpoint:tm_add_copy(Tab, RecNode), + Pid ! {self(), {no_more, DatBin}}, + receive + {Pid, no_more} -> % Dont bother about the spurious 'more' message + no_more; + {copier_done, Node} when Node == node(Pid)-> + verbose("Tab receiver ~p crashed (more): ~p~n", [Tab, Node]), + receiver_died + end + end, + mnesia:transaction(Trans). + +cleanup_tab_copier(Pid, Storage, Tab) -> + mnesia_lib:db_fixtable(Storage, Tab, false), + mnesia_subscr:unsubscribe(Pid, {table, Tab}). + +dat2bin(Tab, ram_copies, ram_copies) -> + mnesia_lib:lock_table(Tab), + Res = file:read_file(mnesia_lib:tab2dcd(Tab)), + mnesia_lib:unlock_table(Tab), + case Res of + {ok, DatBin} -> DatBin; + _ -> nobin + end; +dat2bin(_Tab, _LocalS, _RemoteS) -> + nobin. + +handle_exit(Pid, Reason) when node(Pid) == node() -> + exit(Reason); +handle_exit(_Pid, _Reason) -> %% Not from our node, this will be handled by + ignore. %% mnesia_down soon. diff --git a/lib/mnesia/src/mnesia_locker.erl b/lib/mnesia/src/mnesia_locker.erl new file mode 100644 index 0000000000..cfa3f171b2 --- /dev/null +++ b/lib/mnesia/src/mnesia_locker.erl @@ -0,0 +1,1196 @@ +%% +%% %CopyrightBegin% +%% +%% Copyright Ericsson AB 1996-2009. All Rights Reserved. +%% +%% The contents of this file are subject to the Erlang Public License, +%% Version 1.1, (the "License"); you may not use this file except in +%% compliance with the License. You should have received a copy of the +%% Erlang Public License along with this software. If not, it can be +%% retrieved online at http://www.erlang.org/. +%% +%% Software distributed under the License is distributed on an "AS IS" +%% basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See +%% the License for the specific language governing rights and limitations +%% under the License. +%% +%% %CopyrightEnd% +%% + +%% +-module(mnesia_locker). + +-export([ + get_held_locks/0, + get_lock_queue/0, + global_lock/5, + ixrlock/5, + init/1, + mnesia_down/2, + release_tid/1, + async_release_tid/2, + send_release_tid/2, + receive_release_tid_acc/2, + rlock/3, + rlock_table/3, + rwlock/3, + sticky_rwlock/3, + start/0, + sticky_wlock/3, + sticky_wlock_table/3, + wlock/3, + wlock_no_exist/4, + wlock_table/3 + ]). + +%% sys callback functions +-export([system_continue/3, + system_terminate/4, + system_code_change/4 + ]). + +-include("mnesia.hrl"). +-import(mnesia_lib, [dbg_out/2, error/2, verbose/2]). + +-define(dbg(S,V), ok). +%-define(dbg(S,V), dbg_out("~p:~p: " ++ S, [?MODULE, ?LINE] ++ V)). + +-define(ALL, '______WHOLETABLE_____'). +-define(STICK, '______STICK_____'). +-define(GLOBAL, '______GLOBAL_____'). + +-record(state, {supervisor}). + +-record(queue, {oid, tid, op, pid, lucky}). + +%% mnesia_held_locks: contain {Oid, Op, Tid} entries (bag) +-define(match_oid_held_locks(Oid), {Oid, '_', '_'}). +%% mnesia_tid_locks: contain {Tid, Oid, Op} entries (bag) +-define(match_oid_tid_locks(Tid), {Tid, '_', '_'}). +%% mnesia_sticky_locks: contain {Oid, Node} entries and {Tab, Node} entries (set) +-define(match_oid_sticky_locks(Oid),{Oid, '_'}). +%% mnesia_lock_queue: contain {queue, Oid, Tid, Op, ReplyTo, WaitForTid} entries (bag) +-define(match_oid_lock_queue(Oid), #queue{oid=Oid, tid='_', op = '_', pid = '_', lucky = '_'}). +%% mnesia_lock_counter: {{write, Tab}, Number} && +%% {{read, Tab}, Number} entries (set) + +start() -> + mnesia_monitor:start_proc(?MODULE, ?MODULE, init, [self()]). + +init(Parent) -> + register(?MODULE, self()), + process_flag(trap_exit, true), + ?ets_new_table(mnesia_held_locks, [bag, private, named_table]), + ?ets_new_table(mnesia_tid_locks, [bag, private, named_table]), + ?ets_new_table(mnesia_sticky_locks, [set, private, named_table]), + ?ets_new_table(mnesia_lock_queue, [bag, private, named_table, {keypos, 2}]), + + proc_lib:init_ack(Parent, {ok, self()}), + case ?catch_val(pid_sort_order) of + r9b_plain -> put(pid_sort_order, r9b_plain); + standard -> put(pid_sort_order, standard); + _ -> ignore + end, + loop(#state{supervisor = Parent}). + +val(Var) -> + case ?catch_val(Var) of + {'EXIT', _ReASoN_} -> mnesia_lib:other_val(Var, _ReASoN_); + _VaLuE_ -> _VaLuE_ + end. + +reply(From, R) -> + From ! {?MODULE, node(), R}. + +l_request(Node, X, Store) -> + {?MODULE, Node} ! {self(), X}, + l_req_rec(Node, Store). + +l_req_rec(Node, Store) -> + ?ets_insert(Store, {nodes, Node}), + receive + {?MODULE, Node, Reply} -> + Reply; + {mnesia_down, Node} -> + {not_granted, {node_not_running, Node}} + end. + +release_tid(Tid) -> + ?MODULE ! {release_tid, Tid}. + +async_release_tid(Nodes, Tid) -> + rpc:abcast(Nodes, ?MODULE, {release_tid, Tid}). + +send_release_tid(Nodes, Tid) -> + rpc:abcast(Nodes, ?MODULE, {self(), {sync_release_tid, Tid}}). + +receive_release_tid_acc([Node | Nodes], Tid) -> + receive + {?MODULE, Node, {tid_released, Tid}} -> + receive_release_tid_acc(Nodes, Tid); + {mnesia_down, Node} -> + receive_release_tid_acc(Nodes, Tid) + end; +receive_release_tid_acc([], _Tid) -> + ok. + +loop(State) -> + receive + {From, {write, Tid, Oid}} -> + try_sticky_lock(Tid, write, From, Oid), + loop(State); + + %% If Key == ?ALL it's a request to lock the entire table + %% + + {From, {read, Tid, Oid}} -> + try_sticky_lock(Tid, read, From, Oid), + loop(State); + + %% Really do a read, but get hold of a write lock + %% used by mnesia:wread(Oid). + + {From, {read_write, Tid, Oid}} -> + try_sticky_lock(Tid, read_write, From, Oid), + loop(State); + + %% Tid has somehow terminated, clear up everything + %% and pass locks on to queued processes. + %% This is the purpose of the mnesia_tid_locks table + + {release_tid, Tid} -> + do_release_tid(Tid), + loop(State); + + %% stick lock, first tries this to the where_to_read Node + {From, {test_set_sticky, Tid, {Tab, _} = Oid, Lock}} -> + case ?ets_lookup(mnesia_sticky_locks, Tab) of + [] -> + reply(From, not_stuck), + loop(State); + [{_,Node}] when Node == node() -> + %% Lock is stuck here, see now if we can just set + %% a regular write lock + try_lock(Tid, Lock, From, Oid), + loop(State); + [{_,Node}] -> + reply(From, {stuck_elsewhere, Node}), + loop(State) + end; + + %% If test_set_sticky fails, we send this to all nodes + %% after aquiring a real write lock on Oid + + {stick, {Tab, _}, N} -> + ?ets_insert(mnesia_sticky_locks, {Tab, N}), + loop(State); + + %% The caller which sends this message, must have first + %% aquired a write lock on the entire table + {unstick, Tab} -> + ?ets_delete(mnesia_sticky_locks, Tab), + loop(State); + + {From, {ix_read, Tid, Tab, IxKey, Pos}} -> + case ?ets_lookup(mnesia_sticky_locks, Tab) of + [] -> + set_read_lock_on_all_keys(Tid,From,Tab,IxKey,Pos), + loop(State); + [{_,N}] when N == node() -> + set_read_lock_on_all_keys(Tid,From,Tab,IxKey,Pos), + loop(State); + [{_,N}] -> + Req = {From, {ix_read, Tid, Tab, IxKey, Pos}}, + From ! {?MODULE, node(), {switch, N, Req}}, + loop(State) + end; + + {From, {sync_release_tid, Tid}} -> + do_release_tid(Tid), + reply(From, {tid_released, Tid}), + loop(State); + + {release_remote_non_pending, Node, Pending} -> + release_remote_non_pending(Node, Pending), + mnesia_monitor:mnesia_down(?MODULE, Node), + loop(State); + + {'EXIT', Pid, _} when Pid == State#state.supervisor -> + do_stop(); + + {system, From, Msg} -> + verbose("~p got {system, ~p, ~p}~n", [?MODULE, From, Msg]), + Parent = State#state.supervisor, + sys:handle_system_msg(Msg, From, Parent, ?MODULE, [], State); + + {get_table, From, LockTable} -> + From ! {LockTable, ?ets_match_object(LockTable, '_')}, + loop(State); + + Msg -> + error("~p got unexpected message: ~p~n", [?MODULE, Msg]), + loop(State) + end. + +set_lock(Tid, Oid, Op) -> + ?dbg("Granted ~p ~p ~p~n", [Tid,Oid,Op]), + ?ets_insert(mnesia_held_locks, {Oid, Op, Tid}), + ?ets_insert(mnesia_tid_locks, {Tid, Oid, Op}). + +%%%%%%%%%%%%%%%%%%%%%%%%%%% +%% Acquire locks + +try_sticky_lock(Tid, Op, Pid, {Tab, _} = Oid) -> + case ?ets_lookup(mnesia_sticky_locks, Tab) of + [] -> + try_lock(Tid, Op, Pid, Oid); + [{_,N}] when N == node() -> + try_lock(Tid, Op, Pid, Oid); + [{_,N}] -> + Req = {Pid, {Op, Tid, Oid}}, + Pid ! {?MODULE, node(), {switch, N, Req}} + end. + +try_lock(Tid, read_write, Pid, Oid) -> + try_lock(Tid, read_write, read, write, Pid, Oid); +try_lock(Tid, Op, Pid, Oid) -> + try_lock(Tid, Op, Op, Op, Pid, Oid). + +try_lock(Tid, Op, SimpleOp, Lock, Pid, Oid) -> + case can_lock(Tid, Lock, Oid, {no, bad_luck}) of + yes -> + Reply = grant_lock(Tid, SimpleOp, Lock, Oid), + reply(Pid, Reply); + {no, Lucky} -> + C = #cyclic{op = SimpleOp, lock = Lock, oid = Oid, lucky = Lucky}, + ?dbg("Rejected ~p ~p ~p ~p ~n", [Tid, Oid, Lock, Lucky]), + reply(Pid, {not_granted, C}); + {queue, Lucky} -> + ?dbg("Queued ~p ~p ~p ~p ~n", [Tid, Oid, Lock, Lucky]), + %% Append to queue: Nice place for trace output + ?ets_insert(mnesia_lock_queue, + #queue{oid = Oid, tid = Tid, op = Op, + pid = Pid, lucky = Lucky}), + ?ets_insert(mnesia_tid_locks, {Tid, Oid, {queued, Op}}) + end. + +grant_lock(Tid, read, Lock, Oid = {Tab, Key}) + when Key /= ?ALL, Tab /= ?GLOBAL -> + case node(Tid#tid.pid) == node() of + true -> + set_lock(Tid, Oid, Lock), + {granted, lookup_in_client}; + false -> + try + Val = mnesia_lib:db_get(Tab, Key), %% lookup as well + set_lock(Tid, Oid, Lock), + {granted, Val} + catch _:_Reason -> + %% Table has been deleted from this node, + %% restart the transaction. + C = #cyclic{op = read, lock = Lock, oid = Oid, + lucky = nowhere}, + {not_granted, C} + end + end; +grant_lock(Tid, {ix_read,IxKey,Pos}, Lock, Oid = {Tab, _}) -> + try + Res = ix_read_res(Tab, IxKey,Pos), + set_lock(Tid, Oid, Lock), + {granted, Res, [?ALL]} + catch _:_ -> + {not_granted, {no_exists, Tab, {index, [Pos]}}} + end; +grant_lock(Tid, read, Lock, Oid) -> + set_lock(Tid, Oid, Lock), + {granted, ok}; +grant_lock(Tid, write, Lock, Oid) -> + set_lock(Tid, Oid, Lock), + granted. + +%% 1) Impose an ordering on all transactions favour old (low tid) transactions +%% newer (higher tid) transactions may never wait on older ones, +%% 2) When releasing the tids from the queue always begin with youngest (high tid) +%% because of 1) it will avoid the deadlocks. +%% 3) TabLocks is the problem :-) They should not starve and not deadlock +%% handle tablocks in queue as they had locks on unlocked records. + +can_lock(Tid, read, {Tab, Key}, AlreadyQ) when Key /= ?ALL -> + %% The key is bound, no need for the other BIF + Oid = {Tab, Key}, + ObjLocks = ?ets_match_object(mnesia_held_locks, {Oid, write, '_'}), + TabLocks = ?ets_match_object(mnesia_held_locks, {{Tab, ?ALL}, write, '_'}), + check_lock(Tid, Oid, ObjLocks, TabLocks, yes, AlreadyQ, read); + +can_lock(Tid, read, Oid, AlreadyQ) -> % Whole tab + Tab = element(1, Oid), + ObjLocks = ?ets_match_object(mnesia_held_locks, {{Tab, '_'}, write, '_'}), + check_lock(Tid, Oid, ObjLocks, [], yes, AlreadyQ, read); + +can_lock(Tid, write, {Tab, Key}, AlreadyQ) when Key /= ?ALL -> + Oid = {Tab, Key}, + ObjLocks = ?ets_lookup(mnesia_held_locks, Oid), + TabLocks = ?ets_lookup(mnesia_held_locks, {Tab, ?ALL}), + check_lock(Tid, Oid, ObjLocks, TabLocks, yes, AlreadyQ, write); + +can_lock(Tid, write, Oid, AlreadyQ) -> % Whole tab + Tab = element(1, Oid), + ObjLocks = ?ets_match_object(mnesia_held_locks, ?match_oid_held_locks({Tab, '_'})), + check_lock(Tid, Oid, ObjLocks, [], yes, AlreadyQ, write). + +%% Check held locks for conflicting locks +check_lock(Tid, Oid, [Lock | Locks], TabLocks, X, AlreadyQ, Type) -> + case element(3, Lock) of + Tid -> + check_lock(Tid, Oid, Locks, TabLocks, X, AlreadyQ, Type); + WaitForTid -> + Queue = allowed_to_be_queued(WaitForTid,Tid), + if Queue == true -> + check_lock(Tid, Oid, Locks, TabLocks, {queue, WaitForTid}, AlreadyQ, Type); + Tid#tid.pid == WaitForTid#tid.pid -> + dbg_out("Spurious lock conflict ~w ~w: ~w -> ~w~n", + [Oid, Lock, Tid, WaitForTid]), + %% Test.. + {Tab, _Key} = Oid, + HaveQ = (ets:lookup(mnesia_lock_queue, Oid) /= []) + orelse (ets:lookup(mnesia_lock_queue,{Tab,?ALL}) /= []), + if + HaveQ -> + {no, WaitForTid}; + true -> + check_lock(Tid,Oid,Locks,TabLocks,{queue,WaitForTid},AlreadyQ,Type) + end; + %%{no, WaitForTid}; Safe solution + true -> + {no, WaitForTid} + end + end; + +check_lock(_, _, [], [], X, {queue, bad_luck}, _) -> + X; %% The queue should be correct already no need to check it again + +check_lock(_, _, [], [], X = {queue, _Tid}, _AlreadyQ, _) -> + X; + +check_lock(Tid, Oid, [], [], X, AlreadyQ, Type) -> + {Tab, Key} = Oid, + if + Type == write -> + check_queue(Tid, Tab, X, AlreadyQ); + Key == ?ALL -> + %% hmm should be solvable by a clever select expr but not today... + check_queue(Tid, Tab, X, AlreadyQ); + true -> + %% If there is a queue on that object, read_lock shouldn't be granted + ObjLocks = ets:lookup(mnesia_lock_queue, Oid), + case max(ObjLocks) of + empty -> + check_queue(Tid, Tab, X, AlreadyQ); + ObjL -> + case allowed_to_be_queued(ObjL,Tid) of + false -> + %% Starvation Preemption (write waits for read) + {no, ObjL}; + true -> + check_queue(Tid, Tab, {queue, ObjL}, AlreadyQ) + end + end + end; + +check_lock(Tid, Oid, [], TabLocks, X, AlreadyQ, Type) -> + check_lock(Tid, Oid, TabLocks, [], X, AlreadyQ, Type). + +%% True if WaitForTid > Tid -> % Important order +allowed_to_be_queued(WaitForTid, Tid) -> + case get(pid_sort_order) of + undefined -> WaitForTid > Tid; + r9b_plain -> + cmp_tid(true, WaitForTid, Tid) =:= 1; + standard -> + cmp_tid(false, WaitForTid, Tid) =:= 1 + end. + +%% Check queue for conflicting locks +%% Assume that all queued locks belongs to other tid's + +check_queue(Tid, Tab, X, AlreadyQ) -> + TabLocks = ets:lookup(mnesia_lock_queue, {Tab,?ALL}), + Greatest = max(TabLocks), + case Greatest of + empty -> X; + Tid -> X; + WaitForTid -> + case allowed_to_be_queued(WaitForTid,Tid) of + true -> + {queue, WaitForTid}; + false when AlreadyQ =:= {no, bad_luck} -> + {no, WaitForTid} + end + end. + +sort_queue(QL) -> + case get(pid_sort_order) of + undefined -> + lists:reverse(lists:keysort(#queue.tid, QL)); + r9b_plain -> + lists:sort(fun(#queue{tid=X},#queue{tid=Y}) -> + cmp_tid(true, X, Y) == 1 + end, QL); + standard -> + lists:sort(fun(#queue{tid=X},#queue{tid=Y}) -> + cmp_tid(false, X, Y) == 1 + end, QL) + end. + +max([]) -> empty; +max([#queue{tid=Max}]) -> Max; +max(L) -> + [#queue{tid=Max}|_] = sort_queue(L), + Max. + +set_read_lock_on_all_keys(Tid, From, Tab, IxKey, Pos) -> + Oid = {Tab,?ALL}, + Op = {ix_read,IxKey, Pos}, + Lock = read, + case can_lock(Tid, Lock, Oid, {no, bad_luck}) of + yes -> + Reply = grant_lock(Tid, Op, Lock, Oid), + reply(From, Reply); + {no, Lucky} -> + C = #cyclic{op = Op, lock = Lock, oid = Oid, lucky = Lucky}, + ?dbg("Rejected ~p ~p ~p ~p ~n", [Tid, Oid, Lock, Lucky]), + reply(From, {not_granted, C}); + {queue, Lucky} -> + ?dbg("Queued ~p ~p ~p ~p ~n", [Tid, Oid, Lock, Lucky]), + %% Append to queue: Nice place for trace output + ?ets_insert(mnesia_lock_queue, + #queue{oid = Oid, tid = Tid, op = Op, + pid = From, lucky = Lucky}), + ?ets_insert(mnesia_tid_locks, {Tid, Oid, {queued, Op}}) + end. + +%%%%%%%%%%%%%%%%%%%%%%%%%%% +%% Release of locks + +%% Release remote non-pending nodes +release_remote_non_pending(Node, Pending) -> + %% Clear the mnesia_sticky_locks table first, to avoid + %% unnecessary requests to the failing node + ?ets_match_delete(mnesia_sticky_locks, {'_' , Node}), + + %% Then we have to release all locks held by processes + %% running at the failed node and also simply remove all + %% queue'd requests back to the failed node + + AllTids = ?ets_match(mnesia_tid_locks, {'$1', '_', '_'}), + Tids = [T || [T] <- AllTids, Node == node(T#tid.pid), not lists:member(T, Pending)], + do_release_tids(Tids). + +do_release_tids([Tid | Tids]) -> + do_release_tid(Tid), + do_release_tids(Tids); +do_release_tids([]) -> + ok. + +do_release_tid(Tid) -> + Locks = ?ets_lookup(mnesia_tid_locks, Tid), + ?dbg("Release ~p ~p ~n", [Tid, Locks]), + ?ets_delete(mnesia_tid_locks, Tid), + release_locks(Locks), + %% Removed queued locks which has had locks + UniqueLocks = keyunique(lists:sort(Locks),[]), + rearrange_queue(UniqueLocks). + +keyunique([{_Tid, Oid, _Op}|R], Acc = [{_, Oid, _}|_]) -> + keyunique(R, Acc); +keyunique([H|R], Acc) -> + keyunique(R, [H|Acc]); +keyunique([], Acc) -> + Acc. + +release_locks([Lock | Locks]) -> + release_lock(Lock), + release_locks(Locks); +release_locks([]) -> + ok. + +release_lock({Tid, Oid, {queued, _}}) -> + ?ets_match_delete(mnesia_lock_queue, #queue{oid=Oid, tid = Tid, op = '_', + pid = '_', lucky = '_'}); +release_lock({Tid, Oid, Op}) -> + if + Op == write -> + ?ets_delete(mnesia_held_locks, Oid); + Op == read -> + ets:delete_object(mnesia_held_locks, {Oid, Op, Tid}) + end. + +rearrange_queue([{_Tid, {Tab, Key}, _} | Locks]) -> + if + Key /= ?ALL-> + Queue = + ets:lookup(mnesia_lock_queue, {Tab, ?ALL}) ++ + ets:lookup(mnesia_lock_queue, {Tab, Key}), + case Queue of + [] -> + ok; + _ -> + Sorted = sort_queue(Queue), + try_waiters_obj(Sorted) + end; + true -> + Pat = ?match_oid_lock_queue({Tab, '_'}), + Queue = ?ets_match_object(mnesia_lock_queue, Pat), + Sorted = sort_queue(Queue), + try_waiters_tab(Sorted) + end, + ?dbg("RearrQ ~p~n", [Queue]), + rearrange_queue(Locks); +rearrange_queue([]) -> + ok. + +try_waiters_obj([W | Waiters]) -> + case try_waiter(W) of + queued -> + no; + _ -> + try_waiters_obj(Waiters) + end; +try_waiters_obj([]) -> + ok. + +try_waiters_tab([W | Waiters]) -> + case W#queue.oid of + {_Tab, ?ALL} -> + case try_waiter(W) of + queued -> + no; + _ -> + try_waiters_tab(Waiters) + end; + Oid -> + case try_waiter(W) of + queued -> + Rest = key_delete_all(Oid, #queue.oid, Waiters), + try_waiters_tab(Rest); + _ -> + try_waiters_tab(Waiters) + end + end; +try_waiters_tab([]) -> + ok. + +try_waiter({queue, Oid, Tid, read_write, ReplyTo, _}) -> + try_waiter(Oid, read_write, read, write, ReplyTo, Tid); +try_waiter({queue, Oid, Tid, IXR = {ix_read,_,_}, ReplyTo, _}) -> + try_waiter(Oid, IXR, IXR, read, ReplyTo, Tid); +try_waiter({queue, Oid, Tid, Op, ReplyTo, _}) -> + try_waiter(Oid, Op, Op, Op, ReplyTo, Tid). + +try_waiter(Oid, Op, SimpleOp, Lock, ReplyTo, Tid) -> + case can_lock(Tid, Lock, Oid, {queue, bad_luck}) of + yes -> + %% Delete from queue: Nice place for trace output + ?ets_match_delete(mnesia_lock_queue, + #queue{oid=Oid, tid = Tid, op = Op, + pid = ReplyTo, lucky = '_'}), + Reply = grant_lock(Tid, SimpleOp, Lock, Oid), + reply(ReplyTo,Reply), + locked; + {queue, _Why} -> + ?dbg("Keep ~p ~p ~p ~p~n", [Tid, Oid, Lock, _Why]), + queued; % Keep waiter in queue + {no, Lucky} -> + C = #cyclic{op = SimpleOp, lock = Lock, oid = Oid, lucky = Lucky}, + verbose("** WARNING ** Restarted transaction, possible deadlock in lock queue ~w: cyclic = ~w~n", + [Tid, C]), + ?ets_match_delete(mnesia_lock_queue, + #queue{oid=Oid, tid = Tid, op = Op, + pid = ReplyTo, lucky = '_'}), + Reply = {not_granted, C}, + reply(ReplyTo,Reply), + removed + end. + +key_delete_all(Key, Pos, TupleList) -> + key_delete_all(Key, Pos, TupleList, []). +key_delete_all(Key, Pos, [H|T], Ack) when element(Pos, H) == Key -> + key_delete_all(Key, Pos, T, Ack); +key_delete_all(Key, Pos, [H|T], Ack) -> + key_delete_all(Key, Pos, T, [H|Ack]); +key_delete_all(_, _, [], Ack) -> + lists:reverse(Ack). + +ix_read_res(Tab,IxKey,Pos) -> + Index = mnesia_index:get_index_table(Tab, Pos), + Rks = mnesia_lib:elems(2,mnesia_index:db_get(Index, IxKey)), + lists:append(lists:map(fun(Real) -> mnesia_lib:db_get(Tab, Real) end, Rks)). + +%% ********************* end server code ******************** +%% The following code executes at the client side of a transactions + +mnesia_down(N, Pending) -> + case whereis(?MODULE) of + undefined -> + %% Takes care of mnesia_down's in early startup + mnesia_monitor:mnesia_down(?MODULE, N); + Pid -> + %% Syncronously call needed in order to avoid + %% race with mnesia_tm's coordinator processes + %% that may restart and acquire new locks. + %% mnesia_monitor ensures the sync. + Pid ! {release_remote_non_pending, N, Pending} + end. + +%% Aquire a write lock, but do a read, used by +%% mnesia:wread/1 + +rwlock(Tid, Store, Oid) -> + {Tab, Key} = Oid, + case val({Tab, where_to_read}) of + nowhere -> + mnesia:abort({no_exists, Tab}); + Node -> + Lock = write, + case need_lock(Store, Tab, Key, Lock) of + yes -> + Ns = w_nodes(Tab), + Res = get_rwlocks_on_nodes(Ns, rwlock, Node, Store, Tid, Oid), + ?ets_insert(Store, {{locks, Tab, Key}, Lock}), + Res; + no -> + if + Key == ?ALL -> + w_nodes(Tab); + Tab == ?GLOBAL -> + w_nodes(Tab); + true -> + dirty_rpc(Node, Tab, Key, Lock) + end + end + end. + +%% Return a list of nodes or abort transaction +%% WE also insert any additional where_to_write nodes +%% in the local store under the key == nodes + +w_nodes(Tab) -> + Nodes = ?catch_val({Tab, where_to_write}), + case Nodes of + [_ | _] -> Nodes; + _ -> mnesia:abort({no_exists, Tab}) + end. + +%% aquire a sticky wlock, a sticky lock is a lock +%% which remains at this node after the termination of the +%% transaction. + +sticky_wlock(Tid, Store, Oid) -> + sticky_lock(Tid, Store, Oid, write). + +sticky_rwlock(Tid, Store, Oid) -> + sticky_lock(Tid, Store, Oid, read_write). + +sticky_lock(Tid, Store, {Tab, Key} = Oid, Lock) -> + N = val({Tab, where_to_read}), + if + node() == N -> + case need_lock(Store, Tab, Key, write) of + yes -> + do_sticky_lock(Tid, Store, Oid, Lock); + no -> + dirty_sticky_lock(Tab, Key, [N], Lock) + end; + true -> + mnesia:abort({not_local, Tab}) + end. + +do_sticky_lock(Tid, Store, {Tab, Key} = Oid, Lock) -> + ?MODULE ! {self(), {test_set_sticky, Tid, Oid, Lock}}, + N = node(), + receive + {?MODULE, N, granted} -> + ?ets_insert(Store, {{locks, Tab, Key}, write}), + [?ets_insert(Store, {nodes, Node}) || Node <- w_nodes(Tab)], + granted; + {?MODULE, N, {granted, Val}} -> %% for rwlocks + case opt_lookup_in_client(Val, Oid, write) of + C = #cyclic{} -> + exit({aborted, C}); + Val2 -> + ?ets_insert(Store, {{locks, Tab, Key}, write}), + [?ets_insert(Store, {nodes, Node}) || Node <- w_nodes(Tab)], + Val2 + end; + {?MODULE, N, {not_granted, Reason}} -> + exit({aborted, Reason}); + {?MODULE, N, not_stuck} -> + not_stuck(Tid, Store, Tab, Key, Oid, Lock, N), + dirty_sticky_lock(Tab, Key, [N], Lock); + {mnesia_down, Node} -> + EMsg = {aborted, {node_not_running, Node}}, + flush_remaining([N], Node, EMsg); + {?MODULE, N, {stuck_elsewhere, _N2}} -> + stuck_elsewhere(Tid, Store, Tab, Key, Oid, Lock), + dirty_sticky_lock(Tab, Key, [N], Lock) + end. + +not_stuck(Tid, Store, Tab, _Key, Oid, _Lock, N) -> + rlock(Tid, Store, {Tab, ?ALL}), %% needed? + wlock(Tid, Store, Oid), %% perfect sync + wlock(Tid, Store, {Tab, ?STICK}), %% max one sticker/table + Ns = val({Tab, where_to_write}), + rpc:abcast(Ns, ?MODULE, {stick, Oid, N}). + +stuck_elsewhere(Tid, Store, Tab, _Key, Oid, _Lock) -> + rlock(Tid, Store, {Tab, ?ALL}), %% needed? + wlock(Tid, Store, Oid), %% perfect sync + wlock(Tid, Store, {Tab, ?STICK}), %% max one sticker/table + Ns = val({Tab, where_to_write}), + rpc:abcast(Ns, ?MODULE, {unstick, Tab}). + +dirty_sticky_lock(Tab, Key, Nodes, Lock) -> + if + Lock == read_write -> + mnesia_lib:db_get(Tab, Key); + Key == ?ALL -> + Nodes; + Tab == ?GLOBAL -> + Nodes; + true -> + ok + end. + +sticky_wlock_table(Tid, Store, Tab) -> + sticky_lock(Tid, Store, {Tab, ?ALL}, write). + +%% aquire a wlock on Oid +%% We store a {Tabname, write, Tid} in all locktables +%% on all nodes containing a copy of Tabname +%% We also store an item {{locks, Tab, Key}, write} in the +%% local store when we have aquired the lock. +%% +wlock(Tid, Store, Oid) -> + {Tab, Key} = Oid, + case need_lock(Store, Tab, Key, write) of + yes -> + Ns = w_nodes(Tab), + Op = {self(), {write, Tid, Oid}}, + ?ets_insert(Store, {{locks, Tab, Key}, write}), + get_wlocks_on_nodes(Ns, Ns, Store, Op, Oid); + no when Key /= ?ALL, Tab /= ?GLOBAL -> + []; + no -> + w_nodes(Tab) + end. + +wlock_table(Tid, Store, Tab) -> + wlock(Tid, Store, {Tab, ?ALL}). + +%% Write lock even if the table does not exist + +wlock_no_exist(Tid, Store, Tab, Ns) -> + Oid = {Tab, ?ALL}, + Op = {self(), {write, Tid, Oid}}, + get_wlocks_on_nodes(Ns, Ns, Store, Op, Oid). + +need_lock(Store, Tab, Key, LockPattern) -> + TabL = ?ets_match_object(Store, {{locks, Tab, ?ALL}, LockPattern}), + if + TabL == [] -> + KeyL = ?ets_match_object(Store, {{locks, Tab, Key}, LockPattern}), + if + KeyL == [] -> + yes; + true -> + no + end; + true -> + no + end. + +add_debug(Nodes) -> % Use process dictionary for debug info + put(mnesia_wlock_nodes, Nodes). + +del_debug() -> + erase(mnesia_wlock_nodes). + +%% We first send lock request to the local node if it is part of the lockers +%% then the first sorted node then to the rest of the lockmanagers on all +%% nodes holding a copy of the table + +get_wlocks_on_nodes([Node | Tail], Orig, Store, Request, Oid) -> + {?MODULE, Node} ! Request, + ?ets_insert(Store, {nodes, Node}), + receive_wlocks([Node], undefined, Store, Oid), + case node() of + Node -> %% Local done try one more + get_wlocks_on_nodes(Tail, Orig, Store, Request, Oid); + _ -> %% The first succeded cont with the rest + get_wlocks_on_nodes(Tail, Store, Request), + receive_wlocks(Tail, Orig, Store, Oid) + end; +get_wlocks_on_nodes([], Orig, _Store, _Request, _Oid) -> + Orig. + +get_wlocks_on_nodes([Node | Tail], Store, Request) -> + {?MODULE, Node} ! Request, + ?ets_insert(Store,{nodes, Node}), + get_wlocks_on_nodes(Tail, Store, Request); +get_wlocks_on_nodes([], _, _) -> + ok. + +get_rwlocks_on_nodes([ReadNode|Tail], _Res, ReadNode, Store, Tid, Oid) -> + Op = {self(), {read_write, Tid, Oid}}, + {?MODULE, ReadNode} ! Op, + ?ets_insert(Store, {nodes, ReadNode}), + Res = receive_wlocks([ReadNode], undefined, Store, Oid), + case node() of + ReadNode -> + get_rwlocks_on_nodes(Tail, Res, ReadNode, Store, Tid, Oid); + _ -> + get_wlocks_on_nodes(Tail, Store, {self(), {write, Tid, Oid}}), + receive_wlocks(Tail, Res, Store, Oid) + end; +get_rwlocks_on_nodes([Node | Tail], Res, ReadNode, Store, Tid, Oid) -> + Op = {self(), {write, Tid, Oid}}, + {?MODULE, Node} ! Op, + ?ets_insert(Store, {nodes, Node}), + receive_wlocks([Node], undefined, Store, Oid), + if node() == Node -> + get_rwlocks_on_nodes(Tail, Res, ReadNode, Store, Tid, Oid); + Res == rwlock -> %% Hmm + Rest = lists:delete(ReadNode, Tail), + Op2 = {self(), {read_write, Tid, Oid}}, + {?MODULE, ReadNode} ! Op2, + ?ets_insert(Store, {nodes, ReadNode}), + get_wlocks_on_nodes(Rest, Store, {self(), {write, Tid, Oid}}), + receive_wlocks([ReadNode|Rest], undefined, Store, Oid); + true -> + get_wlocks_on_nodes(Tail, Store, {self(), {write, Tid, Oid}}), + receive_wlocks(Tail, Res, Store, Oid) + end; +get_rwlocks_on_nodes([],Res,_,_,_,_) -> + Res. + +receive_wlocks([], Res, _Store, _Oid) -> + del_debug(), + Res; +receive_wlocks(Nodes = [This|Ns], Res, Store, Oid) -> + add_debug(Nodes), + receive + {?MODULE, Node, granted} -> + receive_wlocks(lists:delete(Node,Nodes), Res, Store, Oid); + {?MODULE, Node, {granted, Val}} -> %% for rwlocks + case opt_lookup_in_client(Val, Oid, write) of + C = #cyclic{} -> + flush_remaining(Nodes, Node, {aborted, C}); + Val2 -> + receive_wlocks(lists:delete(Node,Nodes), Val2, Store, Oid) + end; + {?MODULE, Node, {not_granted, Reason}} -> + Reason1 = {aborted, Reason}, + flush_remaining(Nodes,Node,Reason1); + {?MODULE, Node, {switch, Sticky, _Req}} -> %% for rwlocks + Tail = lists:delete(Node,Nodes), + Nonstuck = lists:delete(Sticky,Tail), + [?ets_insert(Store, {nodes, NSNode}) || NSNode <- Nonstuck], + case lists:member(Sticky,Tail) of + true -> + sticky_flush(Nonstuck,Store), + receive_wlocks([Sticky], Res, Store, Oid); + false -> + sticky_flush(Nonstuck,Store), + Res + end; + {mnesia_down, This} -> % Only look for down from Nodes in list + Reason1 = {aborted, {node_not_running, This}}, + flush_remaining(Ns, This, Reason1) + end. + +sticky_flush([], _) -> + del_debug(), + ok; +sticky_flush(Ns=[Node | Tail], Store) -> + add_debug(Ns), + receive + {?MODULE, Node, _} -> + sticky_flush(Tail, Store); + {mnesia_down, Node} -> + Reason1 = {aborted, {node_not_running, Node}}, + flush_remaining(Tail, Node, Reason1) + end. + +flush_remaining([], _SkipNode, Res) -> + del_debug(), + exit(Res); +flush_remaining([SkipNode | Tail ], SkipNode, Res) -> + flush_remaining(Tail, SkipNode, Res); +flush_remaining(Ns=[Node | Tail], SkipNode, Res) -> + add_debug(Ns), + receive + {?MODULE, Node, _} -> + flush_remaining(Tail, SkipNode, Res); + {mnesia_down, Node} -> + flush_remaining(Tail, SkipNode, {aborted, {node_not_running, Node}}) + end. + +opt_lookup_in_client(lookup_in_client, Oid, Lock) -> + {Tab, Key} = Oid, + case catch mnesia_lib:db_get(Tab, Key) of + {'EXIT', _} -> + %% Table has been deleted from this node, + %% restart the transaction. + #cyclic{op = read, lock = Lock, oid = Oid, lucky = nowhere}; + Val -> + Val + end; +opt_lookup_in_client(Val, _Oid, _Lock) -> + Val. + +return_granted_or_nodes({_, ?ALL} , Nodes) -> Nodes; +return_granted_or_nodes({?GLOBAL, _}, Nodes) -> Nodes; +return_granted_or_nodes(_ , _Nodes) -> granted. + +%% We store a {Tab, read, From} item in the +%% locks table on the node where we actually do pick up the object +%% and we also store an item {lock, Oid, read} in our local store +%% so that we can release any locks we hold when we commit. +%% This function not only aquires a read lock, but also reads the object + +%% Oid's are always {Tab, Key} tuples +rlock(Tid, Store, Oid) -> + {Tab, Key} = Oid, + case val({Tab, where_to_read}) of + nowhere -> + mnesia:abort({no_exists, Tab}); + Node -> + case need_lock(Store, Tab, Key, '_') of + yes -> + R = l_request(Node, {read, Tid, Oid}, Store), + rlock_get_reply(Node, Store, Oid, R); + no -> + if + Key == ?ALL -> + [Node]; + Tab == ?GLOBAL -> + [Node]; + true -> + dirty_rpc(Node, Tab, Key, read) + end + end + end. + +dirty_rpc(nowhere, Tab, Key, _Lock) -> + mnesia:abort({no_exists, {Tab, Key}}); +dirty_rpc(Node, _Tab, ?ALL, _Lock) -> + [Node]; +dirty_rpc(Node, ?GLOBAL, _Key, _Lock) -> + [Node]; +dirty_rpc(Node, Tab, Key, Lock) -> + Args = [Tab, Key], + case rpc:call(Node, mnesia_lib, db_get, Args) of + {badrpc, Reason} -> + case val({Tab, where_to_read}) of + Node -> + ErrorTag = mnesia_lib:dirty_rpc_error_tag(Reason), + mnesia:abort({ErrorTag, Args}); + _NewNode -> + %% Table has been deleted from the node, + %% restart the transaction. + C = #cyclic{op = read, lock = Lock, oid = {Tab, Key}, lucky = nowhere}, + exit({aborted, C}) + end; + Other -> + Other + end. + +rlock_get_reply(Node, Store, Oid, {granted, V}) -> + {Tab, Key} = Oid, + ?ets_insert(Store, {{locks, Tab, Key}, read}), + ?ets_insert(Store, {nodes, Node}), + case opt_lookup_in_client(V, Oid, read) of + C = #cyclic{} -> + mnesia:abort(C); + Val -> + Val + end; +rlock_get_reply(Node, Store, Oid, granted) -> + {Tab, Key} = Oid, + ?ets_insert(Store, {{locks, Tab, Key}, read}), + ?ets_insert(Store, {nodes, Node}), + return_granted_or_nodes(Oid, [Node]); +rlock_get_reply(Node, Store, Tab, {granted, V, RealKeys}) -> + %% Kept for backwards compatibility, keep until no old nodes + %% are available + L = fun(K) -> ?ets_insert(Store, {{locks, Tab, K}, read}) end, + lists:foreach(L, RealKeys), + ?ets_insert(Store, {nodes, Node}), + V; +rlock_get_reply(_Node, _Store, _Oid, {not_granted, Reason}) -> + exit({aborted, Reason}); + +rlock_get_reply(_Node, Store, Oid, {switch, N2, Req}) -> + ?ets_insert(Store, {nodes, N2}), + {?MODULE, N2} ! Req, + rlock_get_reply(N2, Store, Oid, l_req_rec(N2, Store)). + +rlock_table(Tid, Store, Tab) -> + rlock(Tid, Store, {Tab, ?ALL}). + +ixrlock(Tid, Store, Tab, IxKey, Pos) -> + case val({Tab, where_to_read}) of + nowhere -> + mnesia:abort({no_exists, Tab}); + Node -> + %%% Old code + %% R = l_request(Node, {ix_read, Tid, Tab, IxKey, Pos}, Store), + %% rlock_get_reply(Node, Store, Tab, R) + + case need_lock(Store, Tab, ?ALL, read) of + no when Node =:= node() -> + ix_read_res(Tab,IxKey,Pos); + _ -> %% yes or need to get the result from other node + R = l_request(Node, {ix_read, Tid, Tab, IxKey, Pos}, Store), + rlock_get_reply(Node, Store, Tab, R) + end + end. + +%% Grabs the locks or exits +global_lock(Tid, Store, Item, write, Ns) -> + Oid = {?GLOBAL, Item}, + Op = {self(), {write, Tid, Oid}}, + get_wlocks_on_nodes(Ns, Ns, Store, Op, Oid); +global_lock(Tid, Store, Item, read, Ns) -> + Oid = {?GLOBAL, Item}, + send_requests(Ns, {read, Tid, Oid}), + rec_requests(Ns, Oid, Store), + Ns. + +send_requests([Node | Nodes], X) -> + {?MODULE, Node} ! {self(), X}, + send_requests(Nodes, X); +send_requests([], _X) -> + ok. + +rec_requests([Node | Nodes], Oid, Store) -> + Res = l_req_rec(Node, Store), + case catch rlock_get_reply(Node, Store, Oid, Res) of + {'EXIT', Reason} -> + flush_remaining(Nodes, Node, Reason); + _ -> + rec_requests(Nodes, Oid, Store) + end; +rec_requests([], _Oid, _Store) -> + ok. + +get_held_locks() -> + ?MODULE ! {get_table, self(), mnesia_held_locks}, + receive {mnesia_held_locks, Locks} -> Locks end. + +get_lock_queue() -> + ?MODULE ! {get_table, self(), mnesia_lock_queue}, + Q = receive {mnesia_lock_queue, Locks} -> Locks end, + [{Oid, Op, Pid, Tid, WFT} || {queue, Oid, Tid, Op, Pid, WFT} <- Q]. + +do_stop() -> + exit(shutdown). + +%%%%%%%%%%%%%%%%%%%%%%%%%%% +%% System upgrade + +system_continue(_Parent, _Debug, State) -> + loop(State). + +system_terminate(_Reason, _Parent, _Debug, _State) -> + do_stop(). + +system_code_change(State, _Module, _OldVsn, _Extra) -> + {ok, State}. + + +%%%%%%%%%%%%%%%%%%%%%%%%%%% +%% AXD301 patch sort pids according to R9B sort order +%%%%%%%%%%%%%%%%%%%%%%%%%%% + +%% Om R9B == true, g�rs j�mf�relsen som i R9B plain. +%% Om R9B == false, g�rs j�mf�relsen som i alla andra releaser. +%% cmp_tid(T1, T2) returnerar -1 om T1 < T2, 0 om T1 = T2 och 1 om T1 > T2. + +-define(VERSION_MAGIC, 131). +-define(ATOM_EXT, 100). +-define(PID_EXT, 103). + +-record(pid_info, {serial, number, nodename, creation}). + +cmp_tid(R9B, + #tid{} = T, + #tid{} = T) when R9B == true; R9B == false -> + 0; +cmp_tid(R9B, + #tid{counter = C, pid = Pid1}, + #tid{counter = C, pid = Pid2}) when R9B == true; R9B == false -> + cmp_pid_info(R9B, pid_to_pid_info(Pid1), pid_to_pid_info(Pid2)); +cmp_tid(R9B, + #tid{counter = C1}, + #tid{counter = C2}) when R9B == true; R9B == false -> + cmp(C1, C2). + +cmp_pid_info(_, #pid_info{} = PI, #pid_info{} = PI) -> + 0; +cmp_pid_info(false, + #pid_info{serial = S, number = N, nodename = NN, creation = C1}, + #pid_info{serial = S, number = N, nodename = NN, creation = C2}) -> + cmp(C1, C2); +cmp_pid_info(false, + #pid_info{serial = S, number = N, nodename = NN1}, + #pid_info{serial = S, number = N, nodename = NN2}) -> + cmp(NN1, NN2); +cmp_pid_info(false, + #pid_info{serial = S, number = N1}, + #pid_info{serial = S, number = N2}) -> + cmp(N1, N2); +cmp_pid_info(false, #pid_info{serial = S1}, #pid_info{serial = S2}) -> + cmp(S1, S2); +cmp_pid_info(true, + #pid_info{nodename = NN, creation = C, serial = S, number = N1}, + #pid_info{nodename = NN, creation = C, serial = S, number = N2}) -> + cmp(N1, N2); +cmp_pid_info(true, + #pid_info{nodename = NN, creation = C, serial = S1}, + #pid_info{nodename = NN, creation = C, serial = S2}) -> + cmp(S1, S2); +cmp_pid_info(true, + #pid_info{nodename = NN, creation = C1}, + #pid_info{nodename = NN, creation = C2}) -> + cmp(C1, C2); +cmp_pid_info(true, #pid_info{nodename = NN1}, #pid_info{nodename = NN2}) -> + cmp(NN1, NN2). + +cmp(X, X) -> 0; +cmp(X1, X2) when X1 < X2 -> -1; +cmp(_X1, _X2) -> 1. + +pid_to_pid_info(Pid) when is_pid(Pid) -> + [?VERSION_MAGIC, ?PID_EXT, ?ATOM_EXT, NNL1, NNL0 | Rest] + = binary_to_list(term_to_binary(Pid)), + [N3, N2, N1, N0, S3, S2, S1, S0, Creation] = drop(bytes2int(NNL1, NNL0), + Rest), + #pid_info{serial = bytes2int(S3, S2, S1, S0), + number = bytes2int(N3, N2, N1, N0), + nodename = node(Pid), + creation = Creation}. + +drop(0, L) -> L; +drop(N, [_|L]) when is_integer(N), N > 0 -> drop(N-1, L); +drop(N, []) when is_integer(N), N > 0 -> []. + +bytes2int(N1, N0) when 0 =< N1, N1 =< 255, + 0 =< N0, N0 =< 255 -> + (N1 bsl 8) bor N0. +bytes2int(N3, N2, N1, N0) when 0 =< N3, N3 =< 255, + 0 =< N2, N2 =< 255, + 0 =< N1, N1 =< 255, + 0 =< N0, N0 =< 255 -> + (N3 bsl 24) bor (N2 bsl 16) bor (N1 bsl 8) bor N0. + diff --git a/lib/mnesia/src/mnesia_log.erl b/lib/mnesia/src/mnesia_log.erl new file mode 100644 index 0000000000..00ec4740ee --- /dev/null +++ b/lib/mnesia/src/mnesia_log.erl @@ -0,0 +1,1025 @@ +%% +%% %CopyrightBegin% +%% +%% Copyright Ericsson AB 1996-2009. All Rights Reserved. +%% +%% The contents of this file are subject to the Erlang Public License, +%% Version 1.1, (the "License"); you may not use this file except in +%% compliance with the License. You should have received a copy of the +%% Erlang Public License along with this software. If not, it can be +%% retrieved online at http://www.erlang.org/. +%% +%% Software distributed under the License is distributed on an "AS IS" +%% basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See +%% the License for the specific language governing rights and limitations +%% under the License. +%% +%% %CopyrightEnd% +%% + +%% +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +%% +%% This module administers three kinds of log files: +%% +%% 1 The transaction log +%% mnesia_tm appends to the log (via mnesia_log) at the +%% end of each transaction (or dirty write) and +%% mnesia_dumper reads the log and performs the ops in +%% the dat files. The dump_log is done at startup and +%% at intervals controlled by the user. +%% +%% 2 The mnesia_down log +%% mnesia_tm appends to the log (via mnesia_log) when it +%% realizes that mnesia goes up or down on another node. +%% mnesia_init reads the log (via mnesia_log) at startup. +%% +%% 3 The backup log +%% mnesia_schema produces one tiny log when the schema is +%% initially created. mnesia_schema also reads the log +%% when the user wants tables (possibly incl the schema) +%% to be restored. mnesia_log appends to the log when the +%% user wants to produce a real backup. +%% +%% The actual access to the backup media is performed via the +%% mnesia_backup module for both read and write. mnesia_backup +%% uses the disk_log (*), BUT the user may write an own module +%% with the same interface as mnesia_backup and configure +%% Mnesia so the alternate module performs the actual accesses +%% to the backup media. This means that the user may put the +%% backup on medias that Mnesia does not know about possibly on +%% hosts where Erlang is not running. +%% +%% All these logs have to some extent a common structure. +%% They are all using the disk_log module (*) for the basic +%% file structure. The disk_log has a repair feature that +%% can be used to skip erroneous log records if one comes to +%% the conclusion that it is more important to reuse some +%% of the log records than the risque of obtaining inconsistent +%% data. If the data becomes inconsistent it is solely up to the +%% application to make it consistent again. The automatic +%% reparation of the disk_log is very powerful, but use it +%% with extreme care. +%% +%% First in all Mnesia's log file is a mnesia log header. +%% It contains a list with a log_header record as single +%% element. The structure of the log_header may never be +%% changed since it may be written to very old backup files. +%% By holding this record definition stable we can be +%% able to comprahend backups from timepoint 0. It also +%% allows us to use the backup format as an interchange +%% format between Mnesia releases. +%% +%% An op-list is a list of tuples with arity 3. Each tuple +%% has this structure: {Oid, Recs, Op} where Oid is the tuple +%% {Tab, Key}, Recs is a (possibly empty) list of records and +%% Op is an atom. +%% +%% The log file structure for the transaction log is as follows. +%% +%% After the mnesia log section follows an extended record section +%% containing op-lists. There are several values that Op may +%% have, such as write, delete, update_counter, delete_object, +%% and replace. There is no special end of section marker. +%% +%% +-----------------+ +%% | mnesia log head | +%% +-----------------+ +%% | extended record | +%% | section | +%% +-----------------+ +%% +%% The log file structure for the mnesia_down log is as follows. +%% +%% After the mnesia log section follows a mnesia_down section +%% containg lists with yoyo records as single element. +%% +%% +-----------------+ +%% | mnesia log head | +%% +-----------------+ +%% | mnesia_down | +%% | section | +%% +-----------------+ +%% +%% The log file structure for the backup log is as follows. +%% +%% After the mnesia log section follows a schema section +%% containing record lists. A record list is a list of tuples +%% where {schema, Tab} is interpreted as a delete_table(Tab) and +%% {schema, Tab, CreateList} are interpreted as create_table. +%% +%% The record section also contains record lists. In this section +%% {Tab, Key} is interpreted as delete({Tab, Key}) and other tuples +%% as write(Tuple). There is no special end of section marker. +%% +%% +-----------------+ +%% | mnesia log head | +%% +-----------------+ +%% | schema section | +%% +-----------------+ +%% | record section | +%% +-----------------+ +%% +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% + +-module(mnesia_log). + +-export([ + append/2, + backup/1, + backup/2, + backup_checkpoint/2, + backup_checkpoint/3, + backup_log_header/0, + backup_master/2, + chunk_decision_log/1, + chunk_decision_tab/1, + chunk_log/1, + chunk_log/2, + close_decision_log/0, + close_decision_tab/0, + close_log/1, + unsafe_close_log/1, + confirm_log_dump/1, + confirm_decision_log_dump/0, + previous_log_file/0, + previous_decision_log_file/0, + latest_log_file/0, + decision_log_version/0, + decision_log_file/0, + decision_tab_file/0, + decision_tab_version/0, + dcl_version/0, + dcd_version/0, + ets2dcd/1, + ets2dcd/2, + dcd2ets/1, + dcd2ets/2, + init/0, + init_log_dump/0, + log/1, + slog/1, + log_decision/1, + log_files/0, + open_decision_log/0, + trans_log_header/0, + open_decision_tab/0, + dcl_log_header/0, + dcd_log_header/0, + open_log/4, + open_log/6, + prepare_decision_log_dump/0, + prepare_log_dump/1, + save_decision_tab/1, + purge_all_logs/0, + purge_some_logs/0, + stop/0, + tab_copier/3, + version/0, + view/0, + view/1, + write_trans_log_header/0 + ]). + + +-include("mnesia.hrl"). +-import(mnesia_lib, [val/1, dir/1]). +-import(mnesia_lib, [exists/1, fatal/2, error/2, dbg_out/2]). + +trans_log_header() -> log_header(trans_log, version()). +backup_log_header() -> log_header(backup_log, "1.2"). +decision_log_header() -> log_header(decision_log, decision_log_version()). +decision_tab_header() -> log_header(decision_tab, decision_tab_version()). +dcl_log_header() -> log_header(dcl_log, dcl_version()). +dcd_log_header() -> log_header(dcd_log, dcd_version()). + +log_header(Kind, Version) -> + #log_header{log_version=Version, + log_kind=Kind, + mnesia_version=mnesia:system_info(version), + node=node(), + now=now()}. + +version() -> "4.3". + +decision_log_version() -> "3.0". + +decision_tab_version() -> "1.0". + +dcl_version() -> "1.0". +dcd_version() -> "1.0". + +append(Log, Bin) when is_binary(Bin) -> + disk_log:balog(Log, Bin); +append(Log, Term) -> + disk_log:alog(Log, Term). + +%% Synced append +sappend(Log, Bin) when is_binary(Bin) -> + ok = disk_log:blog(Log, Bin); +sappend(Log, Term) -> + ok = disk_log:log(Log, Term). + +%% Write commit records to the latest_log +log(C) when C#commit.disc_copies == [], + C#commit.disc_only_copies == [], + C#commit.schema_ops == [] -> + ignore; +log(C) -> + case mnesia_monitor:use_dir() of + true -> + if + is_record(C, commit) -> + C2 = C#commit{ram_copies = [], snmp = []}, + append(latest_log, C2); + true -> + %% Either a commit record as binary + %% or some decision related info + append(latest_log, C) + end, + mnesia_dumper:incr_log_writes(); + false -> + ignore + end. + +%% Synced + +slog(C) when C#commit.disc_copies == [], + C#commit.disc_only_copies == [], + C#commit.schema_ops == [] -> + ignore; +slog(C) -> + case mnesia_monitor:use_dir() of + true -> + if + is_record(C, commit) -> + C2 = C#commit{ram_copies = [], snmp = []}, + sappend(latest_log, C2); + true -> + %% Either a commit record as binary + %% or some decision related info + sappend(latest_log, C) + end, + mnesia_dumper:incr_log_writes(); + false -> + ignore + end. + + +%% Stuff related to the file LOG + +%% Returns a list of logfiles. The oldest is first. +log_files() -> [previous_log_file(), + latest_log_file(), + decision_tab_file() + ]. + +latest_log_file() -> dir(latest_log_name()). + +previous_log_file() -> dir("PREVIOUS.LOG"). + +decision_log_file() -> dir(decision_log_name()). + +decision_tab_file() -> dir(decision_tab_name()). + +previous_decision_log_file() -> dir("PDECISION.LOG"). + +latest_log_name() -> "LATEST.LOG". + +decision_log_name() -> "DECISION.LOG". + +decision_tab_name() -> "DECISION_TAB.LOG". + +init() -> + case mnesia_monitor:use_dir() of + true -> + Prev = previous_log_file(), + verify_no_exists(Prev), + + Latest = latest_log_file(), + verify_no_exists(Latest), + + Header = trans_log_header(), + open_log(latest_log, Header, Latest); + false -> + ok + end. + +verify_no_exists(Fname) -> + case exists(Fname) of + false -> + ok; + true -> + fatal("Log file exists: ~p~n", [Fname]) + end. + +open_log(Name, Header, Fname) -> + Exists = exists(Fname), + open_log(Name, Header, Fname, Exists). + +open_log(Name, Header, Fname, Exists) -> + Repair = mnesia_monitor:get_env(auto_repair), + open_log(Name, Header, Fname, Exists, Repair). + +open_log(Name, Header, Fname, Exists, Repair) -> + case Name == previous_log of + true -> + open_log(Name, Header, Fname, Exists, Repair, read_only); + false -> + open_log(Name, Header, Fname, Exists, Repair, read_write) + end. + +open_log(Name, Header, Fname, Exists, Repair, Mode) -> + Args = [{file, Fname}, {name, Name}, {repair, Repair}, {mode, Mode}], +%% io:format("~p:open_log: ~p ~p~n", [?MODULE, Name, Fname]), + case mnesia_monitor:open_log(Args) of + {ok, Log} when Exists == true -> + Log; + {ok, Log} -> + write_header(Log, Header), + Log; + {repaired, Log, _, {badbytes, 0}} when Exists == true -> + Log; + {repaired, Log, _, {badbytes, 0}} -> + write_header(Log, Header), + Log; + {repaired, Log, _Recover, BadBytes} -> + mnesia_lib:important("Data may be missing, log ~p repaired: Lost ~p bytes~n", + [Fname, BadBytes]), + Log; + {error, Reason} when Repair == true -> + file:delete(Fname), + mnesia_lib:important("Data may be missing, Corrupt logfile deleted: ~p, ~p ~n", + [Fname, Reason]), + %% Create a new + open_log(Name, Header, Fname, false, false, read_write); + {error, Reason} -> + fatal("Cannot open log file ~p: ~p~n", [Fname, Reason]) + end. + +write_header(Log, Header) -> + append(Log, Header). + +write_trans_log_header() -> + write_header(latest_log, trans_log_header()). + +stop() -> + case mnesia_monitor:use_dir() of + true -> + close_log(latest_log); + false -> + ok + end. + +close_log(Log) -> +%% io:format("mnesia_log:close_log ~p~n", [Log]), +%% io:format("mnesia_log:close_log ~p~n", [Log]), + case disk_log:sync(Log) of + ok -> ok; + {error, {read_only_mode, Log}} -> + ok; + {error, Reason} -> + mnesia_lib:important("Failed syncing ~p to_disk reason ~p ~n", + [Log, Reason]) + end, + mnesia_monitor:close_log(Log). + +unsafe_close_log(Log) -> +%% io:format("mnesia_log:close_log ~p~n", [Log]), + mnesia_monitor:unsafe_close_log(Log). + + +purge_some_logs() -> + mnesia_monitor:unsafe_close_log(latest_log), + file:delete(latest_log_file()), + file:delete(decision_tab_file()). + +purge_all_logs() -> + file:delete(previous_log_file()), + file:delete(latest_log_file()), + file:delete(decision_tab_file()). + +%% Prepare dump by renaming the open logfile if possible +%% Returns a tuple on the following format: {Res, OpenLog} +%% where OpenLog is the file descriptor to log file, ready for append +%% and Res is one of the following: already_dumped, needs_dump or {error, Reason} +prepare_log_dump(InitBy) -> + Diff = mnesia_dumper:get_log_writes() - + mnesia_lib:read_counter(trans_log_writes_prev), + if + Diff == 0, InitBy /= startup -> + already_dumped; + true -> + case mnesia_monitor:use_dir() of + true -> + Prev = previous_log_file(), + prepare_prev(Diff, InitBy, Prev, exists(Prev)); + false -> + already_dumped + end + end. + +prepare_prev(Diff, _, _, true) -> + {needs_dump, Diff}; +prepare_prev(Diff, startup, Prev, false) -> + Latest = latest_log_file(), + case exists(Latest) of + true -> + case file:rename(Latest, Prev) of + ok -> + {needs_dump, Diff}; + {error, Reason} -> + {error, Reason} + end; + false -> + already_dumped + end; +prepare_prev(Diff, _InitBy, Prev, false) -> + Head = trans_log_header(), + case mnesia_monitor:reopen_log(latest_log, Prev, Head) of + ok -> + {needs_dump, Diff}; + {error, Reason} -> + Latest = latest_log_file(), + {error, {"Cannot rename log file", + [Latest, Prev, Reason]}} + end. + +%% Init dump and return PrevLogFileDesc or exit. +init_log_dump() -> + Fname = previous_log_file(), + open_log(previous_log, trans_log_header(), Fname), + start. + + +chunk_log(Cont) -> + chunk_log(previous_log, Cont). + +chunk_log(_Log, eof) -> + eof; +chunk_log(Log, Cont) -> + case catch disk_log:chunk(Log, Cont) of + {error, Reason} -> + fatal("Possibly truncated ~p file: ~p~n", + [Log, Reason]); + {C2, Chunk, _BadBytes} -> + %% Read_only case, should we warn about the bad log file? + %% BUGBUG Should we crash if Repair == false ?? + %% We got to check this !! + mnesia_lib:important("~p repaired, lost ~p bad bytes~n", [Log, _BadBytes]), + {C2, Chunk}; + Other -> + Other + end. + +%% Confirms the dump by closing prev log and delete the file +confirm_log_dump(Updates) -> + case mnesia_monitor:close_log(previous_log) of + ok -> + file:delete(previous_log_file()), + mnesia_lib:incr_counter(trans_log_writes_prev, Updates), + dumped; + {error, Reason} -> + {error, Reason} + end. + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +%% Decision log + +open_decision_log() -> + Latest = decision_log_file(), + open_log(decision_log, decision_log_header(), Latest), + start. + +prepare_decision_log_dump() -> + Prev = previous_decision_log_file(), + prepare_decision_log_dump(exists(Prev), Prev). + +prepare_decision_log_dump(false, Prev) -> + Head = decision_log_header(), + case mnesia_monitor:reopen_log(decision_log, Prev, Head) of + ok -> + prepare_decision_log_dump(true, Prev); + {error, Reason} -> + fatal("Cannot rename decision log file ~p -> ~p: ~p~n", + [decision_log_file(), Prev, Reason]) + end; +prepare_decision_log_dump(true, Prev) -> + open_log(previous_decision_log, decision_log_header(), Prev), + start. + +chunk_decision_log(Cont) -> + %% dbg_out("chunk log ~p~n", [Cont]), + chunk_log(previous_decision_log, Cont). + +%% Confirms dump of the decision log +confirm_decision_log_dump() -> + case mnesia_monitor:close_log(previous_decision_log) of + ok -> + file:delete(previous_decision_log_file()); + {error, Reason} -> + fatal("Cannot confirm decision log dump: ~p~n", + [Reason]) + end. + +save_decision_tab(Decisions) -> + Log = decision_tab, + Tmp = mnesia_lib:dir("DECISION_TAB.TMP"), + file:delete(Tmp), + open_log(Log, decision_tab_header(), Tmp), + append(Log, Decisions), + close_log(Log), + TabFile = decision_tab_file(), + ok = file:rename(Tmp, TabFile). + +open_decision_tab() -> + TabFile = decision_tab_file(), + open_log(decision_tab, decision_tab_header(), TabFile), + start. + +close_decision_tab() -> + close_log(decision_tab). + +chunk_decision_tab(Cont) -> + %% dbg_out("chunk tab ~p~n", [Cont]), + chunk_log(decision_tab, Cont). + +close_decision_log() -> + close_log(decision_log). + +log_decision(Decision) -> + append(decision_log, Decision). + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +%% Debug functions + +view() -> + lists:foreach(fun(F) -> view(F) end, log_files()). + +view(File) -> + mnesia_lib:show("***** ~p ***** ~n", [File]), + case exists(File) of + false -> + nolog; + true -> + N = view_only, + Args = [{file, File}, {name, N}, {mode, read_only}], + case disk_log:open(Args) of + {ok, N} -> + view_file(start, N); + {repaired, _, _, _} -> + view_file(start, N); + {error, Reason} -> + error("Cannot open log ~p: ~p~n", [File, Reason]) + end + end. + +view_file(C, Log) -> + case disk_log:chunk(Log, C) of + {error, Reason} -> + error("** Possibly truncated FILE ~p~n", [Reason]), + error; + eof -> + disk_log:close(Log), + eof; + {C2, Terms, _BadBytes} -> + dbg_out("Lost ~p bytes in ~p ~n", [_BadBytes, Log]), + lists:foreach(fun(X) -> mnesia_lib:show("~p~n", [X]) end, + Terms), + view_file(C2, Log); + {C2, Terms} -> + lists:foreach(fun(X) -> mnesia_lib:show("~p~n", [X]) end, + Terms), + view_file(C2, Log) + end. + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +%% Backup + +-record(backup_args, {name, module, opaque, scope, prev_name, tables, cookie}). + +backup(Opaque) -> + backup(Opaque, []). + +backup(Opaque, Mod) when is_atom(Mod) -> + backup(Opaque, [{module, Mod}]); +backup(Opaque, Args) when is_list(Args) -> + %% Backup all tables with max redundancy + CpArgs = [{ram_overrides_dump, false}, {max, val({schema, tables})}], + case mnesia_checkpoint:activate(CpArgs) of + {ok, Name, _Nodes} -> + Res = backup_checkpoint(Name, Opaque, Args), + mnesia_checkpoint:deactivate(Name), + Res; + {error, Reason} -> + {error, Reason} + end. + +backup_checkpoint(Name, Opaque) -> + backup_checkpoint(Name, Opaque, []). + +backup_checkpoint(Name, Opaque, Mod) when is_atom(Mod) -> + backup_checkpoint(Name, Opaque, [{module, Mod}]); +backup_checkpoint(Name, Opaque, Args) when is_list(Args) -> + DefaultMod = mnesia_monitor:get_env(backup_module), + B = #backup_args{name = Name, + module = DefaultMod, + opaque = Opaque, + scope = global, + tables = all, + prev_name = Name}, + case check_backup_args(Args, B) of + {ok, B2} -> + %% Decentralized backup + %% Incremental + + Self = self(), + Pid = spawn_link(?MODULE, backup_master, [Self, B2]), + receive + {Pid, Self, Res} -> Res + end; + {error, Reason} -> + {error, Reason} + end. + +check_backup_args([Arg | Tail], B) -> + case catch check_backup_arg_type(Arg, B) of + {'EXIT', _Reason} -> + {error, {badarg, Arg}}; + B2 -> + check_backup_args(Tail, B2) + end; + +check_backup_args([], B) -> + {ok, B}. + +check_backup_arg_type(Arg, B) -> + case Arg of + {scope, global} -> + B#backup_args{scope = global}; + {scope, local} -> + B#backup_args{scope = local}; + {module, Mod} -> + Mod2 = mnesia_monitor:do_check_type(backup_module, Mod), + B#backup_args{module = Mod2}; + {incremental, Name} -> + B#backup_args{prev_name = Name}; + {tables, Tabs} when is_list(Tabs) -> + B#backup_args{tables = Tabs} + end. + +backup_master(ClientPid, B) -> + process_flag(trap_exit, true), + case catch do_backup_master(B) of + {'EXIT', Reason} -> + ClientPid ! {self(), ClientPid, {error, {'EXIT', Reason}}}; + Res -> + ClientPid ! {self(), ClientPid, Res} + end, + unlink(ClientPid), + exit(normal). + +do_backup_master(B) -> + Name = B#backup_args.name, + B2 = safe_apply(B, open_write, [B#backup_args.opaque]), + B3 = safe_write(B2, [backup_log_header()]), + case mnesia_checkpoint:tables_and_cookie(Name) of + {ok, AllTabs, Cookie} -> + Tabs = select_tables(AllTabs, B3), + B4 = B3#backup_args{cookie = Cookie}, + %% Always put schema first in backup file + B5 = backup_schema(B4, Tabs), + B6 = lists:foldl(fun backup_tab/2, B5, Tabs -- [schema]), + safe_apply(B6, commit_write, [B6#backup_args.opaque]), + ok; + {error, Reason} -> + abort_write(B3, {?MODULE, backup_master}, [B], {error, Reason}) + end. + +select_tables(AllTabs, B) -> + Tabs = + case B#backup_args.tables of + all -> AllTabs; + SomeTabs when is_list(SomeTabs) -> SomeTabs + end, + case B#backup_args.scope of + global -> + Tabs; + local -> + Name = B#backup_args.name, + [T || T <- Tabs, mnesia_checkpoint:most_local_node(Name, T) == {ok, node()}] + end. + +safe_write(B, []) -> + B; +safe_write(B, Recs) -> + safe_apply(B, write, [B#backup_args.opaque, Recs]). + +backup_schema(B, Tabs) -> + case lists:member(schema, Tabs) of + true -> + backup_tab(schema, B); + false -> + Defs = [{schema, T, mnesia_schema:get_create_list(T)} || T <- Tabs], + safe_write(B, Defs) + end. + +safe_apply(B, write, [_, Items]) when Items == [] -> + B; +safe_apply(B, What, Args) -> + Abort = fun(R) -> abort_write(B, What, Args, R) end, + receive + {'EXIT', Pid, R} -> Abort({'EXIT', Pid, R}) + after 0 -> + Mod = B#backup_args.module, + case catch apply(Mod, What, Args) of + {ok, Opaque} -> B#backup_args{opaque=Opaque}; + {error, R} -> Abort(R); + R -> Abort(R) + end + end. + +abort_write(B, What, Args, Reason) -> + Mod = B#backup_args.module, + Opaque = B#backup_args.opaque, + dbg_out("Failed to perform backup. M=~p:F=~p:A=~p -> ~p~n", + [Mod, What, Args, Reason]), + case catch apply(Mod, abort_write, [Opaque]) of + {ok, _Res} -> + throw({error, Reason}); + Other -> + error("Failed to abort backup. ~p:~p~p -> ~p~n", + [Mod, abort_write, [Opaque], Other]), + throw({error, Reason}) + end. + +backup_tab(Tab, B) -> + Name = B#backup_args.name, + case mnesia_checkpoint:most_local_node(Name, Tab) of + {ok, Node} when Node == node() -> + tab_copier(self(), B, Tab); + {ok, Node} -> + RemoteB = B, + Pid = spawn_link(Node, ?MODULE, tab_copier, [self(), RemoteB, Tab]), + RecName = val({Tab, record_name}), + tab_receiver(Pid, B, Tab, RecName, 0); + {error, Reason} -> + abort_write(B, {?MODULE, backup_tab}, [Tab, B], {error, Reason}) + end. + +tab_copier(Pid, B, Tab) when is_record(B, backup_args) -> + %% Intentional crash at exit + Name = B#backup_args.name, + PrevName = B#backup_args.prev_name, + {FirstName, FirstSource} = select_source(Tab, Name, PrevName), + + ?eval_debug_fun({?MODULE, tab_copier, pre}, [{name, Name}, {tab, Tab}]), + Res = handle_more(Pid, B, Tab, FirstName, FirstSource, Name), + ?eval_debug_fun({?MODULE, tab_copier, post}, [{name, Name}, {tab, Tab}]), + + handle_last(Pid, Res). + +select_source(Tab, Name, PrevName) -> + if + Tab == schema -> + %% Always full backup of schema + {Name, table}; + Name == PrevName -> + %% Full backup + {Name, table}; + true -> + %% Wants incremental backup + case mnesia_checkpoint:most_local_node(PrevName, Tab) of + {ok, Node} when Node == node() -> + %% Accept incremental backup + {PrevName, retainer}; + _ -> + %% Do a full backup anyway + dbg_out("Incremental backup escalated to full backup: ~p~n", [Tab]), + {Name, table} + end + end. + +handle_more(Pid, B, Tab, FirstName, FirstSource, Name) -> + Acc = {0, B}, + case {mnesia_checkpoint:really_retain(Name, Tab), + mnesia_checkpoint:really_retain(FirstName, Tab)} of + {true, true} -> + Acc2 = iterate(B, FirstName, Tab, Pid, FirstSource, latest, first, Acc), + iterate(B, Name, Tab, Pid, retainer, checkpoint, last, Acc2); + {false, false}-> + %% Put the dumped file in the backup + %% instead of the ram table. Does + %% only apply to ram_copies. + iterate(B, Name, Tab, Pid, retainer, checkpoint, last, Acc); + Bad -> + Reason = {"Checkpoints for incremental backup must have same " + "setting of ram_overrides_dump", + Tab, Name, FirstName, Bad}, + abort_write(B, {?MODULE, backup_tab}, [Tab, B], {error, Reason}) + end. + +handle_last(Pid, {_Count, B}) when Pid == self() -> + B; +handle_last(Pid, _Acc) -> + unlink(Pid), + Pid ! {self(), {last, {ok, dummy}}}, + exit(normal). + +iterate(B, Name, Tab, Pid, Source, Age, Pass, Acc) -> + Fun = + if + Pid == self() -> + RecName = val({Tab, record_name}), + fun(Recs, A) -> copy_records(RecName, Tab, Recs, A) end; + true -> + fun(Recs, A) -> send_records(Pid, Tab, Recs, Pass, A) end + end, + case mnesia_checkpoint:iterate(Name, Tab, Fun, Acc, Source, Age) of + {ok, Acc2} -> + Acc2; + {error, Reason} -> + R = {error, {"Tab copier iteration failed", Reason}}, + abort_write(B, {?MODULE, iterate}, [self(), B, Tab], R) + end. + +copy_records(_RecName, _Tab, [], Acc) -> + Acc; +copy_records(RecName, Tab, Recs, {Count, B}) -> + Recs2 = rec_filter(B, Tab, RecName, Recs), + B2 = safe_write(B, Recs2), + {Count + 1, B2}. + +send_records(Pid, Tab, Recs, Pass, {Count, B}) -> + receive + {Pid, more, Count} -> + if + Pass == last, Recs == [] -> + {Count, B}; + true -> + Next = Count + 1, + Pid ! {self(), {more, Next, Recs}}, + {Next, B} + end; + Msg -> + exit({send_records_unexpected_msg, Tab, Msg}) + end. + +tab_receiver(Pid, B, Tab, RecName, Slot) -> + Pid ! {self(), more, Slot}, + receive + {Pid, {more, Next, Recs}} -> + Recs2 = rec_filter(B, Tab, RecName, Recs), + B2 = safe_write(B, Recs2), + tab_receiver(Pid, B2, Tab, RecName, Next); + + {Pid, {last, {ok,_}}} -> + B; + + {'EXIT', Pid, {error, R}} -> + Reason = {error, {"Tab copier crashed", R}}, + abort_write(B, {?MODULE, remote_tab_sender}, [self(), B, Tab], Reason); + {'EXIT', Pid, R} -> + Reason = {error, {"Tab copier crashed", {'EXIT', R}}}, + abort_write(B, {?MODULE, remote_tab_sender}, [self(), B, Tab], Reason); + Msg -> + R = {error, {"Tab receiver got unexpected msg", Msg}}, + abort_write(B, {?MODULE, remote_tab_sender}, [self(), B, Tab], R) + end. + +rec_filter(B, schema, _RecName, Recs) -> + case catch mnesia_bup:refresh_cookie(Recs, B#backup_args.cookie) of + Recs2 when is_list(Recs2) -> + Recs2; + {error, _Reason} -> + %% No schema table cookie + Recs + end; +rec_filter(_B, Tab, Tab, Recs) -> + Recs; +rec_filter(_B, Tab, _RecName, Recs) -> + [setelement(1, Rec, Tab) || Rec <- Recs]. + +ets2dcd(Tab) -> + ets2dcd(Tab, dcd). + +ets2dcd(Tab, Ftype) -> + Fname = + case Ftype of + dcd -> mnesia_lib:tab2dcd(Tab); + dmp -> mnesia_lib:tab2dmp(Tab) + end, + TmpF = mnesia_lib:tab2tmp(Tab), + file:delete(TmpF), + Log = open_log({Tab, ets2dcd}, dcd_log_header(), TmpF, false), + mnesia_lib:db_fixtable(ram_copies, Tab, true), + ok = ets2dcd(mnesia_lib:db_init_chunk(ram_copies, Tab, 1000), Tab, Log), + mnesia_lib:db_fixtable(ram_copies, Tab, false), + close_log(Log), + ok = file:rename(TmpF, Fname), + %% Remove old log data which is now in the new dcd. + %% No one else should be accessing this file! + file:delete(mnesia_lib:tab2dcl(Tab)), + ok. + +ets2dcd('$end_of_table', _Tab, _Log) -> + ok; +ets2dcd({Recs, Cont}, Tab, Log) -> + ok = disk_log:alog_terms(Log, Recs), + ets2dcd(mnesia_lib:db_chunk(ram_copies, Cont), Tab, Log). + +dcd2ets(Tab) -> + dcd2ets(Tab, mnesia_monitor:get_env(auto_repair)). + +dcd2ets(Tab, Rep) -> + Dcd = mnesia_lib:tab2dcd(Tab), + case mnesia_lib:exists(Dcd) of + true -> + Log = open_log({Tab, dcd2ets}, dcd_log_header(), Dcd, + true, Rep, read_only), + Data = chunk_log(Log, start), + ok = insert_dcdchunk(Data, Log, Tab), + close_log(Log), + load_dcl(Tab, Rep); + false -> %% Handle old dets files, and conversion from disc_only to disc. + Fname = mnesia_lib:tab2dat(Tab), + Type = val({Tab, setorbag}), + case mnesia_lib:dets_to_ets(Tab, Tab, Fname, Type, Rep, yes) of + loaded -> + ets2dcd(Tab), + file:delete(Fname), + 0; + {error, Error} -> + erlang:error({"Failed to load table from disc", [Tab, Error]}) + end + end. + +insert_dcdchunk({Cont, [LogH | Rest]}, Log, Tab) + when is_record(LogH, log_header), + LogH#log_header.log_kind == dcd_log, + LogH#log_header.log_version >= "1.0" -> + insert_dcdchunk({Cont, Rest}, Log, Tab); + +insert_dcdchunk({Cont, Recs}, Log, Tab) -> + true = ets:insert(Tab, Recs), + insert_dcdchunk(chunk_log(Log, Cont), Log, Tab); +insert_dcdchunk(eof, _Log, _Tab) -> + ok. + +load_dcl(Tab, Rep) -> + FName = mnesia_lib:tab2dcl(Tab), + case mnesia_lib:exists(FName) of + true -> + Name = {load_dcl,Tab}, + open_log(Name, + dcl_log_header(), + FName, + true, + Rep, + read_only), + FirstChunk = chunk_log(Name, start), + N = insert_logchunk(FirstChunk, Name, 0), + close_log(Name), + N; + false -> + 0 + end. + +insert_logchunk({C2, Recs}, Tab, C) -> + N = add_recs(Recs, C), + insert_logchunk(chunk_log(Tab, C2), Tab, C+N); +insert_logchunk(eof, _Tab, C) -> + C. + +add_recs([{{Tab, _Key}, Val, write} | Rest], N) -> + true = ets:insert(Tab, Val), + add_recs(Rest, N+1); +add_recs([{{Tab, Key}, _Val, delete} | Rest], N) -> + true = ets:delete(Tab, Key), + add_recs(Rest, N+1); +add_recs([{{Tab, _Key}, Val, delete_object} | Rest], N) -> + true = ets:match_delete(Tab, Val), + add_recs(Rest, N+1); +add_recs([{{Tab, Key}, Val, update_counter} | Rest], N) -> + {RecName, Incr} = Val, + case catch ets:update_counter(Tab, Key, Incr) of + CounterVal when is_integer(CounterVal) -> + ok; + _ when Incr < 0 -> + Zero = {RecName, Key, 0}, + true = ets:insert(Tab, Zero); + _ -> + Zero = {RecName, Key, Incr}, + true = ets:insert(Tab, Zero) + end, + add_recs(Rest, N+1); +add_recs([LogH|Rest], N) + when is_record(LogH, log_header), + LogH#log_header.log_kind == dcl_log, + LogH#log_header.log_version >= "1.0" -> + add_recs(Rest, N); +add_recs([{{Tab, _Key}, _Val, clear_table} | Rest], N) -> + true = ets:match_delete(Tab, '_'), + add_recs(Rest, N+ets:info(Tab, size)); +add_recs([], N) -> + N. diff --git a/lib/mnesia/src/mnesia_monitor.erl b/lib/mnesia/src/mnesia_monitor.erl new file mode 100644 index 0000000000..05ae943e3b --- /dev/null +++ b/lib/mnesia/src/mnesia_monitor.erl @@ -0,0 +1,823 @@ +%% +%% %CopyrightBegin% +%% +%% Copyright Ericsson AB 1996-2009. All Rights Reserved. +%% +%% The contents of this file are subject to the Erlang Public License, +%% Version 1.1, (the "License"); you may not use this file except in +%% compliance with the License. You should have received a copy of the +%% Erlang Public License along with this software. If not, it can be +%% retrieved online at http://www.erlang.org/. +%% +%% Software distributed under the License is distributed on an "AS IS" +%% basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See +%% the License for the specific language governing rights and limitations +%% under the License. +%% +%% %CopyrightEnd% +%% + +%% +-module(mnesia_monitor). + +-behaviour(gen_server). + +%% Public exports +-export([ + close_dets/1, + close_log/1, + detect_inconcistency/2, + get_env/1, + init/0, + mktab/2, + unsafe_mktab/2, + mnesia_down/2, + needs_protocol_conversion/1, + negotiate_protocol/1, + disconnect/1, + open_dets/2, + unsafe_open_dets/2, + open_log/1, + patch_env/2, + protocol_version/0, + reopen_log/3, + set_env/2, + start/0, + start_proc/4, + terminate_proc/3, + unsafe_close_dets/1, + unsafe_close_log/1, + use_dir/0, + do_check_type/2 + ]). + +%% gen_server callbacks +-export([ + init/1, + handle_call/3, + handle_cast/2, + handle_info/2, + terminate/2, + code_change/3 + ]). + +%% Internal exports +-export([ + call/1, + cast/1, + detect_partitioned_network/2, + has_remote_mnesia_down/1, + negotiate_protocol_impl/2 + ]). + +-import(mnesia_lib, [dbg_out/2, verbose/2, error/2, fatal/2, set/2]). + +-include("mnesia.hrl"). + +-record(state, {supervisor, pending_negotiators = [], + going_down = [], tm_started = false, early_connects = [], + connecting, mq = []}). + +-define(current_protocol_version, {7,6}). + +-define(previous_protocol_version, {7,5}). + +start() -> + gen_server:start_link({local, ?MODULE}, ?MODULE, + [self()], [{timeout, infinity} + %% ,{debug, [trace]} + ]). + +init() -> + call(init). + +mnesia_down(From, Node) -> + cast({mnesia_down, From, Node}). + +mktab(Tab, Args) -> + unsafe_call({mktab, Tab, Args}). +unsafe_mktab(Tab, Args) -> + unsafe_call({unsafe_mktab, Tab, Args}). + +open_dets(Tab, Args) -> + unsafe_call({open_dets, Tab, Args}). +unsafe_open_dets(Tab, Args) -> + unsafe_call({unsafe_open_dets, Tab, Args}). + +close_dets(Tab) -> + unsafe_call({close_dets, Tab}). + +unsafe_close_dets(Name) -> + unsafe_call({unsafe_close_dets, Name}). + +open_log(Args) -> + unsafe_call({open_log, Args}). + +reopen_log(Name, Fname, Head) -> + unsafe_call({reopen_log, Name, Fname, Head}). + +close_log(Name) -> + unsafe_call({close_log, Name}). + +unsafe_close_log(Name) -> + unsafe_call({unsafe_close_log, Name}). + + +disconnect(Node) -> + cast({disconnect, Node}). + +%% Returns GoodNoodes +%% Creates a link to each compatible monitor and +%% protocol_version to agreed version upon success + +negotiate_protocol([]) -> []; +negotiate_protocol(Nodes) -> + call({negotiate_protocol, Nodes}). + +negotiate_protocol_impl(Nodes, Requester) -> + Version = mnesia:system_info(version), + Protocols = acceptable_protocol_versions(), + MonitorPid = whereis(?MODULE), + Msg = {negotiate_protocol, MonitorPid, Version, Protocols}, + {Replies, _BadNodes} = multicall(Nodes, Msg), + Res = check_protocol(Replies, Protocols), + ?MODULE ! {protocol_negotiated,Requester,Res}, + unlink(whereis(?MODULE)), + ok. + +check_protocol([{Node, {accept, Mon, Version, Protocol}} | Tail], Protocols) -> + case lists:member(Protocol, Protocols) of + true -> + case Protocol == protocol_version() of + true -> + set({protocol, Node}, {Protocol, false}); + false -> + set({protocol, Node}, {Protocol, true}) + end, + [node(Mon) | check_protocol(Tail, Protocols)]; + false -> + verbose("Failed to connect with ~p. ~p protocols rejected. " + "expected version = ~p, expected protocol = ~p~n", + [Node, Protocols, Version, Protocol]), + unlink(Mon), % Get rid of unneccessary link + check_protocol(Tail, Protocols) + end; +check_protocol([{Node, {reject, _Mon, Version, Protocol}} | Tail], Protocols) -> + verbose("Failed to connect with ~p. ~p protocols rejected. " + "expected version = ~p, expected protocol = ~p~n", + [Node, Protocols, Version, Protocol]), + check_protocol(Tail, Protocols); +check_protocol([{error, _Reason} | Tail], Protocols) -> + dbg_out("~p connect failed error: ~p~n", [?MODULE, _Reason]), + check_protocol(Tail, Protocols); +check_protocol([{badrpc, _Reason} | Tail], Protocols) -> + dbg_out("~p connect failed badrpc: ~p~n", [?MODULE, _Reason]), + check_protocol(Tail, Protocols); +check_protocol([], [Protocol | _Protocols]) -> + set(protocol_version, Protocol), + []. + +protocol_version() -> + case ?catch_val(protocol_version) of + {'EXIT', _} -> ?current_protocol_version; + Version -> Version + end. + +%% A sorted list of acceptable protocols the +%% preferred protocols are first in the list +acceptable_protocol_versions() -> + [protocol_version(), ?previous_protocol_version]. + +needs_protocol_conversion(Node) -> + case {?catch_val({protocol, Node}), protocol_version()} of + {{'EXIT', _}, _} -> + false; + {{_, Bool}, ?current_protocol_version} -> + Bool; + {{_, Bool}, _} -> + not Bool + end. + +cast(Msg) -> + case whereis(?MODULE) of + undefined -> ignore; + Pid -> gen_server:cast(Pid, Msg) + end. + +unsafe_call(Msg) -> + case whereis(?MODULE) of + undefined -> {error, {node_not_running, node()}}; + Pid -> gen_server:call(Pid, Msg, infinity) + end. + +call(Msg) -> + case whereis(?MODULE) of + undefined -> + {error, {node_not_running, node()}}; + Pid -> + link(Pid), + Res = gen_server:call(Pid, Msg, infinity), + unlink(Pid), + + %% We get an exit signal if server dies + receive + {'EXIT', Pid, _Reason} -> + {error, {node_not_running, node()}} + after 0 -> + Res + end + end. + +multicall(Nodes, Msg) -> + rpc:multicall(Nodes, ?MODULE, call, [Msg]). + +start_proc(Who, Mod, Fun, Args) -> + Args2 = [Who, Mod, Fun, Args], + proc_lib:start_link(mnesia_sp, init_proc, Args2, infinity). + +terminate_proc(Who, R, State) when R /= shutdown, R /= killed -> + fatal("~p crashed: ~p state: ~p~n", [Who, R, State]); + +terminate_proc(Who, Reason, _State) -> + mnesia_lib:verbose("~p terminated: ~p~n", [Who, Reason]), + ok. + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +%%% Callback functions from gen_server + +%%---------------------------------------------------------------------- +%% Func: init/1 +%% Returns: {ok, State} | +%% {ok, State, Timeout} | +%% {stop, Reason} +%%---------------------------------------------------------------------- +init([Parent]) -> + process_flag(trap_exit, true), + ?ets_new_table(mnesia_gvar, [set, public, named_table]), + ?ets_new_table(mnesia_stats, [set, public, named_table]), + set(subscribers, []), + mnesia_lib:verbose("~p starting: ~p~n", [?MODULE, self()]), + Version = mnesia:system_info(version), + set(version, Version), + dbg_out("Version: ~p~n", [Version]), + + case catch process_config_args(env()) of + ok -> + mnesia_lib:set({'$$$_report', current_pos}, 0), + Level = mnesia_lib:val(debug), + mnesia_lib:verbose("Mnesia debug level set to ~p\n", [Level]), + set(mnesia_status, starting), %% set start status + set({current, db_nodes}, [node()]), + set(use_dir, use_dir()), + mnesia_lib:create_counter(trans_aborts), + mnesia_lib:create_counter(trans_commits), + mnesia_lib:create_counter(trans_log_writes), + Left = get_env(dump_log_write_threshold), + mnesia_lib:set_counter(trans_log_writes_left, Left), + mnesia_lib:create_counter(trans_log_writes_prev), + mnesia_lib:create_counter(trans_restarts), + mnesia_lib:create_counter(trans_failures), + set(checkpoints, []), + set(pending_checkpoints, []), + set(pending_checkpoint_pids, []), + + {ok, #state{supervisor = Parent}}; + {'EXIT', Reason} -> + mnesia_lib:report_fatal("Bad configuration: ~p~n", [Reason]), + {stop, {bad_config, Reason}} + end. + +use_dir() -> + case ?catch_val(use_dir) of + {'EXIT', _} -> + case get_env(schema_location) of + disc -> true; + opt_disc -> non_empty_dir(); + ram -> false + end; + Bool -> + Bool + end. + +%% Returns true if the Mnesia directory contains +%% important files +non_empty_dir() -> + mnesia_lib:exists(mnesia_bup:fallback_bup()) or + mnesia_lib:exists(mnesia_lib:tab2dmp(schema)) or + mnesia_lib:exists(mnesia_lib:tab2dat(schema)). + +%%---------------------------------------------------------------------- +%% Func: handle_call/3 +%% Returns: {reply, Reply, State} | +%% {reply, Reply, State, Timeout} | +%% {noreply, State} | +%% {noreply, State, Timeout} | +%% {stop, Reason, Reply, State} | (terminate/2 is called) +%%---------------------------------------------------------------------- + +handle_call({mktab, Tab, Args}, _From, State) -> + case catch ?ets_new_table(Tab, Args) of + {'EXIT', ExitReason} -> + Msg = "Cannot create ets table", + Reason = {system_limit, Msg, Tab, Args, ExitReason}, + fatal("~p~n", [Reason]), + {noreply, State}; + Reply -> + {reply, Reply, State} + end; + +handle_call({unsafe_mktab, Tab, Args}, _From, State) -> + case catch ?ets_new_table(Tab, Args) of + {'EXIT', ExitReason} -> + {reply, {error, ExitReason}, State}; + Reply -> + {reply, Reply, State} + end; + + +handle_call({open_dets, Tab, Args}, _From, State) -> + case mnesia_lib:dets_sync_open(Tab, Args) of + {ok, Tab} -> + {reply, {ok, Tab}, State}; + + {error, Reason} -> + Msg = "Cannot open dets table", + Error = {error, {Msg, Tab, Args, Reason}}, + fatal("~p~n", [Error]), + {noreply, State} + end; + +handle_call({unsafe_open_dets, Tab, Args}, _From, State) -> + case mnesia_lib:dets_sync_open(Tab, Args) of + {ok, Tab} -> + {reply, {ok, Tab}, State}; + {error, Reason} -> + {reply, {error,Reason}, State} + end; + +handle_call({close_dets, Tab}, _From, State) -> + ok = mnesia_lib:dets_sync_close(Tab), + {reply, ok, State}; + +handle_call({unsafe_close_dets, Tab}, _From, State) -> + mnesia_lib:dets_sync_close(Tab), + {reply, ok, State}; + +handle_call({open_log, Args}, _From, State) -> + Res = disk_log:open([{notify, true}|Args]), + {reply, Res, State}; + +handle_call({reopen_log, Name, Fname, Head}, _From, State) -> + case disk_log:reopen(Name, Fname, Head) of + ok -> + {reply, ok, State}; + + {error, Reason} -> + Msg = "Cannot rename disk_log file", + Error = {error, {Msg, Name, Fname, Head, Reason}}, + fatal("~p~n", [Error]), + {noreply, State} + end; + +handle_call({close_log, Name}, _From, State) -> + case disk_log:close(Name) of + ok -> + {reply, ok, State}; + + {error, Reason} -> + Msg = "Cannot close disk_log file", + Error = {error, {Msg, Name, Reason}}, + fatal("~p~n", [Error]), + {noreply, State} + end; + +handle_call({unsafe_close_log, Name}, _From, State) -> + disk_log:close(Name), + {reply, ok, State}; + +handle_call({negotiate_protocol, Mon, _Version, _Protocols}, _From, State) + when State#state.tm_started == false -> + State2 = State#state{early_connects = [node(Mon) | State#state.early_connects]}, + {reply, {node(), {reject, self(), uninitialized, uninitialized}}, State2}; + +%% From remote monitor.. +handle_call({negotiate_protocol, Mon, Version, Protocols}, From, State) + when node(Mon) /= node() -> + Protocol = protocol_version(), + MyVersion = mnesia:system_info(version), + case lists:member(Protocol, Protocols) of + true -> + accept_protocol(Mon, MyVersion, Protocol, From, State); + false -> + %% in this release we should be able to handle the previous + %% protocol + case hd(Protocols) of + ?previous_protocol_version -> + accept_protocol(Mon, MyVersion, ?previous_protocol_version, From, State); + _ -> + verbose("Connection with ~p rejected. " + "version = ~p, protocols = ~p, " + "expected version = ~p, expected protocol = ~p~n", + [node(Mon), Version, Protocols, MyVersion, Protocol]), + {reply, {node(), {reject, self(), MyVersion, Protocol}}, State} + end + end; + +%% Local request to negotiate with other monitors (nodes). +handle_call({negotiate_protocol, Nodes}, From, State) -> + case mnesia_lib:intersect(State#state.going_down, Nodes) of + [] -> + spawn_link(?MODULE, negotiate_protocol_impl, [Nodes, From]), + {noreply, State#state{connecting={From,Nodes}}}; + _ -> %% Cannot connect now, still processing mnesia down + {reply, busy, State} + end; + +handle_call(init, _From, State) -> + net_kernel:monitor_nodes(true), + EarlyNodes = State#state.early_connects, + State2 = State#state{tm_started = true}, + {reply, EarlyNodes, State2}; + +handle_call(Msg, _From, State) -> + error("~p got unexpected call: ~p~n", [?MODULE, Msg]), + {noreply, State}. + +accept_protocol(Mon, Version, Protocol, From, State) -> + Reply = {node(), {accept, self(), Version, Protocol}}, + Node = node(Mon), + Pending0 = State#state.pending_negotiators, + Pending = lists:keydelete(Node, 1, Pending0), + case lists:member(Node, State#state.going_down) of + true -> + %% Wait for the mnesia_down to be processed, + %% before we reply + P = Pending ++ [{Node, Mon, From, Reply}], + {noreply, State#state{pending_negotiators = P}}; + false -> + %% No need for wait + link(Mon), %% link to remote Monitor + case Protocol == protocol_version() of + true -> + set({protocol, Node}, {Protocol, false}); + false -> + set({protocol, Node}, {Protocol, true}) + end, + {reply, Reply, State#state{pending_negotiators = Pending}} + end. + +%%---------------------------------------------------------------------- +%% Func: handle_cast/2 +%% Returns: {noreply, State} | +%% {noreply, State, Timeout} | +%% {stop, Reason, State} (terminate/2 is called) +%%---------------------------------------------------------------------- + +handle_cast({mnesia_down, mnesia_controller, Node}, State) -> + mnesia_tm:mnesia_down(Node), + {noreply, State}; + +handle_cast({mnesia_down, mnesia_tm, {Node, Pending}}, State) -> + mnesia_locker:mnesia_down(Node, Pending), + {noreply, State}; + +handle_cast({mnesia_down, mnesia_locker, Node}, State) -> + Down = {mnesia_down, Node}, + mnesia_lib:report_system_event(Down), + GoingDown = lists:delete(Node, State#state.going_down), + State2 = State#state{going_down = GoingDown}, + Pending = State#state.pending_negotiators, + case lists:keysearch(Node, 1, Pending) of + {value, {Node, Mon, ReplyTo, Reply}} -> + %% Late reply to remote monitor + link(Mon), %% link to remote Monitor + gen_server:reply(ReplyTo, Reply), + P2 = lists:keydelete(Node, 1,Pending), + State3 = State2#state{pending_negotiators = P2}, + process_q(State3); + false -> + %% No pending remote monitors + {noreply, State2} + end; + +handle_cast({disconnect, Node}, State) -> + case rpc:call(Node, erlang, whereis, [?MODULE]) of + {badrpc, _} -> + ignore; + undefined -> + ignore; + RemoteMon when is_pid(RemoteMon) -> + unlink(RemoteMon) + end, + {noreply, State}; + +handle_cast({inconsistent_database, Context, Node}, State) -> + Msg = {inconsistent_database, Context, Node}, + mnesia_lib:report_system_event(Msg), + {noreply, State}; + +handle_cast(Msg, State) -> + error("~p got unexpected cast: ~p~n", [?MODULE, Msg]), + {noreply, State}. + +%%---------------------------------------------------------------------- +%% Func: handle_info/2 +%% Returns: {noreply, State} | +%% {noreply, State, Timeout} | +%% {stop, Reason, State} (terminate/2 is called) +%%---------------------------------------------------------------------- + +handle_info({'EXIT', Pid, R}, State) when Pid == State#state.supervisor -> + dbg_out("~p was ~p by supervisor~n",[?MODULE, R]), + {stop, R, State}; + +handle_info({'EXIT', Pid, fatal}, State) when node(Pid) == node() -> + dbg_out("~p got FATAL ERROR from: ~p~n",[?MODULE, Pid]), + exit(State#state.supervisor, shutdown), + {noreply, State}; + +handle_info(Msg = {'EXIT',Pid,_}, State) -> + Node = node(Pid), + if + Node /= node(), State#state.connecting == undefined -> + %% Remotly linked process died, assume that it was a mnesia_monitor + mnesia_recover:mnesia_down(Node), + mnesia_controller:mnesia_down(Node), + {noreply, State#state{going_down = [Node | State#state.going_down]}}; + Node /= node() -> + {noreply, State#state{mq = State#state.mq ++ [{info, Msg}]}}; + true -> + %% We have probably got an exit signal from + %% disk_log or dets + Hint = "Hint: check that the disk still is writable", + fatal("~p got unexpected info: ~p; ~p~n", + [?MODULE, Msg, Hint]) + end; + +handle_info({protocol_negotiated, From,Res}, State) -> + From = element(1,State#state.connecting), + gen_server:reply(From, Res), + process_q(State#state{connecting = undefined}); + +handle_info({nodeup, Node}, State) -> + %% Ok, we are connected to yet another Erlang node + %% Let's check if Mnesia is running there in order + %% to detect if the network has been partitioned + %% due to communication failure. + + HasDown = mnesia_recover:has_mnesia_down(Node), + ImRunning = mnesia_lib:is_running(), + + if + %% If I'm not running the test will be made later. + HasDown == true, ImRunning == yes -> + spawn_link(?MODULE, detect_partitioned_network, [self(), Node]); + true -> + ignore + end, + {noreply, State}; + +handle_info({nodedown, _Node}, State) -> + %% Ignore, we are only caring about nodeup's + {noreply, State}; + +handle_info({disk_log, _Node, Log, Info}, State) -> + case Info of + {truncated, _No} -> + ok; + _ -> + mnesia_lib:important("Warning Log file ~p error reason ~s~n", + [Log, disk_log:format_error(Info)]) + end, + {noreply, State}; + +handle_info(Msg, State) -> + error("~p got unexpected info (~p): ~p~n", [?MODULE, State, Msg]). + +process_q(State = #state{mq=[]}) -> {noreply,State}; +process_q(State = #state{mq=[{info,Msg}|R]}) -> + handle_info(Msg, State#state{mq=R}); +process_q(State = #state{mq=[{cast,Msg}|R]}) -> + handle_cast(Msg, State#state{mq=R}); +process_q(State = #state{mq=[{call,From,Msg}|R]}) -> + handle_call(Msg, From, State#state{mq=R}). + +%%---------------------------------------------------------------------- +%% Func: terminate/2 +%% Purpose: Shutdown the server +%% Returns: any (ignored by gen_server) +%%---------------------------------------------------------------------- +terminate(Reason, State) -> + terminate_proc(?MODULE, Reason, State). + +%%---------------------------------------------------------------------- +%% Func: code_change/3 +%% Purpose: Upgrade process when its code is to be changed +%% Returns: {ok, NewState} +%%---------------------------------------------------------------------- + + +code_change(_, {state, SUP, PN, GD, TMS, EC}, _) -> + {ok, #state{supervisor=SUP, pending_negotiators=PN, + going_down = GD, tm_started =TMS, early_connects = EC}}; + +code_change(_OldVsn, State, _Extra) -> + {ok, State}. + +%%%---------------------------------------------------------------------- +%%% Internal functions +%%%---------------------------------------------------------------------- + +process_config_args([]) -> + ok; +process_config_args([C|T]) -> + V = get_env(C), + dbg_out("Env ~p: ~p~n", [C, V]), + mnesia_lib:set(C, V), + process_config_args(T). + +set_env(E,Val) -> + mnesia_lib:set(E, check_type(E,Val)), + ok. + +get_env(E) -> + case ?catch_val(E) of + {'EXIT', _} -> + case application:get_env(mnesia, E) of + {ok, Val} -> + check_type(E, Val); + undefined -> + check_type(E, default_env(E)) + end; + Val -> + Val + end. + +env() -> + [ + access_module, + auto_repair, + backup_module, + debug, + dir, + dump_log_load_regulation, + dump_log_time_threshold, + dump_log_update_in_place, + dump_log_write_threshold, + embedded_mnemosyne, + event_module, + extra_db_nodes, + ignore_fallback_at_startup, + fallback_error_function, + max_wait_for_decision, + schema_location, + core_dir, + pid_sort_order, + no_table_loaders, + dc_dump_limit + ]. + +default_env(access_module) -> + mnesia; +default_env(auto_repair) -> + true; +default_env(backup_module) -> + mnesia_backup; +default_env(debug) -> + none; +default_env(dir) -> + Name = lists:concat(["Mnesia.", node()]), + filename:absname(Name); +default_env(dump_log_load_regulation) -> + false; +default_env(dump_log_time_threshold) -> + timer:minutes(3); +default_env(dump_log_update_in_place) -> + true; +default_env(dump_log_write_threshold) -> + 1000; +default_env(embedded_mnemosyne) -> + false; +default_env(event_module) -> + mnesia_event; +default_env(extra_db_nodes) -> + []; +default_env(ignore_fallback_at_startup) -> + false; +default_env(fallback_error_function) -> + {mnesia, lkill}; +default_env(max_wait_for_decision) -> + infinity; +default_env(schema_location) -> + opt_disc; +default_env(core_dir) -> + false; +default_env(pid_sort_order) -> + false; +default_env(no_table_loaders) -> + 2; +default_env(dc_dump_limit) -> + 4. + +check_type(Env, Val) -> + case catch do_check_type(Env, Val) of + {'EXIT', _Reason} -> + exit({bad_config, Env, Val}); + NewVal -> + NewVal + end. + +do_check_type(access_module, A) when is_atom(A) -> A; +do_check_type(auto_repair, B) -> bool(B); +do_check_type(backup_module, B) when is_atom(B) -> B; +do_check_type(debug, debug) -> debug; +do_check_type(debug, false) -> none; +do_check_type(debug, none) -> none; +do_check_type(debug, trace) -> trace; +do_check_type(debug, true) -> debug; +do_check_type(debug, verbose) -> verbose; +do_check_type(dir, V) -> filename:absname(V); +do_check_type(dump_log_load_regulation, B) -> bool(B); +do_check_type(dump_log_time_threshold, I) when is_integer(I), I > 0 -> I; +do_check_type(dump_log_update_in_place, B) -> bool(B); +do_check_type(dump_log_write_threshold, I) when is_integer(I), I > 0 -> I; +do_check_type(event_module, A) when is_atom(A) -> A; +do_check_type(ignore_fallback_at_startup, B) -> bool(B); +do_check_type(fallback_error_function, {Mod, Func}) + when is_atom(Mod), is_atom(Func) -> {Mod, Func}; +do_check_type(embedded_mnemosyne, B) -> bool(B); +do_check_type(extra_db_nodes, L) when is_list(L) -> + Fun = fun(N) when N == node() -> false; + (A) when is_atom(A) -> true + end, + lists:filter(Fun, L); +do_check_type(max_wait_for_decision, infinity) -> infinity; +do_check_type(max_wait_for_decision, I) when is_integer(I), I > 0 -> I; +do_check_type(schema_location, M) -> media(M); +do_check_type(core_dir, "false") -> false; +do_check_type(core_dir, false) -> false; +do_check_type(core_dir, Dir) when is_list(Dir) -> Dir; +do_check_type(pid_sort_order, r9b_plain) -> r9b_plain; +do_check_type(pid_sort_order, "r9b_plain") -> r9b_plain; +do_check_type(pid_sort_order, standard) -> standard; +do_check_type(pid_sort_order, "standard") -> standard; +do_check_type(pid_sort_order, _) -> false; +do_check_type(no_table_loaders, N) when is_integer(N), N > 0 -> N; +do_check_type(dc_dump_limit,N) when is_number(N), N > 0 -> N. + +bool(true) -> true; +bool(false) -> false. + +media(disc) -> disc; +media(opt_disc) -> opt_disc; +media(ram) -> ram. + +patch_env(Env, Val) -> + case catch do_check_type(Env, Val) of + {'EXIT', _Reason} -> + {error, {bad_type, Env, Val}}; + NewVal -> + application_controller:set_env(mnesia, Env, NewVal), + NewVal + end. + +detect_partitioned_network(Mon, Node) -> + detect_inconcistency([Node], running_partitioned_network), + unlink(Mon), + exit(normal). + +detect_inconcistency([], _Context) -> + ok; +detect_inconcistency(Nodes, Context) -> + Downs = [N || N <- Nodes, mnesia_recover:has_mnesia_down(N)], + {Replies, _BadNodes} = + rpc:multicall(Downs, ?MODULE, has_remote_mnesia_down, [node()]), + report_inconsistency(Replies, Context, ok). + +has_remote_mnesia_down(Node) -> + HasDown = mnesia_recover:has_mnesia_down(Node), + Master = mnesia_recover:get_master_nodes(schema), + if + HasDown == true, Master == [] -> + {true, node()}; + true -> + {false, node()} + end. + +report_inconsistency([{true, Node} | Replies], Context, _Status) -> + %% Oops, Mnesia is already running on the + %% other node AND we both regard each + %% other as down. The database is + %% potentially inconsistent and we has to + %% do tell the applications about it, so + %% they may perform some clever recovery + %% action. + Msg = {inconsistent_database, Context, Node}, + mnesia_lib:report_system_event(Msg), + report_inconsistency(Replies, Context, inconsistent_database); +report_inconsistency([{false, _Node} | Replies], Context, Status) -> + report_inconsistency(Replies, Context, Status); +report_inconsistency([{badrpc, _Reason} | Replies], Context, Status) -> + report_inconsistency(Replies, Context, Status); +report_inconsistency([], _Context, Status) -> + Status. diff --git a/lib/mnesia/src/mnesia_recover.erl b/lib/mnesia/src/mnesia_recover.erl new file mode 100644 index 0000000000..6c53c2e752 --- /dev/null +++ b/lib/mnesia/src/mnesia_recover.erl @@ -0,0 +1,1196 @@ +%% +%% %CopyrightBegin% +%% +%% Copyright Ericsson AB 1997-2009. All Rights Reserved. +%% +%% The contents of this file are subject to the Erlang Public License, +%% Version 1.1, (the "License"); you may not use this file except in +%% compliance with the License. You should have received a copy of the +%% Erlang Public License along with this software. If not, it can be +%% retrieved online at http://www.erlang.org/. +%% +%% Software distributed under the License is distributed on an "AS IS" +%% basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See +%% the License for the specific language governing rights and limitations +%% under the License. +%% +%% %CopyrightEnd% +%% + +%% +-module(mnesia_recover). + +-behaviour(gen_server). + +-export([ + allow_garb/0, + call/1, + connect_nodes/1, + disconnect/1, + dump_decision_tab/0, + get_master_node_info/0, + get_master_node_tables/0, + get_master_nodes/1, + get_mnesia_downs/0, + has_mnesia_down/1, + incr_trans_tid_serial/0, + init/0, + log_decision/1, + log_master_nodes/3, + log_mnesia_down/1, + log_mnesia_up/1, + mnesia_down/1, + note_decision/2, + note_log_decision/2, + outcome/2, + start/0, + start_garb/0, + still_pending/1, + sync_trans_tid_serial/1, + sync/0, + wait_for_decision/2, + what_happened/3 + ]). + +%% gen_server callbacks +-export([init/1, + handle_call/3, + handle_cast/2, + handle_info/2, + terminate/2, + code_change/3 + ]). + + +-include("mnesia.hrl"). +-import(mnesia_lib, [set/2, verbose/2, error/2, fatal/2]). + +-record(state, {supervisor, + unclear_pid, + unclear_decision, + unclear_waitfor, + tm_queue_len = 0, + initiated = false, + early_msgs = [] + }). + +%%-define(DBG(F, A), mnesia:report_event(list_to_atom(lists:flatten(io_lib:format(F, A))))). +%%-define(DBG(F, A), io:format("DBG: " ++ F, A)). + +-record(transient_decision, {tid, outcome}). + +start() -> + gen_server:start_link({local, ?MODULE}, ?MODULE, [self()], + [{timeout, infinity} + %%, {debug, [trace]} + ]). + +init() -> + call(init). + +start_garb() -> + Pid = whereis(mnesia_recover), + {ok, _} = timer:send_interval(timer:minutes(2), Pid, garb_decisions), + {ok, _} = timer:send_interval(timer:seconds(10), Pid, check_overload). + +allow_garb() -> + cast(allow_garb). + + +%% The transaction log has either been swiched (latest -> previous) or +%% there is nothing to be dumped. This means that the previous +%% transaction log only may contain commit records which refers to +%% transactions noted in the last two of the 'Prev' tables. All other +%% tables may now be garbed by 'garb_decisions' (after 2 minutes). +%% Max 10 tables are kept. +do_allow_garb() -> + %% The order of the following stuff is important! + Curr = val(latest_transient_decision), + %% Don't garb small tables, they are created on every + %% dump_log and may be small (empty) for schema transactions + %% which are dumped twice + case ets:info(Curr, size) > 20 of + true -> + Old = val(previous_transient_decisions), + Next = create_transient_decision(), + {Prev, ReallyOld} = sublist([Curr | Old], 10, []), + [?ets_delete_table(Tab) || Tab <- ReallyOld], + set(previous_transient_decisions, Prev), + set(latest_transient_decision, Next); + false -> + ignore + end. + +sublist([H|R], N, Acc) when N > 0 -> + sublist(R, N-1, [H| Acc]); +sublist(List, _N, Acc) -> + {lists:reverse(Acc), List}. + +do_garb_decisions() -> + case val(previous_transient_decisions) of + [First, Second | Rest] -> + set(previous_transient_decisions, [First, Second]), + [?ets_delete_table(Tab) || Tab <- Rest]; + _ -> + ignore + end. + +connect_nodes(Ns) -> + call({connect_nodes, Ns}). + +disconnect(Node) -> + call({disconnect, Node}). + +log_decision(D) -> + cast({log_decision, D}). + +val(Var) -> + case ?catch_val(Var) of + {'EXIT', Reason} -> mnesia_lib:other_val(Var, Reason); + Value -> Value + end. + +call(Msg) -> + Pid = whereis(?MODULE), + case Pid of + undefined -> + {error, {node_not_running, node()}}; + Pid -> + link(Pid), + Res = gen_server:call(Pid, Msg, infinity), + unlink(Pid), + + %% We get an exit signal if server dies + receive + {'EXIT', Pid, _Reason} -> + {error, {node_not_running, node()}} + after 0 -> + Res + end + end. + +multicall(Nodes, Msg) -> + rpc:multicall(Nodes, ?MODULE, call, [Msg]). + +cast(Msg) -> + case whereis(?MODULE) of + undefined -> ignore; + Pid -> gen_server:cast(Pid, Msg) + end. + +abcast(Nodes, Msg) -> + gen_server:abcast(Nodes, ?MODULE, Msg). + +note_decision(Tid, Outcome) -> + Tab = val(latest_transient_decision), + ?ets_insert(Tab, #transient_decision{tid = Tid, outcome = Outcome}). + +note_up(Node, _Date, _Time) -> + ?ets_delete(mnesia_decision, Node). + +note_down(Node, Date, Time) -> + ?ets_insert(mnesia_decision, {mnesia_down, Node, Date, Time}). + +note_master_nodes(Tab, []) -> + ?ets_delete(mnesia_decision, Tab); +note_master_nodes(Tab, Nodes) when is_list(Nodes) -> + Master = {master_nodes, Tab, Nodes}, + ?ets_insert(mnesia_decision, Master). + +note_outcome(D) when D#decision.disc_nodes == [] -> +%% ?DBG("~w: note_tmp_decision: ~w~n", [node(), D]), + note_decision(D#decision.tid, filter_outcome(D#decision.outcome)), + ?ets_delete(mnesia_decision, D#decision.tid); +note_outcome(D) when D#decision.disc_nodes /= [] -> +%% ?DBG("~w: note_decision: ~w~n", [node(), D]), + ?ets_insert(mnesia_decision, D). + +do_log_decision(D) when D#decision.outcome /= unclear -> + OldD = decision(D#decision.tid), + MergedD = merge_decisions(node(), OldD, D), + do_log_decision(MergedD, true, D); +do_log_decision(D) -> + do_log_decision(D, false, undefined). + +do_log_decision(D, DoTell, NodeD) -> + DiscNs = D#decision.disc_nodes -- [node()], + Outcome = D#decision.outcome, + D2 = + case Outcome of + aborted -> D#decision{disc_nodes = DiscNs}; + committed -> D#decision{disc_nodes = DiscNs}; + _ -> D + end, + note_outcome(D2), + case mnesia_monitor:use_dir() of + true -> + mnesia_log:append(latest_log, D2), + if + DoTell == true, Outcome /= unclear -> + tell_im_certain(NodeD#decision.disc_nodes--[node()],D2), + tell_im_certain(NodeD#decision.ram_nodes--[node()], D2); + true -> + ignore + end; + false -> + ignore + end. + +tell_im_certain([], _D) -> + ignore; +tell_im_certain(Nodes, D) -> + Msg = {im_certain, node(), D}, + %% mnesia_lib:verbose("~w: tell: ~w~n", [Msg, Nodes]), + abcast(Nodes, Msg). + +sync() -> + call(sync). + +log_mnesia_up(Node) -> + call({log_mnesia_up, Node}). + +log_mnesia_down(Node) -> + call({log_mnesia_down, Node}). + +get_mnesia_downs() -> + Tab = mnesia_decision, + Pat = {mnesia_down, '_', '_', '_'}, + Downs = ?ets_match_object(Tab, Pat), + [Node || {mnesia_down, Node, _Date, _Time} <- Downs]. + +%% Check if we have got a mnesia_down from Node +has_mnesia_down(Node) -> + case ?ets_lookup(mnesia_decision, Node) of + [{mnesia_down, Node, _Date, _Time}] -> + true; + [] -> + false + end. + +mnesia_down(Node) -> + case ?catch_val(recover_nodes) of + {'EXIT', _} -> + %% Not started yet + ignore; + _ -> + mnesia_lib:del(recover_nodes, Node), + cast({mnesia_down, Node}) + end. + +log_master_nodes(Args, UseDir, IsRunning) -> + if + IsRunning == yes -> + log_master_nodes2(Args, UseDir, IsRunning, ok); + UseDir == false -> + ok; + true -> + Name = latest_log, + Fname = mnesia_log:latest_log_file(), + Exists = mnesia_lib:exists(Fname), + Repair = mnesia:system_info(auto_repair), + OpenArgs = [{file, Fname}, {name, Name}, {repair, Repair}], + case disk_log:open(OpenArgs) of + {ok, Name} -> + log_master_nodes2(Args, UseDir, IsRunning, ok); + {repaired, Name, {recovered, _R}, {badbytes, _B}} + when Exists == true -> + log_master_nodes2(Args, UseDir, IsRunning, ok); + {repaired, Name, {recovered, _R}, {badbytes, _B}} + when Exists == false -> + mnesia_log:write_trans_log_header(), + log_master_nodes2(Args, UseDir, IsRunning, ok); + {error, Reason} -> + {error, Reason} + end + end. + +log_master_nodes2([{Tab, Nodes} | Tail], UseDir, IsRunning, WorstRes) -> + Res = + case IsRunning of + yes -> + R = call({log_master_nodes, Tab, Nodes, UseDir, IsRunning}), + mnesia_controller:master_nodes_updated(Tab, Nodes), + R; + _ -> + do_log_master_nodes(Tab, Nodes, UseDir, IsRunning) + end, + case Res of + ok -> + log_master_nodes2(Tail, UseDir, IsRunning, WorstRes); + {error, Reason} -> + log_master_nodes2(Tail, UseDir, IsRunning, {error, Reason}) + end; +log_master_nodes2([], _UseDir, IsRunning, WorstRes) -> + case IsRunning of + yes -> + WorstRes; + _ -> + disk_log:close(latest_log), + WorstRes + end. + +get_master_node_info() -> + Tab = mnesia_decision, + Pat = {master_nodes, '_', '_'}, + case catch mnesia_lib:db_match_object(ram_copies,Tab, Pat) of + {'EXIT', _} -> + []; + Masters -> + Masters + end. + +get_master_node_tables() -> + Masters = get_master_node_info(), + [Tab || {master_nodes, Tab, _Nodes} <- Masters]. + +get_master_nodes(Tab) -> + case catch ?ets_lookup_element(mnesia_decision, Tab, 3) of + {'EXIT', _} -> []; + Nodes -> Nodes + end. + +%% Determine what has happened to the transaction +what_happened(Tid, Protocol, Nodes) -> + Default = + case Protocol of + asym_trans -> aborted; + _ -> unclear %% sym_trans and sync_sym_trans + end, + This = node(), + case lists:member(This, Nodes) of + true -> + {ok, Outcome} = call({what_happened, Default, Tid}), + Others = Nodes -- [This], + case filter_outcome(Outcome) of + unclear -> what_happened_remotely(Tid, Default, Others); + aborted -> aborted; + committed -> committed + end; + false -> + what_happened_remotely(Tid, Default, Nodes) + end. + +what_happened_remotely(Tid, Default, Nodes) -> + {Replies, _} = multicall(Nodes, {what_happened, Default, Tid}), + check_what_happened(Replies, 0, 0). + +check_what_happened([H | T], Aborts, Commits) -> + case H of + {ok, R} -> + case filter_outcome(R) of + committed -> + check_what_happened(T, Aborts, Commits + 1); + aborted -> + check_what_happened(T, Aborts + 1, Commits); + unclear -> + check_what_happened(T, Aborts, Commits) + end; + {error, _} -> + check_what_happened(T, Aborts, Commits); + {badrpc, _} -> + check_what_happened(T, Aborts, Commits) + end; +check_what_happened([], Aborts, Commits) -> + if + Aborts == 0, Commits == 0 -> aborted; % None of the active nodes knows + Aborts > 0 -> aborted; % Someody has aborted + Aborts == 0, Commits > 0 -> committed % All has committed + end. + +%% Determine what has happened to the transaction +%% and possibly wait forever for the decision. +wait_for_decision(presume_commit, _InitBy) -> + %% sym_trans + {{presume_commit, self()}, committed}; + +wait_for_decision(D, InitBy) when D#decision.outcome == presume_abort -> + wait_for_decision(D, InitBy, 0). + +wait_for_decision(D, InitBy, N) -> + %% asym_trans + Tid = D#decision.tid, + Max = 10, + Outcome = outcome(Tid, D#decision.outcome), + if + Outcome =:= committed -> {Tid, committed}; + Outcome =:= aborted -> {Tid, aborted}; + Outcome =:= presume_abort -> + case N > Max of + true -> {Tid, aborted}; + false -> % busy loop for ets decision moving + timer:sleep(10), + wait_for_decision(D, InitBy, N+1) + end; + InitBy /= startup -> + %% Wait a while for active transactions + %% to end and try again + timer:sleep(100), + wait_for_decision(D, InitBy, N); + InitBy == startup -> + {ok, Res} = call({wait_for_decision, D}), + {Tid, Res} + end. + +still_pending([Tid | Pending]) -> + case filter_outcome(outcome(Tid, unclear)) of + unclear -> [Tid | still_pending(Pending)]; + _ -> still_pending(Pending) + end; +still_pending([]) -> + []. + +load_decision_tab() -> + Cont = mnesia_log:open_decision_tab(), + load_decision_tab(Cont, load_decision_tab), + mnesia_log:close_decision_tab(). + +load_decision_tab(eof, _InitBy) -> + ok; +load_decision_tab(Cont, InitBy) -> + case mnesia_log:chunk_decision_tab(Cont) of + {Cont2, Decisions} -> + note_log_decisions(Decisions, InitBy), + load_decision_tab(Cont2, InitBy); + eof -> + ok + end. + +%% Dumps DECISION.LOG and PDECISION.LOG and removes them. +%% From now on all decisions are logged in the transaction log file +convert_old() -> + HasOldStuff = + mnesia_lib:exists(mnesia_log:previous_decision_log_file()) or + mnesia_lib:exists(mnesia_log:decision_log_file()), + case HasOldStuff of + true -> + mnesia_log:open_decision_log(), + dump_decision_log(startup), + dump_decision_log(startup), + mnesia_log:close_decision_log(), + Latest = mnesia_log:decision_log_file(), + ok = file:delete(Latest); + false -> + ignore + end. + +dump_decision_log(InitBy) -> + %% Assumed to be run in transaction log dumper process + Cont = mnesia_log:prepare_decision_log_dump(), + perform_dump_decision_log(Cont, InitBy). + +perform_dump_decision_log(eof, _InitBy) -> + confirm_decision_log_dump(); +perform_dump_decision_log(Cont, InitBy) when InitBy == startup -> + case mnesia_log:chunk_decision_log(Cont) of + {Cont2, Decisions} -> + note_log_decisions(Decisions, InitBy), + perform_dump_decision_log(Cont2, InitBy); + eof -> + confirm_decision_log_dump() + end; +perform_dump_decision_log(_Cont, _InitBy) -> + confirm_decision_log_dump(). + +confirm_decision_log_dump() -> + dump_decision_tab(), + mnesia_log:confirm_decision_log_dump(). + +dump_decision_tab() -> + Tab = mnesia_decision, + All = mnesia_lib:db_match_object(ram_copies,Tab, '_'), + mnesia_log:save_decision_tab({decision_list, All}). + +note_log_decisions([What | Tail], InitBy) -> + note_log_decision(What, InitBy), + note_log_decisions(Tail, InitBy); +note_log_decisions([], _InitBy) -> + ok. + +note_log_decision(NewD, InitBy) when NewD#decision.outcome == pre_commit -> + note_log_decision(NewD#decision{outcome = unclear}, InitBy); + +note_log_decision(NewD, _InitBy) when is_record(NewD, decision) -> + Tid = NewD#decision.tid, + sync_trans_tid_serial(Tid), + note_outcome(NewD); +note_log_decision({trans_tid, serial, _Serial}, startup) -> + ignore; +note_log_decision({trans_tid, serial, Serial}, _InitBy) -> + sync_trans_tid_serial(Serial); +note_log_decision({mnesia_up, Node, Date, Time}, _InitBy) -> + note_up(Node, Date, Time); +note_log_decision({mnesia_down, Node, Date, Time}, _InitBy) -> + note_down(Node, Date, Time); +note_log_decision({master_nodes, Tab, Nodes}, _InitBy) -> + note_master_nodes(Tab, Nodes); +note_log_decision(H, _InitBy) when H#log_header.log_kind == decision_log -> + V = mnesia_log:decision_log_version(), + if + H#log_header.log_version == V-> + ok; + H#log_header.log_version == "2.0" -> + verbose("Accepting an old version format of decision log: ~p~n", + [V]), + ok; + true -> + fatal("Bad version of decision log: ~p~n", [H]) + end; +note_log_decision(H, _InitBy) when H#log_header.log_kind == decision_tab -> + V = mnesia_log:decision_tab_version(), + if + V == H#log_header.log_version -> + ok; + true -> + fatal("Bad version of decision tab: ~p~n", [H]) + end; +note_log_decision({decision_list, ItemList}, InitBy) -> + note_log_decisions(ItemList, InitBy); +note_log_decision(BadItem, InitBy) -> + exit({"Bad decision log item", BadItem, InitBy}). + +trans_tid_serial() -> + ?ets_lookup_element(mnesia_decision, serial, 3). + +set_trans_tid_serial(Val) -> + ?ets_insert(mnesia_decision, {trans_tid, serial, Val}). + +incr_trans_tid_serial() -> + ?ets_update_counter(mnesia_decision, serial, 1). + +sync_trans_tid_serial(ThatCounter) when is_integer(ThatCounter) -> + ThisCounter = trans_tid_serial(), + if + ThatCounter > ThisCounter -> + set_trans_tid_serial(ThatCounter + 1); + true -> + ignore + end; +sync_trans_tid_serial(Tid) -> + sync_trans_tid_serial(Tid#tid.counter). + + + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +%%% Callback functions from gen_server + +%%---------------------------------------------------------------------- +%% Func: init/1 +%% Returns: {ok, State} | +%% {ok, State, Timeout} | +%% {stop, Reason} +%%---------------------------------------------------------------------- +init([Parent]) -> + process_flag(trap_exit, true), + mnesia_lib:verbose("~p starting: ~p~n", [?MODULE, self()]), + set(latest_transient_decision, create_transient_decision()), + set(previous_transient_decisions, []), + set(recover_nodes, []), + State = #state{supervisor = Parent}, + {ok, State}. + +create_transient_decision() -> + ?ets_new_table(mnesia_transient_decision, [{keypos, 2}, set, public]). + +%%---------------------------------------------------------------------- +%% Func: handle_call/3 +%% Returns: {reply, Reply, State} | +%% {reply, Reply, State, Timeout} | +%% {noreply, State} | +%% {noreply, State, Timeout} | +%% {stop, Reason, Reply, State} | (terminate/2 is called) +%%---------------------------------------------------------------------- + +handle_call(init, From, State) when State#state.initiated == false -> + Args = [{keypos, 2}, set, public, named_table], + case mnesia_monitor:use_dir() of + true -> + ?ets_new_table(mnesia_decision, Args), + set_trans_tid_serial(0), + TabFile = mnesia_log:decision_tab_file(), + case mnesia_lib:exists(TabFile) of + true -> + load_decision_tab(); + false -> + ignore + end, + convert_old(), + mnesia_dumper:opt_dump_log(scan_decisions); + false -> + ?ets_new_table(mnesia_decision, Args), + set_trans_tid_serial(0) + end, + handle_early_msgs(State, From); + +handle_call(Msg, From, State) when State#state.initiated == false -> + %% Buffer early messages + Msgs = State#state.early_msgs, + {noreply, State#state{early_msgs = [{call, Msg, From} | Msgs]}}; + +handle_call({disconnect, Node}, _From, State) -> + mnesia_monitor:disconnect(Node), + mnesia_lib:del(recover_nodes, Node), + {reply, ok, State}; + +handle_call({connect_nodes, Ns}, From, State) -> + %% Determine which nodes we should try to connect + AlreadyConnected = val(recover_nodes), + {_, Nodes} = mnesia_lib:search_delete(node(), Ns), + Check = Nodes -- AlreadyConnected, + case mnesia_monitor:negotiate_protocol(Check) of + busy -> + %% monitor is disconnecting some nodes retry + %% the req (to avoid deadlock). + erlang:send_after(2, self(), {connect_nodes,Ns,From}), + {noreply, State}; + [] -> + %% No good noodes to connect to! + %% We can't use reply here because this function can be + %% called from handle_info + gen_server:reply(From, {[], AlreadyConnected}), + {noreply, State}; + GoodNodes -> + %% Now we have agreed upon a protocol with some new nodes + %% and we may use them when we recover transactions + mnesia_lib:add_list(recover_nodes, GoodNodes), + cast({announce_all, GoodNodes}), + case get_master_nodes(schema) of + [] -> + Context = starting_partitioned_network, + mnesia_monitor:detect_inconcistency(GoodNodes, Context); + _ -> %% If master_nodes is set ignore old inconsistencies + ignore + end, + gen_server:reply(From, {GoodNodes, AlreadyConnected}), + {noreply,State} + end; + +handle_call({what_happened, Default, Tid}, _From, State) -> + sync_trans_tid_serial(Tid), + Outcome = outcome(Tid, Default), + {reply, {ok, Outcome}, State}; + +handle_call({wait_for_decision, D}, From, State) -> + Recov = val(recover_nodes), + AliveRam = (mnesia_lib:intersect(D#decision.ram_nodes, Recov) -- [node()]), + RemoteDisc = D#decision.disc_nodes -- [node()], + if + AliveRam == [], RemoteDisc == [] -> + %% No more else to wait for and we may safely abort + {reply, {ok, aborted}, State}; + true -> + verbose("Transaction ~p is unclear. " + "Wait for disc nodes: ~w ram: ~w~n", + [D#decision.tid, RemoteDisc, AliveRam]), + AliveDisc = mnesia_lib:intersect(RemoteDisc, Recov), + Msg = {what_decision, node(), D}, + abcast(AliveRam, Msg), + abcast(AliveDisc, Msg), + case val(max_wait_for_decision) of + infinity -> + ignore; + MaxWait -> + ForceMsg = {force_decision, D#decision.tid}, + {ok, _} = timer:send_after(MaxWait, ForceMsg) + end, + State2 = State#state{unclear_pid = From, + unclear_decision = D, + unclear_waitfor = (RemoteDisc ++ AliveRam)}, + {noreply, State2} + end; + +handle_call({log_mnesia_up, Node}, _From, State) -> + do_log_mnesia_up(Node), + {reply, ok, State}; + +handle_call({log_mnesia_down, Node}, _From, State) -> + do_log_mnesia_down(Node), + {reply, ok, State}; + +handle_call({log_master_nodes, Tab, Nodes, UseDir, IsRunning}, _From, State) -> + do_log_master_nodes(Tab, Nodes, UseDir, IsRunning), + {reply, ok, State}; + +handle_call(sync, _From, State) -> + {reply, ok, State}; + +handle_call(Msg, _From, State) -> + error("~p got unexpected call: ~p~n", [?MODULE, Msg]), + {noreply, State}. + +do_log_mnesia_up(Node) -> + Yoyo = {mnesia_up, Node, Date = date(), Time = time()}, + case mnesia_monitor:use_dir() of + true -> + mnesia_log:append(latest_log, Yoyo), + disk_log:sync(latest_log); + false -> + ignore + end, + note_up(Node, Date, Time). + +do_log_mnesia_down(Node) -> + Yoyo = {mnesia_down, Node, Date = date(), Time = time()}, + case mnesia_monitor:use_dir() of + true -> + mnesia_log:append(latest_log, Yoyo), + disk_log:sync(latest_log); + false -> + ignore + end, + note_down(Node, Date, Time). + +do_log_master_nodes(Tab, Nodes, UseDir, IsRunning) -> + Master = {master_nodes, Tab, Nodes}, + Res = + case UseDir of + true -> + LogRes = mnesia_log:append(latest_log, Master), + disk_log:sync(latest_log), + LogRes; + false -> + ok + end, + case IsRunning of + yes -> + note_master_nodes(Tab, Nodes); + _NotRunning -> + ignore + end, + Res. + +%%---------------------------------------------------------------------- +%% Func: handle_cast/2 +%% Returns: {noreply, State} | +%% {noreply, State, Timeout} | +%% {stop, Reason, State} (terminate/2 is called) +%%---------------------------------------------------------------------- + +handle_cast(Msg, State) when State#state.initiated == false -> + %% Buffer early messages + Msgs = State#state.early_msgs, + {noreply, State#state{early_msgs = [{cast, Msg} | Msgs]}}; + +handle_cast({im_certain, Node, NewD}, State) -> + OldD = decision(NewD#decision.tid), + MergedD = merge_decisions(Node, OldD, NewD), + do_log_decision(MergedD, false, undefined), + {noreply, State}; + +handle_cast({log_decision, D}, State) -> + do_log_decision(D), + {noreply, State}; + +handle_cast(allow_garb, State) -> + do_allow_garb(), + {noreply, State}; + +handle_cast({decisions, Node, Decisions}, State) -> + mnesia_lib:add(recover_nodes, Node), + State2 = add_remote_decisions(Node, Decisions, State), + {noreply, State2}; + +handle_cast({what_decision, Node, OtherD}, State) -> + Tid = OtherD#decision.tid, + sync_trans_tid_serial(Tid), + Decision = + case decision(Tid) of + no_decision -> OtherD; + MyD when is_record(MyD, decision) -> MyD + end, + announce([Node], [Decision], [], true), + {noreply, State}; + +handle_cast({mnesia_down, Node}, State) -> + case State#state.unclear_decision of + undefined -> + {noreply, State}; + D -> + case lists:member(Node, D#decision.ram_nodes) of + false -> + {noreply, State}; + true -> + State2 = add_remote_decision(Node, D, State), + {noreply, State2} + end + end; + +handle_cast({announce_all, Nodes}, State) -> + announce_all(Nodes), + {noreply, State}; + +handle_cast(Msg, State) -> + error("~p got unexpected cast: ~p~n", [?MODULE, Msg]), + {noreply, State}. + +%%---------------------------------------------------------------------- +%% Func: handle_info/2 +%% Returns: {noreply, State} | +%% {noreply, State, Timeout} | +%% {stop, Reason, State} (terminate/2 is called) +%%---------------------------------------------------------------------- + +%% No need for buffering +%% handle_info(Msg, State) when State#state.initiated == false -> +%% %% Buffer early messages +%% Msgs = State#state.early_msgs, +%% {noreply, State#state{early_msgs = [{info, Msg} | Msgs]}}; + +handle_info({connect_nodes, Ns, From}, State) -> + handle_call({connect_nodes,Ns},From,State); + +handle_info(check_overload, S) -> + %% Time to check if mnesia_tm is overloaded + case whereis(mnesia_tm) of + Pid when is_pid(Pid) -> + + Threshold = 100, + Prev = S#state.tm_queue_len, + {message_queue_len, Len} = + process_info(Pid, message_queue_len), + if + Len > Threshold, Prev > Threshold -> + What = {mnesia_tm, message_queue_len, [Prev, Len]}, + mnesia_lib:report_system_event({mnesia_overload, What}), + {noreply, S#state{tm_queue_len = 0}}; + + Len > Threshold -> + {noreply, S#state{tm_queue_len = Len}}; + + true -> + {noreply, S#state{tm_queue_len = 0}} + end; + undefined -> + {noreply, S} + end; + +handle_info(garb_decisions, State) -> + do_garb_decisions(), + {noreply, State}; + +handle_info({force_decision, Tid}, State) -> + %% Enforce a transaction recovery decision, + %% if we still are waiting for the outcome + + case State#state.unclear_decision of + U when U#decision.tid == Tid -> + verbose("Decided to abort transaction ~p since " + "max_wait_for_decision has been exceeded~n", + [Tid]), + D = U#decision{outcome = aborted}, + State2 = add_remote_decision(node(), D, State), + {noreply, State2}; + _ -> + {noreply, State} + end; + +handle_info({'EXIT', Pid, R}, State) when Pid == State#state.supervisor -> + mnesia_lib:dbg_out("~p was ~p~n",[?MODULE, R]), + {stop, shutdown, State}; + +handle_info(Msg, State) -> + error("~p got unexpected info: ~p~n", [?MODULE, Msg]), + {noreply, State}. + +%%---------------------------------------------------------------------- +%% Func: terminate/2 +%% Purpose: Shutdown the server +%% Returns: any (ignored by gen_server) +%%---------------------------------------------------------------------- + +terminate(Reason, State) -> + mnesia_monitor:terminate_proc(?MODULE, Reason, State). + +%%---------------------------------------------------------------------- +%% Func: code_change/3 +%% Purpose: Upgrade process when its code is to be changed +%% Returns: {ok, NewState} +%%---------------------------------------------------------------------- +code_change(_OldVsn, State, _Extra) -> + {ok, State}. + +%%%---------------------------------------------------------------------- +%%% Internal functions +%%%---------------------------------------------------------------------- + +handle_early_msgs(State, From) -> + Res = do_handle_early_msgs(State#state.early_msgs, + State#state{early_msgs = [], + initiated = true}), + gen_server:reply(From, ok), + Res. + +do_handle_early_msgs([Msg | Msgs], State) -> + %% The messages are in reverted order + case do_handle_early_msgs(Msgs, State) of +%% {stop, Reason, Reply, State2} -> +%% {stop, Reason, Reply, State2}; + {stop, Reason, State2} -> + {stop, Reason, State2}; + {noreply, State2} -> + handle_early_msg(Msg, State2) + end; + +do_handle_early_msgs([], State) -> + {noreply, State}. + +handle_early_msg({call, Msg, From}, State) -> + case handle_call(Msg, From, State) of + {reply, R, S} -> + gen_server:reply(From, R), + {noreply, S}; + Other -> + Other + end; +handle_early_msg({cast, Msg}, State) -> + handle_cast(Msg, State); +handle_early_msg({info, Msg}, State) -> + handle_info(Msg, State). + +tabs() -> + Curr = val(latest_transient_decision), % Do not miss any trans even + Prev = val(previous_transient_decisions), % if the tabs are switched + [Curr, mnesia_decision | Prev]. % Ordered by hit probability + +decision(Tid) -> + decision(Tid, tabs()). + +decision(Tid, [Tab | Tabs]) -> + case catch ?ets_lookup(Tab, Tid) of + [D] when is_record(D, decision) -> + D; + [C] when is_record(C, transient_decision) -> + #decision{tid = C#transient_decision.tid, + outcome = C#transient_decision.outcome, + disc_nodes = [], + ram_nodes = [] + }; + [] -> + decision(Tid, Tabs); + {'EXIT', _} -> + %% Recently switched transient decision table + decision(Tid, Tabs) + end; +decision(_Tid, []) -> + no_decision. + +outcome(Tid, Default) -> + outcome(Tid, Default, tabs()). + +outcome(Tid, Default, [Tab | Tabs]) -> + case catch ?ets_lookup_element(Tab, Tid, 3) of + {'EXIT', _} -> + outcome(Tid, Default, Tabs); + Val -> + Val + end; +outcome(_Tid, Default, []) -> + Default. + +filter_outcome(Val) -> + case Val of + unclear -> unclear; + aborted -> aborted; + presume_abort -> aborted; + committed -> committed; + pre_commit -> unclear + end. + +filter_aborted(D) when D#decision.outcome == presume_abort -> + D#decision{outcome = aborted}; +filter_aborted(D) -> + D. + +%% Merge old decision D with new (probably remote) decision +merge_decisions(Node, D, NewD0) -> + NewD = filter_aborted(NewD0), + if + D == no_decision, node() /= Node -> + %% We did not know anything about this txn + NewD#decision{disc_nodes = []}; + D == no_decision -> + NewD; + is_record(D, decision) -> + DiscNs = D#decision.disc_nodes -- ([node(), Node]), + OldD = filter_aborted(D#decision{disc_nodes = DiscNs}), +%% mnesia_lib:dbg_out("merge ~w: NewD = ~w~n D = ~w~n OldD = ~w~n", +%% [Node, NewD, D, OldD]), + if + OldD#decision.outcome == unclear, + NewD#decision.outcome == unclear -> + D; + + OldD#decision.outcome == NewD#decision.outcome -> + %% We have come to the same decision + OldD; + + OldD#decision.outcome == committed, + NewD#decision.outcome == aborted -> + %% Interesting! We have already committed, + %% but someone else has aborted. Now we + %% have a nice little inconcistency. The + %% other guy (or some one else) has + %% enforced a recovery decision when + %% max_wait_for_decision was exceeded. + %% We will pretend that we have obeyed + %% the forced recovery decision, but we + %% will also generate an event in case the + %% application wants to do something clever. + Msg = {inconsistent_database, bad_decision, Node}, + mnesia_lib:report_system_event(Msg), + OldD#decision{outcome = aborted}; + + OldD#decision.outcome == aborted -> + %% aborted overrrides anything + OldD#decision{outcome = aborted}; + + NewD#decision.outcome == aborted -> + %% aborted overrrides anything + OldD#decision{outcome = aborted}; + + OldD#decision.outcome == committed, + NewD#decision.outcome == unclear -> + %% committed overrides unclear + OldD#decision{outcome = committed}; + + OldD#decision.outcome == unclear, + NewD#decision.outcome == committed -> + %% committed overrides unclear + OldD#decision{outcome = committed} + end + end. + +add_remote_decisions(Node, [D | Tail], State) when is_record(D, decision) -> + State2 = add_remote_decision(Node, D, State), + add_remote_decisions(Node, Tail, State2); + +add_remote_decisions(Node, [C | Tail], State) + when is_record(C, transient_decision) -> + D = #decision{tid = C#transient_decision.tid, + outcome = C#transient_decision.outcome, + disc_nodes = [], + ram_nodes = []}, + State2 = add_remote_decision(Node, D, State), + add_remote_decisions(Node, Tail, State2); + +add_remote_decisions(Node, [{mnesia_down, _, _, _} | Tail], State) -> + add_remote_decisions(Node, Tail, State); + +add_remote_decisions(Node, [{trans_tid, serial, Serial} | Tail], State) -> + sync_trans_tid_serial(Serial), + case State#state.unclear_decision of + undefined -> + ignored; + D -> + case lists:member(Node, D#decision.ram_nodes) of + true -> + ignore; + false -> + abcast([Node], {what_decision, node(), D}) + end + end, + add_remote_decisions(Node, Tail, State); + +add_remote_decisions(_Node, [], State) -> + State. + +add_remote_decision(Node, NewD, State) -> + Tid = NewD#decision.tid, + OldD = decision(Tid), + D = merge_decisions(Node, OldD, NewD), + do_log_decision(D, false, undefined), + Outcome = D#decision.outcome, + if + OldD == no_decision -> + ignore; + Outcome == unclear -> + ignore; + true -> + case lists:member(node(), NewD#decision.disc_nodes) or + lists:member(node(), NewD#decision.ram_nodes) of + true -> + tell_im_certain([Node], D); + false -> + ignore + end + end, + case State#state.unclear_decision of + U when U#decision.tid == Tid -> + WaitFor = State#state.unclear_waitfor -- [Node], + if + Outcome == unclear, WaitFor == [] -> + %% Everybody are uncertain, lets abort + NewOutcome = aborted, + CertainD = D#decision{outcome = NewOutcome, + disc_nodes = [], + ram_nodes = []}, + tell_im_certain(D#decision.disc_nodes, CertainD), + tell_im_certain(D#decision.ram_nodes, CertainD), + do_log_decision(CertainD, false, undefined), + verbose("Decided to abort transaction ~p " + "since everybody are uncertain ~p~n", + [Tid, CertainD]), + gen_server:reply(State#state.unclear_pid, {ok, NewOutcome}), + State#state{unclear_pid = undefined, + unclear_decision = undefined, + unclear_waitfor = undefined}; + Outcome /= unclear -> + verbose("~p told us that transaction ~p was ~p~n", + [Node, Tid, Outcome]), + gen_server:reply(State#state.unclear_pid, {ok, Outcome}), + State#state{unclear_pid = undefined, + unclear_decision = undefined, + unclear_waitfor = undefined}; + Outcome == unclear -> + State#state{unclear_waitfor = WaitFor} + end; + _ -> + State + end. + +announce_all([]) -> + ok; +announce_all(ToNodes) -> + Tid = trans_tid_serial(), + announce(ToNodes, [{trans_tid,serial,Tid}], [], false). + +announce(ToNodes, [Head | Tail], Acc, ForceSend) -> + Acc2 = arrange(ToNodes, Head, Acc, ForceSend), + announce(ToNodes, Tail, Acc2, ForceSend); + +announce(_ToNodes, [], Acc, _ForceSend) -> + send_decisions(Acc). + +send_decisions([{Node, Decisions} | Tail]) -> + abcast([Node], {decisions, node(), Decisions}), + send_decisions(Tail); +send_decisions([]) -> + ok. + +arrange([To | ToNodes], D, Acc, ForceSend) when is_record(D, decision) -> + NeedsAdd = (ForceSend or + lists:member(To, D#decision.disc_nodes) or + lists:member(To, D#decision.ram_nodes)), + case NeedsAdd of + true -> + Acc2 = add_decision(To, D, Acc), + arrange(ToNodes, D, Acc2, ForceSend); + false -> + arrange(ToNodes, D, Acc, ForceSend) + end; + +arrange([To | ToNodes], {trans_tid, serial, Serial}, Acc, ForceSend) -> + %% Do the lamport thing plus release the others + %% from uncertainity. + Acc2 = add_decision(To, {trans_tid, serial, Serial}, Acc), + arrange(ToNodes, {trans_tid, serial, Serial}, Acc2, ForceSend); + +arrange([], _Decision, Acc, _ForceSend) -> + Acc. + +add_decision(Node, Decision, [{Node, Decisions} | Tail]) -> + [{Node, [Decision | Decisions]} | Tail]; +add_decision(Node, Decision, [Head | Tail]) -> + [Head | add_decision(Node, Decision, Tail)]; +add_decision(Node, Decision, []) -> + [{Node, [Decision]}]. + diff --git a/lib/mnesia/src/mnesia_registry.erl b/lib/mnesia/src/mnesia_registry.erl new file mode 100644 index 0000000000..9805d48697 --- /dev/null +++ b/lib/mnesia/src/mnesia_registry.erl @@ -0,0 +1,280 @@ +%% +%% %CopyrightBegin% +%% +%% Copyright Ericsson AB 1998-2009. All Rights Reserved. +%% +%% The contents of this file are subject to the Erlang Public License, +%% Version 1.1, (the "License"); you may not use this file except in +%% compliance with the License. You should have received a copy of the +%% Erlang Public License along with this software. If not, it can be +%% retrieved online at http://www.erlang.org/. +%% +%% Software distributed under the License is distributed on an "AS IS" +%% basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See +%% the License for the specific language governing rights and limitations +%% under the License. +%% +%% %CopyrightEnd% +%% + +%% +-module(mnesia_registry). + +%%%---------------------------------------------------------------------- +%%% File : mnesia_registry.erl +%%% Purpose : Support dump and restore of a registry on a C-node +%%% This is an OTP internal module and is not public available. +%%% +%%% Example : Dump some hardcoded records into the Mnesia table Tab +%%% +%%% case rpc:call(Node, mnesia_registry, start_dump, [Tab, self()]) of +%%% Pid when pid(Pid) -> +%%% Pid ! {write, key1, key_size1, val_type1, val_size1, val1}, +%%% Pid ! {delete, key3}, +%%% Pid ! {write, key2, key_size2, val_type2, val_size2, val2}, +%%% Pid ! {write, key4, key_size4, val_type4, val_size4, val4}, +%%% Pid ! {commit, self()}, +%%% receive +%%% {ok, Pid} -> +%%% ok; +%%% {'EXIT', Pid, Reason} -> +%%% exit(Reason) +%%% end; +%%% {badrpc, Reason} -> +%%% exit(Reason) +%%% end. +%%% +%%% Example : Restore the corresponding Mnesia table Tab +%%% +%%% case rpc:call(Node, mnesia_registry, start_restore, [Tab, self()]) of +%%% {size, Pid, N, LargestKey, LargestVal} -> +%%% Pid ! {send_records, self()}, +%%% Fun = fun() -> +%%% receive +%%% {restore, KeySize, ValSize, ValType, Key, Val} -> +%%% {Key, Val}; +%%% {'EXIT', Pid, Reason} -> +%%% exit(Reason) +%%% end +%%% end, +%%% lists:map(Fun, lists:seq(1, N)); +%%% {badrpc, Reason} -> +%%% exit(Reason) +%%% end. +%%% +%%%---------------------------------------------------------------------- + +%% External exports +-export([start_dump/2, start_restore/2]). +-export([create_table/1, create_table/2]). + +%% Internal exports +-export([init/4]). + +-record(state, {table, ops = [], link_to}). + +-record(registry_entry, {key, key_size, val_type, val_size, val}). + +-record(size, {pid = self(), n_values = 0, largest_key = 0, largest_val = 0}). + +%%%---------------------------------------------------------------------- +%%% Client +%%%---------------------------------------------------------------------- + +start(Type, Tab, LinkTo) -> + Starter = self(), + Args = [Type, Starter, LinkTo, Tab], + Pid = spawn_link(?MODULE, init, Args), + %% The receiver process may unlink the current process + receive + {ok, Res} -> + Res; + {'EXIT', Pid, Reason} when LinkTo == Starter -> + exit(Reason) + end. + +%% Starts a receiver process and optionally creates a Mnesia table +%% with suitable default values. Returns the Pid of the receiver process +%% +%% The receiver process accumulates Mnesia operations and performs +%% all operations or none at commit. The understood messages are: +%% +%% {write, Key, KeySize, ValType, ValSize, Val} -> +%% accumulates mnesia:write({Tab, Key, KeySize, ValType, ValSize, Val}) +%% (no reply) +%% {delete, Key} -> +%% accumulates mnesia:delete({Tab, Key}) (no reply) +%% {commit, ReplyTo} -> +%% commits all accumulated operations +%% and stops the process (replies {ok, Pid}) +%% abort -> +%% stops the process (no reply) +%% +%% The receiver process is linked to the process with the process identifier +%% LinkTo. If some error occurs the receiver process will invoke exit(Reason) +%% and it is up to he LinkTo process to act properly when it receives an exit +%% signal. + +start_dump(Tab, LinkTo) -> + start(dump, Tab, LinkTo). + +%% Starts a sender process which sends restore messages back to the +%% LinkTo process. But first are some statistics about the table +%% determined and returned as a 5-tuple: +%% +%% {size, SenderPid, N, LargestKeySize, LargestValSize} +%% +%% where N is the number of records in the table. Then the sender process +%% waits for a 2-tuple message: +%% +%% {send_records, ReplyTo} +%% +%% At last N 6-tuple messages is sent to the ReplyTo process: +%% +%% ReplyTo ! {restore, KeySize, ValSize, ValType, Key, Val} +%% +%% If some error occurs the receiver process will invoke exit(Reason) +%% and it is up to he LinkTo process to act properly when it receives an +%% exit signal. + +start_restore(Tab, LinkTo) -> + start(restore, Tab, LinkTo). + + +%% Optionally creates the Mnesia table Tab with suitable default values. +%% Returns ok or EXIT's +create_table(Tab) -> + Storage = mnesia:table_info(schema, storage_type), + create_table(Tab, [{Storage, [node()]}]). + +create_table(Tab, TabDef) -> + Attrs = record_info(fields, registry_entry), + case mnesia:create_table(Tab, [{attributes, Attrs} | TabDef]) of + {atomic, ok} -> + ok; + {aborted, {already_exists, Tab}} -> + ok; + {aborted, Reason} -> + exit(Reason) + end. + +%%%---------------------------------------------------------------------- +%%% Server +%%%---------------------------------------------------------------------- + +init(Type, Starter, LinkTo, Tab) -> + if + LinkTo /= Starter -> + link(LinkTo), + unlink(Starter); + true -> + ignore + end, + case Type of + dump -> + Starter ! {ok, self()}, + dump_loop(#state{table = Tab, link_to = LinkTo}); + restore -> + restore_table(Tab, Starter, LinkTo) + end. + +%%%---------------------------------------------------------------------- +%%% Dump loop +%%%---------------------------------------------------------------------- + +dump_loop(S) -> + Tab = S#state.table, + Ops = S#state.ops, + receive + {write, Key, KeySize, ValType, ValSize, Val} -> + RE = #registry_entry{key = Key, + key_size = KeySize, + val_type = ValType, + val_size = ValSize, + val = Val}, + dump_loop(S#state{ops = [{write, RE} | Ops]}); + {delete, Key} -> + dump_loop(S#state{ops = [{delete, Key} | Ops]}); + {commit, ReplyTo} -> + create_table(Tab), + RecName = mnesia:table_info(Tab, record_name), + %% The Ops are in reverse order, but there is no need + %% for reversing the list of accumulated operations + case mnesia:transaction(fun handle_ops/3, [Tab, RecName, Ops]) of + {atomic, ok} -> + ReplyTo ! {ok, self()}, + stop(S#state.link_to); + {aborted, Reason} -> + exit({aborted, Reason}) + end; + abort -> + stop(S#state.link_to); + BadMsg -> + exit({bad_message, BadMsg}) + end. + +stop(LinkTo) -> + unlink(LinkTo), + exit(normal). + +%% Grab a write lock for the entire table +%% and iterate over all accumulated operations +handle_ops(Tab, RecName, Ops) -> + mnesia:write_lock_table(Tab), + do_handle_ops(Tab, RecName, Ops). + +do_handle_ops(Tab, RecName, [{write, RegEntry} | Ops]) -> + Record = setelement(1, RegEntry, RecName), + mnesia:write(Tab, Record, write), + do_handle_ops(Tab, RecName, Ops); +do_handle_ops(Tab, RecName, [{delete, Key} | Ops]) -> + mnesia:delete(Tab, Key, write), + do_handle_ops(Tab, RecName, Ops); +do_handle_ops(_Tab, _RecName, []) -> + ok. + +%%%---------------------------------------------------------------------- +%%% Restore table +%%%---------------------------------------------------------------------- + +restore_table(Tab, Starter, LinkTo) -> + Pat = mnesia:table_info(Tab, wild_pattern), + Fun = fun() -> mnesia:match_object(Tab, Pat, read) end, + case mnesia:transaction(Fun) of + {atomic, AllRecords} -> + Size = calc_size(AllRecords, #size{}), + Starter ! {ok, Size}, + receive + {send_records, ReplyTo} -> + send_records(AllRecords, ReplyTo), + unlink(LinkTo), + exit(normal); + BadMsg -> + exit({bad_message, BadMsg}) + end; + {aborted, Reason} -> + exit(Reason) + end. + +calc_size([H | T], S) -> + KeySize = max(element(#registry_entry.key_size, H), S#size.largest_key), + ValSize = max(element(#registry_entry.val_size, H), S#size.largest_val), + N = S#size.n_values + 1, + calc_size(T, S#size{n_values = N, largest_key = KeySize, largest_val = ValSize}); +calc_size([], Size) -> + Size. + +max(New, Old) when New > Old -> New; +max(_New, Old) -> Old. + +send_records([H | T], ReplyTo) -> + KeySize = element(#registry_entry.key_size, H), + ValSize = element(#registry_entry.val_size, H), + ValType = element(#registry_entry.val_type, H), + Key = element(#registry_entry.key, H), + Val = element(#registry_entry.val, H), + ReplyTo ! {restore, KeySize, ValSize, ValType, Key, Val}, + send_records(T, ReplyTo); +send_records([], _ReplyTo) -> + ok. + diff --git a/lib/mnesia/src/mnesia_schema.erl b/lib/mnesia/src/mnesia_schema.erl new file mode 100644 index 0000000000..354431a296 --- /dev/null +++ b/lib/mnesia/src/mnesia_schema.erl @@ -0,0 +1,3027 @@ +%% +%% %CopyrightBegin% +%% +%% Copyright Ericsson AB 1996-2009. All Rights Reserved. +%% +%% The contents of this file are subject to the Erlang Public License, +%% Version 1.1, (the "License"); you may not use this file except in +%% compliance with the License. You should have received a copy of the +%% Erlang Public License along with this software. If not, it can be +%% retrieved online at http://www.erlang.org/. +%% +%% Software distributed under the License is distributed on an "AS IS" +%% basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See +%% the License for the specific language governing rights and limitations +%% under the License. +%% +%% %CopyrightEnd% +%% + +%% +%% In this module we provide a number of explicit functions +%% to maninpulate the schema. All these functions are called +%% within a special schema transaction. +%% +%% We also have an init/1 function defined here, this func is +%% used by mnesia:start() to initialize the entire schema. + +-module(mnesia_schema). + +-export([ + add_snmp/2, + add_table_copy/3, + add_table_index/2, + arrange_restore/3, + attr_tab_to_pos/2, + attr_to_pos/2, + change_table_copy_type/3, + change_table_access_mode/2, + change_table_load_order/2, + change_table_frag/2, + clear_table/1, + create_table/1, + cs2list/1, + del_snmp/1, + del_table_copy/2, + del_table_index/2, + delete_cstruct/2, + delete_schema/1, + delete_schema2/0, + delete_table/1, + delete_table_property/2, + dump_tables/1, + ensure_no_schema/1, + get_create_list/1, + get_initial_schema/2, + get_table_properties/1, + info/0, + info/1, + init/1, + insert_cstruct/3, + is_remote_member/1, + list2cs/1, + lock_schema/0, + merge_schema/0, + move_table/3, + opt_create_dir/2, + prepare_commit/3, + purge_dir/2, + purge_tmp_files/0, + ram_delete_table/2, +% ram_delete_table/3, + read_cstructs_from_disc/0, + read_nodes/0, + remote_read_schema/0, + restore/1, + restore/2, + restore/3, + schema_coordinator/3, + set_where_to_read/3, + transform_table/4, + undo_prepare_commit/2, + unlock_schema/0, + version/0, + write_table_property/2 + ]). + +%% Exports for mnesia_frag +-export([ + get_tid_ts_and_lock/2, + make_create_table/1, + ensure_active/1, + pick/4, + verify/3, + incr_version/1, + check_keys/3, + check_duplicates/2, + make_delete_table/2 + ]). + +%% Needed outside to be able to use/set table_properties +%% from user (not supported) +-export([schema_transaction/1, + insert_schema_ops/2, + do_create_table/1, + do_delete_table/1, + do_read_table_property/2, + do_delete_table_property/2, + do_write_table_property/2]). + +-include("mnesia.hrl"). +-include_lib("kernel/include/file.hrl"). + +-import(mnesia_lib, [set/2, del/2, verbose/2, dbg_out/2]). + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +%% Here comes the init function which also resides in +%% this module, it is called upon by the trans server +%% at startup of the system +%% +%% We have a meta table which looks like +%% {table, schema, +%% {type, set}, +%% {disc_copies, all}, +%% {arity, 2} +%% {attributes, [key, val]} +%% +%% This means that we have a series of {schema, Name, Cs} tuples +%% in a table called schema !! + +init(IgnoreFallback) -> + Res = read_schema(true, IgnoreFallback), + {ok, Source, _CreateList} = exit_on_error(Res), + verbose("Schema initiated from: ~p~n", [Source]), + set({schema, tables}, []), + set({schema, local_tables}, []), + Tabs = set_schema(?ets_first(schema)), + lists:foreach(fun(Tab) -> clear_whereabouts(Tab) end, Tabs), + set({schema, where_to_read}, node()), + set({schema, load_node}, node()), + set({schema, load_reason}, initial), + mnesia_controller:add_active_replica(schema, node()). + +exit_on_error({error, Reason}) -> + exit(Reason); +exit_on_error(GoodRes) -> + GoodRes. + +val(Var) -> + case ?catch_val(Var) of + {'EXIT', Reason} -> mnesia_lib:other_val(Var, Reason); + Value -> Value + end. + +%% This function traverses all cstructs in the schema and +%% sets all values in mnesia_gvar accordingly for each table/cstruct + +set_schema('$end_of_table') -> + []; +set_schema(Tab) -> + do_set_schema(Tab), + [Tab | set_schema(?ets_next(schema, Tab))]. + +get_create_list(Tab) -> + ?ets_lookup_element(schema, Tab, 3). + +do_set_schema(Tab) -> + List = get_create_list(Tab), + Cs = list2cs(List), + do_set_schema(Tab, Cs). + +do_set_schema(Tab, Cs) -> + Type = Cs#cstruct.type, + set({Tab, setorbag}, Type), + set({Tab, local_content}, Cs#cstruct.local_content), + set({Tab, ram_copies}, Cs#cstruct.ram_copies), + set({Tab, disc_copies}, Cs#cstruct.disc_copies), + set({Tab, disc_only_copies}, Cs#cstruct.disc_only_copies), + set({Tab, load_order}, Cs#cstruct.load_order), + set({Tab, access_mode}, Cs#cstruct.access_mode), + set({Tab, snmp}, Cs#cstruct.snmp), + set({Tab, user_properties}, Cs#cstruct.user_properties), + [set({Tab, user_property, element(1, P)}, P) || P <- Cs#cstruct.user_properties], + set({Tab, frag_properties}, Cs#cstruct.frag_properties), + mnesia_frag:set_frag_hash(Tab, Cs#cstruct.frag_properties), + set({Tab, attributes}, Cs#cstruct.attributes), + Arity = length(Cs#cstruct.attributes) + 1, + set({Tab, arity}, Arity), + RecName = Cs#cstruct.record_name, + set({Tab, record_name}, RecName), + set({Tab, record_validation}, {RecName, Arity, Type}), + set({Tab, wild_pattern}, wild(RecName, Arity)), + set({Tab, index}, Cs#cstruct.index), + %% create actual index tabs later + set({Tab, cookie}, Cs#cstruct.cookie), + set({Tab, version}, Cs#cstruct.version), + set({Tab, cstruct}, Cs), + Storage = mnesia_lib:schema_cs_to_storage_type(node(), Cs), + set({Tab, storage_type}, Storage), + mnesia_lib:add({schema, tables}, Tab), + Ns = mnesia_lib:cs_to_nodes(Cs), + case lists:member(node(), Ns) of + true -> + mnesia_lib:add({schema, local_tables}, Tab); + false when Tab == schema -> + mnesia_lib:add({schema, local_tables}, Tab); + false -> + ignore + end. + +wild(RecName, Arity) -> + Wp0 = list_to_tuple(lists:duplicate(Arity, '_')), + setelement(1, Wp0, RecName). + +%% Temporarily read the local schema and return a list +%% of all nodes mentioned in the schema.DAT file +read_nodes() -> + %% Ensure that we access the intended Mnesia + %% directory. This function may not be called + %% during startup since it will cause the + %% application_controller to get into deadlock + case mnesia_lib:ensure_loaded(?APPLICATION) of + ok -> + case read_schema(false) of + {ok, _Source, CreateList} -> + Cs = list2cs(CreateList), + {ok, Cs#cstruct.disc_copies ++ Cs#cstruct.ram_copies}; + {error, Reason} -> + {error, Reason} + end; + {error, Reason} -> + {error, Reason} + end. + +%% Returns Version from the tuple {Version,MasterNodes} +version() -> + case read_schema(false) of + {ok, Source, CreateList} when Source /= default -> + Cs = list2cs(CreateList), + {Version, _Details} = Cs#cstruct.version, + Version; + _ -> + case dir_exists(mnesia_lib:dir()) of + true -> {1,0}; + false -> {0,0} + end + end. + +%% Calculate next table version from old cstruct +incr_version(Cs) -> + {{Major, Minor}, _} = Cs#cstruct.version, + Nodes = mnesia_lib:intersect(val({schema, disc_copies}), + mnesia_lib:cs_to_nodes(Cs)), + V = + case Nodes -- val({Cs#cstruct.name, active_replicas}) of + [] -> {Major + 1, 0}; % All replicas are active + _ -> {Major, Minor + 1} % Some replicas are inactive + end, + Cs#cstruct{version = {V, {node(), now()}}}. + +%% Returns table name +insert_cstruct(Tid, Cs, KeepWhereabouts) -> + Tab = Cs#cstruct.name, + TabDef = cs2list(Cs), + Val = {schema, Tab, TabDef}, + mnesia_checkpoint:tm_retain(Tid, schema, Tab, write), + mnesia_subscr:report_table_event(schema, Tid, Val, write), + Active = val({Tab, active_replicas}), + + case KeepWhereabouts of + true -> + ignore; + false when Active == [] -> + clear_whereabouts(Tab); + false -> + %% Someone else has initiated table + ignore + end, + set({Tab, cstruct}, Cs), + ?ets_insert(schema, Val), + do_set_schema(Tab, Cs), + Val. + +clear_whereabouts(Tab) -> + set({Tab, checkpoints}, []), + set({Tab, subscribers}, []), + set({Tab, where_to_read}, nowhere), + set({Tab, active_replicas}, []), + set({Tab, commit_work}, []), + set({Tab, where_to_write}, []), + set({Tab, where_to_commit}, []), + set({Tab, load_by_force}, false), + set({Tab, load_node}, unknown), + set({Tab, load_reason}, unknown). + +%% Returns table name +delete_cstruct(Tid, Cs) -> + Tab = Cs#cstruct.name, + TabDef = cs2list(Cs), + Val = {schema, Tab, TabDef}, + mnesia_checkpoint:tm_retain(Tid, schema, Tab, delete), + mnesia_subscr:report_table_event(schema, Tid, Val, delete), + mnesia_controller:update( + fun() -> + ?ets_match_delete(mnesia_gvar, {{Tab, '_'}, '_'}), + ?ets_match_delete(mnesia_gvar, {{Tab, '_', '_'}, '_'}), + del({schema, local_tables}, Tab), + del({schema, tables}, Tab), + ?ets_delete(schema, Tab) + end), + Val. + +%% Delete the Mnesia directory on all given nodes +%% Requires that Mnesia is not running anywhere +%% Returns ok | {error,Reason} +delete_schema(Ns) when is_list(Ns), Ns /= [] -> + RunningNs = mnesia_lib:running_nodes(Ns), + Reason = "Cannot delete schema on all nodes", + if + RunningNs == [] -> + case rpc:multicall(Ns, ?MODULE, delete_schema2, []) of + {Replies, []} -> + case [R || R <- Replies, R /= ok] of + [] -> + ok; + BadReplies -> + verbose("~s: ~p~n", [Reason, BadReplies]), + {error, {"All nodes not running", BadReplies}} + end; + {_Replies, BadNs} -> + verbose("~s: ~p~n", [Reason, BadNs]), + {error, {"All nodes not running", BadNs}} + end; + true -> + verbose("~s: ~p~n", [Reason, RunningNs]), + {error, {"Mnesia is not stopped everywhere", RunningNs}} + end; +delete_schema(Ns) -> + {error, {badarg, Ns}}. + +delete_schema2() -> + %% Ensure that we access the intended Mnesia + %% directory. This function may not be called + %% during startup since it will cause the + %% application_controller to get into deadlock + case mnesia_lib:ensure_loaded(?APPLICATION) of + ok -> + case mnesia_lib:is_running() of + no -> + Dir = mnesia_lib:dir(), + purge_dir(Dir, []), + ok; + _ -> + {error, {"Mnesia still running", node()}} + end; + {error, Reason} -> + {error, Reason} + end. + +ensure_no_schema([H|T]) when is_atom(H) -> + case rpc:call(H, ?MODULE, remote_read_schema, []) of + {badrpc, Reason} -> + {H, {"All nodes not running", H, Reason}}; + {ok,Source, _} when Source /= default -> + {H, {already_exists, H}}; + _ -> + ensure_no_schema(T) + end; +ensure_no_schema([H|_]) -> + {error,{badarg, H}}; +ensure_no_schema([]) -> + ok. + +remote_read_schema() -> + %% Ensure that we access the intended Mnesia + %% directory. This function may not be called + %% during startup since it will cause the + %% application_controller to get into deadlock + case mnesia_lib:ensure_loaded(?APPLICATION) of + ok -> + case mnesia_monitor:get_env(schema_location) of + opt_disc -> + read_schema(false); + _ -> + read_schema(false) + end; + {error, Reason} -> + {error, Reason} + end. + +dir_exists(Dir) -> + dir_exists(Dir, mnesia_monitor:use_dir()). +dir_exists(Dir, true) -> + case file:read_file_info(Dir) of + {ok, _} -> true; + _ -> false + end; +dir_exists(_Dir, false) -> + false. + +opt_create_dir(UseDir, Dir) when UseDir == true-> + case dir_exists(Dir, UseDir) of + true -> + check_can_write(Dir); + false -> + case file:make_dir(Dir) of + ok -> + verbose("Create Directory ~p~n", [Dir]), + ok; + {error, Reason} -> + verbose("Cannot create mnesia dir ~p~n", [Reason]), + {error, {"Cannot create Mnesia dir", Dir, Reason}} + end + end; +opt_create_dir(false, _) -> + {error, {has_no_disc, node()}}. + +check_can_write(Dir) -> + case file:read_file_info(Dir) of + {ok, FI} when FI#file_info.type == directory, + FI#file_info.access == read_write -> + ok; + {ok, _} -> + {error, "Not allowed to write in Mnesia dir", Dir}; + _ -> + {error, "Non existent Mnesia dir", Dir} + end. + +lock_schema() -> + mnesia_lib:lock_table(schema). + +unlock_schema() -> + mnesia_lib:unlock_table(schema). + +read_schema(Keep) -> + read_schema(Keep, false). + +%% The schema may be read for several reasons. +%% If Mnesia is not already started the read intention +%% we normally do not want the ets table named schema +%% be left around. +%% If Keep == true, the ets table schema is kept +%% If Keep == false, the ets table schema is removed +%% +%% Returns {ok, Source, SchemaCstruct} or {error, Reason} +%% Source may be: default | ram | disc | fallback + +read_schema(Keep, IgnoreFallback) -> + lock_schema(), + Res = + case mnesia:system_info(is_running) of + yes -> + {ok, ram, get_create_list(schema)}; + _IsRunning -> + case mnesia_monitor:use_dir() of + true -> + read_disc_schema(Keep, IgnoreFallback); + false when Keep == true -> + Args = [{keypos, 2}, public, named_table, set], + mnesia_monitor:mktab(schema, Args), + CreateList = get_initial_schema(ram_copies, []), + ?ets_insert(schema,{schema, schema, CreateList}), + {ok, default, CreateList}; + false when Keep == false -> + CreateList = get_initial_schema(ram_copies, []), + {ok, default, CreateList} + end + end, + unlock_schema(), + Res. + +read_disc_schema(Keep, IgnoreFallback) -> + Running = mnesia:system_info(is_running), + case mnesia_bup:fallback_exists() of + true when IgnoreFallback == false, Running /= yes -> + mnesia_bup:fallback_to_schema(); + _ -> + %% If we're running, we read the schema file even + %% if fallback exists + Dat = mnesia_lib:tab2dat(schema), + case mnesia_lib:exists(Dat) of + true -> + do_read_disc_schema(Dat, Keep); + false -> + Dmp = mnesia_lib:tab2dmp(schema), + case mnesia_lib:exists(Dmp) of + true -> + %% May only happen when toggling of + %% schema storage type has been + %% interrupted + do_read_disc_schema(Dmp, Keep); + false -> + {error, "No schema file exists"} + end + end + end. + +do_read_disc_schema(Fname, Keep) -> + T = + case Keep of + false -> + Args = [{keypos, 2}, public, set], + ?ets_new_table(schema, Args); + true -> + Args = [{keypos, 2}, public, named_table, set], + mnesia_monitor:mktab(schema, Args) + end, + Repair = mnesia_monitor:get_env(auto_repair), + Res = % BUGBUG Fixa till dcl! + case mnesia_lib:dets_to_ets(schema, T, Fname, set, Repair, no) of + loaded -> {ok, disc, ?ets_lookup_element(T, schema, 3)}; + Other -> {error, {"Cannot read schema", Fname, Other}} + end, + case Keep of + true -> ignore; + false -> ?ets_delete_table(T) + end, + Res. + +get_initial_schema(SchemaStorage, Nodes) -> + Cs = #cstruct{name = schema, + record_name = schema, + attributes = [table, cstruct]}, + Cs2 = + case SchemaStorage of + ram_copies -> Cs#cstruct{ram_copies = Nodes}; + disc_copies -> Cs#cstruct{disc_copies = Nodes} + end, + cs2list(Cs2). + +read_cstructs_from_disc() -> + %% Assumptions: + %% - local schema lock in global + %% - use_dir is true + %% - Mnesia is not running + %% - Ignore fallback + + Fname = mnesia_lib:tab2dat(schema), + case mnesia_lib:exists(Fname) of + true -> + Args = [{file, Fname}, + {keypos, 2}, + {repair, mnesia_monitor:get_env(auto_repair)}, + {type, set}], + case dets:open_file(make_ref(), Args) of + {ok, Tab} -> + Fun = fun({_, _, List}) -> + {continue, list2cs(List)} + end, + Cstructs = dets:traverse(Tab, Fun), + dets:close(Tab), + {ok, Cstructs}; + {error, Reason} -> + {error, Reason} + end; + false -> + {error, "No schema file exists"} + end. + +%% We run a very special type of transactions when we +%% we want to manipulate the schema. + +get_tid_ts_and_lock(Tab, Intent) -> + TidTs = get(mnesia_activity_state), + case TidTs of + {_Mod, Tid, Ts} when is_record(Ts, tidstore)-> + Store = Ts#tidstore.store, + case Intent of + read -> mnesia_locker:rlock_table(Tid, Store, Tab); + write -> mnesia_locker:wlock_table(Tid, Store, Tab); + none -> ignore + end, + TidTs; + _ -> + mnesia:abort(no_transaction) + end. + +schema_transaction(Fun) -> + case get(mnesia_activity_state) of + undefined -> + Args = [self(), Fun, whereis(mnesia_controller)], + Pid = spawn_link(?MODULE, schema_coordinator, Args), + receive + {transaction_done, Res, Pid} -> Res; + {'EXIT', Pid, R} -> {aborted, {transaction_crashed, R}} + end; + _ -> + {aborted, nested_transaction} + end. + +%% This process may dump the transaction log, and should +%% therefore not be run in an application process +%% +schema_coordinator(Client, _Fun, undefined) -> + Res = {aborted, {node_not_running, node()}}, + Client ! {transaction_done, Res, self()}, + unlink(Client); + +schema_coordinator(Client, Fun, Controller) when is_pid(Controller) -> + %% Do not trap exit in order to automatically die + %% when the controller dies + + link(Controller), + unlink(Client), + + %% Fulfull the transaction even if the client dies + Res = mnesia:transaction(Fun), + Client ! {transaction_done, Res, self()}, + unlink(Controller), % Avoids spurious exit message + unlink(whereis(mnesia_tm)), % Avoids spurious exit message + exit(normal). + +%% The make* rotines return a list of ops, this function +%% inserts em all in the Store and maintains the local order +%% of ops. + +insert_schema_ops({_Mod, _Tid, Ts}, SchemaIOps) -> + do_insert_schema_ops(Ts#tidstore.store, SchemaIOps). + +do_insert_schema_ops(Store, [Head | Tail]) -> + ?ets_insert(Store, Head), + do_insert_schema_ops(Store, Tail); +do_insert_schema_ops(_Store, []) -> + ok. + +cs2list(Cs) when is_record(Cs, cstruct) -> + Tags = record_info(fields, cstruct), + rec2list(Tags, 2, Cs); +cs2list(CreateList) when is_list(CreateList) -> + CreateList. + +rec2list([Tag | Tags], Pos, Rec) -> + Val = element(Pos, Rec), + [{Tag, Val} | rec2list(Tags, Pos + 1, Rec)]; +rec2list([], _Pos, _Rec) -> + []. + +list2cs(List) when is_list(List) -> + Name = pick(unknown, name, List, must), + Type = pick(Name, type, List, set), + Rc0 = pick(Name, ram_copies, List, []), + Dc = pick(Name, disc_copies, List, []), + Doc = pick(Name, disc_only_copies, List, []), + Rc = case {Rc0, Dc, Doc} of + {[], [], []} -> [node()]; + _ -> Rc0 + end, + LC = pick(Name, local_content, List, false), + RecName = pick(Name, record_name, List, Name), + Attrs = pick(Name, attributes, List, [key, val]), + Snmp = pick(Name, snmp, List, []), + LoadOrder = pick(Name, load_order, List, 0), + AccessMode = pick(Name, access_mode, List, read_write), + UserProps = pick(Name, user_properties, List, []), + verify({alt, [nil, list]}, mnesia_lib:etype(UserProps), + {bad_type, Name, {user_properties, UserProps}}), + Cookie = pick(Name, cookie, List, ?unique_cookie), + Version = pick(Name, version, List, {{2, 0}, []}), + Ix = pick(Name, index, List, []), + verify({alt, [nil, list]}, mnesia_lib:etype(Ix), + {bad_type, Name, {index, [Ix]}}), + Ix2 = [attr_to_pos(I, Attrs) || I <- Ix], + + Frag = pick(Name, frag_properties, List, []), + verify({alt, [nil, list]}, mnesia_lib:etype(Frag), + {badarg, Name, {frag_properties, Frag}}), + + Keys = check_keys(Name, List, record_info(fields, cstruct)), + check_duplicates(Name, Keys), + #cstruct{name = Name, + ram_copies = Rc, + disc_copies = Dc, + disc_only_copies = Doc, + type = Type, + index = Ix2, + snmp = Snmp, + load_order = LoadOrder, + access_mode = AccessMode, + local_content = LC, + record_name = RecName, + attributes = Attrs, + user_properties = lists:sort(UserProps), + frag_properties = lists:sort(Frag), + cookie = Cookie, + version = Version}; +list2cs(Other) -> + mnesia:abort({badarg, Other}). + +pick(Tab, Key, List, Default) -> + case lists:keysearch(Key, 1, List) of + false when Default == must -> + mnesia:abort({badarg, Tab, "Missing key", Key, List}); + false -> + Default; + {value, {Key, Value}} -> + Value; + {value, BadArg} -> + mnesia:abort({bad_type, Tab, BadArg}) + end. + +%% Convert attribute name to integer if neccessary +attr_tab_to_pos(_Tab, Pos) when is_integer(Pos) -> + Pos; +attr_tab_to_pos(Tab, Attr) -> + attr_to_pos(Attr, val({Tab, attributes})). + +%% Convert attribute name to integer if neccessary +attr_to_pos(Pos, _Attrs) when is_integer(Pos) -> + Pos; +attr_to_pos(Attr, Attrs) when is_atom(Attr) -> + attr_to_pos(Attr, Attrs, 2); +attr_to_pos(Attr, _) -> + mnesia:abort({bad_type, Attr}). + +attr_to_pos(Attr, [Attr | _Attrs], Pos) -> + Pos; +attr_to_pos(Attr, [_ | Attrs], Pos) -> + attr_to_pos(Attr, Attrs, Pos + 1); +attr_to_pos(Attr, _, _) -> + mnesia:abort({bad_type, Attr}). + +check_keys(Tab, [{Key, _Val} | Tail], Items) -> + case lists:member(Key, Items) of + true -> [Key | check_keys(Tab, Tail, Items)]; + false -> mnesia:abort({badarg, Tab, Key}) + end; +check_keys(_, [], _) -> + []; +check_keys(Tab, Arg, _) -> + mnesia:abort({badarg, Tab, Arg}). + +check_duplicates(Tab, Keys) -> + case has_duplicates(Keys) of + false -> ok; + true -> mnesia:abort({badarg, Tab, "Duplicate keys", Keys}) + end. + +has_duplicates([H | T]) -> + case lists:member(H, T) of + true -> true; + false -> has_duplicates(T) + end; +has_duplicates([]) -> + false. + +%% This is the only place where we check the validity of data +verify_cstruct(Cs) when is_record(Cs, cstruct) -> + verify_nodes(Cs), + + Tab = Cs#cstruct.name, + verify(atom, mnesia_lib:etype(Tab), {bad_type, Tab}), + Type = Cs#cstruct.type, + verify(true, lists:member(Type, [set, bag, ordered_set]), + {bad_type, Tab, {type, Type}}), + + %% Currently ordered_set is not supported for disk_only_copies. + if + Type == ordered_set, Cs#cstruct.disc_only_copies /= [] -> + mnesia:abort({bad_type, Tab, {not_supported, Type, disc_only_copies}}); + true -> + ok + end, + + RecName = Cs#cstruct.record_name, + verify(atom, mnesia_lib:etype(RecName), + {bad_type, Tab, {record_name, RecName}}), + + Attrs = Cs#cstruct.attributes, + verify(list, mnesia_lib:etype(Attrs), + {bad_type, Tab, {attributes, Attrs}}), + + Arity = length(Attrs) + 1, + verify(true, Arity > 2, {bad_type, Tab, {attributes, Attrs}}), + + lists:foldl(fun(Attr,_Other) when Attr == snmp -> + mnesia:abort({bad_type, Tab, {attributes, [Attr]}}); + (Attr,Other) -> + verify(atom, mnesia_lib:etype(Attr), + {bad_type, Tab, {attributes, [Attr]}}), + verify(false, lists:member(Attr, Other), + {combine_error, Tab, {attributes, [Attr | Other]}}), + [Attr | Other] + end, + [], + Attrs), + + Index = Cs#cstruct.index, + verify({alt, [nil, list]}, mnesia_lib:etype(Index), + {bad_type, Tab, {index, Index}}), + + IxFun = + fun(Pos) -> + verify(true, fun() -> + if + is_integer(Pos), + Pos > 2, + Pos =< Arity -> + true; + true -> false + end + end, + {bad_type, Tab, {index, [Pos]}}) + end, + lists:foreach(IxFun, Index), + + LC = Cs#cstruct.local_content, + verify({alt, [true, false]}, LC, + {bad_type, Tab, {local_content, LC}}), + Access = Cs#cstruct.access_mode, + verify({alt, [read_write, read_only]}, Access, + {bad_type, Tab, {access_mode, Access}}), + + Snmp = Cs#cstruct.snmp, + verify(true, mnesia_snmp_hook:check_ustruct(Snmp), + {badarg, Tab, {snmp, Snmp}}), + + CheckProp = fun(Prop) when is_tuple(Prop), size(Prop) >= 1 -> ok; + (Prop) -> mnesia:abort({bad_type, Tab, {user_properties, [Prop]}}) + end, + lists:foreach(CheckProp, Cs#cstruct.user_properties), + + case Cs#cstruct.cookie of + {{MegaSecs, Secs, MicroSecs}, _Node} + when is_integer(MegaSecs), is_integer(Secs), + is_integer(MicroSecs), is_atom(node) -> + ok; + Cookie -> + mnesia:abort({bad_type, Tab, {cookie, Cookie}}) + end, + case Cs#cstruct.version of + {{Major, Minor}, _Detail} + when is_integer(Major), is_integer(Minor) -> + ok; + Version -> + mnesia:abort({bad_type, Tab, {version, Version}}) + end. + +verify_nodes(Cs) -> + Tab = Cs#cstruct.name, + Ram = Cs#cstruct.ram_copies, + Disc = Cs#cstruct.disc_copies, + DiscOnly = Cs#cstruct.disc_only_copies, + LoadOrder = Cs#cstruct.load_order, + + verify({alt, [nil, list]}, mnesia_lib:etype(Ram), + {bad_type, Tab, {ram_copies, Ram}}), + verify({alt, [nil, list]}, mnesia_lib:etype(Disc), + {bad_type, Tab, {disc_copies, Disc}}), + case Tab of + schema -> + verify([], DiscOnly, {bad_type, Tab, {disc_only_copies, DiscOnly}}); + _ -> + verify({alt, [nil, list]}, + mnesia_lib:etype(DiscOnly), + {bad_type, Tab, {disc_only_copies, DiscOnly}}) + end, + verify(integer, mnesia_lib:etype(LoadOrder), + {bad_type, Tab, {load_order, LoadOrder}}), + + Nodes = Ram ++ Disc ++ DiscOnly, + verify(list, mnesia_lib:etype(Nodes), + {combine_error, Tab, + [{ram_copies, []}, {disc_copies, []}, {disc_only_copies, []}]}), + verify(false, has_duplicates(Nodes), {combine_error, Tab, Nodes}), + AtomCheck = fun(N) -> verify(atom, mnesia_lib:etype(N), {bad_type, Tab, N}) end, + lists:foreach(AtomCheck, Nodes). + +verify(Expected, Fun, Error) when is_function(Fun) -> + do_verify(Expected, catch Fun(), Error); +verify(Expected, Actual, Error) -> + do_verify(Expected, Actual, Error). + +do_verify({alt, Values}, Value, Error) -> + case lists:member(Value, Values) of + true -> ok; + false -> mnesia:abort(Error) + end; +do_verify(Value, Value, _) -> + ok; +do_verify(_Value, _, Error) -> + mnesia:abort(Error). + +ensure_writable(Tab) -> + case val({Tab, where_to_write}) of + [] -> mnesia:abort({read_only, Tab}); + _ -> ok + end. + +%% Ensure that all replicas on disk full nodes are active +ensure_active(Cs) -> + ensure_active(Cs, active_replicas). + +ensure_active(Cs, What) -> + Tab = Cs#cstruct.name, + W = {Tab, What}, + ensure_non_empty(W), + Nodes = mnesia_lib:intersect(val({schema, disc_copies}), + mnesia_lib:cs_to_nodes(Cs)), + case Nodes -- val(W) of + [] -> + ok; + Ns -> + Expl = "All replicas on diskfull nodes are not active yet", + case val({Tab, local_content}) of + true -> + case rpc:multicall(Ns, ?MODULE, is_remote_member, [W]) of + {Replies, []} -> + check_active(Replies, Expl, Tab); + {_Replies, BadNs} -> + mnesia:abort({not_active, Expl, Tab, BadNs}) + end; + false -> + mnesia:abort({not_active, Expl, Tab, Ns}) + end + end. + +ensure_non_empty({Tab, Vhat}) -> + case val({Tab, Vhat}) of + [] -> mnesia:abort({no_exists, Tab}); + _ -> ok + end. + +ensure_not_active(Tab = schema, Node) -> + Active = val({Tab, active_replicas}), + case lists:member(Node, Active) of + false when Active =/= [] -> + ok; + false -> + mnesia:abort({no_exists, Tab}); + true -> + Expl = "Mnesia is running", + mnesia:abort({active, Expl, Node}) + end. + +is_remote_member(Key) -> + IsActive = lists:member(node(), val(Key)), + {IsActive, node()}. + +check_active([{true, _Node} | Replies], Expl, Tab) -> + check_active(Replies, Expl, Tab); +check_active([{false, Node} | _Replies], Expl, Tab) -> + mnesia:abort({not_active, Expl, Tab, [Node]}); +check_active([{badrpc, Reason} | _Replies], Expl, Tab) -> + mnesia:abort({not_active, Expl, Tab, Reason}); +check_active([], _Expl, _Tab) -> + ok. + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +%% Here's the real interface function to create a table + +create_table(TabDef) -> + schema_transaction(fun() -> do_multi_create_table(TabDef) end). + +%% And the corresponding do routines .... + +do_multi_create_table(TabDef) -> + get_tid_ts_and_lock(schema, write), + ensure_writable(schema), + Cs = list2cs(TabDef), + case Cs#cstruct.frag_properties of + [] -> + do_create_table(Cs); + _Props -> + CsList = mnesia_frag:expand_cstruct(Cs), + lists:foreach(fun do_create_table/1, CsList) + end, + ok. + +do_create_table(Cs) -> + {_Mod, _Tid, Ts} = get_tid_ts_and_lock(schema, none), + Store = Ts#tidstore.store, + do_insert_schema_ops(Store, make_create_table(Cs)). + +make_create_table(Cs) -> + Tab = Cs#cstruct.name, + verify(false, check_if_exists(Tab), {already_exists, Tab}), + unsafe_make_create_table(Cs). + +% unsafe_do_create_table(Cs) -> +% {_Mod, Tid, Ts} = get_tid_ts_and_lock(schema, none), +% Store = Ts#tidstore.store, +% do_insert_schema_ops(Store, unsafe_make_create_table(Cs)). + +unsafe_make_create_table(Cs) -> + {_Mod, Tid, Ts} = get_tid_ts_and_lock(schema, none), + verify_cstruct(Cs), + Tab = Cs#cstruct.name, + + %% Check that we have all disc replica nodes running + DiscNodes = Cs#cstruct.disc_copies ++ Cs#cstruct.disc_only_copies, + RunningNodes = val({current, db_nodes}), + CheckDisc = fun(N) -> + verify(true, lists:member(N, RunningNodes), + {not_active, Tab, N}) + end, + lists:foreach(CheckDisc, DiscNodes), + + Nodes = mnesia_lib:intersect(mnesia_lib:cs_to_nodes(Cs), RunningNodes), + Store = Ts#tidstore.store, + mnesia_locker:wlock_no_exist(Tid, Store, Tab, Nodes), + [{op, create_table, cs2list(Cs)}]. + +check_if_exists(Tab) -> + TidTs = get_tid_ts_and_lock(schema, write), + {_, _, Ts} = TidTs, + Store = Ts#tidstore.store, + ets:foldl( + fun({op, create_table, [{name, T}|_]}, _Acc) when T==Tab -> + true; + ({op, delete_table, [{name,T}|_]}, _Acc) when T==Tab -> + false; + (_Other, Acc) -> + Acc + end, existed_before(Tab), Store). + +existed_before(Tab) -> + ('EXIT' =/= element(1, ?catch_val({Tab,cstruct}))). + + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +%% Delete a table entirely on all nodes. + +delete_table(Tab) -> + schema_transaction(fun() -> do_delete_table(Tab) end). + +do_delete_table(schema) -> + mnesia:abort({bad_type, schema}); +do_delete_table(Tab) -> + TidTs = get_tid_ts_and_lock(schema, write), + ensure_writable(schema), + insert_schema_ops(TidTs, make_delete_table(Tab, whole_table)). + +make_delete_table(Tab, Mode) -> + case existed_before(Tab) of + false -> + %% Deleting a table that was created in this very + %% schema transaction. Delete all ops in the Store + %% that operate on this table. We cannot run a normal + %% delete operation, since that involves checking live + %% nodes etc. + TidTs = get_tid_ts_and_lock(schema, write), + {_, _, Ts} = TidTs, + Store = Ts#tidstore.store, + Deleted = ets:select_delete( + Store, [{{op,'$1',[{name,Tab}|'_']}, + [{'or', + {'==','$1',create_table}, + {'==','$1',delete_table}}], [true]}]), + ets:select_delete( + Store, [{{op,'$1',[{name,Tab}|'_'],'_'}, + [{'or', + {'==','$1',write_table_property}, + {'==','$1',delete_table_property}}], + [true]}]), + case Deleted of + 0 -> mnesia:abort({no_exists, Tab}); + _ -> [] + end; + true -> + case Mode of + whole_table -> + case val({Tab, frag_properties}) of + [] -> + [make_delete_table2(Tab)]; + _Props -> + %% Check if it is a base table + mnesia_frag:lookup_frag_hash(Tab), + + %% Check for foreigners + F = mnesia_frag:lookup_foreigners(Tab), + verify([], F, {combine_error, + Tab, "Too many foreigners", F}), + [make_delete_table2(T) || + T <- mnesia_frag:frag_names(Tab)] + end; + single_frag -> + [make_delete_table2(Tab)] + end + end. + +make_delete_table2(Tab) -> + get_tid_ts_and_lock(Tab, write), + Cs = val({Tab, cstruct}), + ensure_active(Cs), + ensure_writable(Tab), + {op, delete_table, cs2list(Cs)}. + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +%% Change fragmentation of a table + +change_table_frag(Tab, Change) -> + schema_transaction(fun() -> do_change_table_frag(Tab, Change) end). + +do_change_table_frag(Tab, Change) when is_atom(Tab), Tab /= schema -> + TidTs = get_tid_ts_and_lock(schema, write), + Ops = mnesia_frag:change_table_frag(Tab, Change), + [insert_schema_ops(TidTs, Op) || Op <- Ops], + ok; +do_change_table_frag(Tab, _Change) -> + mnesia:abort({bad_type, Tab}). + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +%% Clear a table + +%% No need for a schema transaction +clear_table(Tab) -> + schema_transaction(fun() -> do_clear_table(Tab) end). + +do_clear_table(schema) -> + mnesia:abort({bad_type, schema}); +do_clear_table(Tab) -> + TidTs = get_tid_ts_and_lock(schema, write), + get_tid_ts_and_lock(Tab, write), + insert_schema_ops(TidTs, make_clear_table(Tab)). + +make_clear_table(Tab) -> + Cs = val({Tab, cstruct}), + ensure_writable(Tab), + [{op, clear_table, cs2list(Cs)}]. + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% + +add_table_copy(Tab, Node, Storage) -> + schema_transaction(fun() -> do_add_table_copy(Tab, Node, Storage) end). + +do_add_table_copy(Tab, Node, Storage) when is_atom(Tab), is_atom(Node) -> + TidTs = get_tid_ts_and_lock(schema, write), + insert_schema_ops(TidTs, make_add_table_copy(Tab, Node, Storage)); +do_add_table_copy(Tab,Node,_) -> + mnesia:abort({badarg, Tab, Node}). + +make_add_table_copy(Tab, Node, Storage) -> + ensure_writable(schema), + Cs = incr_version(val({Tab, cstruct})), + Ns = mnesia_lib:cs_to_nodes(Cs), + verify(false, lists:member(Node, Ns), {already_exists, Tab, Node}), + Cs2 = new_cs(Cs, Node, Storage, add), + verify_cstruct(Cs2), + + %% Check storage and if node is running + IsRunning = lists:member(Node, val({current, db_nodes})), + if + Tab == schema -> + if + Storage /= ram_copies -> + mnesia:abort({badarg, Tab, Storage}); + IsRunning == true -> + mnesia:abort({already_exists, Tab, Node}); + true -> + ignore + end; + Storage == ram_copies -> + ignore; + IsRunning == true -> + ignore; + IsRunning == false -> + mnesia:abort({not_active, schema, Node}) + end, + [{op, add_table_copy, Storage, Node, cs2list(Cs2)}]. + +del_table_copy(Tab, Node) -> + schema_transaction(fun() -> do_del_table_copy(Tab, Node) end). + +do_del_table_copy(Tab, Node) when is_atom(Node) -> + TidTs = get_tid_ts_and_lock(schema, write), +%% get_tid_ts_and_lock(Tab, write), + insert_schema_ops(TidTs, make_del_table_copy(Tab, Node)); +do_del_table_copy(Tab, Node) -> + mnesia:abort({badarg, Tab, Node}). + +make_del_table_copy(Tab, Node) -> + ensure_writable(schema), + Cs = incr_version(val({Tab, cstruct})), + Storage = mnesia_lib:schema_cs_to_storage_type(Node, Cs), + Cs2 = new_cs(Cs, Node, Storage, del), + case mnesia_lib:cs_to_nodes(Cs2) of + [] when Tab == schema -> + mnesia:abort({combine_error, Tab, "Last replica"}); + [] -> + ensure_active(Cs), + dbg_out("Last replica deleted in table ~p~n", [Tab]), + make_delete_table(Tab, whole_table); + _ when Tab == schema -> + %% ensure_active(Cs2), + ensure_not_active(Tab, Node), + verify_cstruct(Cs2), + Ops = remove_node_from_tabs(val({schema, tables}), Node), + [{op, del_table_copy, ram_copies, Node, cs2list(Cs2)} | Ops]; + _ -> + ensure_active(Cs), + verify_cstruct(Cs2), + [{op, del_table_copy, Storage, Node, cs2list(Cs2)}] + end. + +remove_node_from_tabs([], _Node) -> + []; +remove_node_from_tabs([schema|Rest], Node) -> + remove_node_from_tabs(Rest, Node); +remove_node_from_tabs([Tab|Rest], Node) -> + {Cs, IsFragModified} = + mnesia_frag:remove_node(Node, incr_version(val({Tab, cstruct}))), + case mnesia_lib:schema_cs_to_storage_type(Node, Cs) of + unknown -> + case IsFragModified of + true -> + [{op, change_table_frag, {del_node, Node}, cs2list(Cs)} | + remove_node_from_tabs(Rest, Node)]; + false -> + remove_node_from_tabs(Rest, Node) + end; + Storage -> + Cs2 = new_cs(Cs, Node, Storage, del), + case mnesia_lib:cs_to_nodes(Cs2) of + [] -> + [{op, delete_table, cs2list(Cs)} | + remove_node_from_tabs(Rest, Node)]; + _Ns -> + verify_cstruct(Cs2), + [{op, del_table_copy, ram_copies, Node, cs2list(Cs2)}| + remove_node_from_tabs(Rest, Node)] + end + end. + +new_cs(Cs, Node, ram_copies, add) -> + Cs#cstruct{ram_copies = opt_add(Node, Cs#cstruct.ram_copies)}; +new_cs(Cs, Node, disc_copies, add) -> + Cs#cstruct{disc_copies = opt_add(Node, Cs#cstruct.disc_copies)}; +new_cs(Cs, Node, disc_only_copies, add) -> + Cs#cstruct{disc_only_copies = opt_add(Node, Cs#cstruct.disc_only_copies)}; +new_cs(Cs, Node, ram_copies, del) -> + Cs#cstruct{ram_copies = lists:delete(Node , Cs#cstruct.ram_copies)}; +new_cs(Cs, Node, disc_copies, del) -> + Cs#cstruct{disc_copies = lists:delete(Node , Cs#cstruct.disc_copies)}; +new_cs(Cs, Node, disc_only_copies, del) -> + Cs#cstruct{disc_only_copies = + lists:delete(Node , Cs#cstruct.disc_only_copies)}; +new_cs(Cs, _Node, Storage, _Op) -> + mnesia:abort({badarg, Cs#cstruct.name, Storage}). + + +opt_add(N, L) -> [N | lists:delete(N, L)]. + +move_table(Tab, FromNode, ToNode) -> + schema_transaction(fun() -> do_move_table(Tab, FromNode, ToNode) end). + +do_move_table(schema, _FromNode, _ToNode) -> + mnesia:abort({bad_type, schema}); +do_move_table(Tab, FromNode, ToNode) when is_atom(FromNode), is_atom(ToNode) -> + TidTs = get_tid_ts_and_lock(schema, write), + insert_schema_ops(TidTs, make_move_table(Tab, FromNode, ToNode)); +do_move_table(Tab, FromNode, ToNode) -> + mnesia:abort({badarg, Tab, FromNode, ToNode}). + +make_move_table(Tab, FromNode, ToNode) -> + ensure_writable(schema), + Cs = incr_version(val({Tab, cstruct})), + Ns = mnesia_lib:cs_to_nodes(Cs), + verify(false, lists:member(ToNode, Ns), {already_exists, Tab, ToNode}), + verify(true, lists:member(FromNode, val({Tab, where_to_write})), + {not_active, Tab, FromNode}), + verify(false, val({Tab,local_content}), + {"Cannot move table with local content", Tab}), + ensure_active(Cs), + Running = val({current, db_nodes}), + Storage = mnesia_lib:schema_cs_to_storage_type(FromNode, Cs), + verify(true, lists:member(ToNode, Running), {not_active, schema, ToNode}), + + Cs2 = new_cs(Cs, ToNode, Storage, add), + Cs3 = new_cs(Cs2, FromNode, Storage, del), + verify_cstruct(Cs3), + [{op, add_table_copy, Storage, ToNode, cs2list(Cs2)}, + {op, sync_trans}, + {op, del_table_copy, Storage, FromNode, cs2list(Cs3)}]. + +%% end of functions to add and delete nodes to tables +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +%% + +change_table_copy_type(Tab, Node, ToS) -> + schema_transaction(fun() -> do_change_table_copy_type(Tab, Node, ToS) end). + +do_change_table_copy_type(Tab, Node, ToS) when is_atom(Node) -> + TidTs = get_tid_ts_and_lock(schema, write), + get_tid_ts_and_lock(Tab, write), % ensure global sync + %% get_tid_ts_and_lock(Tab, read), + insert_schema_ops(TidTs, make_change_table_copy_type(Tab, Node, ToS)); +do_change_table_copy_type(Tab, Node, _ToS) -> + mnesia:abort({badarg, Tab, Node}). + +make_change_table_copy_type(Tab, Node, unknown) -> + make_del_table_copy(Tab, Node); +make_change_table_copy_type(Tab, Node, ToS) -> + ensure_writable(schema), + Cs = incr_version(val({Tab, cstruct})), + FromS = mnesia_lib:storage_type_at_node(Node, Tab), + + case compare_storage_type(false, FromS, ToS) of + {same, _} -> + mnesia:abort({already_exists, Tab, Node, ToS}); + {diff, _} -> + ignore; + incompatible -> + ensure_active(Cs) + end, + + Cs2 = new_cs(Cs, Node, FromS, del), + Cs3 = new_cs(Cs2, Node, ToS, add), + verify_cstruct(Cs3), + + [{op, change_table_copy_type, Node, FromS, ToS, cs2list(Cs3)}]. + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +%% change index functions .... +%% Pos is allready added by 1 in both of these functions + +add_table_index(Tab, Pos) -> + schema_transaction(fun() -> do_add_table_index(Tab, Pos) end). + +do_add_table_index(schema, _Attr) -> + mnesia:abort({bad_type, schema}); +do_add_table_index(Tab, Attr) -> + TidTs = get_tid_ts_and_lock(schema, write), + get_tid_ts_and_lock(Tab, read), + Pos = attr_tab_to_pos(Tab, Attr), + insert_schema_ops(TidTs, make_add_table_index(Tab, Pos)). + +make_add_table_index(Tab, Pos) -> + ensure_writable(schema), + Cs = incr_version(val({Tab, cstruct})), + ensure_active(Cs), + Ix = Cs#cstruct.index, + verify(false, lists:member(Pos, Ix), {already_exists, Tab, Pos}), + Ix2 = lists:sort([Pos | Ix]), + Cs2 = Cs#cstruct{index = Ix2}, + verify_cstruct(Cs2), + [{op, add_index, Pos, cs2list(Cs2)}]. + +del_table_index(Tab, Pos) -> + schema_transaction(fun() -> do_del_table_index(Tab, Pos) end). + +do_del_table_index(schema, _Attr) -> + mnesia:abort({bad_type, schema}); +do_del_table_index(Tab, Attr) -> + TidTs = get_tid_ts_and_lock(schema, write), + get_tid_ts_and_lock(Tab, read), + Pos = attr_tab_to_pos(Tab, Attr), + insert_schema_ops(TidTs, make_del_table_index(Tab, Pos)). + +make_del_table_index(Tab, Pos) -> + ensure_writable(schema), + Cs = incr_version(val({Tab, cstruct})), + ensure_active(Cs), + Ix = Cs#cstruct.index, + verify(true, lists:member(Pos, Ix), {no_exists, Tab, Pos}), + Cs2 = Cs#cstruct{index = lists:delete(Pos, Ix)}, + verify_cstruct(Cs2), + [{op, del_index, Pos, cs2list(Cs2)}]. + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% + +add_snmp(Tab, Ustruct) -> + schema_transaction(fun() -> do_add_snmp(Tab, Ustruct) end). + +do_add_snmp(schema, _Ustruct) -> + mnesia:abort({bad_type, schema}); +do_add_snmp(Tab, Ustruct) -> + TidTs = get_tid_ts_and_lock(schema, write), + get_tid_ts_and_lock(Tab, read), + insert_schema_ops(TidTs, make_add_snmp(Tab, Ustruct)). + +make_add_snmp(Tab, Ustruct) -> + ensure_writable(schema), + Cs = incr_version(val({Tab, cstruct})), + ensure_active(Cs), + verify([], Cs#cstruct.snmp, {already_exists, Tab, snmp}), + Error = {badarg, Tab, snmp, Ustruct}, + verify(true, mnesia_snmp_hook:check_ustruct(Ustruct), Error), + Cs2 = Cs#cstruct{snmp = Ustruct}, + verify_cstruct(Cs2), + [{op, add_snmp, Ustruct, cs2list(Cs2)}]. + +del_snmp(Tab) -> + schema_transaction(fun() -> do_del_snmp(Tab) end). + +do_del_snmp(schema) -> + mnesia:abort({bad_type, schema}); +do_del_snmp(Tab) -> + TidTs = get_tid_ts_and_lock(schema, write), + get_tid_ts_and_lock(Tab, read), + insert_schema_ops(TidTs, make_del_snmp(Tab)). + +make_del_snmp(Tab) -> + ensure_writable(schema), + Cs = incr_version(val({Tab, cstruct})), + ensure_active(Cs), + Cs2 = Cs#cstruct{snmp = []}, + verify_cstruct(Cs2), + [{op, del_snmp, cs2list(Cs2)}]. + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +%% + +transform_table(Tab, Fun, NewAttrs, NewRecName) + when is_function(Fun), is_list(NewAttrs), is_atom(NewRecName) -> + schema_transaction(fun() -> do_transform_table(Tab, Fun, NewAttrs, NewRecName) end); + +transform_table(Tab, ignore, NewAttrs, NewRecName) + when is_list(NewAttrs), is_atom(NewRecName) -> + schema_transaction(fun() -> do_transform_table(Tab, ignore, NewAttrs, NewRecName) end); + +transform_table(Tab, Fun, NewAttrs, NewRecName) -> + {aborted,{bad_type, Tab, Fun, NewAttrs, NewRecName}}. + +do_transform_table(schema, _Fun, _NewAttrs, _NewRecName) -> + mnesia:abort({bad_type, schema}); +do_transform_table(Tab, Fun, NewAttrs, NewRecName) -> + TidTs = get_tid_ts_and_lock(schema, write), + get_tid_ts_and_lock(Tab, write), + insert_schema_ops(TidTs, make_transform(Tab, Fun, NewAttrs, NewRecName)). + +make_transform(Tab, Fun, NewAttrs, NewRecName) -> + ensure_writable(schema), + Cs = incr_version(val({Tab, cstruct})), + ensure_active(Cs), + ensure_writable(Tab), + case mnesia_lib:val({Tab, index}) of + [] -> + Cs2 = Cs#cstruct{attributes = NewAttrs, record_name = NewRecName}, + verify_cstruct(Cs2), + [{op, transform, Fun, cs2list(Cs2)}]; + PosList -> + DelIdx = fun(Pos, Ncs) -> + Ix = Ncs#cstruct.index, + Ncs1 = Ncs#cstruct{index = lists:delete(Pos, Ix)}, + Op = {op, del_index, Pos, cs2list(Ncs1)}, + {Op, Ncs1} + end, + AddIdx = fun(Pos, Ncs) -> + Ix = Ncs#cstruct.index, + Ix2 = lists:sort([Pos | Ix]), + Ncs1 = Ncs#cstruct{index = Ix2}, + Op = {op, add_index, Pos, cs2list(Ncs1)}, + {Op, Ncs1} + end, + {DelOps, Cs1} = lists:mapfoldl(DelIdx, Cs, PosList), + Cs2 = Cs1#cstruct{attributes = NewAttrs, record_name = NewRecName}, + {AddOps, Cs3} = lists:mapfoldl(AddIdx, Cs2, PosList), + verify_cstruct(Cs3), + lists:flatten([DelOps, {op, transform, Fun, cs2list(Cs2)}, AddOps]) + end. + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +%% + +change_table_access_mode(Tab, Mode) -> + schema_transaction(fun() -> do_change_table_access_mode(Tab, Mode) end). + +do_change_table_access_mode(Tab, Mode) -> + {_Mod, Tid, Ts} = get_tid_ts_and_lock(schema, write), + Store = Ts#tidstore.store, + mnesia_locker:wlock_no_exist(Tid, Store, schema, val({schema, active_replicas})), + mnesia_locker:wlock_no_exist(Tid, Store, Tab, val({Tab, active_replicas})), + do_insert_schema_ops(Store, make_change_table_access_mode(Tab, Mode)). + +make_change_table_access_mode(Tab, Mode) -> + ensure_writable(schema), + Cs = incr_version(val({Tab, cstruct})), + ensure_active(Cs), + OldMode = Cs#cstruct.access_mode, + verify(false, OldMode == Mode, {already_exists, Tab, Mode}), + Cs2 = Cs#cstruct{access_mode = Mode}, + verify_cstruct(Cs2), + [{op, change_table_access_mode, cs2list(Cs2), OldMode, Mode}]. + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% + +change_table_load_order(Tab, LoadOrder) -> + schema_transaction(fun() -> do_change_table_load_order(Tab, LoadOrder) end). + +do_change_table_load_order(schema, _LoadOrder) -> + mnesia:abort({bad_type, schema}); +do_change_table_load_order(Tab, LoadOrder) -> + TidTs = get_tid_ts_and_lock(schema, write), + get_tid_ts_and_lock(Tab, none), + insert_schema_ops(TidTs, make_change_table_load_order(Tab, LoadOrder)). + +make_change_table_load_order(Tab, LoadOrder) -> + ensure_writable(schema), + Cs = incr_version(val({Tab, cstruct})), + ensure_active(Cs), + OldLoadOrder = Cs#cstruct.load_order, + Cs2 = Cs#cstruct{load_order = LoadOrder}, + verify_cstruct(Cs2), + [{op, change_table_load_order, cs2list(Cs2), OldLoadOrder, LoadOrder}]. + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% + +write_table_property(Tab, Prop) when is_tuple(Prop), size(Prop) >= 1 -> + schema_transaction(fun() -> do_write_table_property(Tab, Prop) end); +write_table_property(Tab, Prop) -> + {aborted, {bad_type, Tab, Prop}}. +do_write_table_property(Tab, Prop) -> + TidTs = get_tid_ts_and_lock(schema, write), + {_, _, Ts} = TidTs, + Store = Ts#tidstore.store, + case change_prop_in_existing_op(Tab, Prop, write_property, Store) of + true -> + dbg_out("change_prop_in_existing_op" + "(~p,~p,write_property,Store) -> true~n", + [Tab,Prop]), + %% we have merged the table prop into the create_table op + ok; + false -> + dbg_out("change_prop_in_existing_op" + "(~p,~p,write_property,Store) -> false~n", + [Tab,Prop]), + %% this must be an existing table + get_tid_ts_and_lock(Tab, none), + insert_schema_ops(TidTs, make_write_table_properties(Tab, [Prop])) + end. + +make_write_table_properties(Tab, Props) -> + ensure_writable(schema), + Cs = incr_version(val({Tab, cstruct})), + ensure_active(Cs), + make_write_table_properties(Tab, Props, Cs). + +make_write_table_properties(Tab, [Prop | Props], Cs) -> + OldProps = Cs#cstruct.user_properties, + PropKey = element(1, Prop), + DelProps = lists:keydelete(PropKey, 1, OldProps), + MergedProps = lists:merge(DelProps, [Prop]), + Cs2 = Cs#cstruct{user_properties = MergedProps}, + verify_cstruct(Cs2), + [{op, write_property, cs2list(Cs2), Prop} | + make_write_table_properties(Tab, Props, Cs2)]; +make_write_table_properties(_Tab, [], _Cs) -> + []. + +change_prop_in_existing_op(Tab, Prop, How, Store) -> + Ops = ets:match_object(Store, '_'), + case update_existing_op(Ops, Tab, Prop, How, []) of + {true, Ops1} -> + ets:match_delete(Store, '_'), + [ets:insert(Store, Op) || Op <- Ops1], + true; + false -> + false + end. + +update_existing_op([{op, Op, L = [{name,Tab}|_], _OldProp}|Ops], + Tab, Prop, How, Acc) when Op == write_property; + Op == delete_property -> + %% Apparently, mnesia_dumper doesn't care about OldProp here -- just L, + %% so we will throw away OldProp (not that it matters...) and insert Prop. + %% as element 3. + L1 = insert_prop(Prop, L, How), + NewOp = {op, How, L1, Prop}, + {true, lists:reverse(Acc) ++ [NewOp|Ops]}; +update_existing_op([Op = {op, create_table, L}|Ops], Tab, Prop, How, Acc) -> + case lists:keysearch(name, 1, L) of + {value, {_, Tab}} -> + %% Tab is being created here -- insert Prop into L + L1 = insert_prop(Prop, L, How), + {true, lists:reverse(Acc) ++ [{op, create_table, L1}|Ops]}; + _ -> + update_existing_op(Ops, Tab, Prop, How, [Op|Acc]) + end; +update_existing_op([Op|Ops], Tab, Prop, How, Acc) -> + update_existing_op(Ops, Tab, Prop, How, [Op|Acc]); +update_existing_op([], _, _, _, _) -> + false. + +do_read_table_property(Tab, Key) -> + TidTs = get_tid_ts_and_lock(schema, read), + {_, _, Ts} = TidTs, + Store = Ts#tidstore.store, + Props = ets:foldl( + fun({op, create_table, [{name, T}|Opts]}, _Acc) + when T==Tab -> + find_props(Opts); + ({op, Op, [{name,T}|Opts], _Prop}, _Acc) + when T==Tab, Op==write_property; Op==delete_property -> + find_props(Opts); + ({op, delete_table, [{name,T}|_]}, _Acc) + when T==Tab -> + []; + (_Other, Acc) -> + Acc + end, [], Store), + case lists:keysearch(Key, 1, Props) of + {value, Property} -> + Property; + false -> + undefined + end. + + +%% perhaps a misnomer. How could also be delete_property... never mind. +%% Returns the modified L. +insert_prop(Prop, L, How) -> + Prev = find_props(L), + MergedProps = merge_with_previous(How, Prop, Prev), + replace_props(L, MergedProps). + +find_props([{user_properties, P}|_]) -> P; +find_props([_H|T]) -> find_props(T). +%% we shouldn't reach [] + +replace_props([{user_properties, _}|T], P) -> [{user_properties, P}|T]; +replace_props([H|T], P) -> [H|replace_props(T, P)]. +%% again, we shouldn't reach [] + +merge_with_previous(write_property, Prop, Prev) -> + Key = element(1, Prop), + Prev1 = lists:keydelete(Key, 1, Prev), + lists:sort([Prop|Prev1]); +merge_with_previous(delete_property, PropKey, Prev) -> + lists:keydelete(PropKey, 1, Prev). + +delete_table_property(Tab, PropKey) -> + schema_transaction(fun() -> do_delete_table_property(Tab, PropKey) end). + +do_delete_table_property(Tab, PropKey) -> + TidTs = get_tid_ts_and_lock(schema, write), + {_, _, Ts} = TidTs, + Store = Ts#tidstore.store, + case change_prop_in_existing_op(Tab, PropKey, delete_property, Store) of + true -> + dbg_out("change_prop_in_existing_op" + "(~p,~p,delete_property,Store) -> true~n", + [Tab,PropKey]), + %% we have merged the table prop into the create_table op + ok; + false -> + dbg_out("change_prop_in_existing_op" + "(~p,~p,delete_property,Store) -> false~n", + [Tab,PropKey]), + %% this must be an existing table + get_tid_ts_and_lock(Tab, none), + insert_schema_ops(TidTs, + make_delete_table_properties(Tab, [PropKey])) + end. + +make_delete_table_properties(Tab, PropKeys) -> + ensure_writable(schema), + Cs = incr_version(val({Tab, cstruct})), + ensure_active(Cs), + make_delete_table_properties(Tab, PropKeys, Cs). + +make_delete_table_properties(Tab, [PropKey | PropKeys], Cs) -> + OldProps = Cs#cstruct.user_properties, + Props = lists:keydelete(PropKey, 1, OldProps), + Cs2 = Cs#cstruct{user_properties = Props}, + verify_cstruct(Cs2), + [{op, delete_property, cs2list(Cs2), PropKey} | + make_delete_table_properties(Tab, PropKeys, Cs2)]; +make_delete_table_properties(_Tab, [], _Cs) -> + []. + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% + +%% Ensure that the transaction can be committed even +%% if the node crashes and Mnesia is restarted +prepare_commit(Tid, Commit, WaitFor) -> + case Commit#commit.schema_ops of + [] -> + {false, Commit, optional}; + OrigOps -> + {Modified, Ops, DumperMode} = + prepare_ops(Tid, OrigOps, WaitFor, false, [], optional), + InitBy = schema_prepare, + GoodRes = {Modified, + Commit#commit{schema_ops = lists:reverse(Ops)}, + DumperMode}, + case DumperMode of + optional -> + dbg_out("Transaction log dump skipped (~p): ~w~n", + [DumperMode, InitBy]); + mandatory -> + case mnesia_controller:sync_dump_log(InitBy) of + dumped -> + GoodRes; + {error, Reason} -> + mnesia:abort(Reason) + end + end, + case Ops of + [] -> + ignore; + _ -> + %% We need to grab a dumper lock here, the log may not + %% be dumped by others, during the schema commit phase. + mnesia_controller:wait_for_schema_commit_lock() + end, + GoodRes + end. + +prepare_ops(Tid, [Op | Ops], WaitFor, Changed, Acc, DumperMode) -> + case prepare_op(Tid, Op, WaitFor) of + {true, mandatory} -> + prepare_ops(Tid, Ops, WaitFor, Changed, [Op | Acc], mandatory); + {true, optional} -> + prepare_ops(Tid, Ops, WaitFor, Changed, [Op | Acc], DumperMode); + {true, Ops2, mandatory} -> + prepare_ops(Tid, Ops, WaitFor, true, Ops2 ++ Acc, mandatory); + {true, Ops2, optional} -> + prepare_ops(Tid, Ops, WaitFor, true, Ops2 ++ Acc, DumperMode); + {false, optional} -> + prepare_ops(Tid, Ops, WaitFor, true, Acc, DumperMode) + end; +prepare_ops(_Tid, [], _WaitFor, Changed, Acc, DumperMode) -> + {Changed, Acc, DumperMode}. + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +%% Prepare for commit +%% returns true if Op should be included, i.e. unmodified +%% {true, Operation} if NewRecs should be included, i.e. modified +%% false if Op should NOT be included, i.e. modified +%% +prepare_op(_Tid, {op, rec, unknown, Rec}, _WaitFor) -> + {{Tab, Key}, Items, _Op} = Rec, + case val({Tab, storage_type}) of + unknown -> + {false, optional}; + Storage -> + mnesia_tm:prepare_snmp(Tab, Key, Items), % May exit + {true, [{op, rec, Storage, Rec}], optional} + end; + +prepare_op(_Tid, {op, announce_im_running, Node, SchemaDef, Running, RemoteRunning}, _WaitFor) -> + SchemaCs = list2cs(SchemaDef), + if + Node == node() -> %% Announce has already run on local node + ignore; %% from do_merge_schema + true -> + NewNodes = mnesia_lib:uniq(Running++RemoteRunning) -- val({current,db_nodes}), + mnesia_lib:set(prepare_op, {announce_im_running,NewNodes}), + announce_im_running(NewNodes, SchemaCs) + end, + {false, optional}; + +prepare_op(_Tid, {op, sync_trans}, {part, CoordPid}) -> + CoordPid ! {sync_trans, self()}, + receive + {sync_trans, CoordPid} -> + {false, optional}; + {mnesia_down, _Node} = Else -> + mnesia_lib:verbose("sync_op terminated due to ~p~n", [Else]), + mnesia:abort(Else); + {'EXIT', _, _} = Else -> + mnesia_lib:verbose("sync_op terminated due to ~p~n", [Else]), + mnesia:abort(Else) + end; + +prepare_op(_Tid, {op, sync_trans}, {coord, Nodes}) -> + case receive_sync(Nodes, []) of + {abort, Reason} -> + mnesia_lib:verbose("sync_op terminated due to ~p~n", [Reason]), + mnesia:abort(Reason); + Pids -> + [Pid ! {sync_trans, self()} || Pid <- Pids], + {false, optional} + end; +prepare_op(Tid, {op, create_table, TabDef}, _WaitFor) -> + Cs = list2cs(TabDef), + Storage = mnesia_lib:cs_to_storage_type(node(), Cs), + UseDir = mnesia_monitor:use_dir(), + Tab = Cs#cstruct.name, + case Storage of + disc_copies when UseDir == false -> + UseDirReason = {bad_type, Tab, Storage, node()}, + mnesia:abort(UseDirReason); + disc_only_copies when UseDir == false -> + UseDirReason = {bad_type, Tab, Storage, node()}, + mnesia:abort(UseDirReason); + ram_copies -> + mnesia_lib:set({Tab, create_table},true), + create_ram_table(Tab, Cs#cstruct.type), + insert_cstruct(Tid, Cs, false), + {true, optional}; + disc_copies -> + mnesia_lib:set({Tab, create_table},true), + create_ram_table(Tab, Cs#cstruct.type), + create_disc_table(Tab), + insert_cstruct(Tid, Cs, false), + {true, optional}; + disc_only_copies -> + mnesia_lib:set({Tab, create_table},true), + create_disc_only_table(Tab,Cs#cstruct.type), + insert_cstruct(Tid, Cs, false), + {true, optional}; + unknown -> %% No replica on this node + mnesia_lib:set({Tab, create_table},true), + insert_cstruct(Tid, Cs, false), + {true, optional} + end; + +prepare_op(Tid, {op, add_table_copy, Storage, Node, TabDef}, _WaitFor) -> + Cs = list2cs(TabDef), + Tab = Cs#cstruct.name, + + if + Tab == schema -> + {true, optional}; + + Node == node() -> + case mnesia_lib:val({schema, storage_type}) of + ram_copies when Storage /= ram_copies -> + Error = {combine_error, Tab, "has no disc", Node}, + mnesia:abort(Error); + _ -> + ok + end, + %% Tables are created by mnesia_loader get_network code + insert_cstruct(Tid, Cs, true), + case mnesia_controller:get_network_copy(Tab, Cs) of + {loaded, ok} -> + {true, optional}; + {not_loaded, ErrReason} -> + Reason = {system_limit, Tab, {Node, ErrReason}}, + mnesia:abort(Reason) + end; + Node /= node() -> + %% Verify that ram table not has been dumped to disc + if + Storage /= ram_copies -> + case mnesia_lib:schema_cs_to_storage_type(node(), Cs) of + ram_copies -> + Dat = mnesia_lib:tab2dcd(Tab), + case mnesia_lib:exists(Dat) of + true -> + mnesia:abort({combine_error, Tab, Storage, + "Table dumped to disc", node()}); + false -> + ok + end; + _ -> + ok + end; + true -> + ok + end, + insert_cstruct(Tid, Cs, true), + {true, optional} + end; + +prepare_op(Tid, {op, del_table_copy, _Storage, Node, TabDef}, _WaitFor) -> + Cs = list2cs(TabDef), + Tab = Cs#cstruct.name, + + if + %% Schema table lock is always required to run a schema op. + %% No need to look it. + node(Tid#tid.pid) == node(), Tab /= schema -> + Self = self(), + Pid = spawn_link(fun() -> lock_del_table(Tab, Node, Cs, Self) end), + put(mnesia_lock, Pid), + receive + {Pid, updated} -> + {true, optional}; + {Pid, FailReason} -> + mnesia:abort(FailReason); + {'EXIT', Pid, Reason} -> + mnesia:abort(Reason) + end; + true -> + {true, optional} + end; + +prepare_op(_Tid, {op, change_table_copy_type, N, FromS, ToS, TabDef}, _WaitFor) + when N == node() -> + Cs = list2cs(TabDef), + Tab = Cs#cstruct.name, + + NotActive = mnesia_lib:not_active_here(Tab), + + if + NotActive == true -> + mnesia:abort({not_active, Tab, node()}); + + Tab == schema -> + case {FromS, ToS} of + {ram_copies, disc_copies} -> + case mnesia:system_info(schema_location) of + opt_disc -> + ignore; + _ -> + mnesia:abort({combine_error, Tab, node(), + "schema_location must be opt_disc"}) + end, + Dir = mnesia_lib:dir(), + case opt_create_dir(true, Dir) of + ok -> + purge_dir(Dir, []), + mnesia_log:purge_all_logs(), + set(use_dir, true), + mnesia_log:init(), + Ns = val({current, db_nodes}), %mnesia_lib:running_nodes(), + F = fun(U) -> mnesia_recover:log_mnesia_up(U) end, + lists:foreach(F, Ns), + + mnesia_dumper:raw_named_dump_table(Tab, dmp), + mnesia_checkpoint:tm_change_table_copy_type(Tab, FromS, ToS); + {error, Reason} -> + mnesia:abort(Reason) + end; + {disc_copies, ram_copies} -> + Ltabs = val({schema, local_tables}) -- [schema], + Dtabs = [L || L <- Ltabs, + val({L, storage_type}) /= ram_copies], + verify([], Dtabs, {"Disc resident tables", Dtabs, N}); + _ -> + mnesia:abort({combine_error, Tab, ToS}) + end; + + FromS == ram_copies -> + case mnesia_monitor:use_dir() of + true -> + Dat = mnesia_lib:tab2dcd(Tab), + case mnesia_lib:exists(Dat) of + true -> + mnesia:abort({combine_error, Tab, node(), + "Table dump exists"}); + false -> + case ToS of + disc_copies -> + mnesia_log:ets2dcd(Tab, dmp); + disc_only_copies -> + mnesia_dumper:raw_named_dump_table(Tab, dmp) + end, + mnesia_checkpoint:tm_change_table_copy_type(Tab, FromS, ToS) + end; + false -> + mnesia:abort({has_no_disc, node()}) + end; + + FromS == disc_copies, ToS == disc_only_copies -> + mnesia_dumper:raw_named_dump_table(Tab, dmp); + FromS == disc_only_copies -> + Type = Cs#cstruct.type, + create_ram_table(Tab, Type), + Datname = mnesia_lib:tab2dat(Tab), + Repair = mnesia_monitor:get_env(auto_repair), + case mnesia_lib:dets_to_ets(Tab, Tab, Datname, Type, Repair, no) of + loaded -> ok; + Reason -> + Err = "Failed to copy disc data to ram", + mnesia:abort({system_limit, Tab, {Err,Reason}}) + end; + true -> + ignore + end, + {true, mandatory}; + +prepare_op(_Tid, {op, change_table_copy_type, N, _FromS, _ToS, _TabDef}, _WaitFor) + when N /= node() -> + {true, mandatory}; + +prepare_op(_Tid, {op, delete_table, _TabDef}, _WaitFor) -> + {true, mandatory}; + +prepare_op(_Tid, {op, dump_table, unknown, TabDef}, _WaitFor) -> + Cs = list2cs(TabDef), + Tab = Cs#cstruct.name, + case lists:member(node(), Cs#cstruct.ram_copies) of + true -> + case mnesia_monitor:use_dir() of + true -> + mnesia_log:ets2dcd(Tab, dmp), + Size = mnesia:table_info(Tab, size), + {true, [{op, dump_table, Size, TabDef}], optional}; + false -> + mnesia:abort({has_no_disc, node()}) + end; + false -> + {false, optional} + end; + +prepare_op(_Tid, {op, add_snmp, Ustruct, TabDef}, _WaitFor) -> + Cs = list2cs(TabDef), + case mnesia_lib:cs_to_storage_type(node(), Cs) of + unknown -> + {true, optional}; + Storage -> + Tab = Cs#cstruct.name, + Stab = mnesia_snmp_hook:create_table(Ustruct, Tab, Storage), + mnesia_lib:set({Tab, {index, snmp}}, Stab), + {true, optional} + end; + +prepare_op(_Tid, {op, transform, ignore, _TabDef}, _WaitFor) -> + {true, mandatory}; %% Apply schema changes only. +prepare_op(_Tid, {op, transform, Fun, TabDef}, _WaitFor) -> + Cs = list2cs(TabDef), + case mnesia_lib:cs_to_storage_type(node(), Cs) of + unknown -> + {true, mandatory}; + Storage -> + Tab = Cs#cstruct.name, + RecName = Cs#cstruct.record_name, + Type = Cs#cstruct.type, + NewArity = length(Cs#cstruct.attributes) + 1, + mnesia_lib:db_fixtable(Storage, Tab, true), + Key = mnesia_lib:db_first(Tab), + Op = {op, transform, Fun, TabDef}, + case catch transform_objs(Fun, Tab, RecName, + Key, NewArity, Storage, Type, [Op]) of + {'EXIT', Reason} -> + mnesia_lib:db_fixtable(Storage, Tab, false), + exit({"Bad transform function", Tab, Fun, node(), Reason}); + Objs -> + mnesia_lib:db_fixtable(Storage, Tab, false), + {true, Objs, mandatory} + end + end; + +prepare_op(_Tid, {op, merge_schema, TabDef}, _WaitFor) -> + Cs = list2cs(TabDef), + case verify_merge(Cs) of + ok -> + {true, optional}; + Error -> + verbose("Merge_Schema ~p failed on ~p: ~p~n", [_Tid,node(),Error]), + mnesia:abort({bad_commit, Error}) + end; +prepare_op(_Tid, _Op, _WaitFor) -> + {true, optional}. + +create_ram_table(Tab, Type) -> + Args = [{keypos, 2}, public, named_table, Type], + case mnesia_monitor:unsafe_mktab(Tab, Args) of + Tab -> + ok; + {error,Reason} -> + Err = "Failed to create ets table", + mnesia:abort({system_limit, Tab, {Err,Reason}}) + end. +create_disc_table(Tab) -> + File = mnesia_lib:tab2dcd(Tab), + file:delete(File), + FArg = [{file, File}, {name, {mnesia,create}}, + {repair, false}, {mode, read_write}], + case mnesia_monitor:open_log(FArg) of + {ok,Log} -> + mnesia_monitor:unsafe_close_log(Log), + ok; + {error,Reason} -> + Err = "Failed to create disc table", + mnesia:abort({system_limit, Tab, {Err,Reason}}) + end. +create_disc_only_table(Tab,Type) -> + File = mnesia_lib:tab2dat(Tab), + file:delete(File), + Args = [{file, mnesia_lib:tab2dat(Tab)}, + {type, mnesia_lib:disk_type(Tab, Type)}, + {keypos, 2}, + {repair, mnesia_monitor:get_env(auto_repair)}], + case mnesia_monitor:unsafe_open_dets(Tab, Args) of + {ok, _} -> + ok; + {error,Reason} -> + Err = "Failed to create disc table", + mnesia:abort({system_limit, Tab, {Err,Reason}}) + end. + + +receive_sync([], Pids) -> + Pids; +receive_sync(Nodes, Pids) -> + receive + {sync_trans, Pid} -> + Node = node(Pid), + receive_sync(lists:delete(Node, Nodes), [Pid | Pids]); + Else -> + {abort, Else} + end. + +lock_del_table(Tab, Node, Cs, Father) -> + Ns = val({schema, active_replicas}), + process_flag(trap_exit,true), + Lock = fun() -> + mnesia:write_lock_table(Tab), + {Res, []} = rpc:multicall(Ns, ?MODULE, set_where_to_read, [Tab, Node, Cs]), + Filter = fun(ok) -> + false; + ({badrpc, {'EXIT', {undef, _}}}) -> + %% This will be the case we talks with elder nodes + %% than 3.8.2, they will set where_to_read without + %% getting a lock. + false; + (_) -> + true + end, + case lists:filter(Filter, Res) of + [] -> + Father ! {self(), updated}, + %% When transaction is commited the process dies + %% and the lock is released. + receive _ -> ok end; + Err -> + Father ! {self(), {bad_commit, Err}} + end, + ok + end, + case mnesia:transaction(Lock) of + {atomic, ok} -> ok; + {aborted, R} -> Father ! {self(), R} + end, + unlink(Father), + unlink(whereis(mnesia_tm)), + exit(normal). + +set_where_to_read(Tab, Node, Cs) -> + case mnesia_lib:val({Tab, where_to_read}) of + Node -> + case Cs#cstruct.local_content of + true -> + ok; + false -> + mnesia_lib:set_remote_where_to_read(Tab, [Node]), + ok + end; + _ -> + ok + end. + +%% Build up the list in reverse order. +transform_objs(_Fun, _Tab, _RT, '$end_of_table', _NewArity, _Storage, _Type, Acc) -> + Acc; +transform_objs(Fun, Tab, RecName, Key, A, Storage, Type, Acc) -> + Objs = mnesia_lib:db_get(Tab, Key), + NextKey = mnesia_lib:db_next_key(Tab, Key), + Oid = {Tab, Key}, + NewObjs = {Ws, Ds} = transform_obj(Tab, RecName, Key, Fun, Objs, A, Type, [], []), + if + NewObjs == {[], []} -> + transform_objs(Fun, Tab, RecName, NextKey, A, Storage, Type, Acc); + Type == bag -> + transform_objs(Fun, Tab, RecName, NextKey, A, Storage, Type, + [{op, rec, Storage, {Oid, Ws, write}}, + {op, rec, Storage, {Oid, [Oid], delete}} | Acc]); + Ds == [] -> + %% Type is set or ordered_set, no need to delete the record first + transform_objs(Fun, Tab, RecName, NextKey, A, Storage, Type, + [{op, rec, Storage, {Oid, Ws, write}} | Acc]); + Ws == [] -> + transform_objs(Fun, Tab, RecName, NextKey, A, Storage, Type, + [{op, rec, Storage, {Oid, Ds, write}} | Acc]); + true -> + transform_objs(Fun, Tab, RecName, NextKey, A, Storage, Type, + [{op, rec, Storage, {Oid, Ws, write}}, + {op, rec, Storage, {Oid, Ds, delete}} | Acc]) + end. + +transform_obj(Tab, RecName, Key, Fun, [Obj|Rest], NewArity, Type, Ws, Ds) -> + NewObj = Fun(Obj), + if + size(NewObj) /= NewArity -> + exit({"Bad arity", Obj, NewObj}); + NewObj == Obj -> + transform_obj(Tab, RecName, Key, Fun, Rest, NewArity, Type, Ws, Ds); + RecName == element(1, NewObj), Key == element(2, NewObj) -> + transform_obj(Tab, RecName, Key, Fun, Rest, NewArity, + Type, [NewObj | Ws], Ds); + NewObj == delete -> + case Type of + bag -> %% Just don't write that object + transform_obj(Tab, RecName, Key, Fun, Rest, + NewArity, Type, Ws, Ds); + _ -> + transform_obj(Tab, RecName, Key, Fun, Rest, NewArity, + Type, Ws, [NewObj | Ds]) + end; + true -> + exit({"Bad key or Record Name", Obj, NewObj}) + end; +transform_obj(_Tab, _RecName, _Key, _Fun, [], _NewArity, _Type, Ws, Ds) -> + {lists:reverse(Ws), lists:reverse(Ds)}. + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +%% Undo prepare of commit +undo_prepare_commit(Tid, Commit) -> + case Commit#commit.schema_ops of + [] -> + ignore; + Ops -> + %% Catch to allow failure mnesia_controller may not be started + catch mnesia_controller:release_schema_commit_lock(), + undo_prepare_ops(Tid, Ops) + end, + Commit. + +%% Undo in reverse order +undo_prepare_ops(Tid, [Op | Ops]) -> + case element(1, Op) of + TheOp when TheOp /= op, TheOp /= restore_op -> + undo_prepare_ops(Tid, Ops); + _ -> + undo_prepare_ops(Tid, Ops), + undo_prepare_op(Tid, Op) + end; +undo_prepare_ops(_Tid, []) -> + []. + +undo_prepare_op(_Tid, {op, announce_im_running, _Node, _, _Running, _RemoteRunning}) -> + case ?catch_val(prepare_op) of + {announce_im_running, New} -> + unannounce_im_running(New); + _Else -> + ok + end; + +undo_prepare_op(_Tid, {op, sync_trans}) -> + ok; + +undo_prepare_op(Tid, {op, create_table, TabDef}) -> + Cs = list2cs(TabDef), + Tab = Cs#cstruct.name, + mnesia_lib:unset({Tab, create_table}), + delete_cstruct(Tid, Cs), + case mnesia_lib:cs_to_storage_type(node(), Cs) of + unknown -> + ok; + ram_copies -> + ram_delete_table(Tab, ram_copies); + disc_copies -> + ram_delete_table(Tab, disc_copies), + DcdFile = mnesia_lib:tab2dcd(Tab), + %% disc_delete_table(Tab, Storage), + file:delete(DcdFile); + disc_only_copies -> + mnesia_monitor:unsafe_close_dets(Tab), + Dat = mnesia_lib:tab2dat(Tab), + %% disc_delete_table(Tab, Storage), + file:delete(Dat) + end; + +undo_prepare_op(Tid, {op, add_table_copy, Storage, Node, TabDef}) -> + Cs = list2cs(TabDef), + Tab = Cs#cstruct.name, + if + Tab == schema -> + true; % Nothing to prepare + Node == node() -> + mnesia_checkpoint:tm_del_copy(Tab, Node), + mnesia_controller:unannounce_add_table_copy(Tab, Node), + if + Storage == disc_only_copies; Tab == schema -> + mnesia_monitor:close_dets(Tab), + file:delete(mnesia_lib:tab2dat(Tab)); + true -> + file:delete(mnesia_lib:tab2dcd(Tab)) + end, + ram_delete_table(Tab, Storage), + Cs2 = new_cs(Cs, Node, Storage, del), + insert_cstruct(Tid, Cs2, true); % Don't care about the version + Node /= node() -> + mnesia_controller:unannounce_add_table_copy(Tab, Node), + Cs2 = new_cs(Cs, Node, Storage, del), + insert_cstruct(Tid, Cs2, true) % Don't care about the version + end; + +undo_prepare_op(_Tid, {op, del_table_copy, _, Node, TabDef}) + when Node == node() -> + Cs = list2cs(TabDef), + Tab = Cs#cstruct.name, + mnesia_lib:set({Tab, where_to_read}, Node); + + +undo_prepare_op(_Tid, {op, change_table_copy_type, N, FromS, ToS, TabDef}) + when N == node() -> + Cs = list2cs(TabDef), + Tab = Cs#cstruct.name, + mnesia_checkpoint:tm_change_table_copy_type(Tab, ToS, FromS), + Dmp = mnesia_lib:tab2dmp(Tab), + + case {FromS, ToS} of + {ram_copies, disc_copies} when Tab == schema -> + file:delete(Dmp), + mnesia_log:purge_some_logs(), + set(use_dir, false); + {ram_copies, disc_copies} -> + file:delete(Dmp); + {ram_copies, disc_only_copies} -> + file:delete(Dmp); + {disc_only_copies, _} -> + ram_delete_table(Tab, ram_copies); + _ -> + ignore + end; + +undo_prepare_op(_Tid, {op, dump_table, _Size, TabDef}) -> + Cs = list2cs(TabDef), + case lists:member(node(), Cs#cstruct.ram_copies) of + true -> + Tab = Cs#cstruct.name, + Dmp = mnesia_lib:tab2dmp(Tab), + file:delete(Dmp); + false -> + ignore + end; + +undo_prepare_op(_Tid, {op, add_snmp, _Ustruct, TabDef}) -> + Cs = list2cs(TabDef), + case mnesia_lib:cs_to_storage_type(node(), Cs) of + unknown -> + true; + _Storage -> + Tab = Cs#cstruct.name, + case ?catch_val({Tab, {index, snmp}}) of + {'EXIT',_} -> + ignore; + Stab -> + mnesia_snmp_hook:delete_table(Tab, Stab), + mnesia_lib:unset({Tab, {index, snmp}}) + end + end; + +undo_prepare_op(_Tid, _Op) -> + ignore. + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% + +ram_delete_table(Tab, Storage) -> + case Storage of + unknown -> + ignore; + disc_only_copies -> + ignore; + _Else -> + %% delete possible index files and data ..... + %% Got to catch this since if no info has been set in the + %% mnesia_gvar it will crash + catch mnesia_index:del_transient(Tab, Storage), + case ?catch_val({Tab, {index, snmp}}) of + {'EXIT', _} -> + ignore; + Etab -> + catch mnesia_snmp_hook:delete_table(Tab, Etab) + end, + catch ?ets_delete_table(Tab) + end. + +purge_dir(Dir, KeepFiles) -> + Suffixes = known_suffixes(), + purge_dir(Dir, KeepFiles, Suffixes). + +purge_dir(Dir, KeepFiles, Suffixes) -> + case dir_exists(Dir) of + true -> + {ok, AllFiles} = file:list_dir(Dir), + purge_known_files(AllFiles, KeepFiles, Dir, Suffixes); + false -> + ok + end. + +purge_tmp_files() -> + case mnesia_monitor:use_dir() of + true -> + Dir = mnesia_lib:dir(), + KeepFiles = [], + Exists = mnesia_lib:exists(mnesia_lib:tab2dat(schema)), + case Exists of + true -> + Suffixes = tmp_suffixes(), + purge_dir(Dir, KeepFiles, Suffixes); + false -> + %% Interrupted change of storage type + %% for schema table + Suffixes = known_suffixes(), + purge_dir(Dir, KeepFiles, Suffixes), + mnesia_lib:set(use_dir, false) + end; + + false -> + ok + end. + +purge_known_files([File | Tail], KeepFiles, Dir, Suffixes) -> + case lists:member(File, KeepFiles) of + true -> + ignore; + false -> + case has_known_suffix(File, Suffixes, false) of + false -> + ignore; + true -> + AbsFile = filename:join([Dir, File]), + file:delete(AbsFile) + end + end, + purge_known_files(Tail, KeepFiles, Dir, Suffixes); +purge_known_files([], _KeepFiles, _Dir, _Suffixes) -> + ok. + +has_known_suffix(_File, _Suffixes, true) -> + true; +has_known_suffix(File, [Suffix | Tail], false) -> + has_known_suffix(File, Tail, lists:suffix(Suffix, File)); +has_known_suffix(_File, [], Bool) -> + Bool. + +known_suffixes() -> real_suffixes() ++ tmp_suffixes(). + +real_suffixes() -> [".DAT", ".LOG", ".BUP", ".DCL", ".DCD"]. + +tmp_suffixes() -> [".TMP", ".BUPTMP", ".RET", ".DMP"]. + +info() -> + Tabs = lists:sort(val({schema, tables})), + lists:foreach(fun(T) -> info(T) end, Tabs), + ok. + +info(Tab) -> + Props = get_table_properties(Tab), + io:format("-- Properties for ~w table --- ~n",[Tab]), + info2(Tab, Props). +info2(Tab, [{cstruct, _V} | Tail]) -> % Ignore cstruct + info2(Tab, Tail); +info2(Tab, [{frag_hash, _V} | Tail]) -> % Ignore frag_hash + info2(Tab, Tail); +info2(Tab, [{P, V} | Tail]) -> + io:format("~-20w -> ~p~n",[P,V]), + info2(Tab, Tail); +info2(_, []) -> + io:format("~n", []). + +get_table_properties(Tab) -> + case catch mnesia_lib:db_match_object(ram_copies, + mnesia_gvar, {{Tab, '_'}, '_'}) of + {'EXIT', _} -> + mnesia:abort({no_exists, Tab, all}); + RawGvar -> + case [{Item, Val} || {{_Tab, Item}, Val} <- RawGvar] of + [] -> + []; + Gvar -> + Size = {size, mnesia:table_info(Tab, size)}, + Memory = {memory, mnesia:table_info(Tab, memory)}, + Master = {master_nodes, mnesia:table_info(Tab, master_nodes)}, + lists:sort([Size, Memory, Master | Gvar]) + end + end. + +%%%%%%%%%%% RESTORE %%%%%%%%%%% + +-record(r, {iter = schema, + module, + table_options = [], + default_op = clear_tables, + tables = [], + opaque, + insert_op = error_fun, + recs = error_recs + }). + +restore(Opaque) -> + restore(Opaque, [], mnesia_monitor:get_env(backup_module)). +restore(Opaque, Args) when is_list(Args) -> + restore(Opaque, Args, mnesia_monitor:get_env(backup_module)); +restore(_Opaque, BadArg) -> + {aborted, {badarg, BadArg}}. +restore(Opaque, Args, Module) when is_list(Args), is_atom(Module) -> + InitR = #r{opaque = Opaque, module = Module}, + case catch lists:foldl(fun check_restore_arg/2, InitR, Args) of + R when is_record(R, r) -> + case mnesia_bup:read_schema(R#r.module, Opaque) of + {error, Reason} -> + {aborted, Reason}; + BupSchema -> + schema_transaction(fun() -> do_restore(R, BupSchema) end) + end; + {'EXIT', Reason} -> + {aborted, Reason} + end; +restore(_Opaque, Args, Module) -> + {aborted, {badarg, Args, Module}}. + +check_restore_arg({module, Mod}, R) when is_atom(Mod) -> + R#r{module = Mod}; + +check_restore_arg({clear_tables, List}, R) when is_list(List) -> + case lists:member(schema, List) of + false -> + TableList = [{Tab, clear_tables} || Tab <- List], + R#r{table_options = R#r.table_options ++ TableList}; + true -> + exit({badarg, {clear_tables, schema}}) + end; +check_restore_arg({recreate_tables, List}, R) when is_list(List) -> + case lists:member(schema, List) of + false -> + TableList = [{Tab, recreate_tables} || Tab <- List], + R#r{table_options = R#r.table_options ++ TableList}; + true -> + exit({badarg, {recreate_tables, schema}}) + end; +check_restore_arg({keep_tables, List}, R) when is_list(List) -> + TableList = [{Tab, keep_tables} || Tab <- List], + R#r{table_options = R#r.table_options ++ TableList}; +check_restore_arg({skip_tables, List}, R) when is_list(List) -> + TableList = [{Tab, skip_tables} || Tab <- List], + R#r{table_options = R#r.table_options ++ TableList}; +check_restore_arg({default_op, Op}, R) -> + case Op of + clear_tables -> ok; + recreate_tables -> ok; + keep_tables -> ok; + skip_tables -> ok; + Else -> + exit({badarg, {bad_default_op, Else}}) + end, + R#r{default_op = Op}; + +check_restore_arg(BadArg,_) -> + exit({badarg, BadArg}). + +do_restore(R, BupSchema) -> + TidTs = get_tid_ts_and_lock(schema, write), + R2 = restore_schema(BupSchema, R), + insert_schema_ops(TidTs, [{restore_op, R2}]), + [element(1, TabStruct) || TabStruct <- R2#r.tables]. + +arrange_restore(R, Fun, Recs) -> + R2 = R#r{insert_op = Fun, recs = Recs}, + case mnesia_bup:iterate(R#r.module, fun restore_items/4, R#r.opaque, R2) of + {ok, R3} -> R3#r.recs; + {error, Reason} -> mnesia:abort(Reason) + end. + +restore_items([Rec | Recs], Header, Schema, R) -> + Tab = element(1, Rec), + case lists:keysearch(Tab, 1, R#r.tables) of + {value, {Tab, Where0, Snmp, RecName}} -> + Where = case Where0 of + undefined -> + val({Tab, where_to_commit}); + _ -> + Where0 + end, + {Rest, NRecs} = restore_tab_items([Rec | Recs], Tab, + RecName, Where, Snmp, + R#r.recs, R#r.insert_op), + restore_items(Rest, Header, Schema, R#r{recs = NRecs}); + false -> + Rest = skip_tab_items(Recs, Tab), + restore_items(Rest, Header, Schema, R) + end; + +restore_items([], _Header, _Schema, R) -> + R. + +restore_func(Tab, R) -> + case lists:keysearch(Tab, 1, R#r.table_options) of + {value, {Tab, OP}} -> + OP; + false -> + R#r.default_op + end. + +where_to_commit(Tab, CsList) -> + Ram = [{N, ram_copies} || N <- pick(Tab, ram_copies, CsList, [])], + Disc = [{N, disc_copies} || N <- pick(Tab, disc_copies, CsList, [])], + DiscO = [{N, disc_only_copies} || N <- pick(Tab, disc_only_copies, CsList, [])], + Ram ++ Disc ++ DiscO. + +%% Changes of the Meta info of schema itself is not allowed +restore_schema([{schema, schema, _List} | Schema], R) -> + restore_schema(Schema, R); +restore_schema([{schema, Tab, List} | Schema], R) -> + case restore_func(Tab, R) of + clear_tables -> + do_clear_table(Tab), + Snmp = val({Tab, snmp}), + RecName = val({Tab, record_name}), + R2 = R#r{tables = [{Tab, undefined, Snmp, RecName} | R#r.tables]}, + restore_schema(Schema, R2); + recreate_tables -> + case ?catch_val({Tab, cstruct}) of + {'EXIT', _} -> + TidTs = {_Mod, Tid, Ts} = get(mnesia_activity_state), + RunningNodes = val({current, db_nodes}), + Nodes = mnesia_lib:intersect(mnesia_lib:cs_to_nodes(list2cs(List)), + RunningNodes), + mnesia_locker:wlock_no_exist(Tid, Ts#tidstore.store, Tab, Nodes), + TidTs; + _ -> + TidTs = get_tid_ts_and_lock(Tab, write) + end, + NC = {cookie, ?unique_cookie}, + List2 = lists:keyreplace(cookie, 1, List, NC), + Where = where_to_commit(Tab, List2), + Snmp = pick(Tab, snmp, List2, []), + RecName = pick(Tab, record_name, List2, Tab), + insert_schema_ops(TidTs, [{op, restore_recreate, List2}]), + R2 = R#r{tables = [{Tab, Where, Snmp, RecName} | R#r.tables]}, + restore_schema(Schema, R2); + keep_tables -> + get_tid_ts_and_lock(Tab, write), + Snmp = val({Tab, snmp}), + RecName = val({Tab, record_name}), + R2 = R#r{tables = [{Tab, undefined, Snmp, RecName} | R#r.tables]}, + restore_schema(Schema, R2); + skip_tables -> + restore_schema(Schema, R) + end; + +restore_schema([{schema, Tab} | Schema], R) -> + do_delete_table(Tab), + Tabs = lists:delete(Tab,R#r.tables), + restore_schema(Schema, R#r{tables = Tabs}); +restore_schema([], R) -> + R. + +restore_tab_items([Rec | Rest], Tab, RecName, Where, Snmp, Recs, Op) + when element(1, Rec) == Tab -> + NewRecs = Op(Rec, Recs, RecName, Where, Snmp), + restore_tab_items(Rest, Tab, RecName, Where, Snmp, NewRecs, Op); + +restore_tab_items(Rest, _Tab, _RecName, _Where, _Snmp, Recs, _Op) -> + {Rest, Recs}. + +skip_tab_items([Rec| Rest], Tab) + when element(1, Rec) == Tab -> + skip_tab_items(Rest, Tab); +skip_tab_items(Recs, _) -> + Recs. + +%%%%%%%%% Dump tables %%%%%%%%%%%%% +dump_tables(Tabs) when is_list(Tabs) -> + schema_transaction(fun() -> do_dump_tables(Tabs) end); +dump_tables(Tabs) -> + {aborted, {bad_type, Tabs}}. + +do_dump_tables(Tabs) -> + TidTs = get_tid_ts_and_lock(schema, write), + insert_schema_ops(TidTs, make_dump_tables(Tabs)). + +make_dump_tables([schema | _Tabs]) -> + mnesia:abort({bad_type, schema}); +make_dump_tables([Tab | Tabs]) -> + get_tid_ts_and_lock(Tab, read), + TabDef = get_create_list(Tab), + DiscResident = val({Tab, disc_copies}) ++ val({Tab, disc_only_copies}), + verify([], DiscResident, + {"Only allowed on ram_copies", Tab, DiscResident}), + [{op, dump_table, unknown, TabDef} | make_dump_tables(Tabs)]; +make_dump_tables([]) -> + []. + +%% Merge the local schema with the schema on other nodes +merge_schema() -> + schema_transaction(fun() -> do_merge_schema() end). + +do_merge_schema() -> + {_Mod, Tid, Ts} = get_tid_ts_and_lock(schema, write), + Connected = val(recover_nodes), + Running = val({current, db_nodes}), + Store = Ts#tidstore.store, + %% Verify that all nodes are locked that might not be the + %% case, if this trans where queued when new nodes where added. + case Running -- ets:lookup_element(Store, nodes, 2) of + [] -> ok; %% All known nodes are locked + Miss -> %% Abort! We don't want the sideeffects below to be executed + mnesia:abort({bad_commit, {missing_lock, Miss}}) + end, + case Connected -- Running of + [Node | _] -> + %% Time for a schema merging party! + mnesia_locker:wlock_no_exist(Tid, Store, schema, [Node]), + case rpc:call(Node, mnesia_controller, get_cstructs, []) of + {cstructs, Cstructs, RemoteRunning1} -> + LockedAlready = Running ++ [Node], + {New, Old} = mnesia_recover:connect_nodes(RemoteRunning1), + RemoteRunning = mnesia_lib:intersect(New ++ Old, RemoteRunning1), + if + RemoteRunning /= RemoteRunning1 -> + mnesia_lib:error("Mnesia on ~p could not connect to node(s) ~p~n", + [node(), RemoteRunning1 -- RemoteRunning]); + true -> ok + end, + NeedsLock = RemoteRunning -- LockedAlready, + mnesia_locker:wlock_no_exist(Tid, Store, schema, NeedsLock), + {value, SchemaCs} = + lists:keysearch(schema, #cstruct.name, Cstructs), + + %% Announce that Node is running + A = [{op, announce_im_running, node(), + cs2list(SchemaCs), Running, RemoteRunning}], + do_insert_schema_ops(Store, A), + + %% Introduce remote tables to local node + do_insert_schema_ops(Store, make_merge_schema(Node, Cstructs)), + + %% Introduce local tables to remote nodes + Tabs = val({schema, tables}), + Ops = [{op, merge_schema, get_create_list(T)} + || T <- Tabs, + not lists:keymember(T, #cstruct.name, Cstructs)], + do_insert_schema_ops(Store, Ops), + + %% Ensure that the txn will be committed on all nodes + NewNodes = RemoteRunning -- Running, + mnesia_lib:set(prepare_op, {announce_im_running,NewNodes}), + announce_im_running(NewNodes, SchemaCs), + {merged, Running, RemoteRunning}; + {error, Reason} -> + {"Cannot get cstructs", Node, Reason}; + {badrpc, Reason} -> + {"Cannot get cstructs", Node, {badrpc, Reason}} + end; + [] -> + %% No more nodes to merge schema with + not_merged + end. + +make_merge_schema(Node, [Cs | Cstructs]) -> + Ops = do_make_merge_schema(Node, Cs), + Ops ++ make_merge_schema(Node, Cstructs); +make_merge_schema(_Node, []) -> + []. + +%% Merge definitions of schema table +do_make_merge_schema(Node, RemoteCs) + when RemoteCs#cstruct.name == schema -> + Cs = val({schema, cstruct}), + Masters = mnesia_recover:get_master_nodes(schema), + HasRemoteMaster = lists:member(Node, Masters), + HasLocalMaster = lists:member(node(), Masters), + Force = HasLocalMaster or HasRemoteMaster, + %% What is the storage types opinions? + StCsLocal = mnesia_lib:cs_to_storage_type(node(), Cs), + StRcsLocal = mnesia_lib:cs_to_storage_type(node(), RemoteCs), + StCsRemote = mnesia_lib:cs_to_storage_type(Node, Cs), + StRcsRemote = mnesia_lib:cs_to_storage_type(Node, RemoteCs), + + if + Cs#cstruct.cookie == RemoteCs#cstruct.cookie, + Cs#cstruct.version == RemoteCs#cstruct.version -> + %% Great, we have the same cookie and version + %% and do not need to merge cstructs + []; + + Cs#cstruct.cookie /= RemoteCs#cstruct.cookie, + Cs#cstruct.disc_copies /= [], + RemoteCs#cstruct.disc_copies /= [] -> + %% Both cstructs involves disc nodes + %% and we cannot merge them + if + HasLocalMaster == true, + HasRemoteMaster == false -> + %% Choose local cstruct, + %% since it's the master + [{op, merge_schema, cs2list(Cs)}]; + + HasRemoteMaster == true, + HasLocalMaster == false -> + %% Choose remote cstruct, + %% since it's the master + [{op, merge_schema, cs2list(RemoteCs)}]; + + true -> + Str = io_lib:format("Incompatible schema cookies. " + "Please, restart from old backup." + "~w = ~w, ~w = ~w~n", + [Node, cs2list(RemoteCs), node(), cs2list(Cs)]), + throw(Str) + end; + + StCsLocal /= StRcsLocal, StRcsLocal /= unknown, StCsLocal /= ram_copies -> + Str = io_lib:format("Incompatible schema storage types (local). " + "on ~w storage ~w, on ~w storage ~w~n", + [node(), StCsLocal, Node, StRcsLocal]), + throw(Str); + StCsRemote /= StRcsRemote, StCsRemote /= unknown, StRcsRemote /= ram_copies -> + Str = io_lib:format("Incompatible schema storage types (remote). " + "on ~w cs ~w, on ~w rcs ~w~n", + [node(), cs2list(Cs), Node, cs2list(RemoteCs)]), + throw(Str); + + Cs#cstruct.disc_copies /= [] -> + %% Choose local cstruct, + %% since it involves disc nodes + MergedCs = merge_cstructs(Cs, RemoteCs, Force), + [{op, merge_schema, cs2list(MergedCs)}]; + + RemoteCs#cstruct.disc_copies /= [] -> + %% Choose remote cstruct, + %% since it involves disc nodes + MergedCs = merge_cstructs(RemoteCs, Cs, Force), + [{op, merge_schema, cs2list(MergedCs)}]; + + Cs > RemoteCs -> + %% Choose remote cstruct + MergedCs = merge_cstructs(RemoteCs, Cs, Force), + [{op, merge_schema, cs2list(MergedCs)}]; + + true -> + %% Choose local cstruct + MergedCs = merge_cstructs(Cs, RemoteCs, Force), + [{op, merge_schema, cs2list(MergedCs)}] + end; + +%% Merge definitions of normal table +do_make_merge_schema(Node, RemoteCs) -> + Tab = RemoteCs#cstruct.name, + Masters = mnesia_recover:get_master_nodes(schema), + HasRemoteMaster = lists:member(Node, Masters), + HasLocalMaster = lists:member(node(), Masters), + Force = HasLocalMaster or HasRemoteMaster, + case ?catch_val({Tab, cstruct}) of + {'EXIT', _} -> + %% A completely new table, created while Node was down + [{op, merge_schema, cs2list(RemoteCs)}]; + Cs when Cs#cstruct.cookie == RemoteCs#cstruct.cookie -> + if + Cs#cstruct.version == RemoteCs#cstruct.version -> + %% We have exactly the same version of the + %% table def + []; + + Cs#cstruct.version > RemoteCs#cstruct.version -> + %% Oops, we have different versions + %% of the table def, lets merge them. + %% The only changes that may have occurred + %% is that new replicas may have been added. + MergedCs = merge_cstructs(Cs, RemoteCs, Force), + [{op, merge_schema, cs2list(MergedCs)}]; + + Cs#cstruct.version < RemoteCs#cstruct.version -> + %% Oops, we have different versions + %% of the table def, lets merge them + MergedCs = merge_cstructs(RemoteCs, Cs, Force), + [{op, merge_schema, cs2list(MergedCs)}] + end; + Cs -> + %% Different cookies, not possible to merge + if + HasLocalMaster == true, + HasRemoteMaster == false -> + %% Choose local cstruct, + %% since it's the master + [{op, merge_schema, cs2list(Cs)}]; + + HasRemoteMaster == true, + HasLocalMaster == false -> + %% Choose remote cstruct, + %% since it's the master + [{op, merge_schema, cs2list(RemoteCs)}]; + + true -> + Str = io_lib:format("Bad cookie in table definition" + " ~w: ~w = ~w, ~w = ~w~n", + [Tab, node(), Cs, Node, RemoteCs]), + throw(Str) + end + end. + +%% Change of table definitions (cstructs) requires all replicas +%% of the table to be active. New replicas, db_nodes and tables +%% may however be added even if some replica is inactive. These +%% invariants must be enforced in order to allow merge of cstructs. +%% +%% Returns a new cstruct or issues a fatal error +merge_cstructs(Cs, RemoteCs, Force) -> + verify_cstruct(Cs), + case catch do_merge_cstructs(Cs, RemoteCs, Force) of + {'EXIT', {aborted, _Reason}} when Force == true -> + Cs; + {'EXIT', Reason} -> + exit(Reason); + MergedCs when is_record(MergedCs, cstruct) -> + MergedCs; + Other -> + throw(Other) + end. + +do_merge_cstructs(Cs, RemoteCs, Force) -> + verify_cstruct(RemoteCs), + Ns = mnesia_lib:uniq(mnesia_lib:cs_to_nodes(Cs) ++ + mnesia_lib:cs_to_nodes(RemoteCs)), + {AnythingNew, MergedCs} = + merge_storage_type(Ns, false, Cs, RemoteCs, Force), + MergedCs2 = merge_versions(AnythingNew, MergedCs, RemoteCs, Force), + verify_cstruct(MergedCs2), + MergedCs2. + +merge_storage_type([N | Ns], AnythingNew, Cs, RemoteCs, Force) -> + Local = mnesia_lib:cs_to_storage_type(N, Cs), + Remote = mnesia_lib:cs_to_storage_type(N, RemoteCs), + case compare_storage_type(true, Local, Remote) of + {same, _Storage} -> + merge_storage_type(Ns, AnythingNew, Cs, RemoteCs, Force); + {diff, Storage} -> + Cs2 = change_storage_type(N, Storage, Cs), + merge_storage_type(Ns, true, Cs2, RemoteCs, Force); + incompatible when Force == true -> + merge_storage_type(Ns, AnythingNew, Cs, RemoteCs, Force); + Other -> + Str = io_lib:format("Cannot merge storage type for node ~w " + "in cstruct ~w with remote cstruct ~w (~w)~n", + [N, Cs, RemoteCs, Other]), + throw(Str) + end; +merge_storage_type([], AnythingNew, MergedCs, _RemoteCs, _Force) -> + {AnythingNew, MergedCs}. + +compare_storage_type(_Retry, Any, Any) -> + {same, Any}; +compare_storage_type(_Retry, unknown, Any) -> + {diff, Any}; +compare_storage_type(_Retry, ram_copies, disc_copies) -> + {diff, disc_copies}; +compare_storage_type(_Retry, disc_copies, disc_only_copies) -> + {diff, disc_only_copies}; +compare_storage_type(true, One, Another) -> + compare_storage_type(false, Another, One); +compare_storage_type(false, _One, _Another) -> + incompatible. + +change_storage_type(N, ram_copies, Cs) -> + Nodes = [N | Cs#cstruct.ram_copies], + Cs#cstruct{ram_copies = mnesia_lib:uniq(Nodes)}; +change_storage_type(N, disc_copies, Cs) -> + Nodes = [N | Cs#cstruct.disc_copies], + Cs#cstruct{disc_copies = mnesia_lib:uniq(Nodes)}; +change_storage_type(N, disc_only_copies, Cs) -> + Nodes = [N | Cs#cstruct.disc_only_copies], + Cs#cstruct{disc_only_copies = mnesia_lib:uniq(Nodes)}. + +%% BUGBUG: Verify match of frag info; equalit demanded for all but add_node + +merge_versions(AnythingNew, Cs, RemoteCs, Force) -> + if + Cs#cstruct.name == schema -> + ok; + Cs#cstruct.name /= schema, + Cs#cstruct.cookie == RemoteCs#cstruct.cookie -> + ok; + Force == true -> + ok; + true -> + Str = io_lib:format("Bad cookies. Cannot merge definitions of " + "table ~w. Local = ~w, Remote = ~w~n", + [Cs#cstruct.name, Cs, RemoteCs]), + throw(Str) + end, + if + Cs#cstruct.name == RemoteCs#cstruct.name, + Cs#cstruct.type == RemoteCs#cstruct.type, + Cs#cstruct.local_content == RemoteCs#cstruct.local_content, + Cs#cstruct.attributes == RemoteCs#cstruct.attributes, + Cs#cstruct.index == RemoteCs#cstruct.index, + Cs#cstruct.snmp == RemoteCs#cstruct.snmp, + Cs#cstruct.access_mode == RemoteCs#cstruct.access_mode, + Cs#cstruct.load_order == RemoteCs#cstruct.load_order, + Cs#cstruct.user_properties == RemoteCs#cstruct.user_properties -> + do_merge_versions(AnythingNew, Cs, RemoteCs); + Force == true -> + do_merge_versions(AnythingNew, Cs, RemoteCs); + true -> + Str1 = io_lib:format("Cannot merge definitions of " + "table ~w. Local = ~w, Remote = ~w~n", + [Cs#cstruct.name, Cs, RemoteCs]), + throw(Str1) + end. + +do_merge_versions(AnythingNew, MergedCs, RemoteCs) -> + {{Major1, Minor1}, _Detail1} = MergedCs#cstruct.version, + {{Major2, Minor2}, _Detail2} = RemoteCs#cstruct.version, + if + AnythingNew == false -> + MergedCs; + MergedCs#cstruct.version == RemoteCs#cstruct.version -> + V = {{Major1, Minor1}, dummy}, + incr_version(MergedCs#cstruct{version = V}); + Major1 == Major2 -> + Minor = lists:max([Minor1, Minor2]), + V = {{Major1, Minor}, dummy}, + incr_version(MergedCs#cstruct{version = V}); + Major1 /= Major2 -> + Major = lists:max([Major1, Major2]), + V = {{Major, 0}, dummy}, + incr_version(MergedCs#cstruct{version = V}) + end. + +%% Verify the basics +verify_merge(RemoteCs) -> + Tab = RemoteCs#cstruct.name, + Masters = mnesia_recover:get_master_nodes(schema), + HasRemoteMaster = Masters /= [], + case ?catch_val({Tab, cstruct}) of + {'EXIT', _} -> + ok; + Cs -> + StCsLocal = mnesia_lib:cs_to_storage_type(node(), Cs), + StRcsLocal = mnesia_lib:cs_to_storage_type(node(), RemoteCs), + if + StCsLocal == StRcsLocal -> ok; + StCsLocal == unknown -> ok; + (StRcsLocal == unknown), (HasRemoteMaster == false) -> + {merge_error, Cs, RemoteCs}; + %% Trust the merger + true -> ok + end + end. + +announce_im_running([N | Ns], SchemaCs) -> + {L1, L2} = mnesia_recover:connect_nodes([N]), + case lists:member(N, L1) or lists:member(N, L2) of + true -> + mnesia_lib:add({current, db_nodes}, N), + mnesia_controller:add_active_replica(schema, N, SchemaCs); + false -> + ignore + end, + announce_im_running(Ns, SchemaCs); +announce_im_running([], _) -> + []. + +unannounce_im_running([N | Ns]) -> + mnesia_lib:del({current, db_nodes}, N), + mnesia_controller:del_active_replica(schema, N), + unannounce_im_running(Ns); +unannounce_im_running([]) -> + ok. + diff --git a/lib/mnesia/src/mnesia_snmp_hook.erl b/lib/mnesia/src/mnesia_snmp_hook.erl new file mode 100644 index 0000000000..8b4b5231e1 --- /dev/null +++ b/lib/mnesia/src/mnesia_snmp_hook.erl @@ -0,0 +1,259 @@ +%% +%% %CopyrightBegin% +%% +%% Copyright Ericsson AB 1996-2009. All Rights Reserved. +%% +%% The contents of this file are subject to the Erlang Public License, +%% Version 1.1, (the "License"); you may not use this file except in +%% compliance with the License. You should have received a copy of the +%% Erlang Public License along with this software. If not, it can be +%% retrieved online at http://www.erlang.org/. +%% +%% Software distributed under the License is distributed on an "AS IS" +%% basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See +%% the License for the specific language governing rights and limitations +%% under the License. +%% +%% %CopyrightEnd% +%% + +%% +-module(mnesia_snmp_hook). + +%% Hooks (called from mnesia) +-export([check_ustruct/1, create_table/3, delete_table/2, + key_to_oid/2, key_to_oid/3, oid_to_key/2, + update/1, + get_row/2, get_next_index/2, get_mnesia_key/2]). + +-export([key_to_oid_i/2, oid_to_key_1/2]). %% Test + +-include("mnesia.hrl"). + +val(Var) -> + case ?catch_val(Var) of + {'EXIT', _ReASoN_} -> mnesia_lib:other_val(Var, _ReASoN_); + _VaLuE_ -> _VaLuE_ + end. + +check_ustruct([]) -> + true; %% default value, not SNMP'ified +check_ustruct([{key, Types}]) -> + is_snmp_type(to_list(Types)); +check_ustruct(_) -> false. + +to_list(Tuple) when is_tuple(Tuple) -> tuple_to_list(Tuple); +to_list(X) -> [X]. + +is_snmp_type([integer | T]) -> is_snmp_type(T); +is_snmp_type([string | T]) -> is_snmp_type(T); +is_snmp_type([fix_string | T]) -> is_snmp_type(T); +is_snmp_type([]) -> true; +is_snmp_type(_) -> false. + +create_table([], MnesiaTab, _Storage) -> + mnesia:abort({badarg, MnesiaTab, {snmp, empty_snmpstruct}}); + +create_table([{key, Us}], MnesiaTab, Storage) -> + Tree = b_new(MnesiaTab, Us), + mnesia_lib:db_fixtable(Storage, MnesiaTab, true), + First = mnesia_lib:db_first(Storage, MnesiaTab), + build_table(First, MnesiaTab, Tree, Us, Storage), + mnesia_lib:db_fixtable(Storage, MnesiaTab, false), + Tree. + +build_table(MnesiaKey, MnesiaTab, Tree, Us, Storage) + when MnesiaKey /= '$end_of_table' -> + %%update(write, Tree, MnesiaKey, MnesiaKey), + SnmpKey = key_to_oid_i(MnesiaKey, Us), + b_insert(Tree, SnmpKey, MnesiaKey), + Next = mnesia_lib:db_next_key(Storage, MnesiaTab, MnesiaKey), + build_table(Next, MnesiaTab, Tree, Us, Storage); +build_table('$end_of_table', _MnesiaTab, _Tree, _Us, _Storage) -> + ok. + +delete_table(_MnesiaTab, Tree) -> + b_delete_tree(Tree), + ok. + +%%----------------------------------------------------------------- +%% update({Op, MnesiaTab, MnesiaKey, SnmpKey}) +%%----------------------------------------------------------------- + +update({clear_table, MnesiaTab}) -> + Tree = val({MnesiaTab, {index, snmp}}), + b_clear(Tree), + ok; + +update({Op, MnesiaTab, MnesiaKey, SnmpKey}) -> + Tree = val({MnesiaTab, {index, snmp}}), + update(Op, Tree, MnesiaKey, SnmpKey). + +update(Op, Tree, MnesiaKey, SnmpKey) -> + case Op of + write -> + b_insert(Tree, SnmpKey, MnesiaKey); + update_counter -> + ignore; + delete -> + b_delete(Tree, SnmpKey); + delete_object -> + b_delete(Tree, SnmpKey) + end, + ok. + +%%----------------------------------------------------------------- +%% Func: key_to_oid(Tab, Key, Ustruct) +%% Args: Key ::= key() +%% key() ::= int() | string() | {int() | string()} +%% Type ::= {fix_string | term()} +%% Make an OBJECT IDENTIFIER out of it. +%% Variable length objects are prepended by their length. +%% Ex. Key = {"pelle", 42} AND Type = {string, integer} => +%% OID [5, $p, $e, $l, $l, $e, 42] +%% Key = {"pelle", 42} AND Type = {fix_string, integer} => +%% OID [$p, $e, $l, $l, $e, 42] +%%----------------------------------------------------------------- + +key_to_oid(Tab,Key) -> + Types = val({Tab,snmp}), + key_to_oid(Tab, Key, Types). + +key_to_oid(Tab, Key, [{key, Types}]) -> + try key_to_oid_i(Key,Types) + catch _:_ -> + mnesia:abort({bad_snmp_key, {Tab,Key}, Types}) + end. + +key_to_oid_i(Key, integer) when is_integer(Key) -> [Key]; +key_to_oid_i(Key, fix_string) when is_list(Key) -> Key; +key_to_oid_i(Key, string) when is_list(Key) -> [length(Key) | Key]; +key_to_oid_i(Key, Types) -> keys_to_oid(size(Key), Key, [], Types). + +keys_to_oid(0, _Key, Oid, _Types) -> Oid; +keys_to_oid(N, Key, Oid, Types) -> + Oid2 = lists:append(key_to_oid_i(element(N, Key), element(N, Types)), Oid), + keys_to_oid(N-1, Key, Oid2, Types). + +%%-------------------------------------------------- +%% The reverse of the above, i.e. snmp oid to mnesia key. +%% This can be lookup up in tree but that might be on a remote node. +%% It's probably faster to look it up, but use when it migth be remote +oid_to_key(Oid, Tab) -> + [{key, Types}] = val({Tab,snmp}), + oid_to_key_1(Types, Oid). + +oid_to_key_1(integer, [Key]) -> Key; +oid_to_key_1(fix_string, Key) -> Key; +oid_to_key_1(string, [_|Key]) -> Key; +oid_to_key_1(Tuple, Oid) -> + try + List = oid_to_key_2(1, size(Tuple), Tuple, Oid), + list_to_tuple(List) + catch + _:_ -> unknown + end. + +oid_to_key_2(N, Sz, Tuple, Oid0) when N =< Sz -> + case element(N, Tuple) of + integer -> + [Key|Oid] = Oid0, + [Key|oid_to_key_2(N+1, Sz, Tuple, Oid)]; + fix_string when N =:= Sz -> + [Oid0]; + fix_string -> + throw(fix_string); + string -> + [Len|Oid1] = Oid0, + {Str,Oid} = lists:split(Len, Oid1), + [Str|oid_to_key_2(N+1, Sz, Tuple, Oid)] + end; +oid_to_key_2(N, Sz, _, []) when N =:= (Sz+1) -> + []. + +%%----------------------------------------------------------------- +%% Func: get_row/2 +%% Args: Name is the name of the table (atom) +%% RowIndex is an Oid +%% Returns: {ok, Row} | undefined +%% Note that the Row returned might contain columns that +%% are not visible via SNMP. e.g. the first column may be +%% ifIndex, and the last MFA ({ifIndex, col1, col2, MFA}). +%% where ifIndex is used only as index (not as a real col), +%% and MFA as extra info, used by the application. +%%----------------------------------------------------------------- +get_row(Name, RowIndex) -> + Tree = mnesia_lib:val({Name, {index, snmp}}), + case b_lookup(Tree, RowIndex) of + {ok, {_RowIndex, Key}} -> + [Row] = mnesia:dirty_read({Name, Key}), + {ok, Row}; + _ -> + undefined + end. + +%%----------------------------------------------------------------- +%% Func: get_next_index/2 +%% Args: Name is the name of the table (atom) +%% RowIndex is an Oid +%% Returns: {NextIndex,MnesiaKey} | {endOfTable, undefined} +%%----------------------------------------------------------------- +get_next_index(Name, RowIndex) -> + Tree = mnesia_lib:val({Name, {index, snmp}}), + case b_lookup_next(Tree, RowIndex) of + {ok, R} -> + R; + _ -> + {endOfTable,undefined} + end. + +%%----------------------------------------------------------------- +%% Func: get_mnesia_key/2 +%% Purpose: Get the mnesia key corresponding to the RowIndex. +%% Args: Name is the name of the table (atom) +%% RowIndex is an Oid +%% Returns: {ok, Key} | undefiend +%%----------------------------------------------------------------- +get_mnesia_key(Name, RowIndex) -> + Tree = mnesia_lib:val({Name, {index, snmp}}), + case b_lookup(Tree, RowIndex) of + {ok, {_RowIndex, Key}} -> + {ok, Key}; + _ -> + undefined + end. + + +%%----------------------------------------------------------------- +%% Internal implementation, ordered_set ets. + +b_new(_Tab, _Us) -> + mnesia_monitor:unsafe_mktab(?MODULE, [public, ordered_set]). + +b_delete_tree(Tree) -> + ets:delete(Tree). %% Close via mnesia_monitor ? + +b_clear(Tree) -> + ets:delete_all_objects(Tree). + +b_insert(Tree, SnmpKey, MnesiaKey) -> + ets:insert(Tree, {SnmpKey, MnesiaKey}). + +b_delete(Tree, SnmpKey) -> + ets:delete(Tree, SnmpKey). + +b_lookup(Tree, RowIndex) -> + case ets:lookup(Tree, RowIndex) of + [X] -> + {ok, X}; + _ -> + undefined + end. + +b_lookup_next(Tree,RowIndex) -> + case ets:next(Tree, RowIndex) of + '$end_of_table' -> + undefined; + Key -> + b_lookup(Tree, Key) + end. diff --git a/lib/mnesia/src/mnesia_snmp_sup.erl b/lib/mnesia/src/mnesia_snmp_sup.erl new file mode 100644 index 0000000000..7e86281428 --- /dev/null +++ b/lib/mnesia/src/mnesia_snmp_sup.erl @@ -0,0 +1,42 @@ +%% +%% %CopyrightBegin% +%% +%% Copyright Ericsson AB 1997-2009. All Rights Reserved. +%% +%% The contents of this file are subject to the Erlang Public License, +%% Version 1.1, (the "License"); you may not use this file except in +%% compliance with the License. You should have received a copy of the +%% Erlang Public License along with this software. If not, it can be +%% retrieved online at http://www.erlang.org/. +%% +%% Software distributed under the License is distributed on an "AS IS" +%% basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See +%% the License for the specific language governing rights and limitations +%% under the License. +%% +%% %CopyrightEnd% +%% + +%% +-module(mnesia_snmp_sup). + +-behaviour(supervisor). + +-export([start/0, init/1]). + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +%% top supervisor callback functions + +start() -> + supervisor:start_link({local, ?MODULE}, ?MODULE, []). + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +%% sub supervisor callback functions + +init([]) -> + Flags = {simple_one_for_one, 0, timer:hours(24)}, % Trust the top supervisor + MFA = {mnesia_snmp_hook, start, []}, + Modules = [?MODULE, mnesia_snmp_hook, supervisor], + KillAfter = mnesia_kernel_sup:supervisor_timeout(timer:seconds(3)), + Workers = [{?MODULE, MFA, transient, KillAfter, worker, Modules}], + {ok, {Flags, Workers}}. diff --git a/lib/mnesia/src/mnesia_sp.erl b/lib/mnesia/src/mnesia_sp.erl new file mode 100644 index 0000000000..58a177513f --- /dev/null +++ b/lib/mnesia/src/mnesia_sp.erl @@ -0,0 +1,42 @@ +%% +%% %CopyrightBegin% +%% +%% Copyright Ericsson AB 1999-2009. All Rights Reserved. +%% +%% The contents of this file are subject to the Erlang Public License, +%% Version 1.1, (the "License"); you may not use this file except in +%% compliance with the License. You should have received a copy of the +%% Erlang Public License along with this software. If not, it can be +%% retrieved online at http://www.erlang.org/. +%% +%% Software distributed under the License is distributed on an "AS IS" +%% basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See +%% the License for the specific language governing rights and limitations +%% under the License. +%% +%% %CopyrightEnd% +%% + +%% + +%% To able to generate nice crash reports we need a catch on the highest level. +%% This code can't be purged so a code change is not possible. +%% And hence this a simple module. + +-module(mnesia_sp). + +-export([init_proc/4]). + +init_proc(Who, Mod, Fun, Args) -> + mnesia_lib:verbose("~p starting: ~p~n", [Who, self()]), + case catch apply(Mod, Fun, Args) of + {'EXIT', Reason} -> + mnesia_monitor:terminate_proc(Who, Reason, Args), + exit(Reason); + Other -> + Other + end. + + + + diff --git a/lib/mnesia/src/mnesia_subscr.erl b/lib/mnesia/src/mnesia_subscr.erl new file mode 100644 index 0000000000..afd1704dec --- /dev/null +++ b/lib/mnesia/src/mnesia_subscr.erl @@ -0,0 +1,494 @@ +%% +%% %CopyrightBegin% +%% +%% Copyright Ericsson AB 1997-2009. All Rights Reserved. +%% +%% The contents of this file are subject to the Erlang Public License, +%% Version 1.1, (the "License"); you may not use this file except in +%% compliance with the License. You should have received a copy of the +%% Erlang Public License along with this software. If not, it can be +%% retrieved online at http://www.erlang.org/. +%% +%% Software distributed under the License is distributed on an "AS IS" +%% basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See +%% the License for the specific language governing rights and limitations +%% under the License. +%% +%% %CopyrightEnd% +%% + +%% +-module(mnesia_subscr). + +-behaviour(gen_server). + +-export([start/0, + set_debug_level/1, + subscribe/2, + unsubscribe/2, + unsubscribe_table/1, + subscribers/0, + report_table_event/4, + report_table_event/5, + report_table_event/6 + ]). + +%% gen_server callbacks +-export([init/1, + handle_call/3, + handle_cast/2, + handle_info/2, + terminate/2, + code_change/3 + ]). + +-include("mnesia.hrl"). + +-import(mnesia_lib, [error/2]). +-record(state, {supervisor, pid_tab}). + +start() -> + gen_server:start_link({local, ?MODULE}, ?MODULE, [self()], + [{timeout, infinity}]). + +set_debug_level(Level) -> + OldEnv = application:get_env(mnesia, debug), + case mnesia_monitor:patch_env(debug, Level) of + {error, Reason} -> + {error, Reason}; + NewLevel -> + set_debug_level(NewLevel, OldEnv) + end. + +set_debug_level(Level, OldEnv) -> + case mnesia:system_info(is_running) of + no when OldEnv == undefined -> + none; + no -> + {ok, E} = OldEnv, + E; + _ -> + Old = mnesia_lib:val(debug), + Local = mnesia:system_info(local_tables), + E = whereis(mnesia_event), + Sub = fun(Tab) -> subscribe(E, {table, Tab}) end, + UnSub = fun(Tab) -> unsubscribe(E, {table, Tab}) end, + + case Level of + none -> + lists:foreach(UnSub, Local); + verbose -> + lists:foreach(UnSub, Local); + debug -> + lists:foreach(UnSub, Local -- [schema]), + Sub(schema); + trace -> + lists:foreach(Sub, Local) + end, + mnesia_lib:set(debug, Level), + Old + end. + +subscribe(ClientPid, system) -> + change_subscr(activate, ClientPid, system); +subscribe(ClientPid, {table, Tab}) -> + change_subscr(activate, ClientPid, {table, Tab, simple}); +subscribe(ClientPid, {table, Tab, simple}) -> + change_subscr(activate, ClientPid, {table, Tab, simple}); +subscribe(ClientPid, {table, Tab, detailed}) -> + change_subscr(activate, ClientPid, {table, Tab, detailed}); +subscribe(_ClientPid, What) -> + {error, {badarg, What}}. + +unsubscribe(ClientPid, system) -> + change_subscr(deactivate, ClientPid, system); +unsubscribe(ClientPid, {table, Tab}) -> + change_subscr(deactivate, ClientPid, {table, Tab, simple}); +unsubscribe(ClientPid, {table, Tab, simple}) -> + change_subscr(deactivate, ClientPid, {table, Tab, simple}); +unsubscribe(ClientPid, {table, Tab, detailed}) -> + change_subscr(deactivate, ClientPid, {table, Tab, detailed}); +unsubscribe(_ClientPid, What) -> + {error, {badarg, What}}. + +unsubscribe_table(Tab) -> + call({change, {deactivate_table, Tab}}). + +change_subscr(Kind, ClientPid, What) -> + call({change, {Kind, ClientPid, What}}). + +subscribers() -> + [whereis(mnesia_event) | mnesia_lib:val(subscribers)]. + +report_table_event(Tab, Tid, Obj, Op) -> + case ?catch_val({Tab, commit_work}) of + {'EXIT', _} -> ok; + Commit -> + case lists:keysearch(subscribers, 1, Commit) of + false -> ok; + {value, Subs} -> + report_table_event(Subs, Tab, Tid, Obj, Op, undefined) + end + end. + +%% Backwards compatible for the moment when mnesia_tm get's updated! +report_table_event(Subscr, Tab, Tid, Obj, Op) -> + report_table_event(Subscr, Tab, Tid, Obj, Op, undefined). + +report_table_event({subscribers, S1, S2}, Tab, Tid, _Obj, clear_table, _Old) -> + What = {delete, {schema, Tab}, Tid}, + deliver(S1, {mnesia_table_event, What}), + TabDef = mnesia_schema:cs2list(?catch_val({Tab, cstruct})), + What2 = {write, {schema, Tab, TabDef}, Tid}, + deliver(S1, {mnesia_table_event, What2}), + What3 = {delete, schema, {schema, Tab}, [{schema, Tab, TabDef}], Tid}, + deliver(S2, {mnesia_table_event, What3}), + What4 = {write, schema, {schema, Tab, TabDef}, [], Tid}, + deliver(S2, {mnesia_table_event, What4}); + +report_table_event({subscribers, Subscr, []}, Tab, Tid, Obj, Op, _Old) -> + What = {Op, patch_record(Tab, Obj), Tid}, + deliver(Subscr, {mnesia_table_event, What}); + +report_table_event({subscribers, S1, S2}, Tab, Tid, Obj, Op, Old) -> + Standard = {Op, patch_record(Tab, Obj), Tid}, + deliver(S1, {mnesia_table_event, Standard}), + Extended = what(Tab, Tid, Obj, Op, Old), + deliver(S2, Extended); + +%% Backwards compatible for the moment when mnesia_tm get's updated! +report_table_event({subscribers, Subscr}, Tab, Tid, Obj, Op, Old) -> + report_table_event({subscribers, Subscr, []}, Tab, Tid, Obj, Op, Old). + + +patch_record(Tab, Obj) -> + case Tab == element(1, Obj) of + true -> + Obj; + false -> + setelement(1, Obj, Tab) + end. + +what(Tab, Tid, {RecName, Key}, delete, undefined) -> + case catch mnesia_lib:db_get(Tab, Key) of + Old when is_list(Old) -> %% Op only allowed for set table. + {mnesia_table_event, {delete, Tab, {RecName, Key}, Old, Tid}}; + _ -> + %% Record just deleted by a dirty_op or + %% the whole table has been deleted + ignore + end; +what(Tab, Tid, Obj, delete, Old) -> + {mnesia_table_event, {delete, Tab, Obj, Old, Tid}}; +what(Tab, Tid, Obj, delete_object, _Old) -> + {mnesia_table_event, {delete, Tab, Obj, [Obj], Tid}}; +what(Tab, Tid, Obj, write, undefined) -> + case catch mnesia_lib:db_get(Tab, element(2, Obj)) of + Old when is_list(Old) -> + {mnesia_table_event, {write, Tab, Obj, Old, Tid}}; + {'EXIT', _} -> + ignore + end. + +deliver(_, ignore) -> + ok; +deliver([Pid | Pids], Msg) -> + Pid ! Msg, + deliver(Pids, Msg); +deliver([], _Msg) -> + ok. + +call(Msg) -> + Pid = whereis(?MODULE), + case Pid of + undefined -> + {error, {node_not_running, node()}}; + Pid -> + Res = gen_server:call(Pid, Msg, infinity), + %% We get an exit signal if server dies + receive + {'EXIT', _Pid, _Reason} -> + {error, {node_not_running, node()}} + after 0 -> + Res + end + end. + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +%%% Callback functions from gen_server + +%%---------------------------------------------------------------------- +%% Func: init/1 +%% Returns: {ok, State} | +%% {ok, State, Timeout} | +%% {stop, Reason} +%%---------------------------------------------------------------------- +init([Parent]) -> + process_flag(trap_exit, true), + ClientPid = whereis(mnesia_event), + link(ClientPid), + mnesia_lib:verbose("~p starting: ~p~n", [?MODULE, self()]), + Tab = ?ets_new_table(mnesia_subscr, [duplicate_bag, private]), + ?ets_insert(Tab, {ClientPid, system}), + {ok, #state{supervisor = Parent, pid_tab = Tab}}. + +%%---------------------------------------------------------------------- +%% Func: handle_call/3 +%% Returns: {reply, Reply, State} | +%% {reply, Reply, State, Timeout} | +%% {noreply, State} | +%% {noreply, State, Timeout} | +%% {stop, Reason, Reply, State} | (terminate/2 is called) +%%---------------------------------------------------------------------- +handle_call({change, How}, _From, State) -> + Reply = do_change(How, State#state.pid_tab), + {reply, Reply, State}; + +handle_call(Msg, _From, State) -> + error("~p got unexpected call: ~p~n", [?MODULE, Msg]), + {noreply, State}. + +%%---------------------------------------------------------------------- +%% Func: handle_cast/2 +%% Returns: {noreply, State} | +%% {noreply, State, Timeout} | +%% {stop, Reason, State} (terminate/2 is called) +%%---------------------------------------------------------------------- +handle_cast(Msg, State) -> + error("~p got unexpected cast: ~p~n", [?MODULE, Msg]), + {noreply, State}. + +%%---------------------------------------------------------------------- +%% Func: handle_info/2 +%% Returns: {noreply, State} | +%% {noreply, State, Timeout} | +%% {stop, Reason, State} (terminate/2 is called) +%%---------------------------------------------------------------------- + +handle_info({'EXIT', Pid, _R}, State) when Pid == State#state.supervisor -> + {stop, shutdown, State}; + +handle_info({'EXIT', Pid, _Reason}, State) -> + handle_exit(Pid, State#state.pid_tab), + {noreply, State}; + +handle_info(Msg, State) -> + error("~p got unexpected info: ~p~n", [?MODULE, Msg]), + {noreply, State}. + +%%---------------------------------------------------------------------- +%% Func: terminate/2 +%% Purpose: Shutdown the server +%% Returns: any (ignored by gen_server) +%%---------------------------------------------------------------------- +terminate(Reason, State) -> + prepare_stop(State#state.pid_tab), + mnesia_monitor:terminate_proc(?MODULE, Reason, State). + +%%---------------------------------------------------------------------- +%% Func: code_change/3 +%% Purpose: Upgrade process when its code is to be changed +%% Returns: {ok, NewState} +%%---------------------------------------------------------------------- +code_change(_OldVsn, State, _Extra) -> + {ok, State}. + +%%%---------------------------------------------------------------------- +%%% Internal functions +%%%---------------------------------------------------------------------- + +do_change({activate, ClientPid, system}, SubscrTab) when is_pid(ClientPid) -> + Var = subscribers, + activate(ClientPid, system, Var, subscribers(), SubscrTab); +do_change({activate, ClientPid, {table, Tab, How}}, SubscrTab) when is_pid(ClientPid) -> + case ?catch_val({Tab, where_to_read}) of + Node when Node == node() -> + Var = {Tab, commit_work}, + activate(ClientPid, {table, Tab, How}, Var, mnesia_lib:val(Var), SubscrTab); + {'EXIT', _} -> + {error, {no_exists, Tab}}; + _Node -> + {error, {not_active_local, Tab}} + end; +do_change({deactivate, ClientPid, system}, SubscrTab) -> + Var = subscribers, + deactivate(ClientPid, system, Var, SubscrTab); +do_change({deactivate, ClientPid, {table, Tab, How}}, SubscrTab) -> + Var = {Tab, commit_work}, + deactivate(ClientPid, {table, Tab, How}, Var, SubscrTab); +do_change({deactivate_table, Tab}, SubscrTab) -> + Var = {Tab, commit_work}, + case ?catch_val(Var) of + {'EXIT', _} -> + {error, {no_exists, Tab}}; + CommitWork -> + case lists:keysearch(subscribers, 1, CommitWork) of + false -> + ok; + {value, Subs} -> + Simple = {table, Tab, simple}, + Detailed = {table, Tab, detailed}, + Fs = fun(C) -> deactivate(C, Simple, Var, SubscrTab) end, + Fd = fun(C) -> deactivate(C, Detailed, Var, SubscrTab) end, + case Subs of + {subscribers, L1, L2} -> + lists:foreach(Fs, L1), + lists:foreach(Fd, L2); + {subscribers, L1} -> + lists:foreach(Fs, L1) + end + end, + {ok, node()} + end; +do_change(_, _) -> + {error, badarg}. + +activate(ClientPid, What, Var, OldSubscribers, SubscrTab) -> + Old = + if Var == subscribers -> + OldSubscribers; + true -> + case lists:keysearch(subscribers, 1, OldSubscribers) of + false -> []; + {value, Subs} -> + case Subs of + {subscribers, L1, L2} -> + L1 ++ L2; + {subscribers, L1} -> + L1 + end + end + end, + case lists:member(ClientPid, Old) of + false -> + %% Don't care about checking old links + case catch link(ClientPid) of + true -> + ?ets_insert(SubscrTab, {ClientPid, What}), + add_subscr(Var, What, ClientPid), + {ok, node()}; + {'EXIT', _Reason} -> + {error, {no_exists, ClientPid}} + end; + true -> + {error, {already_exists, What}} + end. + +%%-record(subscribers, {pids = []}). Old subscriber record removed +%% To solve backward compatibility, this code is a cludge.. +add_subscr(subscribers, _What, Pid) -> + mnesia_lib:add(subscribers, Pid), + {ok, node()}; +add_subscr({Tab, commit_work}, What, Pid) -> + Commit = mnesia_lib:val({Tab, commit_work}), + case lists:keysearch(subscribers, 1, Commit) of + false -> + Subscr = + case What of + {table, _, simple} -> + {subscribers, [Pid], []}; + {table, _, detailed} -> + {subscribers, [], [Pid]} + end, + mnesia_lib:add({Tab, subscribers}, Pid), + mnesia_lib:set({Tab, commit_work}, + mnesia_lib:sort_commit([Subscr | Commit])); + {value, Old} -> + {L1, L2} = + case Old of + {subscribers, L} -> %% Old Way + {L, []}; + {subscribers, SL1, SL2} -> + {SL1, SL2} + end, + Subscr = + case What of + {table, _, simple} -> + {subscribers, [Pid | L1], L2}; + {table, _, detailed} -> + {subscribers, L1, [Pid | L2]} + end, + NewC = lists:keyreplace(subscribers, 1, Commit, Subscr), + mnesia_lib:set({Tab, commit_work}, + mnesia_lib:sort_commit(NewC)), + mnesia_lib:add({Tab, subscribers}, Pid) + end. + +deactivate(ClientPid, What, Var, SubscrTab) -> + ?ets_match_delete(SubscrTab, {ClientPid, What}), + case catch ?ets_lookup_element(SubscrTab, ClientPid, 1) of + List when is_list(List) -> + ignore; + {'EXIT', _} -> + unlink(ClientPid) + end, + del_subscr(Var, What, ClientPid), + {ok, node()}. + +del_subscr(subscribers, _What, Pid) -> + mnesia_lib:del(subscribers, Pid); +del_subscr({Tab, commit_work}, What, Pid) -> + Commit = mnesia_lib:val({Tab, commit_work}), + case lists:keysearch(subscribers, 1, Commit) of + false -> + false; + {value, Old} -> + {L1, L2} = + case Old of + {subscribers, L} -> %% Old Way + {L, []}; + {subscribers, SL1, SL2} -> + {SL1, SL2} + end, + Subscr = + case What of %% Ignore user error delete subscr from any list + {table, _, simple} -> + NewL1 = lists:delete(Pid, L1), + NewL2 = lists:delete(Pid, L2), + {subscribers, NewL1, NewL2}; + {table, _, detailed} -> + NewL1 = lists:delete(Pid, L1), + NewL2 = lists:delete(Pid, L2), + {subscribers, NewL1, NewL2} + end, + case Subscr of + {subscribers, [], []} -> + NewC = lists:keydelete(subscribers, 1, Commit), + mnesia_lib:del({Tab, subscribers}, Pid), + mnesia_lib:set({Tab, commit_work}, + mnesia_lib:sort_commit(NewC)); + _ -> + NewC = lists:keyreplace(subscribers, 1, Commit, Subscr), + mnesia_lib:del({Tab, subscribers}, Pid), + mnesia_lib:set({Tab, commit_work}, + mnesia_lib:sort_commit(NewC)) + end + end. + +handle_exit(ClientPid, SubscrTab) -> + do_handle_exit(?ets_lookup(SubscrTab, ClientPid)), + ?ets_delete(SubscrTab, ClientPid). + +do_handle_exit([{ClientPid, What} | Tail]) -> + case What of + system -> + del_subscr(subscribers, What, ClientPid); + {_, Tab, _Level} -> + del_subscr({Tab, commit_work}, What, ClientPid) + end, + do_handle_exit(Tail); +do_handle_exit([]) -> + ok. + +prepare_stop(SubscrTab) -> + mnesia_lib:report_system_event({mnesia_down, node()}), + do_prepare_stop(?ets_first(SubscrTab), SubscrTab). + +do_prepare_stop('$end_of_table', _SubscrTab) -> + ok; +do_prepare_stop(ClientPid, SubscrTab) -> + Next = ?ets_next(SubscrTab, ClientPid), + handle_exit(ClientPid, SubscrTab), + unlink(ClientPid), + do_prepare_stop(Next, SubscrTab). + diff --git a/lib/mnesia/src/mnesia_sup.erl b/lib/mnesia/src/mnesia_sup.erl new file mode 100644 index 0000000000..9ee4086f50 --- /dev/null +++ b/lib/mnesia/src/mnesia_sup.erl @@ -0,0 +1,131 @@ +%% +%% %CopyrightBegin% +%% +%% Copyright Ericsson AB 1996-2009. All Rights Reserved. +%% +%% The contents of this file are subject to the Erlang Public License, +%% Version 1.1, (the "License"); you may not use this file except in +%% compliance with the License. You should have received a copy of the +%% Erlang Public License along with this software. If not, it can be +%% retrieved online at http://www.erlang.org/. +%% +%% Software distributed under the License is distributed on an "AS IS" +%% basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See +%% the License for the specific language governing rights and limitations +%% under the License. +%% +%% %CopyrightEnd% +%% + +%% +%% Supervisor for the entire Mnesia application + +-module(mnesia_sup). + +-behaviour(application). +-behaviour(supervisor). + +-export([start/0, start/2, init/1, stop/1, start_event/0, kill/0]). + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +%% application and suprvisor callback functions + +start(normal, Args) -> + SupName = {local,?MODULE}, + case supervisor:start_link(SupName, ?MODULE, [Args]) of + {ok, Pid} -> + {ok, Pid, {normal, Args}}; + Error -> + Error + end; +start(_, _) -> + {error, badarg}. + +start() -> + SupName = {local,?MODULE}, + supervisor:start_link(SupName, ?MODULE, []). + +stop(_StartArgs) -> + ok. + +init([]) -> % Supervisor + init(); +init([[]]) -> % Application + init(); +init(BadArg) -> + {error, {badarg, BadArg}}. + +init() -> + Flags = {one_for_all, 0, 3600}, % Should be rest_for_one policy + + Event = event_procs(), + Kernel = kernel_procs(), + Mnemosyne = mnemosyne_procs(), + + {ok, {Flags, Event ++ Kernel ++ Mnemosyne}}. + +event_procs() -> + KillAfter = timer:seconds(30), + KA = mnesia_kernel_sup:supervisor_timeout(KillAfter), + E = mnesia_event, + [{E, {?MODULE, start_event, []}, permanent, KA, worker, [E, gen_event]}]. + +kernel_procs() -> + K = mnesia_kernel_sup, + KA = infinity, + [{K, {K, start, []}, permanent, KA, supervisor, [K, supervisor]}]. + +mnemosyne_procs() -> + case mnesia_monitor:get_env(embedded_mnemosyne) of + true -> + Q = mnemosyne_sup, + KA = infinity, + [{Q, {Q, start, []}, permanent, KA, supervisor, [Q, supervisor]}]; + false -> + [] + end. + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +%% event handler + +start_event() -> + case gen_event:start_link({local, mnesia_event}) of + {ok, Pid} -> + case add_event_handler() of + ok -> + {ok, Pid}; + Error -> + Error + end; + Error -> + Error + end. + +add_event_handler() -> + Handler = mnesia_monitor:get_env(event_module), + gen_event:add_handler(mnesia_event, Handler, []). + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +%% debug functions + +kill() -> + Mnesia = [mnesia_fallback | mnesia:ms()], + Kill = fun(Name) -> catch exit(whereis(Name), kill) end, + lists:foreach(Kill, Mnesia), + lists:foreach(fun ensure_dead/1, Mnesia), + timer:sleep(10), + case lists:keymember(mnesia, 1, application:which_applications()) of + true -> kill(); + false -> ok + end. + +ensure_dead(Name) -> + case whereis(Name) of + undefined -> + ok; + Pid when is_pid(Pid) -> + exit(Pid, kill), + timer:sleep(10), + ensure_dead(Name) + end. + diff --git a/lib/mnesia/src/mnesia_text.erl b/lib/mnesia/src/mnesia_text.erl new file mode 100644 index 0000000000..f1a28bf43d --- /dev/null +++ b/lib/mnesia/src/mnesia_text.erl @@ -0,0 +1,194 @@ +%% +%% %CopyrightBegin% +%% +%% Copyright Ericsson AB 1996-2009. All Rights Reserved. +%% +%% The contents of this file are subject to the Erlang Public License, +%% Version 1.1, (the "License"); you may not use this file except in +%% compliance with the License. You should have received a copy of the +%% Erlang Public License along with this software. If not, it can be +%% retrieved online at http://www.erlang.org/. +%% +%% Software distributed under the License is distributed on an "AS IS" +%% basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See +%% the License for the specific language governing rights and limitations +%% under the License. +%% +%% %CopyrightEnd% +%% + +%% +-module(mnesia_text). + +-export([parse/1, file/1, load_textfile/1, dump_to_textfile/1]). + +load_textfile(File) -> + ensure_started(), + case parse(File) of + {ok, {Tabs, Data}} -> + Badtabs = make_tabs(lists:map(fun validate_tab/1, Tabs)), + load_data(del_data(Badtabs, Data, [])); + Other -> + Other + end. + +dump_to_textfile(File) -> + dump_to_textfile(mnesia_lib:is_running(), file:open(File, [write])). +dump_to_textfile(yes, {ok, F}) -> + Tabs = lists:delete(schema, mnesia_lib:local_active_tables()), + Defs = lists:map(fun(T) -> {T, [{record_name, mnesia_lib:val({T, record_name})}, + {attributes, mnesia_lib:val({T, attributes})}]} + end, + Tabs), + io:format(F, "~p.~n", [{tables, Defs}]), + lists:foreach(fun(T) -> dump_tab(F, T) end, Tabs), + file:close(F); +dump_to_textfile(_,_) -> error. + + +dump_tab(F, T) -> + W = mnesia_lib:val({T, wild_pattern}), + {atomic,All} = mnesia:transaction(fun() -> mnesia:match_object(T, W, read) end), + lists:foreach(fun(Term) -> io:format(F,"~p.~n", [setelement(1, Term, T)]) end, All). + + +ensure_started() -> + case mnesia_lib:is_running() of + yes -> + yes; + no -> + case mnesia_lib:exists(mnesia_lib:dir("schema.DAT")) of + true -> + mnesia:start(); + false -> + mnesia:create_schema([node()]), + mnesia:start() + end + end. + +del_data(Bad, [H|T], Ack) -> + case lists:member(element(1, H), Bad) of + true -> del_data(Bad, T, Ack); + false -> del_data(Bad, T, [H|Ack]) + end; +del_data(_Bad, [], Ack) -> + lists:reverse(Ack). + +%% Tis the place to call the validate func in mnesia_schema +validate_tab({Tabname, List}) -> + {Tabname, List}; +validate_tab({Tabname, RecName, List}) -> + {Tabname, RecName, List}; +validate_tab(_) -> error(badtab). + +make_tabs([{Tab, Def} | Tail]) -> + case catch mnesia:table_info(Tab, where_to_read) of + {'EXIT', _} -> %% non-existing table + case mnesia:create_table(Tab, Def) of + {aborted, Reason} -> + io:format("** Failed to create table ~w ~n" + "** Reason = ~w, Args = ~p~n", + [Tab, Reason, Def]), + [Tab | make_tabs(Tail)]; + _ -> + io:format("New table ~w~n", [Tab]), + make_tabs(Tail) + end; + Node -> + io:format("** Table ~w already exists on ~p, just entering data~n", + [Tab, Node]), + make_tabs(Tail) + end; + +make_tabs([]) -> + []. + +load_data(L) -> + mnesia:transaction(fun() -> + F = fun(X) -> + Tab = element(1, X), + RN = mnesia:table_info(Tab, record_name), + Rec = setelement(1, X, RN), + mnesia:write(Tab, Rec, write) end, + lists:foreach(F, L) + end). + +parse(File) -> + case file(File) of + {ok, Terms} -> + case catch collect(Terms) of + {error, X} -> + {error, X}; + Other -> + {ok, Other} + end; + Other -> + Other + end. + +collect([{_, {tables, Tabs}}|L]) -> + {Tabs, collect_data(Tabs, L)}; + +collect(_) -> + io:format("No tables found\n", []), + error(bad_header). + +collect_data(Tabs, [{Line, Term} | Tail]) when is_tuple(Term) -> + case lists:keysearch(element(1, Term), 1, Tabs) of + {value, _} -> + [Term | collect_data(Tabs, Tail)]; + _Other -> + io:format("Object:~p at line ~w unknown\n", [Term,Line]), + error(undefined_object) + end; +collect_data(_Tabs, []) -> []; +collect_data(_Tabs, [H|_T]) -> + io:format("Object:~p unknown\n", [H]), + error(undefined_object). + +error(What) -> throw({error, What}). + +file(File) -> + case file:open(File, [read]) of + {ok, Stream} -> + Res = read_terms(Stream, File, 1, []), + file:close(Stream), + Res; + _Other -> + {error, open} + end. + +read_terms(Stream, File, Line, L) -> + case read_term_from_stream(Stream, File, Line) of + {ok, Term, NextLine} -> + read_terms(Stream, File, NextLine, [Term|L]); + error -> + {error, read}; + eof -> + {ok, lists:reverse(L)} + end. + +read_term_from_stream(Stream, File, Line) -> + R = io:request(Stream, {get_until,'',erl_scan,tokens,[Line]}), + case R of + {ok,Toks,EndLine} -> + case erl_parse:parse_term(Toks) of + {ok, Term} -> + {ok, {Line, Term}, EndLine}; + {error, {NewLine,Mod,What}} -> + Str = Mod:format_error(What), + io:format("Error in line:~p of:~p ~s\n", + [NewLine, File, Str]), + error; + T -> + io:format("Error2 **~p~n",[T]), + error + end; + {eof,_EndLine} -> + eof; + Other -> + io:format("Error1 **~p~n",[Other]), + error + end. + + diff --git a/lib/mnesia/src/mnesia_tm.erl b/lib/mnesia/src/mnesia_tm.erl new file mode 100644 index 0000000000..3f3a10a9c1 --- /dev/null +++ b/lib/mnesia/src/mnesia_tm.erl @@ -0,0 +1,2301 @@ +%% +%% %CopyrightBegin% +%% +%% Copyright Ericsson AB 1996-2009. All Rights Reserved. +%% +%% The contents of this file are subject to the Erlang Public License, +%% Version 1.1, (the "License"); you may not use this file except in +%% compliance with the License. You should have received a copy of the +%% Erlang Public License along with this software. If not, it can be +%% retrieved online at http://www.erlang.org/. +%% +%% Software distributed under the License is distributed on an "AS IS" +%% basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See +%% the License for the specific language governing rights and limitations +%% under the License. +%% +%% %CopyrightEnd% +%% + +%% +-module(mnesia_tm). + +-export([ + start/0, + init/1, + non_transaction/5, + transaction/6, + commit_participant/5, + dirty/2, + display_info/2, + do_update_op/3, + get_info/1, + get_transactions/0, + info/1, + mnesia_down/1, + prepare_checkpoint/2, + prepare_checkpoint/1, % Internal + prepare_snmp/3, + do_snmp/2, + put_activity_id/1, + put_activity_id/2, + block_tab/1, + unblock_tab/1, + fixtable/3 + ]). + +%% sys callback functions +-export([system_continue/3, + system_terminate/4, + system_code_change/4 + ]). + +-include("mnesia.hrl"). +-import(mnesia_lib, [set/2]). +-import(mnesia_lib, [fatal/2, verbose/2, dbg_out/2]). + +-record(state, {coordinators = gb_trees:empty(), participants = gb_trees:empty(), supervisor, + blocked_tabs = [], dirty_queue = [], fixed_tabs = []}). +%% Format on coordinators is [{Tid, EtsTabList} ..... + +-record(prep, {protocol = sym_trans, + %% async_dirty | sync_dirty | sym_trans | sync_sym_trans | asym_trans + records = [], + prev_tab = [], % initiate to a non valid table name + prev_types, + prev_snmp, + types + }). + +-record(participant, {tid, pid, commit, disc_nodes = [], + ram_nodes = [], protocol = sym_trans}). + +start() -> + mnesia_monitor:start_proc(?MODULE, ?MODULE, init, [self()]). + +init(Parent) -> + register(?MODULE, self()), + process_flag(trap_exit, true), + + %% Initialize the schema + IgnoreFallback = mnesia_monitor:get_env(ignore_fallback_at_startup), + mnesia_bup:tm_fallback_start(IgnoreFallback), + mnesia_schema:init(IgnoreFallback), + + %% Handshake and initialize transaction recovery + mnesia_recover:init(), + Early = mnesia_monitor:init(), + AllOthers = mnesia_lib:uniq(Early ++ mnesia_lib:all_nodes()) -- [node()], + set(original_nodes, AllOthers), + mnesia_recover:connect_nodes(AllOthers), + + %% Recover transactions, may wait for decision + case mnesia_monitor:use_dir() of + true -> + P = mnesia_dumper:opt_dump_log(startup), % previous log + L = mnesia_dumper:opt_dump_log(startup), % latest log + Msg = "Initial dump of log during startup: ~p~n", + mnesia_lib:verbose(Msg, [[P, L]]), + mnesia_log:init(); + false -> + ignore + end, + + mnesia_schema:purge_tmp_files(), + mnesia_recover:start_garb(), + + ?eval_debug_fun({?MODULE, init}, [{nodes, AllOthers}]), + + case val(debug) of + Debug when Debug /= debug, Debug /= trace -> + ignore; + _ -> + mnesia_subscr:subscribe(whereis(mnesia_event), {table, schema}) + end, + proc_lib:init_ack(Parent, {ok, self()}), + doit_loop(#state{supervisor = Parent}). + +val(Var) -> + case ?catch_val(Var) of + {'EXIT', _ReASoN_} -> mnesia_lib:other_val(Var, _ReASoN_); + _VaLuE_ -> _VaLuE_ + end. + +reply({From,Ref}, R) -> + From ! {?MODULE, Ref, R}; +reply(From, R) -> + From ! {?MODULE, node(), R}. + +reply(From, R, State) -> + reply(From, R), + doit_loop(State). + +req(R) -> + case whereis(?MODULE) of + undefined -> + {error, {node_not_running, node()}}; + Pid -> + Ref = make_ref(), + Pid ! {{self(), Ref}, R}, + rec(Pid, Ref) + end. + +rec() -> + rec(whereis(?MODULE)). + +rec(Pid) when is_pid(Pid) -> + receive + {?MODULE, _, Reply} -> + Reply; + + {'EXIT', Pid, _} -> + {error, {node_not_running, node()}} + end; +rec(undefined) -> + {error, {node_not_running, node()}}. + +rec(Pid, Ref) -> + receive + {?MODULE, Ref, Reply} -> + Reply; + {'EXIT', Pid, _} -> + {error, {node_not_running, node()}} + end. + +tmlink({From, Ref}) when is_reference(Ref) -> + link(From); +tmlink(From) -> + link(From). +tmpid({Pid, _Ref}) when is_pid(Pid) -> + Pid; +tmpid(Pid) -> + Pid. + +%% Returns a list of participant transaction Tid's +mnesia_down(Node) -> + %% Syncronously call needed in order to avoid + %% race with mnesia_tm's coordinator processes + %% that may restart and acquire new locks. + %% mnesia_monitor takes care of the sync + case whereis(?MODULE) of + undefined -> + mnesia_monitor:mnesia_down(?MODULE, {Node, []}); + Pid -> + Pid ! {mnesia_down, Node} + end. + +prepare_checkpoint(Nodes, Cp) -> + rpc:multicall(Nodes, ?MODULE, prepare_checkpoint, [Cp]). + +prepare_checkpoint(Cp) -> + req({prepare_checkpoint,Cp}). + +block_tab(Tab) -> + req({block_tab, Tab}). + +unblock_tab(Tab) -> + req({unblock_tab, Tab}). + +doit_loop(#state{coordinators=Coordinators,participants=Participants,supervisor=Sup}=State) -> + receive + {_From, {async_dirty, Tid, Commit, Tab}} -> + case lists:member(Tab, State#state.blocked_tabs) of + false -> + do_async_dirty(Tid, Commit, Tab), + doit_loop(State); + true -> + Item = {async_dirty, Tid, Commit, Tab}, + State2 = State#state{dirty_queue = [Item | State#state.dirty_queue]}, + doit_loop(State2) + end; + + {From, {sync_dirty, Tid, Commit, Tab}} -> + case lists:member(Tab, State#state.blocked_tabs) of + false -> + do_sync_dirty(From, Tid, Commit, Tab), + doit_loop(State); + true -> + Item = {sync_dirty, From, Tid, Commit, Tab}, + State2 = State#state{dirty_queue = [Item | State#state.dirty_queue]}, + doit_loop(State2) + end; + + {From, start_outer} -> %% Create and associate ets_tab with Tid + case catch ?ets_new_table(mnesia_trans_store, [bag, public]) of + {'EXIT', Reason} -> %% system limit + Msg = "Cannot create an ets table for the " + "local transaction store", + reply(From, {error, {system_limit, Msg, Reason}}, State); + Etab -> + tmlink(From), + C = mnesia_recover:incr_trans_tid_serial(), + ?ets_insert(Etab, {nodes, node()}), + Tid = #tid{pid = tmpid(From), counter = C}, + A2 = gb_trees:insert(Tid,[Etab],Coordinators), + S2 = State#state{coordinators = A2}, + reply(From, {new_tid, Tid, Etab}, S2) + end; + + {From, {ask_commit, Protocol, Tid, Commit, DiscNs, RamNs}} -> + ?eval_debug_fun({?MODULE, doit_ask_commit}, + [{tid, Tid}, {prot, Protocol}]), + mnesia_checkpoint:tm_enter_pending(Tid, DiscNs, RamNs), + Pid = + case Protocol of + asym_trans when node(Tid#tid.pid) /= node() -> + Args = [tmpid(From), Tid, Commit, DiscNs, RamNs], + spawn_link(?MODULE, commit_participant, Args); + _ when node(Tid#tid.pid) /= node() -> %% *_sym_trans + reply(From, {vote_yes, Tid}), + nopid + end, + P = #participant{tid = Tid, + pid = Pid, + commit = Commit, + disc_nodes = DiscNs, + ram_nodes = RamNs, + protocol = Protocol}, + State2 = State#state{participants = gb_trees:insert(Tid,P,Participants)}, + doit_loop(State2); + + {Tid, do_commit} -> + case gb_trees:lookup(Tid, Participants) of + none -> + verbose("Tried to commit a non participant transaction ~p~n",[Tid]), + doit_loop(State); + {value, P} -> + ?eval_debug_fun({?MODULE,do_commit,pre},[{tid,Tid},{participant,P}]), + case P#participant.pid of + nopid -> + Commit = P#participant.commit, + Member = lists:member(node(), P#participant.disc_nodes), + if Member == false -> + ignore; + P#participant.protocol == sym_trans -> + mnesia_log:log(Commit); + P#participant.protocol == sync_sym_trans -> + mnesia_log:slog(Commit) + end, + mnesia_recover:note_decision(Tid, committed), + do_commit(Tid, Commit), + if + P#participant.protocol == sync_sym_trans -> + Tid#tid.pid ! {?MODULE, node(), {committed, Tid}}; + true -> + ignore + end, + mnesia_locker:release_tid(Tid), + transaction_terminated(Tid), + ?eval_debug_fun({?MODULE,do_commit,post},[{tid,Tid},{pid,nopid}]), + doit_loop(State#state{participants= + gb_trees:delete(Tid,Participants)}); + Pid when is_pid(Pid) -> + Pid ! {Tid, committed}, + ?eval_debug_fun({?MODULE, do_commit, post}, [{tid, Tid}, {pid, Pid}]), + doit_loop(State) + end + end; + + {Tid, simple_commit} -> + mnesia_recover:note_decision(Tid, committed), + mnesia_locker:release_tid(Tid), + transaction_terminated(Tid), + doit_loop(State); + + {Tid, {do_abort, Reason}} -> + ?eval_debug_fun({?MODULE, do_abort, pre}, [{tid, Tid}]), + case gb_trees:lookup(Tid, Participants) of + none -> + verbose("Tried to abort a non participant transaction ~p: ~p~n", + [Tid, Reason]), + mnesia_locker:release_tid(Tid), + doit_loop(State); + {value, P} -> + case P#participant.pid of + nopid -> + Commit = P#participant.commit, + mnesia_recover:note_decision(Tid, aborted), + do_abort(Tid, Commit), + if + P#participant.protocol == sync_sym_trans -> + Tid#tid.pid ! {?MODULE, node(), {aborted, Tid}}; + true -> + ignore + end, + transaction_terminated(Tid), + mnesia_locker:release_tid(Tid), + ?eval_debug_fun({?MODULE, do_abort, post}, [{tid, Tid}, {pid, nopid}]), + doit_loop(State#state{participants= + gb_trees:delete(Tid,Participants)}); + Pid when is_pid(Pid) -> + Pid ! {Tid, {do_abort, Reason}}, + ?eval_debug_fun({?MODULE, do_abort, post}, + [{tid, Tid}, {pid, Pid}]), + doit_loop(State) + end + end; + + {From, {add_store, Tid}} -> %% new store for nested transaction + case catch ?ets_new_table(mnesia_trans_store, [bag, public]) of + {'EXIT', Reason} -> %% system limit + Msg = "Cannot create an ets table for a nested " + "local transaction store", + reply(From, {error, {system_limit, Msg, Reason}}, State); + Etab -> + A2 = add_coord_store(Coordinators, Tid, Etab), + reply(From, {new_store, Etab}, + State#state{coordinators = A2}) + end; + + {From, {del_store, Tid, Current, Obsolete, PropagateStore}} -> + opt_propagate_store(Current, Obsolete, PropagateStore), + A2 = del_coord_store(Coordinators, Tid, Current, Obsolete), + reply(From, store_erased, State#state{coordinators = A2}); + + {'EXIT', Pid, Reason} -> + handle_exit(Pid, Reason, State); + + {From, {restart, Tid, Store}} -> + A2 = restore_stores(Coordinators, Tid, Store), + clear_fixtable([Store]), + ?ets_match_delete(Store, '_'), + ?ets_insert(Store, {nodes, node()}), + reply(From, {restarted, Tid}, State#state{coordinators = A2}); + + {delete_transaction, Tid} -> + %% used to clear transactions which are committed + %% in coordinator or participant processes + case gb_trees:is_defined(Tid, Participants) of + false -> + case gb_trees:lookup(Tid, Coordinators) of + none -> + verbose("** ERROR ** Tried to delete a non transaction ~p~n", + [Tid]), + doit_loop(State); + {value, Etabs} -> + clear_fixtable(Etabs), + erase_ets_tabs(Etabs), + transaction_terminated(Tid), + doit_loop(State#state{coordinators = + gb_trees:delete(Tid,Coordinators)}) + end; + true -> + transaction_terminated(Tid), + State2 = State#state{participants=gb_trees:delete(Tid,Participants)}, + doit_loop(State2) + end; + + {sync_trans_serial, Tid} -> + %% Do the Lamport thing here + mnesia_recover:sync_trans_tid_serial(Tid), + doit_loop(State); + + {From, info} -> + reply(From, {info, gb_trees:values(Participants), + gb_trees:to_list(Coordinators)}, State); + + {mnesia_down, N} -> + verbose("Got mnesia_down from ~p, reconfiguring...~n", [N]), + reconfigure_coordinators(N, gb_trees:to_list(Coordinators)), + + Tids = gb_trees:keys(Participants), + reconfigure_participants(N, gb_trees:values(Participants)), + NewState = clear_fixtable(N, State), + mnesia_monitor:mnesia_down(?MODULE, {N, Tids}), + doit_loop(NewState); + + {From, {unblock_me, Tab}} -> + case lists:member(Tab, State#state.blocked_tabs) of + false -> + verbose("Wrong dirty Op blocked on ~p ~p ~p", + [node(), Tab, From]), + reply(From, unblocked), + doit_loop(State); + true -> + Item = {Tab, unblock_me, From}, + State2 = State#state{dirty_queue = [Item | State#state.dirty_queue]}, + doit_loop(State2) + end; + + {From, {block_tab, Tab}} -> + State2 = State#state{blocked_tabs = [Tab | State#state.blocked_tabs]}, + reply(From, ok, State2); + + {From, {unblock_tab, Tab}} -> + BlockedTabs2 = State#state.blocked_tabs -- [Tab], + case lists:member(Tab, BlockedTabs2) of + false -> + mnesia_controller:unblock_table(Tab), + Queue = process_dirty_queue(Tab, State#state.dirty_queue), + State2 = State#state{blocked_tabs = BlockedTabs2, + dirty_queue = Queue}, + reply(From, ok, State2); + true -> + State2 = State#state{blocked_tabs = BlockedTabs2}, + reply(From, ok, State2) + end; + + {From, {prepare_checkpoint, Cp}} -> + Res = mnesia_checkpoint:tm_prepare(Cp), + case Res of + {ok, _Name, IgnoreNew, _Node} -> + prepare_pending_coordinators(gb_trees:to_list(Coordinators), IgnoreNew), + prepare_pending_participants(gb_trees:values(Participants), IgnoreNew); + {error, _Reason} -> + ignore + end, + reply(From, Res, State); + {From, {fixtable, [Tab,Lock,Requester]}} -> + case ?catch_val({Tab, storage_type}) of + {'EXIT', _} -> + reply(From, error, State); + Storage -> + mnesia_lib:db_fixtable(Storage,Tab,Lock), + NewState = manage_fixtable(Tab,Lock,Requester,State), + reply(From, node(), NewState) + end; + + {system, From, Msg} -> + dbg_out("~p got {system, ~p, ~p}~n", [?MODULE, From, Msg]), + sys:handle_system_msg(Msg, From, Sup, ?MODULE, [], State); + + Msg -> + verbose("** ERROR ** ~p got unexpected message: ~p~n", [?MODULE, Msg]), + doit_loop(State) + end. + +do_sync_dirty(From, Tid, Commit, _Tab) -> + ?eval_debug_fun({?MODULE, sync_dirty, pre}, [{tid, Tid}]), + Res = (catch do_dirty(Tid, Commit)), + ?eval_debug_fun({?MODULE, sync_dirty, post}, [{tid, Tid}]), + From ! {?MODULE, node(), {dirty_res, Res}}. + +do_async_dirty(Tid, Commit, _Tab) -> + ?eval_debug_fun({?MODULE, async_dirty, pre}, [{tid, Tid}]), + catch do_dirty(Tid, Commit), + ?eval_debug_fun({?MODULE, async_dirty, post}, [{tid, Tid}]). + + +%% Process items in fifo order +process_dirty_queue(Tab, [Item | Queue]) -> + Queue2 = process_dirty_queue(Tab, Queue), + case Item of + {async_dirty, Tid, Commit, Tab} -> + do_async_dirty(Tid, Commit, Tab), + Queue2; + {sync_dirty, From, Tid, Commit, Tab} -> + do_sync_dirty(From, Tid, Commit, Tab), + Queue2; + {Tab, unblock_me, From} -> + reply(From, unblocked), + Queue2; + _ -> + [Item | Queue2] + end; +process_dirty_queue(_Tab, []) -> + []. + +prepare_pending_coordinators([{Tid, [Store | _Etabs]} | Coords], IgnoreNew) -> + case catch ?ets_lookup(Store, pending) of + [] -> + prepare_pending_coordinators(Coords, IgnoreNew); + [Pending] -> + case lists:member(Tid, IgnoreNew) of + false -> + mnesia_checkpoint:tm_enter_pending(Pending); + true -> + ignore + end, + prepare_pending_coordinators(Coords, IgnoreNew); + {'EXIT', _} -> + prepare_pending_coordinators(Coords, IgnoreNew) + end; +prepare_pending_coordinators([], _IgnoreNew) -> + ok. + +prepare_pending_participants([Part | Parts], IgnoreNew) -> + Tid = Part#participant.tid, + D = Part#participant.disc_nodes, + R = Part#participant.ram_nodes, + case lists:member(Tid, IgnoreNew) of + false -> + mnesia_checkpoint:tm_enter_pending(Tid, D, R); + true -> + ignore + end, + prepare_pending_participants(Parts, IgnoreNew); +prepare_pending_participants([], _IgnoreNew) -> + ok. + +handle_exit(Pid, _Reason, State) when node(Pid) /= node() -> + %% We got exit from a remote fool + doit_loop(State); + +handle_exit(Pid, _Reason, State) when Pid == State#state.supervisor -> + %% Our supervisor has died, time to stop + do_stop(State); + +handle_exit(Pid, Reason, State) -> + %% Check if it is a coordinator + case pid_search_delete(Pid, gb_trees:to_list(State#state.coordinators)) of + {none, _} -> + %% Check if it is a participant + Ps = gb_trees:values(State#state.participants), + case mnesia_lib:key_search_delete(Pid,#participant.pid,Ps) of + {none, _} -> + %% We got exit from a local fool + doit_loop(State); + {P = #participant{}, _RestP} -> + fatal("Participant ~p in transaction ~p died ~p~n", + [P#participant.pid, P#participant.tid, Reason]), + NewPs = gb_trees:delete(P#participant.tid,State#state.participants), + doit_loop(State#state{participants = NewPs}) + end; + + {{Tid, Etabs}, RestC} -> + %% A local coordinator has died and + %% we must determine the outcome of the + %% transaction and tell mnesia_tm on the + %% other nodes about it and then recover + %% locally. + recover_coordinator(Tid, Etabs), + doit_loop(State#state{coordinators = RestC}) + end. + +recover_coordinator(Tid, Etabs) -> + verbose("Coordinator ~p in transaction ~p died.~n", [Tid#tid.pid, Tid]), + + Store = hd(Etabs), + CheckNodes = get_elements(nodes,Store), + TellNodes = CheckNodes -- [node()], + case catch arrange(Tid, Store, async) of + {'EXIT', Reason} -> + dbg_out("Recovery of coordinator ~p failed:~n", [Tid, Reason]), + Protocol = asym_trans, + tell_outcome(Tid, Protocol, node(), CheckNodes, TellNodes); + {_N, Prep} -> + %% Tell the participants about the outcome + Protocol = Prep#prep.protocol, + Outcome = tell_outcome(Tid, Protocol, node(), CheckNodes, TellNodes), + + %% Recover locally + CR = Prep#prep.records, + {DiscNs, RamNs} = commit_nodes(CR, [], []), + case lists:keysearch(node(), #commit.node, CR) of + {value, Local} -> + ?eval_debug_fun({?MODULE, recover_coordinator, pre}, + [{tid, Tid}, {outcome, Outcome}, {prot, Protocol}]), + recover_coordinator(Tid, Protocol, Outcome, Local, DiscNs, RamNs), + ?eval_debug_fun({?MODULE, recover_coordinator, post}, + [{tid, Tid}, {outcome, Outcome}, {prot, Protocol}]); + false -> %% When killed before store havn't been copied to + ok %% to the new nested trans store. + end + end, + erase_ets_tabs(Etabs), + transaction_terminated(Tid), + mnesia_locker:release_tid(Tid). + +recover_coordinator(Tid, sym_trans, committed, Local, _, _) -> + mnesia_recover:note_decision(Tid, committed), + do_dirty(Tid, Local); +recover_coordinator(Tid, sym_trans, aborted, _Local, _, _) -> + mnesia_recover:note_decision(Tid, aborted); +recover_coordinator(Tid, sync_sym_trans, committed, Local, _, _) -> + mnesia_recover:note_decision(Tid, committed), + do_dirty(Tid, Local); +recover_coordinator(Tid, sync_sym_trans, aborted, _Local, _, _) -> + mnesia_recover:note_decision(Tid, aborted); + +recover_coordinator(Tid, asym_trans, committed, Local, DiscNs, RamNs) -> + D = #decision{tid = Tid, outcome = committed, + disc_nodes = DiscNs, ram_nodes = RamNs}, + mnesia_recover:log_decision(D), + do_commit(Tid, Local); +recover_coordinator(Tid, asym_trans, aborted, Local, DiscNs, RamNs) -> + D = #decision{tid = Tid, outcome = aborted, + disc_nodes = DiscNs, ram_nodes = RamNs}, + mnesia_recover:log_decision(D), + do_abort(Tid, Local). + +restore_stores(Coords, Tid, Store) -> + Etstabs = gb_trees:get(Tid,Coords), + Remaining = lists:delete(Store, Etstabs), + erase_ets_tabs(Remaining), + gb_trees:update(Tid,[Store],Coords). + +add_coord_store(Coords, Tid, Etab) -> + Stores = gb_trees:get(Tid, Coords), + gb_trees:update(Tid, [Etab|Stores], Coords). + +del_coord_store(Coords, Tid, Current, Obsolete) -> + Stores = gb_trees:get(Tid, Coords), + Rest = + case Stores of + [Obsolete, Current | Tail] -> Tail; + [Current, Obsolete | Tail] -> Tail + end, + ?ets_delete_table(Obsolete), + gb_trees:update(Tid, [Current|Rest], Coords). + +erase_ets_tabs([H | T]) -> + ?ets_delete_table(H), + erase_ets_tabs(T); +erase_ets_tabs([]) -> + ok. + +%% Clear one transactions all fixtables +clear_fixtable([Store|_]) -> + Fixed = get_elements(fixtable, Store), + lists:foreach(fun({Tab,Node}) -> + rpc:cast(Node, ?MODULE, fixtable, [Tab,false,self()]) + end, Fixed). + +%% Clear all fixtable Node have done +clear_fixtable(Node, State=#state{fixed_tabs = FT0}) -> + case mnesia_lib:key_search_delete(Node, 1, FT0) of + {none, _Ft} -> + State; + {{Node,Tabs},FT} -> + lists:foreach( + fun(Tab) -> + case ?catch_val({Tab, storage_type}) of + {'EXIT', _} -> + ignore; + Storage -> + mnesia_lib:db_fixtable(Storage,Tab,false) + end + end, Tabs), + State#state{fixed_tabs=FT} + end. + +manage_fixtable(Tab,true,Requester,State=#state{fixed_tabs = FT0}) -> + Node = node(Requester), + case mnesia_lib:key_search_delete(Node, 1, FT0) of + {none, FT}-> + State#state{fixed_tabs=[{Node, [Tab]}|FT]}; + {{Node,Tabs},FT} -> + State#state{fixed_tabs=[{Node, [Tab|Tabs]}|FT]} + end; +manage_fixtable(Tab,false,Requester,State = #state{fixed_tabs = FT0}) -> + Node = node(Requester), + case mnesia_lib:key_search_delete(Node, 1, FT0) of + {none,_FT} -> State; % Hmm? Safeguard + {{Node, Tabs0},FT} -> + case lists:delete(Tab, Tabs0) of + [] -> State#state{fixed_tabs=FT}; + Tabs -> State#state{fixed_tabs=[{Node,Tabs}|FT]} + end + end. + +%% Deletes a pid from a list of participants +%% or from a gb_trees of coordinators +%% {none, All} or {Tr, Rest} +pid_search_delete(Pid, Trs) -> + pid_search_delete(Pid, Trs, none, []). +pid_search_delete(Pid, [Tr = {Tid, _Ts} | Trs], _Val, Ack) when Tid#tid.pid == Pid -> + pid_search_delete(Pid, Trs, Tr, Ack); +pid_search_delete(Pid, [Tr | Trs], Val, Ack) -> + pid_search_delete(Pid, Trs, Val, [Tr | Ack]); + +pid_search_delete(_Pid, [], Val, Ack) -> + {Val, gb_trees:from_orddict(lists:reverse(Ack))}. + +transaction_terminated(Tid) -> + mnesia_checkpoint:tm_exit_pending(Tid), + Pid = Tid#tid.pid, + if + node(Pid) == node() -> + unlink(Pid); + true -> %% Do the Lamport thing here + mnesia_recover:sync_trans_tid_serial(Tid) + end. + +%% If there are an surrounding transaction, we inherit it's context +non_transaction(OldState={_,_,Trans}, Fun, Args, ActivityKind, Mod) + when Trans /= non_transaction -> + Kind = case ActivityKind of + sync_dirty -> sync; + _ -> async + end, + case transaction(OldState, Fun, Args, infinity, Mod, Kind) of + {atomic, Res} -> + Res; + {aborted,Res} -> + exit(Res) + end; +non_transaction(OldState, Fun, Args, ActivityKind, Mod) -> + Id = {ActivityKind, self()}, + NewState = {Mod, Id, non_transaction}, + put(mnesia_activity_state, NewState), + %% I Want something uniqe here, references are expensive + Ref = mNeSia_nOn_TrAnSacTioN, + RefRes = (catch {Ref, apply(Fun, Args)}), + case OldState of + undefined -> erase(mnesia_activity_state); + _ -> put(mnesia_activity_state, OldState) + end, + case RefRes of + {Ref, Res} -> + case Res of + {'EXIT', Reason} -> exit(Reason); + {aborted, Reason} -> mnesia:abort(Reason); + _ -> Res + end; + {'EXIT', Reason} -> + exit(Reason); + Throw -> + throw(Throw) + end. + +transaction(OldTidTs, Fun, Args, Retries, Mod, Type) -> + Factor = 1, + case OldTidTs of + undefined -> % Outer + execute_outer(Mod, Fun, Args, Factor, Retries, Type); + {_, _, non_transaction} -> % Transaction inside ?sync_dirty + Res = execute_outer(Mod, Fun, Args, Factor, Retries, Type), + put(mnesia_activity_state, OldTidTs), + Res; + {OldMod, Tid, Ts} -> % Nested + execute_inner(Mod, Tid, OldMod, Ts, Fun, Args, Factor, Retries, Type); + _ -> % Bad nesting + {aborted, nested_transaction} + end. + +execute_outer(Mod, Fun, Args, Factor, Retries, Type) -> + case req(start_outer) of + {error, Reason} -> + {aborted, Reason}; + {new_tid, Tid, Store} -> + Ts = #tidstore{store = Store}, + NewTidTs = {Mod, Tid, Ts}, + put(mnesia_activity_state, NewTidTs), + execute_transaction(Fun, Args, Factor, Retries, Type) + end. + +execute_inner(Mod, Tid, OldMod, Ts, Fun, Args, Factor, Retries, Type) -> + case req({add_store, Tid}) of + {error, Reason} -> + {aborted, Reason}; + {new_store, Ets} -> + copy_ets(Ts#tidstore.store, Ets), + Up = [{OldMod,Ts#tidstore.store} | Ts#tidstore.up_stores], + NewTs = Ts#tidstore{level = 1 + Ts#tidstore.level, + store = Ets, + up_stores = Up}, + NewTidTs = {Mod, Tid, NewTs}, + put(mnesia_activity_state, NewTidTs), + execute_transaction(Fun, Args, Factor, Retries, Type) + end. + +copy_ets(From, To) -> + do_copy_ets(?ets_first(From), From, To). +do_copy_ets('$end_of_table', _,_) -> + ok; +do_copy_ets(K, From, To) -> + Objs = ?ets_lookup(From, K), + insert_objs(Objs, To), + do_copy_ets(?ets_next(From, K), From, To). + +insert_objs([H|T], Tab) -> + ?ets_insert(Tab, H), + insert_objs(T, Tab); +insert_objs([], _Tab) -> + ok. + +execute_transaction(Fun, Args, Factor, Retries, Type) -> + case catch apply_fun(Fun, Args, Type) of + {'EXIT', Reason} -> + check_exit(Fun, Args, Factor, Retries, Reason, Type); + {atomic, Value} -> + mnesia_lib:incr_counter(trans_commits), + erase(mnesia_activity_state), + %% no need to clear locks, already done by commit ... + %% Flush any un processed mnesia_down messages we might have + flush_downs(), + catch unlink(whereis(?MODULE)), + {atomic, Value}; + {nested_atomic, Value} -> + mnesia_lib:incr_counter(trans_commits), + {atomic, Value}; + Value -> %% User called throw + Reason = {aborted, {throw, Value}}, + return_abort(Fun, Args, Reason) + end. + +apply_fun(Fun, Args, Type) -> + Result = apply(Fun, Args), + case t_commit(Type) of + do_commit -> + {atomic, Result}; + do_commit_nested -> + {nested_atomic, Result}; + {do_abort, {aborted, Reason}} -> + {'EXIT', {aborted, Reason}}; + {do_abort, Reason} -> + {'EXIT', {aborted, Reason}} + end. + +check_exit(Fun, Args, Factor, Retries, Reason, Type) -> + case Reason of + {aborted, C = #cyclic{}} -> + maybe_restart(Fun, Args, Factor, Retries, Type, C); + {aborted, {node_not_running, N}} -> + maybe_restart(Fun, Args, Factor, Retries, Type, {node_not_running, N}); + {aborted, {bad_commit, N}} -> + maybe_restart(Fun, Args, Factor, Retries, Type, {bad_commit, N}); + _ -> + return_abort(Fun, Args, Reason) + end. + +maybe_restart(Fun, Args, Factor, Retries, Type, Why) -> + {Mod, Tid, Ts} = get(mnesia_activity_state), + case try_again(Retries) of + yes when Ts#tidstore.level == 1 -> + restart(Mod, Tid, Ts, Fun, Args, Factor, Retries, Type, Why); + yes -> + return_abort(Fun, Args, Why); + no -> + return_abort(Fun, Args, {aborted, nomore}) + end. + +try_again(infinity) -> yes; +try_again(X) when is_number(X) , X > 1 -> yes; +try_again(_) -> no. + +%% We can only restart toplevel transactions. +%% If a deadlock situation occurs in a nested transaction +%% The whole thing including all nested transactions need to be +%% restarted. The stack is thus popped by a consequtive series of +%% exit({aborted, #cyclic{}}) calls + +restart(Mod, Tid, Ts, Fun, Args, Factor0, Retries0, Type, Why) -> + mnesia_lib:incr_counter(trans_restarts), + Retries = decr(Retries0), + case Why of + {bad_commit, _N} -> + return_abort(Fun, Args, Why), + Factor = 1, + SleepTime = mnesia_lib:random_time(Factor, Tid#tid.counter), + dbg_out("Restarting transaction ~w: in ~wms ~w~n", [Tid, SleepTime, Why]), + timer:sleep(SleepTime), + execute_outer(Mod, Fun, Args, Factor, Retries, Type); + {node_not_running, _N} -> %% Avoids hanging in receive_release_tid_ack + return_abort(Fun, Args, Why), + Factor = 1, + SleepTime = mnesia_lib:random_time(Factor, Tid#tid.counter), + dbg_out("Restarting transaction ~w: in ~wms ~w~n", [Tid, SleepTime, Why]), + timer:sleep(SleepTime), + execute_outer(Mod, Fun, Args, Factor, Retries, Type); + _ -> + SleepTime = mnesia_lib:random_time(Factor0, Tid#tid.counter), + dbg_out("Restarting transaction ~w: in ~wms ~w~n", [Tid, SleepTime, Why]), + + if + Factor0 /= 10 -> + ignore; + true -> + %% Our serial may be much larger than other nodes ditto + AllNodes = val({current, db_nodes}), + verbose("Sync serial ~p~n", [Tid]), + rpc:abcast(AllNodes, ?MODULE, {sync_trans_serial, Tid}) + end, + intercept_friends(Tid, Ts), + Store = Ts#tidstore.store, + Nodes = get_elements(nodes,Store), + ?MODULE ! {self(), {restart, Tid, Store}}, + mnesia_locker:send_release_tid(Nodes, Tid), + timer:sleep(SleepTime), + mnesia_locker:receive_release_tid_acc(Nodes, Tid), + case get_restarted(Tid) of + {restarted, Tid} -> + execute_transaction(Fun, Args, Factor0 + 1, + Retries, Type); + {error, Reason} -> + mnesia:abort(Reason) + end + end. + +get_restarted(Tid) -> + case Res = rec() of + {restarted, Tid} -> + Res; + {error,_} -> + Res; + _ -> %% We could get a couple of aborts to many. + get_restarted(Tid) + end. + +decr(infinity) -> infinity; +decr(X) when is_integer(X), X > 1 -> X - 1; +decr(_X) -> 0. + +return_abort(Fun, Args, Reason) -> + {_Mod, Tid, Ts} = get(mnesia_activity_state), + dbg_out("Transaction ~p calling ~p with ~p failed: ~n ~p~n", + [Tid, Fun, Args, Reason]), + OldStore = Ts#tidstore.store, + Nodes = get_elements(nodes, OldStore), + intercept_friends(Tid, Ts), + catch mnesia_lib:incr_counter(trans_failures), + Level = Ts#tidstore.level, + if + Level == 1 -> + mnesia_locker:async_release_tid(Nodes, Tid), + ?MODULE ! {delete_transaction, Tid}, + erase(mnesia_activity_state), + flush_downs(), + catch unlink(whereis(?MODULE)), + {aborted, mnesia_lib:fix_error(Reason)}; + true -> + %% Nested transaction + [{OldMod,NewStore} | Tail] = Ts#tidstore.up_stores, + req({del_store, Tid, NewStore, OldStore, true}), + Ts2 = Ts#tidstore{store = NewStore, + up_stores = Tail, + level = Level - 1}, + NewTidTs = {OldMod, Tid, Ts2}, + put(mnesia_activity_state, NewTidTs), + case Reason of + #cyclic{} -> + exit({aborted, Reason}); + {node_not_running, _N} -> + exit({aborted, Reason}); + {bad_commit, _N}-> + exit({aborted, Reason}); + _ -> + {aborted, mnesia_lib:fix_error(Reason)} + end + end. + +flush_downs() -> + receive + {?MODULE, _, _} -> flush_downs(); % Votes + {mnesia_down, _} -> flush_downs() + after 0 -> flushed + end. + + +put_activity_id(MTT) -> + put_activity_id(MTT, undefined). +put_activity_id(undefined,_) -> + erase_activity_id(); +put_activity_id({Mod, Tid = #tid{}, Ts = #tidstore{}},Fun) -> + flush_downs(), + Store = Ts#tidstore.store, + if + is_function(Fun) -> + ?ets_insert(Store, {friends, {stop,Fun}}); + true -> + ?ets_insert(Store, {friends, self()}) + end, + NewTidTs = {Mod, Tid, Ts}, + put(mnesia_activity_state, NewTidTs); +put_activity_id(SimpleState,_) -> + put(mnesia_activity_state, SimpleState). + +erase_activity_id() -> + flush_downs(), + erase(mnesia_activity_state). + +get_elements(Type,Store) -> + case catch ?ets_lookup(Store, Type) of + [] -> []; + [{_,Val}] -> [Val]; + {'EXIT', _} -> []; + Vals -> [Val|| {_,Val} <- Vals] + end. + +opt_propagate_store(_Current, _Obsolete, false) -> + ok; +opt_propagate_store(Current, Obsolete, true) -> + propagate_store(Current, nodes, get_elements(nodes,Obsolete)), + propagate_store(Current, fixtable, get_elements(fixtable,Obsolete)), + propagate_store(Current, friends, get_elements(friends, Obsolete)). + +propagate_store(Store, Var, [Val | Vals]) -> + ?ets_insert(Store, {Var, Val}), + propagate_store(Store, Var, Vals); +propagate_store(_Store, _Var, []) -> + ok. + +%% Tell all processes that are cooperating with the current transaction +intercept_friends(_Tid, Ts) -> + Friends = get_elements(friends,Ts#tidstore.store), + intercept_best_friend(Friends, false). + +intercept_best_friend([],_) -> ok; +intercept_best_friend([{stop,Fun} | R],Ignore) -> + catch Fun(), + intercept_best_friend(R,Ignore); +intercept_best_friend([Pid | R],false) -> + Pid ! {activity_ended, undefined, self()}, + wait_for_best_friend(Pid, 0), + intercept_best_friend(R,true); +intercept_best_friend([_|R],true) -> + intercept_best_friend(R,true). + +wait_for_best_friend(Pid, Timeout) -> + receive + {'EXIT', Pid, _} -> ok; + {activity_ended, _, Pid} -> ok + after Timeout -> + case my_process_is_alive(Pid) of + true -> wait_for_best_friend(Pid, 1000); + false -> ok + end + end. + +my_process_is_alive(Pid) -> + case catch erlang:is_process_alive(Pid) of % New BIF in R5 + true -> + true; + false -> + false; + {'EXIT', _} -> % Pre R5 backward compatibility + case process_info(Pid, message_queue_len) of + undefined -> false; + _ -> true + end + end. + +dirty(Protocol, Item) -> + {{Tab, Key}, _Val, _Op} = Item, + Tid = {dirty, self()}, + Prep = prepare_items(Tid, Tab, Key, [Item], #prep{protocol= Protocol}), + CR = Prep#prep.records, + case Protocol of + async_dirty -> + %% Send commit records to the other involved nodes, + %% but do only wait for one node to complete. + %% Preferrably, the local node if possible. + + ReadNode = val({Tab, where_to_read}), + {WaitFor, FirstRes} = async_send_dirty(Tid, CR, Tab, ReadNode), + rec_dirty(WaitFor, FirstRes); + + sync_dirty -> + %% Send commit records to the other involved nodes, + %% and wait for all nodes to complete + {WaitFor, FirstRes} = sync_send_dirty(Tid, CR, Tab, []), + rec_dirty(WaitFor, FirstRes); + _ -> + mnesia:abort({bad_activity, Protocol}) + end. + +%% This is the commit function, The first thing it does, +%% is to find out which nodes that have been participating +%% in this particular transaction, all of the mnesia_locker:lock* +%% functions insert the names of the nodes where it aquires locks +%% into the local shadow Store +%% This function exacutes in the context of the user process +t_commit(Type) -> + {_Mod, Tid, Ts} = get(mnesia_activity_state), + Store = Ts#tidstore.store, + if + Ts#tidstore.level == 1 -> + intercept_friends(Tid, Ts), + %% N is number of updates + case arrange(Tid, Store, Type) of + {N, Prep} when N > 0 -> + multi_commit(Prep#prep.protocol, + Tid, Prep#prep.records, Store); + {0, Prep} -> + multi_commit(read_only, Tid, Prep#prep.records, Store) + end; + true -> + %% nested commit + Level = Ts#tidstore.level, + [{OldMod,Obsolete} | Tail] = Ts#tidstore.up_stores, + req({del_store, Tid, Store, Obsolete, false}), + NewTs = Ts#tidstore{store = Store, + up_stores = Tail, + level = Level - 1}, + NewTidTs = {OldMod, Tid, NewTs}, + put(mnesia_activity_state, NewTidTs), + do_commit_nested + end. + +%% This function arranges for all objects we shall write in S to be +%% in a list of {Node, CommitRecord} +%% Important function for the performance of mnesia. + +arrange(Tid, Store, Type) -> + %% The local node is always included + Nodes = get_elements(nodes,Store), + Recs = prep_recs(Nodes, []), + Key = ?ets_first(Store), + N = 0, + Prep = + case Type of + async -> #prep{protocol = sym_trans, records = Recs}; + sync -> #prep{protocol = sync_sym_trans, records = Recs} + end, + case catch do_arrange(Tid, Store, Key, Prep, N) of + {'EXIT', Reason} -> + dbg_out("do_arrange failed ~p ~p~n", [Reason, Tid]), + case Reason of + {aborted, R} -> + mnesia:abort(R); + _ -> + mnesia:abort(Reason) + end; + {New, Prepared} -> + {New, Prepared#prep{records = reverse(Prepared#prep.records)}} + end. + +reverse([]) -> + []; +reverse([H=#commit{ram_copies=Ram, disc_copies=DC, + disc_only_copies=DOC,snmp = Snmp} + |R]) -> + [ + H#commit{ + ram_copies = lists:reverse(Ram), + disc_copies = lists:reverse(DC), + disc_only_copies = lists:reverse(DOC), + snmp = lists:reverse(Snmp) + } + | reverse(R)]. + +prep_recs([N | Nodes], Recs) -> + prep_recs(Nodes, [#commit{decision = presume_commit, node = N} | Recs]); +prep_recs([], Recs) -> + Recs. + +%% storage_types is a list of {Node, Storage} tuples +%% where each tuple represents an active replica +do_arrange(Tid, Store, {Tab, Key}, Prep, N) -> + Oid = {Tab, Key}, + Items = ?ets_lookup(Store, Oid), %% Store is a bag + P2 = prepare_items(Tid, Tab, Key, Items, Prep), + do_arrange(Tid, Store, ?ets_next(Store, Oid), P2, N + 1); +do_arrange(Tid, Store, SchemaKey, Prep, N) when SchemaKey == op -> + Items = ?ets_lookup(Store, SchemaKey), %% Store is a bag + P2 = prepare_schema_items(Tid, Items, Prep), + do_arrange(Tid, Store, ?ets_next(Store, SchemaKey), P2, N + 1); +do_arrange(Tid, Store, RestoreKey, Prep, N) when RestoreKey == restore_op -> + [{restore_op, R}] = ?ets_lookup(Store, RestoreKey), + Fun = fun({Tab, Key}, CommitRecs, _RecName, Where, Snmp) -> + Item = [{{Tab, Key}, {Tab, Key}, delete}], + do_prepare_items(Tid, Tab, Key, Where, Snmp, Item, CommitRecs); + (BupRec, CommitRecs, RecName, Where, Snmp) -> + Tab = element(1, BupRec), + Key = element(2, BupRec), + Item = + if + Tab == RecName -> + [{{Tab, Key}, BupRec, write}]; + true -> + BupRec2 = setelement(1, BupRec, RecName), + [{{Tab, Key}, BupRec2, write}] + end, + do_prepare_items(Tid, Tab, Key, Where, Snmp, Item, CommitRecs) + end, + Recs2 = mnesia_schema:arrange_restore(R, Fun, Prep#prep.records), + P2 = Prep#prep{protocol = asym_trans, records = Recs2}, + do_arrange(Tid, Store, ?ets_next(Store, RestoreKey), P2, N + 1); +do_arrange(_Tid, _Store, '$end_of_table', Prep, N) -> + {N, Prep}; +do_arrange(Tid, Store, IgnoredKey, Prep, N) -> %% locks, nodes ... local atoms... + do_arrange(Tid, Store, ?ets_next(Store, IgnoredKey), Prep, N). + +%% Returns a prep record with all items in reverse order +prepare_schema_items(Tid, Items, Prep) -> + Types = [{N, schema_ops} || N <- val({current, db_nodes})], + Recs = prepare_nodes(Tid, Types, Items, Prep#prep.records, schema), + Prep#prep{protocol = asym_trans, records = Recs}. + +%% Returns a prep record with all items in reverse order +prepare_items(Tid, Tab, Key, Items, Prep) when Prep#prep.prev_tab == Tab -> + Types = Prep#prep.prev_types, + Snmp = Prep#prep.prev_snmp, + Recs = Prep#prep.records, + Recs2 = do_prepare_items(Tid, Tab, Key, Types, Snmp, Items, Recs), + Prep#prep{records = Recs2}; + +prepare_items(Tid, Tab, Key, Items, Prep) -> + Types = val({Tab, where_to_commit}), + case Types of + [] -> mnesia:abort({no_exists, Tab}); + {blocked, _} -> + unblocked = req({unblock_me, Tab}), + prepare_items(Tid, Tab, Key, Items, Prep); + _ -> + Snmp = val({Tab, snmp}), + Recs2 = do_prepare_items(Tid, Tab, Key, Types, + Snmp, Items, Prep#prep.records), + Prep2 = Prep#prep{records = Recs2, prev_tab = Tab, + prev_types = Types, prev_snmp = Snmp}, + check_prep(Prep2, Types) + end. + +do_prepare_items(Tid, Tab, Key, Types, Snmp, Items, Recs) -> + Recs2 = prepare_snmp(Tid, Tab, Key, Types, Snmp, Items, Recs), % May exit + prepare_nodes(Tid, Types, Items, Recs2, normal). + +prepare_snmp(Tab, Key, Items) -> + case val({Tab, snmp}) of + [] -> + []; + Ustruct when Key /= '_' -> + {_Oid, _Val, Op} = hd(Items), + %% Still making snmp oid (not used) because we want to catch errors here + %% And also it keeps backwards comp. with old nodes. + SnmpOid = mnesia_snmp_hook:key_to_oid(Tab, Key, Ustruct), % May exit + [{Op, Tab, Key, SnmpOid}]; + _ -> + [{clear_table, Tab}] + end. + +prepare_snmp(_Tid, _Tab, _Key, _Types, [], _Items, Recs) -> + Recs; + +prepare_snmp(Tid, Tab, Key, Types, Us, Items, Recs) -> + if Key /= '_' -> + {_Oid, _Val, Op} = hd(Items), + SnmpOid = mnesia_snmp_hook:key_to_oid(Tab, Key, Us), % May exit + prepare_nodes(Tid, Types, [{Op, Tab, Key, SnmpOid}], Recs, snmp); + Key == '_' -> + prepare_nodes(Tid, Types, [{clear_table, Tab}], Recs, snmp) + end. + +check_prep(Prep, Types) when Prep#prep.types == Types -> + Prep; +check_prep(Prep, Types) when Prep#prep.types == undefined -> + Prep#prep{types = Types}; +check_prep(Prep, _Types) -> + Prep#prep{protocol = asym_trans}. + +%% Returns a list of commit records +prepare_nodes(Tid, [{Node, Storage} | Rest], Items, C, Kind) -> + {Rec, C2} = pick_node(Tid, Node, C, []), + Rec2 = prepare_node(Node, Storage, Items, Rec, Kind), + [Rec2 | prepare_nodes(Tid, Rest, Items, C2, Kind)]; +prepare_nodes(_Tid, [], _Items, CommitRecords, _Kind) -> + CommitRecords. + +pick_node(Tid, Node, [Rec | Rest], Done) -> + if + Rec#commit.node == Node -> + {Rec, Done ++ Rest}; + true -> + pick_node(Tid, Node, Rest, [Rec | Done]) + end; +pick_node({dirty,_}, Node, [], Done) -> + {#commit{decision = presume_commit, node = Node}, Done}; +pick_node(_Tid, Node, [], _Done) -> + mnesia:abort({bad_commit, {missing_lock, Node}}). + +prepare_node(Node, Storage, [Item | Items], Rec, Kind) when Kind == snmp -> + Rec2 = Rec#commit{snmp = [Item | Rec#commit.snmp]}, + prepare_node(Node, Storage, Items, Rec2, Kind); +prepare_node(Node, Storage, [Item | Items], Rec, Kind) when Kind /= schema -> + Rec2 = + case Storage of + ram_copies -> + Rec#commit{ram_copies = [Item | Rec#commit.ram_copies]}; + disc_copies -> + Rec#commit{disc_copies = [Item | Rec#commit.disc_copies]}; + disc_only_copies -> + Rec#commit{disc_only_copies = + [Item | Rec#commit.disc_only_copies]} + end, + prepare_node(Node, Storage, Items, Rec2, Kind); +prepare_node(_Node, _Storage, Items, Rec, Kind) + when Kind == schema, Rec#commit.schema_ops == [] -> + Rec#commit{schema_ops = Items}; +prepare_node(_Node, _Storage, [], Rec, _Kind) -> + Rec. + +%% multi_commit((Protocol, Tid, CommitRecords, Store) +%% Local work is always performed in users process +multi_commit(read_only, Tid, CR, _Store) -> + %% This featherweight commit protocol is used when no + %% updates has been performed in the transaction. + + {DiscNs, RamNs} = commit_nodes(CR, [], []), + Msg = {Tid, simple_commit}, + rpc:abcast(DiscNs -- [node()], ?MODULE, Msg), + rpc:abcast(RamNs -- [node()], ?MODULE, Msg), + mnesia_recover:note_decision(Tid, committed), + mnesia_locker:release_tid(Tid), + ?MODULE ! {delete_transaction, Tid}, + do_commit; + +multi_commit(sym_trans, Tid, CR, Store) -> + %% This lightweight commit protocol is used when all + %% the involved tables are replicated symetrically. + %% Their storage types must match on each node. + %% + %% 1 Ask the other involved nodes if they want to commit + %% All involved nodes votes yes if they are up + %% 2a Somebody has voted no + %% Tell all yes voters to do_abort + %% 2b Everybody has voted yes + %% Tell everybody to do_commit. I.e. that they should + %% prepare the commit, log the commit record and + %% perform the updates. + %% + %% The outcome is kept 3 minutes in the transient decision table. + %% + %% Recovery: + %% If somebody dies before the coordinator has + %% broadcasted do_commit, the transaction is aborted. + %% + %% If a participant dies, the table load algorithm + %% ensures that the contents of the involved tables + %% are picked from another node. + %% + %% If the coordinator dies, each participants checks + %% the outcome with all the others. If all are uncertain + %% about the outcome, the transaction is aborted. If + %% somebody knows the outcome the others will follow. + + {DiscNs, RamNs} = commit_nodes(CR, [], []), + Pending = mnesia_checkpoint:tm_enter_pending(Tid, DiscNs, RamNs), + ?ets_insert(Store, Pending), + + {WaitFor, Local} = ask_commit(sym_trans, Tid, CR, DiscNs, RamNs), + {Outcome, []} = rec_all(WaitFor, Tid, do_commit, []), + ?eval_debug_fun({?MODULE, multi_commit_sym}, + [{tid, Tid}, {outcome, Outcome}]), + rpc:abcast(DiscNs -- [node()], ?MODULE, {Tid, Outcome}), + rpc:abcast(RamNs -- [node()], ?MODULE, {Tid, Outcome}), + case Outcome of + do_commit -> + mnesia_recover:note_decision(Tid, committed), + do_dirty(Tid, Local), + mnesia_locker:release_tid(Tid), + ?MODULE ! {delete_transaction, Tid}; + {do_abort, _Reason} -> + mnesia_recover:note_decision(Tid, aborted) + end, + ?eval_debug_fun({?MODULE, multi_commit_sym, post}, + [{tid, Tid}, {outcome, Outcome}]), + Outcome; + +multi_commit(sync_sym_trans, Tid, CR, Store) -> + %% This protocol is the same as sym_trans except that it + %% uses syncronized calls to disk_log and syncronized commits + %% when several nodes are involved. + + {DiscNs, RamNs} = commit_nodes(CR, [], []), + Pending = mnesia_checkpoint:tm_enter_pending(Tid, DiscNs, RamNs), + ?ets_insert(Store, Pending), + + {WaitFor, Local} = ask_commit(sync_sym_trans, Tid, CR, DiscNs, RamNs), + {Outcome, []} = rec_all(WaitFor, Tid, do_commit, []), + ?eval_debug_fun({?MODULE, multi_commit_sym_sync}, + [{tid, Tid}, {outcome, Outcome}]), + rpc:abcast(DiscNs -- [node()], ?MODULE, {Tid, Outcome}), + rpc:abcast(RamNs -- [node()], ?MODULE, {Tid, Outcome}), + case Outcome of + do_commit -> + mnesia_recover:note_decision(Tid, committed), + mnesia_log:slog(Local), + do_commit(Tid, Local), + %% Just wait for completion result is ignore. + rec_all(WaitFor, Tid, ignore, []), + mnesia_locker:release_tid(Tid), + ?MODULE ! {delete_transaction, Tid}; + {do_abort, _Reason} -> + mnesia_recover:note_decision(Tid, aborted) + end, + ?eval_debug_fun({?MODULE, multi_commit_sym, post}, + [{tid, Tid}, {outcome, Outcome}]), + Outcome; + +multi_commit(asym_trans, Tid, CR, Store) -> + %% This more expensive commit protocol is used when + %% table definitions are changed (schema transactions). + %% It is also used when the involved tables are + %% replicated asymetrically. If the storage type differs + %% on at least one node this protocol is used. + %% + %% 1 Ask the other involved nodes if they want to commit. + %% All involved nodes prepares the commit, logs a presume_abort + %% commit record and votes yes or no depending of the + %% outcome of the prepare. The preparation is also performed + %% by the coordinator. + %% + %% 2a Somebody has died or voted no + %% Tell all yes voters to do_abort + %% 2b Everybody has voted yes + %% Put a unclear marker in the log. + %% Tell the others to pre_commit. I.e. that they should + %% put a unclear marker in the log and reply + %% acc_pre_commit when they are done. + %% + %% 3a Somebody died + %% Tell the remaining participants to do_abort + %% 3b Everybody has replied acc_pre_commit + %% Tell everybody to committed. I.e that they should + %% put a committed marker in the log, perform the updates + %% and reply done_commit when they are done. The coordinator + %% must wait with putting his committed marker inte the log + %% until the committed has been sent to all the others. + %% Then he performs local commit before collecting replies. + %% + %% 4 Everybody has either died or replied done_commit + %% Return to the caller. + %% + %% Recovery: + %% If the coordinator dies, the participants (and + %% the coordinator when he starts again) must do + %% the following: + %% + %% If we have no unclear marker in the log we may + %% safely abort, since we know that nobody may have + %% decided to commit yet. + %% + %% If we have a committed marker in the log we may + %% safely commit since we know that everybody else + %% also will come to this conclusion. + %% + %% If we have a unclear marker but no committed + %% in the log we are uncertain about the real outcome + %% of the transaction and must ask the others before + %% we can decide what to do. If someone knows the + %% outcome we will do the same. If nobody knows, we + %% will wait for the remaining involved nodes to come + %% up. When all involved nodes are up and uncertain, + %% we decide to commit (first put a committed marker + %% in the log, then do the updates). + + D = #decision{tid = Tid, outcome = presume_abort}, + {D2, CR2} = commit_decision(D, CR, [], []), + DiscNs = D2#decision.disc_nodes, + RamNs = D2#decision.ram_nodes, + Pending = mnesia_checkpoint:tm_enter_pending(Tid, DiscNs, RamNs), + ?ets_insert(Store, Pending), + {WaitFor, Local} = ask_commit(asym_trans, Tid, CR2, DiscNs, RamNs), + SchemaPrep = (catch mnesia_schema:prepare_commit(Tid, Local, {coord, WaitFor})), + {Votes, Pids} = rec_all(WaitFor, Tid, do_commit, []), + + ?eval_debug_fun({?MODULE, multi_commit_asym_got_votes}, + [{tid, Tid}, {votes, Votes}]), + case Votes of + do_commit -> + case SchemaPrep of + {_Modified, C = #commit{}, DumperMode} -> + mnesia_log:log(C), % C is not a binary + ?eval_debug_fun({?MODULE, multi_commit_asym_log_commit_rec}, + [{tid, Tid}]), + + D3 = C#commit.decision, + D4 = D3#decision{outcome = unclear}, + mnesia_recover:log_decision(D4), + ?eval_debug_fun({?MODULE, multi_commit_asym_log_commit_dec}, + [{tid, Tid}]), + tell_participants(Pids, {Tid, pre_commit}), + %% Now we are uncertain and we do not know + %% if all participants have logged that + %% they are uncertain or not + rec_acc_pre_commit(Pids, Tid, Store, {C,Local}, + do_commit, DumperMode, [], []); + {'EXIT', Reason} -> + %% The others have logged the commit + %% record but they are not uncertain + mnesia_recover:note_decision(Tid, aborted), + ?eval_debug_fun({?MODULE, multi_commit_asym_prepare_exit}, + [{tid, Tid}]), + tell_participants(Pids, {Tid, {do_abort, Reason}}), + do_abort(Tid, Local), + {do_abort, Reason} + end; + + {do_abort, Reason} -> + %% The others have logged the commit + %% record but they are not uncertain + mnesia_recover:note_decision(Tid, aborted), + ?eval_debug_fun({?MODULE, multi_commit_asym_do_abort}, [{tid, Tid}]), + tell_participants(Pids, {Tid, {do_abort, Reason}}), + do_abort(Tid, Local), + {do_abort, Reason} + end. + +%% Returns do_commit or {do_abort, Reason} +rec_acc_pre_commit([Pid | Tail], Tid, Store, Commit, Res, DumperMode, + GoodPids, SchemaAckPids) -> + receive + {?MODULE, _, {acc_pre_commit, Tid, Pid, true}} -> + rec_acc_pre_commit(Tail, Tid, Store, Commit, Res, DumperMode, + [Pid | GoodPids], [Pid | SchemaAckPids]); + + {?MODULE, _, {acc_pre_commit, Tid, Pid, false}} -> + rec_acc_pre_commit(Tail, Tid, Store, Commit, Res, DumperMode, + [Pid | GoodPids], SchemaAckPids); + + {?MODULE, _, {acc_pre_commit, Tid, Pid}} -> + %% Kept for backwards compatibility. Remove after Mnesia 4.x + rec_acc_pre_commit(Tail, Tid, Store, Commit, Res, DumperMode, + [Pid | GoodPids], [Pid | SchemaAckPids]); + {?MODULE, _, {do_abort, Tid, Pid, _Reason}} -> + AbortRes = {do_abort, {bad_commit, node(Pid)}}, + rec_acc_pre_commit(Tail, Tid, Store, Commit, AbortRes, DumperMode, + GoodPids, SchemaAckPids); + {mnesia_down, Node} when Node == node(Pid) -> + AbortRes = {do_abort, {bad_commit, Node}}, + catch Pid ! {Tid, AbortRes}, %% Tell him that he has died + rec_acc_pre_commit(Tail, Tid, Store, Commit, AbortRes, DumperMode, + GoodPids, SchemaAckPids) + end; +rec_acc_pre_commit([], Tid, Store, {Commit,OrigC}, Res, DumperMode, GoodPids, SchemaAckPids) -> + D = Commit#commit.decision, + case Res of + do_commit -> + %% Now everybody knows that the others + %% has voted yes. We also know that + %% everybody are uncertain. + prepare_sync_schema_commit(Store, SchemaAckPids), + tell_participants(GoodPids, {Tid, committed}), + D2 = D#decision{outcome = committed}, + mnesia_recover:log_decision(D2), + ?eval_debug_fun({?MODULE, rec_acc_pre_commit_log_commit}, + [{tid, Tid}]), + + %% Now we have safely logged committed + %% and we can recover without asking others + do_commit(Tid, Commit, DumperMode), + ?eval_debug_fun({?MODULE, rec_acc_pre_commit_done_commit}, + [{tid, Tid}]), + sync_schema_commit(Tid, Store, SchemaAckPids), + mnesia_locker:release_tid(Tid), + ?MODULE ! {delete_transaction, Tid}; + + {do_abort, Reason} -> + tell_participants(GoodPids, {Tid, {do_abort, Reason}}), + D2 = D#decision{outcome = aborted}, + mnesia_recover:log_decision(D2), + ?eval_debug_fun({?MODULE, rec_acc_pre_commit_log_abort}, + [{tid, Tid}]), + do_abort(Tid, OrigC), + ?eval_debug_fun({?MODULE, rec_acc_pre_commit_done_abort}, + [{tid, Tid}]) + end, + Res. + +%% Note all nodes in case of mnesia_down mgt +prepare_sync_schema_commit(_Store, []) -> + ok; +prepare_sync_schema_commit(Store, [Pid | Pids]) -> + ?ets_insert(Store, {waiting_for_commit_ack, node(Pid)}), + prepare_sync_schema_commit(Store, Pids). + +sync_schema_commit(_Tid, _Store, []) -> + ok; +sync_schema_commit(Tid, Store, [Pid | Tail]) -> + receive + {?MODULE, _, {schema_commit, Tid, Pid}} -> + ?ets_match_delete(Store, {waiting_for_commit_ack, node(Pid)}), + sync_schema_commit(Tid, Store, Tail); + + {mnesia_down, Node} when Node == node(Pid) -> + ?ets_match_delete(Store, {waiting_for_commit_ack, Node}), + sync_schema_commit(Tid, Store, Tail) + end. + +tell_participants([Pid | Pids], Msg) -> + Pid ! Msg, + tell_participants(Pids, Msg); +tell_participants([], _Msg) -> + ok. + +%% Trap exit because we can get a shutdown from application manager +commit_participant(Coord, Tid, Bin, DiscNs, RamNs) when is_binary(Bin) -> + process_flag(trap_exit, true), + Commit = binary_to_term(Bin), + commit_participant(Coord, Tid, Bin, Commit, DiscNs, RamNs); +commit_participant(Coord, Tid, C = #commit{}, DiscNs, RamNs) -> + process_flag(trap_exit, true), + commit_participant(Coord, Tid, C, C, DiscNs, RamNs). + +commit_participant(Coord, Tid, Bin, C0, DiscNs, _RamNs) -> + ?eval_debug_fun({?MODULE, commit_participant, pre}, [{tid, Tid}]), + case catch mnesia_schema:prepare_commit(Tid, C0, {part, Coord}) of + {Modified, C = #commit{}, DumperMode} -> + %% If we can not find any local unclear decision + %% we should presume abort at startup recovery + case lists:member(node(), DiscNs) of + false -> + ignore; + true -> + case Modified of + false -> mnesia_log:log(Bin); + true -> mnesia_log:log(C) + end + end, + ?eval_debug_fun({?MODULE, commit_participant, vote_yes}, + [{tid, Tid}]), + reply(Coord, {vote_yes, Tid, self()}), + + receive + {Tid, pre_commit} -> + D = C#commit.decision, + mnesia_recover:log_decision(D#decision{outcome = unclear}), + ?eval_debug_fun({?MODULE, commit_participant, pre_commit}, + [{tid, Tid}]), + Expect_schema_ack = C#commit.schema_ops /= [], + reply(Coord, {acc_pre_commit, Tid, self(), Expect_schema_ack}), + + %% Now we are vulnerable for failures, since + %% we cannot decide without asking others + receive + {Tid, committed} -> + mnesia_recover:log_decision(D#decision{outcome = committed}), + ?eval_debug_fun({?MODULE, commit_participant, log_commit}, + [{tid, Tid}]), + do_commit(Tid, C, DumperMode), + case Expect_schema_ack of + false -> ignore; + true -> reply(Coord, {schema_commit, Tid, self()}) + end, + ?eval_debug_fun({?MODULE, commit_participant, do_commit}, + [{tid, Tid}]); + + {Tid, {do_abort, _Reason}} -> + mnesia_recover:log_decision(D#decision{outcome = aborted}), + ?eval_debug_fun({?MODULE, commit_participant, log_abort}, + [{tid, Tid}]), + mnesia_schema:undo_prepare_commit(Tid, C0), + ?eval_debug_fun({?MODULE, commit_participant, undo_prepare}, + [{tid, Tid}]); + + {'EXIT', _, _} -> + mnesia_recover:log_decision(D#decision{outcome = aborted}), + ?eval_debug_fun({?MODULE, commit_participant, exit_log_abort}, + [{tid, Tid}]), + mnesia_schema:undo_prepare_commit(Tid, C0), + ?eval_debug_fun({?MODULE, commit_participant, exit_undo_prepare}, + [{tid, Tid}]); + + Msg -> + verbose("** ERROR ** commit_participant ~p, got unexpected msg: ~p~n", + [Tid, Msg]) + end; + {Tid, {do_abort, Reason}} -> + reply(Coord, {do_abort, Tid, self(), Reason}), + mnesia_schema:undo_prepare_commit(Tid, C0), + ?eval_debug_fun({?MODULE, commit_participant, pre_commit_undo_prepare}, + [{tid, Tid}]); + + {'EXIT', _, Reason} -> + reply(Coord, {do_abort, Tid, self(), {bad_commit,Reason}}), + mnesia_schema:undo_prepare_commit(Tid, C0), + ?eval_debug_fun({?MODULE, commit_participant, pre_commit_undo_prepare}, [{tid, Tid}]); + + Msg -> + reply(Coord, {do_abort, Tid, self(), {bad_commit,internal}}), + verbose("** ERROR ** commit_participant ~p, got unexpected msg: ~p~n", + [Tid, Msg]) + end; + + {'EXIT', Reason} -> + ?eval_debug_fun({?MODULE, commit_participant, vote_no}, + [{tid, Tid}]), + reply(Coord, {vote_no, Tid, Reason}), + mnesia_schema:undo_prepare_commit(Tid, C0) + end, + mnesia_locker:release_tid(Tid), + ?MODULE ! {delete_transaction, Tid}, + unlink(whereis(?MODULE)), + exit(normal). + +do_abort(Tid, Bin) when is_binary(Bin) -> + %% Possible optimization: + %% If we want we could pass arround a flag + %% that tells us whether the binary contains + %% schema ops or not. Only if the binary + %% contains schema ops there are meningful + %% unpack the binary and perform + %% mnesia_schema:undo_prepare_commit/1. + do_abort(Tid, binary_to_term(Bin)); +do_abort(Tid, Commit) -> + mnesia_schema:undo_prepare_commit(Tid, Commit), + Commit. + +do_dirty(Tid, Commit) when Commit#commit.schema_ops == [] -> + mnesia_log:log(Commit), + do_commit(Tid, Commit). + +%% do_commit(Tid, CommitRecord) +do_commit(Tid, Bin) when is_binary(Bin) -> + do_commit(Tid, binary_to_term(Bin)); +do_commit(Tid, C) -> + do_commit(Tid, C, optional). +do_commit(Tid, Bin, DumperMode) when is_binary(Bin) -> + do_commit(Tid, binary_to_term(Bin), DumperMode); +do_commit(Tid, C, DumperMode) -> + mnesia_dumper:update(Tid, C#commit.schema_ops, DumperMode), + R = do_snmp(Tid, C#commit.snmp), + R2 = do_update(Tid, ram_copies, C#commit.ram_copies, R), + R3 = do_update(Tid, disc_copies, C#commit.disc_copies, R2), + do_update(Tid, disc_only_copies, C#commit.disc_only_copies, R3). + +%% Update the items +do_update(Tid, Storage, [Op | Ops], OldRes) -> + case catch do_update_op(Tid, Storage, Op) of + ok -> + do_update(Tid, Storage, Ops, OldRes); + {'EXIT', Reason} -> + %% This may only happen when we recently have + %% deleted our local replica, changed storage_type + %% or transformed table + %% BUGBUG: Updates may be lost if storage_type is changed. + %% Determine actual storage type and try again. + %% BUGBUG: Updates may be lost if table is transformed. + + verbose("do_update in ~w failed: ~p -> {'EXIT', ~p}~n", + [Tid, Op, Reason]), + do_update(Tid, Storage, Ops, OldRes); + NewRes -> + do_update(Tid, Storage, Ops, NewRes) + end; +do_update(_Tid, _Storage, [], Res) -> + Res. + +do_update_op(Tid, Storage, {{Tab, K}, Obj, write}) -> + commit_write(?catch_val({Tab, commit_work}), Tid, + Tab, K, Obj, undefined), + mnesia_lib:db_put(Storage, Tab, Obj); + +do_update_op(Tid, Storage, {{Tab, K}, Val, delete}) -> + commit_delete(?catch_val({Tab, commit_work}), Tid, Tab, K, Val, undefined), + mnesia_lib:db_erase(Storage, Tab, K); + +do_update_op(Tid, Storage, {{Tab, K}, {RecName, Incr}, update_counter}) -> + {NewObj, OldObjs} = + case catch mnesia_lib:db_update_counter(Storage, Tab, K, Incr) of + NewVal when is_integer(NewVal), NewVal >= 0 -> + {{RecName, K, NewVal}, [{RecName, K, NewVal - Incr}]}; + _ when Incr > 0 -> + New = {RecName, K, Incr}, + mnesia_lib:db_put(Storage, Tab, New), + {New, []}; + _ -> + Zero = {RecName, K, 0}, + mnesia_lib:db_put(Storage, Tab, Zero), + {Zero, []} + end, + commit_update(?catch_val({Tab, commit_work}), Tid, Tab, + K, NewObj, OldObjs), + element(3, NewObj); + +do_update_op(Tid, Storage, {{Tab, Key}, Obj, delete_object}) -> + commit_del_object(?catch_val({Tab, commit_work}), + Tid, Tab, Key, Obj, undefined), + mnesia_lib:db_match_erase(Storage, Tab, Obj); + +do_update_op(Tid, Storage, {{Tab, Key}, Obj, clear_table}) -> + commit_clear(?catch_val({Tab, commit_work}), Tid, Tab, Key, Obj), + mnesia_lib:db_match_erase(Storage, Tab, Obj). + +commit_write([], _, _, _, _, _) -> ok; +commit_write([{checkpoints, CpList}|R], Tid, Tab, K, Obj, Old) -> + mnesia_checkpoint:tm_retain(Tid, Tab, K, write, CpList), + commit_write(R, Tid, Tab, K, Obj, Old); +commit_write([H|R], Tid, Tab, K, Obj, Old) + when element(1, H) == subscribers -> + mnesia_subscr:report_table_event(H, Tab, Tid, Obj, write, Old), + commit_write(R, Tid, Tab, K, Obj, Old); +commit_write([H|R], Tid, Tab, K, Obj, Old) + when element(1, H) == index -> + mnesia_index:add_index(H, Tab, K, Obj, Old), + commit_write(R, Tid, Tab, K, Obj, Old). + +commit_update([], _, _, _, _, _) -> ok; +commit_update([{checkpoints, CpList}|R], Tid, Tab, K, Obj, _) -> + Old = mnesia_checkpoint:tm_retain(Tid, Tab, K, write, CpList), + commit_update(R, Tid, Tab, K, Obj, Old); +commit_update([H|R], Tid, Tab, K, Obj, Old) + when element(1, H) == subscribers -> + mnesia_subscr:report_table_event(H, Tab, Tid, Obj, write, Old), + commit_update(R, Tid, Tab, K, Obj, Old); +commit_update([H|R], Tid, Tab, K, Obj, Old) + when element(1, H) == index -> + mnesia_index:add_index(H, Tab, K, Obj, Old), + commit_update(R, Tid, Tab, K, Obj, Old). + +commit_delete([], _, _, _, _, _) -> ok; +commit_delete([{checkpoints, CpList}|R], Tid, Tab, K, Obj, _) -> + Old = mnesia_checkpoint:tm_retain(Tid, Tab, K, delete, CpList), + commit_delete(R, Tid, Tab, K, Obj, Old); +commit_delete([H|R], Tid, Tab, K, Obj, Old) + when element(1, H) == subscribers -> + mnesia_subscr:report_table_event(H, Tab, Tid, Obj, delete, Old), + commit_delete(R, Tid, Tab, K, Obj, Old); +commit_delete([H|R], Tid, Tab, K, Obj, Old) + when element(1, H) == index -> + mnesia_index:delete_index(H, Tab, K), + commit_delete(R, Tid, Tab, K, Obj, Old). + +commit_del_object([], _, _, _, _, _) -> ok; +commit_del_object([{checkpoints, CpList}|R], Tid, Tab, K, Obj, _) -> + Old = mnesia_checkpoint:tm_retain(Tid, Tab, K, delete_object, CpList), + commit_del_object(R, Tid, Tab, K, Obj, Old); +commit_del_object([H|R], Tid, Tab, K, Obj, Old) + when element(1, H) == subscribers -> + mnesia_subscr:report_table_event(H, Tab, Tid, Obj, delete_object, Old), + commit_del_object(R, Tid, Tab, K, Obj, Old); +commit_del_object([H|R], Tid, Tab, K, Obj, Old) + when element(1, H) == index -> + mnesia_index:del_object_index(H, Tab, K, Obj, Old), + commit_del_object(R, Tid, Tab, K, Obj, Old). + +commit_clear([], _, _, _, _) -> ok; +commit_clear([{checkpoints, CpList}|R], Tid, Tab, K, Obj) -> + mnesia_checkpoint:tm_retain(Tid, Tab, K, clear_table, CpList), + commit_clear(R, Tid, Tab, K, Obj); +commit_clear([H|R], Tid, Tab, K, Obj) + when element(1, H) == subscribers -> + mnesia_subscr:report_table_event(H, Tab, Tid, Obj, clear_table, undefined), + commit_clear(R, Tid, Tab, K, Obj); +commit_clear([H|R], Tid, Tab, K, Obj) + when element(1, H) == index -> + mnesia_index:clear_index(H, Tab, K, Obj), + commit_clear(R, Tid, Tab, K, Obj). + +do_snmp(_, []) -> ok; +do_snmp(Tid, [Head | Tail]) -> + case catch mnesia_snmp_hook:update(Head) of + {'EXIT', Reason} -> + %% This should only happen when we recently have + %% deleted our local replica or recently deattached + %% the snmp table + + verbose("do_snmp in ~w failed: ~p -> {'EXIT', ~p}~n", + [Tid, Head, Reason]); + ok -> + ignore + end, + do_snmp(Tid, Tail). + +commit_nodes([C | Tail], AccD, AccR) + when C#commit.disc_copies == [], + C#commit.disc_only_copies == [], + C#commit.schema_ops == [] -> + commit_nodes(Tail, AccD, [C#commit.node | AccR]); +commit_nodes([C | Tail], AccD, AccR) -> + commit_nodes(Tail, [C#commit.node | AccD], AccR); +commit_nodes([], AccD, AccR) -> + {AccD, AccR}. + +commit_decision(D, [C | Tail], AccD, AccR) -> + N = C#commit.node, + {D2, Tail2} = + case C#commit.schema_ops of + [] when C#commit.disc_copies == [], + C#commit.disc_only_copies == [] -> + commit_decision(D, Tail, AccD, [N | AccR]); + [] -> + commit_decision(D, Tail, [N | AccD], AccR); + Ops -> + case ram_only_ops(N, Ops) of + true -> + commit_decision(D, Tail, AccD, [N | AccR]); + false -> + commit_decision(D, Tail, [N | AccD], AccR) + end + end, + {D2, [C#commit{decision = D2} | Tail2]}; +commit_decision(D, [], AccD, AccR) -> + {D#decision{disc_nodes = AccD, ram_nodes = AccR}, []}. + +ram_only_ops(N, [{op, change_table_copy_type, N, _FromS, _ToS, Cs} | _Ops ]) -> + case lists:member({name, schema}, Cs) of + true -> + %% We always use disk if change type of the schema + false; + false -> + not lists:member(N, val({schema, disc_copies})) + end; + +ram_only_ops(N, _Ops) -> + not lists:member(N, val({schema, disc_copies})). + +%% Returns {WaitFor, Res} +sync_send_dirty(Tid, [Head | Tail], Tab, WaitFor) -> + Node = Head#commit.node, + if + Node == node() -> + {WF, _} = sync_send_dirty(Tid, Tail, Tab, WaitFor), + Res = do_dirty(Tid, Head), + {WF, Res}; + true -> + {?MODULE, Node} ! {self(), {sync_dirty, Tid, Head, Tab}}, + sync_send_dirty(Tid, Tail, Tab, [Node | WaitFor]) + end; +sync_send_dirty(_Tid, [], _Tab, WaitFor) -> + {WaitFor, {'EXIT', {aborted, {node_not_running, WaitFor}}}}. + +%% Returns {WaitFor, Res} +async_send_dirty(_Tid, _Nodes, Tab, nowhere) -> + {[], {'EXIT', {aborted, {no_exists, Tab}}}}; +async_send_dirty(Tid, Nodes, Tab, ReadNode) -> + async_send_dirty(Tid, Nodes, Tab, ReadNode, [], ok). + +async_send_dirty(Tid, [Head | Tail], Tab, ReadNode, WaitFor, Res) -> + Node = Head#commit.node, + if + ReadNode == Node, Node == node() -> + NewRes = do_dirty(Tid, Head), + async_send_dirty(Tid, Tail, Tab, ReadNode, WaitFor, NewRes); + ReadNode == Node -> + {?MODULE, Node} ! {self(), {sync_dirty, Tid, Head, Tab}}, + NewRes = {'EXIT', {aborted, {node_not_running, Node}}}, + async_send_dirty(Tid, Tail, Tab, ReadNode, [Node | WaitFor], NewRes); + true -> + {?MODULE, Node} ! {self(), {async_dirty, Tid, Head, Tab}}, + async_send_dirty(Tid, Tail, Tab, ReadNode, WaitFor, Res) + end; +async_send_dirty(_Tid, [], _Tab, _ReadNode, WaitFor, Res) -> + {WaitFor, Res}. + +rec_dirty([Node | Tail], Res) when Node /= node() -> + NewRes = get_dirty_reply(Node, Res), + rec_dirty(Tail, NewRes); +rec_dirty([], Res) -> + Res. + +get_dirty_reply(Node, Res) -> + receive + {?MODULE, Node, {'EXIT', Reason}} -> + {'EXIT', {aborted, {badarg, Reason}}}; + {?MODULE, Node, {dirty_res, ok}} -> + case Res of + {'EXIT', {aborted, {node_not_running, _Node}}} -> + ok; + _ -> + %% Prioritize bad results, but node_not_running + Res + end; + {?MODULE, Node, {dirty_res, Reply}} -> + Reply; + {mnesia_down, Node} -> + case get(mnesia_activity_state) of + {_, Tid, _Ts} when element(1,Tid) == tid -> + %% Hmm dirty called inside a transaction, to avoid + %% hanging transaction we need to restart the transaction + mnesia:abort({node_not_running, Node}); + _ -> + %% It's ok to ignore mnesia_down's since we will make + %% the replicas consistent again when Node is started + Res + end + after 1000 -> + case lists:member(Node, val({current, db_nodes})) of + true -> + get_dirty_reply(Node, Res); + false -> + Res + end + end. + +%% Assume that CommitRecord is no binary +%% Return {Res, Pids} +ask_commit(Protocol, Tid, CR, DiscNs, RamNs) -> + ask_commit(Protocol, Tid, CR, DiscNs, RamNs, [], no_local). + +ask_commit(Protocol, Tid, [Head | Tail], DiscNs, RamNs, WaitFor, Local) -> + Node = Head#commit.node, + if + Node == node() -> + ask_commit(Protocol, Tid, Tail, DiscNs, RamNs, WaitFor, Head); + true -> + Bin = opt_term_to_binary(Protocol, Head, DiscNs++RamNs), + Msg = {ask_commit, Protocol, Tid, Bin, DiscNs, RamNs}, + {?MODULE, Node} ! {self(), Msg}, + ask_commit(Protocol, Tid, Tail, DiscNs, RamNs, [Node | WaitFor], Local) + end; +ask_commit(_Protocol, _Tid, [], _DiscNs, _RamNs, WaitFor, Local) -> + {WaitFor, Local}. + +%% This used to test protocol conversion between mnesia-nodes +%% but it is really dependent on the emulator version on the +%% two nodes (if funs are sent which they are in transform table op). +%% to be safe we let erts do the translation (many times maybe and thus +%% slower but it works. +% opt_term_to_binary(asym_trans, Head, Nodes) -> +% opt_term_to_binary(Nodes, Head); +opt_term_to_binary(_Protocol, Head, _Nodes) -> + Head. + +rec_all([Node | Tail], Tid, Res, Pids) -> + receive + {?MODULE, Node, {vote_yes, Tid}} -> + rec_all(Tail, Tid, Res, Pids); + {?MODULE, Node, {vote_yes, Tid, Pid}} -> + rec_all(Tail, Tid, Res, [Pid | Pids]); + {?MODULE, Node, {vote_no, Tid, Reason}} -> + rec_all(Tail, Tid, {do_abort, Reason}, Pids); + {?MODULE, Node, {committed, Tid}} -> + rec_all(Tail, Tid, Res, Pids); + {?MODULE, Node, {aborted, Tid}} -> + rec_all(Tail, Tid, Res, Pids); + + {mnesia_down, Node} -> + %% Make sure that mnesia_tm knows it has died + %% it may have been restarted + Abort = {do_abort, {bad_commit, Node}}, + catch {?MODULE, Node} ! {Tid, Abort}, + rec_all(Tail, Tid, Abort, Pids) + end; +rec_all([], _Tid, Res, Pids) -> + {Res, Pids}. + +get_transactions() -> + {info, Participant, Coordinator} = req(info), + lists:map(fun({Tid, _Tabs}) -> + Status = tr_status(Tid,Participant), + {Tid#tid.counter, Tid#tid.pid, Status} + end,Coordinator). + +tr_status(Tid,Participant) -> + case lists:keymember(Tid, 1, Participant) of + true -> participant; + false -> coordinator + end. + +get_info(Timeout) -> + case whereis(?MODULE) of + undefined -> + {timeout, Timeout}; + Pid -> + Pid ! {self(), info}, + receive + {?MODULE, _, {info, Part, Coord}} -> + {info, Part, Coord} + after Timeout -> + {timeout, Timeout} + end + end. + +display_info(Stream, {timeout, T}) -> + io:format(Stream, "---> No info about coordinator and participant transactions, " + "timeout ~p <--- ~n", [T]); + +display_info(Stream, {info, Part, Coord}) -> + io:format(Stream, "---> Participant transactions <--- ~n", []), + lists:foreach(fun(P) -> pr_participant(Stream, P) end, Part), + io:format(Stream, "---> Coordinator transactions <---~n", []), + lists:foreach(fun({Tid, _Tabs}) -> pr_tid(Stream, Tid) end, Coord). + +pr_participant(Stream, P) -> + Commit0 = P#participant.commit, + Commit = + if + is_binary(Commit0) -> binary_to_term(Commit0); + true -> Commit0 + end, + pr_tid(Stream, P#participant.tid), + io:format(Stream, "with participant objects ~p~n", [Commit]). + + +pr_tid(Stream, Tid) -> + io:format(Stream, "Tid: ~p (owned by ~p) ~n", + [Tid#tid.counter, Tid#tid.pid]). + +info(Serial) -> + io:format( "Info about transaction with serial == ~p~n", [Serial]), + {info, Participant, Trs} = req(info), + search_pr_participant(Serial, Participant), + search_pr_coordinator(Serial, Trs). + + +search_pr_coordinator(_S, []) -> no; +search_pr_coordinator(S, [{Tid, _Ts}|Tail]) -> + case Tid#tid.counter of + S -> + io:format( "Tid is coordinator, owner == \n", []), + display_pid_info(Tid#tid.pid), + search_pr_coordinator(S, Tail); + _ -> + search_pr_coordinator(S, Tail) + end. + +search_pr_participant(_S, []) -> + false; +search_pr_participant(S, [ P | Tail]) -> + Tid = P#participant.tid, + Commit0 = P#participant.commit, + if + Tid#tid.counter == S -> + io:format( "Tid is participant to commit, owner == \n", []), + Pid = Tid#tid.pid, + display_pid_info(Pid), + io:format( "Tid wants to write objects \n",[]), + Commit = + if + is_binary(Commit0) -> binary_to_term(Commit0); + true -> Commit0 + end, + + io:format("~p~n", [Commit]), + search_pr_participant(S,Tail); %% !!!!! + true -> + search_pr_participant(S, Tail) + end. + +display_pid_info(Pid) -> + case rpc:pinfo(Pid) of + undefined -> + io:format( "Dead process \n"); + Info -> + Call = fetch(initial_call, Info), + Curr = case fetch(current_function, Info) of + {Mod,F,Args} when is_list(Args) -> + {Mod,F,length(Args)}; + Other -> + Other + end, + Reds = fetch(reductions, Info), + LM = length(fetch(messages, Info)), + pformat(io_lib:format("~p", [Pid]), + io_lib:format("~p", [Call]), + io_lib:format("~p", [Curr]), Reds, LM) + end. + +pformat(A1, A2, A3, A4, A5) -> + io:format( "~-12s ~-21s ~-21s ~9w ~4w~n", [A1,A2,A3,A4,A5]). + +fetch(Key, Info) -> + case lists:keysearch(Key, 1, Info) of + {value, {_, Val}} -> + Val; + _ -> + 0 + end. + + +%%%%%%%%%%%%%%%%%%%% +%%%%%%%%%%%%%%%%%%%%% reconfigure stuff comes here ...... +%%%%%%%%%%%%%%%%%%%%% + +reconfigure_coordinators(N, [{Tid, [Store | _]} | Coordinators]) -> + case mnesia_recover:outcome(Tid, unknown) of + committed -> + WaitingNodes = ?ets_lookup(Store, waiting_for_commit_ack), + case lists:keymember(N, 2, WaitingNodes) of + false -> + ignore; % avoid spurious mnesia_down messages + true -> + send_mnesia_down(Tid, Store, N) + end; + aborted -> + ignore; % avoid spurious mnesia_down messages + _ -> + %% Tell the coordinator about the mnesia_down + send_mnesia_down(Tid, Store, N) + end, + reconfigure_coordinators(N, Coordinators); +reconfigure_coordinators(_N, []) -> + ok. + +send_mnesia_down(Tid, Store, Node) -> + Msg = {mnesia_down, Node}, + send_to_pids([Tid#tid.pid | get_elements(friends,Store)], Msg). + +send_to_pids([Pid | Pids], Msg) when is_pid(Pid) -> + Pid ! Msg, + send_to_pids(Pids, Msg); +send_to_pids([_ | Pids], Msg) -> + send_to_pids(Pids, Msg); +send_to_pids([], _Msg) -> + ok. + +reconfigure_participants(N, [P | Tail]) -> + case lists:member(N, P#participant.disc_nodes) or + lists:member(N, P#participant.ram_nodes) of + false -> + %% Ignore, since we are not a participant + %% in the transaction. + reconfigure_participants(N, Tail); + + true -> + %% We are on a participant node, lets + %% check if the dead one was a + %% participant or a coordinator. + Tid = P#participant.tid, + if + node(Tid#tid.pid) /= N -> + %% Another participant node died. Ignore. + reconfigure_participants(N, Tail); + + true -> + %% The coordinator node has died and + %% we must determine the outcome of the + %% transaction and tell mnesia_tm on all + %% nodes (including the local node) about it + verbose("Coordinator ~p in transaction ~p died~n", + [Tid#tid.pid, Tid]), + + Nodes = P#participant.disc_nodes ++ + P#participant.ram_nodes, + AliveNodes = Nodes -- [N], + Protocol = P#participant.protocol, + tell_outcome(Tid, Protocol, N, AliveNodes, AliveNodes), + reconfigure_participants(N, Tail) + end + end; +reconfigure_participants(_, []) -> + []. + +%% We need to determine the outcome of the transaction and +%% tell mnesia_tm on all involved nodes (including the local node) +%% about the outcome. +tell_outcome(Tid, Protocol, Node, CheckNodes, TellNodes) -> + Outcome = mnesia_recover:what_happened(Tid, Protocol, CheckNodes), + case Outcome of + aborted -> + rpc:abcast(TellNodes, ?MODULE, {Tid,{do_abort, {mnesia_down, Node}}}); + committed -> + rpc:abcast(TellNodes, ?MODULE, {Tid, do_commit}) + end, + Outcome. + +do_stop(#state{coordinators = Coordinators}) -> + Msg = {mnesia_down, node()}, + lists:foreach(fun({Tid, _}) -> Tid#tid.pid ! Msg end, gb_trees:to_list(Coordinators)), + mnesia_checkpoint:stop(), + mnesia_log:stop(), + exit(shutdown). + +fixtable(Tab, Lock, Me) -> + case req({fixtable, [Tab,Lock,Me]}) of + error -> + exit({no_exists, Tab}); + Else -> + Else + end. + +%%%%%%%%%%%%%%%%%%%%%%%%%%% +%% System upgrade + +system_continue(_Parent, _Debug, State) -> + doit_loop(State). + +system_terminate(_Reason, _Parent, _Debug, State) -> + do_stop(State). + +system_code_change(State=#state{coordinators=Cs0,participants=Ps0},_Module,_OldVsn,downgrade) -> + case is_tuple(Cs0) of + true -> + Cs = gb_trees:to_list(Cs0), + Ps = gb_trees:values(Ps0), + {ok, State#state{coordinators=Cs,participants=Ps}}; + false -> + {ok, State} + end; + +system_code_change(State=#state{coordinators=Cs0,participants=Ps0},_Module,_OldVsn,_Extra) -> + case is_list(Cs0) of + true -> + Cs = gb_trees:from_orddict(lists:sort(Cs0)), + Ps1 = [{P#participant.tid,P}|| P <- Ps0], + Ps = gb_trees:from_orddict(lists:sort(Ps1)), + {ok, State#state{coordinators=Cs,participants=Ps}}; + false -> + {ok, State} + end. |