aboutsummaryrefslogtreecommitdiffstats
path: root/lib
diff options
context:
space:
mode:
Diffstat (limited to 'lib')
-rw-r--r--lib/dialyzer/test/small_SUITE_data/src/ddfs_master/common_types.hrl6
-rw-r--r--lib/dialyzer/test/small_SUITE_data/src/ddfs_master/config.hrl148
-rw-r--r--lib/dialyzer/test/small_SUITE_data/src/ddfs_master/ddfs.hrl9
-rw-r--r--lib/dialyzer/test/small_SUITE_data/src/ddfs_master/ddfs_gc.hrl17
-rw-r--r--lib/dialyzer/test/small_SUITE_data/src/ddfs_master/ddfs_master.erl531
-rw-r--r--lib/dialyzer/test/small_SUITE_data/src/ddfs_master/ddfs_tag.hrl19
-rw-r--r--lib/dialyzer/test/small_SUITE_data/src/ddfs_master/gs_util.hrl16
-rw-r--r--lib/hipe/cerl/erl_types.erl41
8 files changed, 769 insertions, 18 deletions
diff --git a/lib/dialyzer/test/small_SUITE_data/src/ddfs_master/common_types.hrl b/lib/dialyzer/test/small_SUITE_data/src/ddfs_master/common_types.hrl
new file mode 100644
index 0000000000..f362a06bca
--- /dev/null
+++ b/lib/dialyzer/test/small_SUITE_data/src/ddfs_master/common_types.hrl
@@ -0,0 +1,6 @@
+-type host() :: nonempty_string().
+-type path() :: nonempty_string().
+-type url() :: binary().
+
+% The host portion of a url, if available.
+-type url_host() :: host() | none.
diff --git a/lib/dialyzer/test/small_SUITE_data/src/ddfs_master/config.hrl b/lib/dialyzer/test/small_SUITE_data/src/ddfs_master/config.hrl
new file mode 100644
index 0000000000..8cab65fc9c
--- /dev/null
+++ b/lib/dialyzer/test/small_SUITE_data/src/ddfs_master/config.hrl
@@ -0,0 +1,148 @@
+
+-define(SECOND, 1000).
+-define(MINUTE, (60 * ?SECOND)).
+-define(HOUR, (60 * ?MINUTE)).
+-define(DAY, (24 * ?HOUR)).
+-define(MB, (1024 * 1024)).
+
+% Maximum length of tag/blob prefix
+-define(NAME_MAX, 511).
+
+% How long ddfs node startup can take. The most time-consuming part
+% is the scanning of the tag objects in the node's DDFS volumes.
+-define(NODE_STARTUP, (1 * ?MINUTE)).
+
+% How long to wait on the master for replies from nodes.
+-define(NODE_TIMEOUT, (10 * ?SECOND)).
+
+% How long to wait for a reply from an operation coordinated by the
+% master that accesses nodes. This value should be larger than
+% NODE_TIMEOUT.
+-define(NODEOP_TIMEOUT, (1 * ?MINUTE)).
+
+% The minimum amount of free space a node must have, to be considered
+% a primary candidate host for a new blob.
+-define(MIN_FREE_SPACE, (1024 * ?MB)).
+
+% The maximum number of active HTTP connections on a system (this
+% applies separately for GET and PUT operations).
+-define(HTTP_MAX_ACTIVE, 3).
+
+% The maximum number of waiting HTTP connections to queue up on a busy system.
+-define(HTTP_QUEUE_LENGTH, 100).
+
+% The maximum number of simultaneous HTTP connections. Note that
+% HTTP_MAX_CONNS * 2 * 2 + 32 < Maximum number of file descriptors, where
+% 2 = Get and put, 2 = two FDs required for each connection (connection
+% itself + a file it accesses), 32 = a guess how many extra fds is needed.
+-define(HTTP_MAX_CONNS, 128).
+
+% How long to keep a PUT request in queue if the system is busy.
+-define(PUT_WAIT_TIMEOUT, (1 * ?MINUTE)).
+
+% How long to keep a GET request in queue if the system is busy.
+-define(GET_WAIT_TIMEOUT, (1 * ?MINUTE)).
+
+% An unused loaded tag expires in TAG_EXPIRES milliseconds. Note that
+% if TAG_EXPIRES is not smaller than GC_INTERVAL, tags will never
+% expire from the memory cache and will always take up memory.
+-define(TAG_EXPIRES, (10 * ?HOUR)).
+
+% How often the master's cache of all known tag names is refreshed.
+% This refresh is only needed to purge deleted tags eventually from
+% the tag cache. It doesn't harm to have a long interval.
+-define(TAG_CACHE_INTERVAL, (10 * ?MINUTE)).
+
+% How soon a tag object initialized in memory expires if it's content
+% cannot be fetched from the cluster.
+-define(TAG_EXPIRES_ONERROR, (1 * ?SECOND)).
+
+% How often a DDFS node should refresh its tag cache from disk.
+-define(FIND_TAGS_INTERVAL, ?DAY).
+
+% How often buffered (delayed) updates to a tag need to be
+% flushed. Tradeoff: The longer the interval, the more updates are
+% bundled in a single commit. On the other hand, in the worst case
+% the requester has to wait for the full interval before getting a
+% reply. A long interval also increases the likelihood that the server
+% crashes before the commit has finished successfully, making requests
+% more unreliable.
+-define(DELAYED_FLUSH_INTERVAL, (1 * ?SECOND)).
+
+% How long to wait between garbage collection runs.
+-define(GC_INTERVAL, ?DAY).
+
+% Max duration for a GC run. This should be smaller than
+% min(ORPHANED_{BLOB,TAG}_EXPIRES).
+-define(GC_MAX_DURATION, (3 * ?DAY)).
+
+% How long to wait after startup for cluster to stabilize before
+% starting the first GC run.
+-define(GC_DEFAULT_INITIAL_WAIT, (5 * ?MINUTE)).
+
+% The longest potential interval between messages in the GC protocol;
+% used to ensure GC makes forward progress. This can be set to the
+% estimated time to traverse all the volumes on a DDFS node.
+-define(GC_PROGRESS_INTERVAL, (30 * ?MINUTE)).
+
+% Number of extra replicas (i.e. lost replicas recovered during GC) to
+% allow before deleting extra replicas.
+-define(NUM_EXTRA_REPLICAS, 1).
+
+% Permissions for files backing blobs and tags.
+-define(FILE_MODE, 8#00400).
+
+% How often to check available disk space in ddfs_node.
+-define(DISKSPACE_INTERVAL, (10 * ?SECOND)).
+
+% The maximum size of payloads of HTTP requests to the /ddfs/tag/
+% prefix.
+-define(MAX_TAG_BODY_SIZE, (512 * ?MB)).
+
+% Tag attribute names and values have a limited size, and there
+% can be only a limited number of them.
+-define(MAX_TAG_ATTRIB_NAME_SIZE, 1024).
+-define(MAX_TAG_ATTRIB_VALUE_SIZE, 1024).
+-define(MAX_NUM_TAG_ATTRIBS, 1000).
+
+% How long HTTP requests that perform tag updates should wait to
+% finish (a long time).
+-define(TAG_UPDATE_TIMEOUT, ?DAY).
+
+% Timeout for re-replicating a single blob over HTTP PUT. This
+% depends on the largest blobs hosted by DDFS, and the speed of the
+% cluster network.
+-define(GC_PUT_TIMEOUT, (180 * ?MINUTE)).
+
+% Delete !partial files after this many milliseconds.
+-define(PARTIAL_EXPIRES, ?DAY).
+
+% When orphaned blob can be deleted. This should be large enough that
+% you can upload all the new blobs of a tag and perform the tag update
+% within this time.
+-define(ORPHANED_BLOB_EXPIRES, (5 * ?DAY)).
+
+% When orphaned tag can be deleted.
+-define(ORPHANED_TAG_EXPIRES, (5 * ?DAY)).
+
+% How long a tag has to stay on the deleted list before
+% we can permanently forget it, after all known instances
+% of the tag object have been removed. This quarantine period
+% ensures that a node that was temporarily unavailable
+% and reactivates can't resurrect deleted tags. You
+% must ensure that all temporarily inactive nodes
+% are reactivated (or cleaned) within the ?DELETED_TAG_EXPIRES
+% time frame.
+%
+% This value _must_ be larger than the other time-related DDFS
+% parameters listed in this file. In particular, it must be larger
+% than ORPHANED_TAG_EXPIRES.
+-define(DELETED_TAG_EXPIRES, (30 * ?DAY)).
+
+% How many times a tag operation should be retried before aborting.
+-define(MAX_TAG_OP_RETRIES, 3).
+
+% How long to wait before timing out a tag retrieval. This should be
+% large enough to read a large tag object off the disk and send it
+% over the network.
+-define(GET_TAG_TIMEOUT, (5 * ?MINUTE)).
diff --git a/lib/dialyzer/test/small_SUITE_data/src/ddfs_master/ddfs.hrl b/lib/dialyzer/test/small_SUITE_data/src/ddfs_master/ddfs.hrl
new file mode 100644
index 0000000000..e43ec23fe1
--- /dev/null
+++ b/lib/dialyzer/test/small_SUITE_data/src/ddfs_master/ddfs.hrl
@@ -0,0 +1,9 @@
+-type volume_name() :: nonempty_string().
+
+% Diskinfo is {FreeSpace, UsedSpace}.
+-type diskinfo() :: {non_neg_integer(), non_neg_integer()}.
+-type volume() :: {diskinfo(), volume_name()}.
+
+-type object_type() :: 'blob' | 'tag'.
+-type object_name() :: binary().
+-type taginfo() :: {erlang:timestamp(), volume_name()}.
diff --git a/lib/dialyzer/test/small_SUITE_data/src/ddfs_master/ddfs_gc.hrl b/lib/dialyzer/test/small_SUITE_data/src/ddfs_master/ddfs_gc.hrl
new file mode 100644
index 0000000000..dc43f7586b
--- /dev/null
+++ b/lib/dialyzer/test/small_SUITE_data/src/ddfs_master/ddfs_gc.hrl
@@ -0,0 +1,17 @@
+-type local_object() :: {object_name(), node()}.
+-type phase() :: 'start' | 'build_map' | 'map_wait' | 'gc'
+ | 'rr_blobs' | 'rr_blobs_wait' | 'rr_tags'.
+-type protocol_msg() :: {'check_blob', object_name()} | 'start_gc' | 'end_rr'.
+
+-type blob_update() :: {object_name(), 'filter' | [url()]}.
+
+-type check_blob_result() :: 'false' | {'true', volume_name()}.
+
+% GC statistics
+
+% {Files, Bytes}
+-type gc_stat() :: {non_neg_integer(), non_neg_integer()}.
+% {Kept, Deleted}
+-type obj_stats() :: {gc_stat(), gc_stat()}.
+% {Tags, Blobs}.
+-type gc_run_stats() :: {obj_stats(), obj_stats()}.
diff --git a/lib/dialyzer/test/small_SUITE_data/src/ddfs_master/ddfs_master.erl b/lib/dialyzer/test/small_SUITE_data/src/ddfs_master/ddfs_master.erl
new file mode 100644
index 0000000000..2be2773dc5
--- /dev/null
+++ b/lib/dialyzer/test/small_SUITE_data/src/ddfs_master/ddfs_master.erl
@@ -0,0 +1,531 @@
+-module(ddfs_master).
+-behaviour(gen_server).
+
+-export([start_link/0]).
+-export([get_tags/1, get_tags/3,
+ get_nodeinfo/1,
+ get_read_nodes/0,
+ get_hosted_tags/1,
+ gc_blacklist/0, gc_blacklist/1,
+ gc_stats/0,
+ choose_write_nodes/3,
+ new_blob/4, new_blob/5,
+ safe_gc_blacklist/0, safe_gc_blacklist/1,
+ refresh_tag_cache/0,
+ tag_notify/2,
+ tag_operation/2, tag_operation/3,
+ update_gc_stats/1,
+ update_nodes/1
+ ]).
+-export([init/1,
+ handle_call/3,
+ handle_cast/2,
+ handle_info/2,
+ terminate/2,
+ code_change/3]).
+
+-define(WEB_PORT, 8011).
+
+-compile(nowarn_deprecated_type).
+
+-include("common_types.hrl").
+-include("gs_util.hrl").
+-include("config.hrl").
+-include("ddfs.hrl").
+-include("ddfs_tag.hrl").
+-include("ddfs_gc.hrl").
+
+-type node_info() :: {node(), {non_neg_integer(), non_neg_integer()}}.
+-type gc_stats() :: none | gc_run_stats().
+
+-record(state, {tags = gb_trees:empty() :: gb_trees:tree(),
+ tag_cache = false :: false | gb_sets:set(),
+ cache_refresher :: pid(),
+
+ nodes = [] :: [node_info()],
+ write_blacklist = [] :: [node()],
+ read_blacklist = [] :: [node()],
+ gc_blacklist = [] :: [node()],
+ safe_gc_blacklist = gb_sets:empty() :: gb_sets:set(),
+ gc_stats = none :: none | {gc_stats(), erlang:timestamp()}}).
+-type state() :: #state{}.
+-type replyto() :: {pid(), reference()}.
+
+-export_type([gc_stats/0, node_info/0]).
+
+%% ===================================================================
+%% API functions
+
+-spec start_link() -> {ok, pid()}.
+start_link() ->
+ lager:info("DDFS master starts"),
+ case gen_server:start_link({local, ?MODULE}, ?MODULE, [], []) of
+ {ok, Server} -> {ok, Server};
+ {error, {already_started, Server}} -> {ok, Server}
+ end.
+
+-spec tag_operation(term(), tagname()) -> term().
+tag_operation(Op, Tag) ->
+ gen_server:call(?MODULE, {tag, Op, Tag}).
+-spec tag_operation(term(), tagname(), non_neg_integer() | infinity) ->
+ term().
+tag_operation(Op, Tag, Timeout) ->
+ gen_server:call(?MODULE, {tag, Op, Tag}, Timeout).
+
+-spec tag_notify(term(), tagname()) -> ok.
+tag_notify(Op, Tag) ->
+ gen_server:cast(?MODULE, {tag_notify, Op, Tag}).
+
+-spec get_nodeinfo(all) -> {ok, [node_info()]}.
+get_nodeinfo(all) ->
+ gen_server:call(?MODULE, {get_nodeinfo, all}).
+
+-spec get_read_nodes() -> {ok, [node()], non_neg_integer()} | {error, term()}.
+get_read_nodes() ->
+ gen_server:call(?MODULE, get_read_nodes, infinity).
+
+-spec gc_blacklist() -> {ok, [node()]}.
+gc_blacklist() ->
+ gen_server:call(?MODULE, gc_blacklist).
+
+-spec gc_blacklist([node()]) -> ok.
+gc_blacklist(Nodes) ->
+ gen_server:cast(?MODULE, {gc_blacklist, Nodes}).
+
+-spec gc_stats() -> {ok, none | {gc_stats(), erlang:timestamp()}} | {error, term()}.
+gc_stats() ->
+ gen_server:call(?MODULE, gc_stats).
+
+-spec get_hosted_tags(host()) -> {ok, [tagname()]} | {error, term()}.
+get_hosted_tags(Host) ->
+ gen_server:call(?MODULE, {get_hosted_tags, Host}).
+
+-spec choose_write_nodes(non_neg_integer(), [node()], [node()]) -> {ok, [node()]}.
+choose_write_nodes(K, Include, Exclude) ->
+ gen_server:call(?MODULE, {choose_write_nodes, K, Include, Exclude}).
+
+-spec get_tags(gc) -> {ok, [tagname()], [node()]} | too_many_failed_nodes;
+ (safe) -> {ok, [binary()]} | too_many_failed_nodes.
+get_tags(Mode) ->
+ get_tags(?MODULE, Mode, ?GET_TAG_TIMEOUT).
+
+-spec get_tags(server(), gc, non_neg_integer()) ->
+ {ok, [tagname()], [node()]} | too_many_failed_nodes;
+ (server(), safe, non_neg_integer()) ->
+ {ok, [binary()]} | too_many_failed_nodes.
+get_tags(Server, Mode, Timeout) ->
+ disco_profile:timed_run(
+ fun() -> gen_server:call(Server, {get_tags, Mode}, Timeout) end,
+ get_tags).
+
+-spec new_blob(string()|object_name(), non_neg_integer(), [node()], [node()]) ->
+ too_many_replicas | {ok, [nonempty_string()]}.
+new_blob(Obj, K, Include, Exclude) ->
+ gen_server:call(?MODULE, {new_blob, Obj, K, Include, Exclude}, infinity).
+
+-spec new_blob(server(), string()|object_name(), non_neg_integer(), [node()], [node()]) ->
+ too_many_replicas | {ok, [nonempty_string()]}.
+new_blob(Master, Obj, K, Include, Exclude) ->
+ gen_server:call(Master, {new_blob, Obj, K, Include, Exclude}, infinity).
+
+-spec safe_gc_blacklist() -> {ok, [node()]} | {error, term()}.
+safe_gc_blacklist() ->
+ gen_server:call(?MODULE, safe_gc_blacklist).
+
+-spec safe_gc_blacklist(gb_sets:set()) -> ok.
+safe_gc_blacklist(SafeGCBlacklist) ->
+ gen_server:cast(?MODULE, {safe_gc_blacklist, SafeGCBlacklist}).
+
+-spec update_gc_stats(gc_run_stats()) -> ok.
+update_gc_stats(Stats) ->
+ gen_server:cast(?MODULE, {update_gc_stats, Stats}).
+
+-type nodes_update() :: [{node(), boolean(), boolean()}].
+-spec update_nodes(nodes_update()) -> ok.
+update_nodes(DDFSNodes) ->
+ gen_server:cast(?MODULE, {update_nodes, DDFSNodes}).
+
+-spec update_nodestats(gb_trees:tree()) -> ok.
+update_nodestats(NewNodes) ->
+ gen_server:cast(?MODULE, {update_nodestats, NewNodes}).
+
+-spec update_tag_cache(gb_sets:set()) -> ok.
+update_tag_cache(TagCache) ->
+ gen_server:cast(?MODULE, {update_tag_cache, TagCache}).
+
+-spec refresh_tag_cache() -> ok.
+refresh_tag_cache() ->
+ gen_server:cast(?MODULE, refresh_tag_cache).
+
+%% ===================================================================
+%% gen_server callbacks
+
+-spec init(_) -> gs_init().
+init(_Args) ->
+ _ = [disco_profile:new_histogram(Name)
+ || Name <- [get_tags, do_get_tags_all, do_get_tags_filter,
+ do_get_tags_safe, do_get_tags_gc]],
+ spawn_link(fun() -> monitor_diskspace() end),
+ spawn_link(fun() -> ddfs_gc:start_gc(disco:get_setting("DDFS_DATA")) end),
+ Refresher = spawn_link(fun() -> refresh_tag_cache_proc() end),
+ put(put_port, disco:get_setting("DDFS_PUT_PORT")),
+ {ok, #state{cache_refresher = Refresher}}.
+
+-type choose_write_nodes_msg() :: {choose_write_nodes, non_neg_integer(), [node()], [node()]}.
+-type new_blob_msg() :: {new_blob, string() | object_name(), non_neg_integer(), [node()]}.
+-type tag_msg() :: {tag, ddfs_tag:call_msg(), tagname()}.
+-spec handle_call(dbg_state_msg(), from(), state()) ->
+ gs_reply(state());
+ ({get_nodeinfo, all}, from(), state()) ->
+ gs_reply({ok, [node_info()]});
+ (get_read_nodes, from(), state()) ->
+ gs_reply({ok, [node()], non_neg_integer});
+ (gc_blacklist, from(), state()) ->
+ gs_reply({ok, [node()]});
+ (gc_stats, from(), state()) ->
+ gs_reply({ok, gc_stats(), erlang:timestamp()});
+ (choose_write_nodes_msg(), from(), state()) ->
+ gs_reply({ok, [node()]});
+ (new_blob_msg(), from(), state()) ->
+ gs_reply(new_blob_result());
+ (tag_msg(), from(), state()) ->
+ gs_reply({error, nonodes}) | gs_noreply();
+ ({get_tags, gc | safe}, from(), state()) ->
+ gs_noreply();
+ ({get_hosted_tags, host()}, from(), state()) ->
+ gs_noreply();
+ (safe_gc_blacklist, from(), state()) ->
+ gs_reply({ok, [node()]}).
+handle_call(dbg_get_state, _, S) ->
+ {reply, S, S};
+
+handle_call({get_nodeinfo, all}, _From, #state{nodes = Nodes} = S) ->
+ {reply, {ok, Nodes}, S};
+
+handle_call(get_read_nodes, _F, #state{nodes = Nodes, read_blacklist = RB} = S) ->
+ {reply, do_get_readable_nodes(Nodes, RB), S};
+
+handle_call(gc_blacklist, _F, #state{gc_blacklist = Nodes} = S) ->
+ {reply, {ok, Nodes}, S};
+
+handle_call(gc_stats, _F, #state{gc_stats = Stats} = S) ->
+ {reply, {ok, Stats}, S};
+
+handle_call({choose_write_nodes, K, Include, Exclude}, _,
+ #state{nodes = N, write_blacklist = WBL, gc_blacklist = GBL} = S) ->
+ BL = lists:umerge(WBL, GBL),
+ {reply, do_choose_write_nodes(N, K, Include, Exclude, BL), S};
+
+handle_call({new_blob, Obj, K, Include, Exclude}, _,
+ #state{nodes = N, gc_blacklist = GBL, write_blacklist = WBL} = S) ->
+ BL = lists:umerge(WBL, GBL),
+ {reply, do_new_blob(Obj, K, Include, Exclude, BL, N), S};
+
+handle_call({tag, _M, _Tag}, _From, #state{nodes = []} = S) ->
+ {reply, {error, no_nodes}, S};
+
+handle_call({tag, M, Tag}, From, S) ->
+ {noreply, do_tag_request(M, Tag, From, S)};
+
+handle_call({get_tags, Mode}, From, #state{nodes = Nodes} = S) ->
+ spawn(fun() ->
+ gen_server:reply(From, do_get_tags(Mode, [N || {N, _} <- Nodes]))
+ end),
+ {noreply, S};
+
+handle_call({get_hosted_tags, Host}, From, S) ->
+ spawn(fun() -> gen_server:reply(From, ddfs_gc:hosted_tags(Host)) end),
+ {noreply, S};
+
+handle_call(safe_gc_blacklist, _From, #state{safe_gc_blacklist = SBL} = S) ->
+ {reply, {ok, gb_sets:to_list(SBL)}, S}.
+
+-spec handle_cast({tag_notify, ddfs_tag:cast_msg(), tagname()}
+ | {gc_blacklist, [node()]}
+ | {safe_gc_blacklist, gb_sets:set()}
+ | {update_gc_stats, gc_stats()}
+ | {update_tag_cache, gb_sets:set()}
+ | refresh_tag_cache
+ | {update_nodes, nodes_update()}
+ | {update_nodestats, gb_trees:tree()},
+ state()) -> gs_noreply().
+handle_cast({tag_notify, M, Tag}, S) ->
+ {noreply, do_tag_notify(M, Tag, S)};
+
+handle_cast({gc_blacklist, Nodes}, #state{safe_gc_blacklist = SBL} = S) ->
+ BLSet = gb_sets:from_list(Nodes),
+ NewSBL = gb_sets:intersection(BLSet, SBL),
+ {noreply, S#state{gc_blacklist = gb_sets:to_list(BLSet),
+ safe_gc_blacklist = NewSBL}};
+
+handle_cast({safe_gc_blacklist, SafeBlacklist}, #state{gc_blacklist = BL} = S) ->
+ SBL = gb_sets:intersection(SafeBlacklist, gb_sets:from_list(BL)),
+ {noreply, S#state{safe_gc_blacklist = SBL}};
+
+handle_cast({update_gc_stats, Stats}, S) ->
+ {noreply, S#state{gc_stats = {Stats, now()}}};
+
+handle_cast({update_tag_cache, TagCache}, S) ->
+ {noreply, S#state{tag_cache = TagCache}};
+
+handle_cast(refresh_tag_cache, #state{cache_refresher = Refresher} = S) ->
+ Refresher ! refresh,
+ {noreply, S};
+
+handle_cast({update_nodes, NewNodes}, S) ->
+ {noreply, do_update_nodes(NewNodes, S)};
+
+handle_cast({update_nodestats, NewNodes}, S) ->
+ {noreply, do_update_nodestats(NewNodes, S)}.
+
+-spec handle_info({'DOWN', _, _, pid(), _}, state()) -> gs_noreply().
+handle_info({'DOWN', _, _, Pid, _}, S) ->
+ {noreply, do_tag_exit(Pid, S)}.
+
+%% ===================================================================
+%% gen_server callback stubs
+
+-spec terminate(term(), state()) -> ok.
+terminate(Reason, _State) ->
+ lager:warning("DDFS master died: ~p", [Reason]).
+
+-spec code_change(term(), state(), term()) -> {ok, state()}.
+code_change(_OldVsn, State, _Extra) -> {ok, State}.
+
+%% ===================================================================
+%% internal functions
+
+-spec do_get_readable_nodes([node_info()], [node()]) ->
+ {ok, [node()], non_neg_integer()}.
+do_get_readable_nodes(Nodes, ReadBlacklist) ->
+ NodeSet = gb_sets:from_ordset(lists:sort([Node || {Node, _} <- Nodes])),
+ BlackSet = gb_sets:from_ordset(ReadBlacklist),
+ ReadableNodeSet = gb_sets:subtract(NodeSet, BlackSet),
+ {ok, gb_sets:to_list(ReadableNodeSet), gb_sets:size(BlackSet)}.
+
+-spec do_choose_write_nodes([node_info()], non_neg_integer(), [node()], [node()], [node()]) ->
+ {ok, [node()]}.
+do_choose_write_nodes(Nodes, K, Include, Exclude, BlackList) ->
+ % Include is the list of nodes that must be included
+ %
+ % Node selection algorithm:
+ % 1. try to choose K nodes randomly from all the nodes which have
+ % more than ?MIN_FREE_SPACE bytes free space available and which
+ % are not excluded or blacklisted.
+ % 2. if K nodes cannot be found this way, choose the K emptiest
+ % nodes which are not excluded or blacklisted.
+ Primary = ([N || {N, {Free, _Total}} <- Nodes, Free > ?MIN_FREE_SPACE / 1024]
+ -- (Exclude ++ BlackList)),
+ if length(Primary) >= K ->
+ {ok, Include ++ disco_util:choose_random(Primary -- Include , K - length(Include))};
+ true ->
+ Preferred = [N || {N, _} <- lists:reverse(lists:keysort(2, Nodes))],
+ Secondary = Include ++ lists:sublist(Preferred -- (Include ++ Exclude ++ BlackList),
+ K - length(Include)),
+ {ok, Secondary}
+ end.
+
+-type new_blob_result() :: too_many_replicas | {ok, [nonempty_string()]}.
+-spec do_new_blob(string()|object_name(), non_neg_integer(), [node()], [node()], [node()], [node_info()]) ->
+ new_blob_result().
+do_new_blob(_Obj, K, _Include, _Exclude, _BlackList, Nodes) when K > length(Nodes) ->
+ too_many_replicas;
+do_new_blob(Obj, K, Include, Exclude, BlackList, Nodes) ->
+ {ok, WriteNodes} = do_choose_write_nodes(Nodes, K, Include, Exclude, BlackList),
+ Urls = [["http://", disco:host(N), ":", get(put_port), "/ddfs/", Obj]
+ || N <- WriteNodes],
+ {ok, Urls}.
+
+% Tag request: Start a new tag server if one doesn't exist already. Forward
+% the request to the tag server.
+
+-spec get_tag_pid(tagname(), gb_trees:tree(), false | gb_sets:set()) ->
+ {pid(), gb_trees:tree()}.
+get_tag_pid(Tag, Tags, Cache) ->
+ case gb_trees:lookup(Tag, Tags) of
+ none ->
+ NotFound = (Cache =/= false
+ andalso not gb_sets:is_element(Tag, Cache)),
+ {ok, Server} = ddfs_tag:start(Tag, NotFound),
+ erlang:monitor(process, Server),
+ {Server, gb_trees:insert(Tag, Server, Tags)};
+ {value, P} ->
+ {P, Tags}
+ end.
+
+-spec do_tag_request(term(), tagname(), replyto(), state()) ->
+ state().
+do_tag_request(M, Tag, From, #state{tags = Tags, tag_cache = Cache} = S) ->
+ {Pid, TagsN} = get_tag_pid(Tag, Tags, Cache),
+ gen_server:cast(Pid, {M, From}),
+ S#state{tags = TagsN,
+ tag_cache = Cache =/= false andalso gb_sets:add(Tag, Cache)}.
+
+-spec do_tag_notify(term(), tagname(), state()) -> state().
+do_tag_notify(M, Tag, #state{tags = Tags, tag_cache = Cache} = S) ->
+ {Pid, TagsN} = get_tag_pid(Tag, Tags, Cache),
+ gen_server:cast(Pid, {notify, M}),
+ S#state{tags = TagsN,
+ tag_cache = Cache =/= false andalso gb_sets:add(Tag, Cache)}.
+
+-spec do_update_nodes(nodes_update(), state()) -> state().
+do_update_nodes(NewNodes, #state{nodes = Nodes, tags = Tags} = S) ->
+ WriteBlacklist = lists:sort([Node || {Node, false, _} <- NewNodes]),
+ ReadBlacklist = lists:sort([Node || {Node, _, false} <- NewNodes]),
+ OldNodes = gb_trees:from_orddict(Nodes),
+ UpdatedNodes = lists:keysort(1, [case gb_trees:lookup(Node, OldNodes) of
+ none ->
+ {Node, {0, 0}};
+ {value, OldStats} ->
+ {Node, OldStats}
+ end || {Node, _WB, _RB} <- NewNodes]),
+ if
+ UpdatedNodes =/= Nodes ->
+ _ = [gen_server:cast(Pid, {die, none}) || Pid <- gb_trees:values(Tags)],
+ spawn(fun() ->
+ {ok, ReadableNodes, RBSize} =
+ do_get_readable_nodes(UpdatedNodes, ReadBlacklist),
+ refresh_tag_cache(ReadableNodes, RBSize)
+ end),
+ S#state{nodes = UpdatedNodes,
+ write_blacklist = WriteBlacklist,
+ read_blacklist = ReadBlacklist,
+ tag_cache = false,
+ tags = gb_trees:empty()};
+ true ->
+ S#state{write_blacklist = WriteBlacklist,
+ read_blacklist = ReadBlacklist}
+ end.
+
+-spec do_update_nodestats(gb_trees:tree(), state()) -> state().
+do_update_nodestats(NewNodes, #state{nodes = Nodes} = S) ->
+ UpdatedNodes = [case gb_trees:lookup(Node, NewNodes) of
+ none ->
+ {Node, Stats};
+ {value, NewStats} ->
+ {Node, NewStats}
+ end || {Node, Stats} <- Nodes],
+ S#state{nodes = UpdatedNodes}.
+
+-spec do_tag_exit(pid(), state()) -> state().
+do_tag_exit(Pid, S) ->
+ NewTags = [X || {_, V} = X <- gb_trees:to_list(S#state.tags), V =/= Pid],
+ S#state{tags = gb_trees:from_orddict(NewTags)}.
+
+-spec do_get_tags(all | filter, [node()]) -> {[node()], [node()], [binary()]};
+ (safe, [node()]) -> {ok, [binary()]} | too_many_failed_nodes;
+ (gc, [node()]) -> {ok, [binary()], [node()]} | too_many_failed_nodes.
+do_get_tags(all, Nodes) ->
+ disco_profile:timed_run(
+ fun() ->
+ {Replies, Failed} =
+ gen_server:multi_call(Nodes, ddfs_node, get_tags, ?NODE_TIMEOUT),
+ {OkNodes, Tags} = lists:unzip(Replies),
+ {OkNodes, Failed, lists:usort(lists:flatten(Tags))}
+ end, do_get_tags_all);
+
+do_get_tags(filter, Nodes) ->
+ disco_profile:timed_run(
+ fun() ->
+ {OkNodes, Failed, Tags} = do_get_tags(all, Nodes),
+ case tag_operation(get_tagnames, <<"+deleted">>, ?NODEOP_TIMEOUT) of
+ {ok, Deleted} ->
+ TagSet = gb_sets:from_ordset(Tags),
+ DelSet = gb_sets:insert(<<"+deleted">>, Deleted),
+ NotDeleted = gb_sets:to_list(gb_sets:subtract(TagSet, DelSet)),
+ {OkNodes, Failed, NotDeleted};
+ E ->
+ E
+ end
+ end, do_get_tags_filter);
+
+do_get_tags(safe, Nodes) ->
+ disco_profile:timed_run(
+ fun() ->
+ TagMinK = list_to_integer(disco:get_setting("DDFS_TAG_MIN_REPLICAS")),
+ case do_get_tags(filter, Nodes) of
+ {_OkNodes, Failed, Tags} when length(Failed) < TagMinK ->
+ {ok, Tags};
+ _ ->
+ too_many_failed_nodes
+ end
+ end, do_get_tags_safe);
+
+% The returned tag list may include +deleted.
+do_get_tags(gc, Nodes) ->
+ disco_profile:timed_run(
+ fun() ->
+ {OkNodes, Failed, Tags} = do_get_tags(all, Nodes),
+ TagMinK = list_to_integer(disco:get_setting("DDFS_TAG_MIN_REPLICAS")),
+ case length(Failed) < TagMinK of
+ false ->
+ too_many_failed_nodes;
+ true ->
+ case tag_operation(get_tagnames, <<"+deleted">>, ?NODEOP_TIMEOUT) of
+ {ok, Deleted} ->
+ TagSet = gb_sets:from_ordset(Tags),
+ NotDeleted = gb_sets:subtract(TagSet, Deleted),
+ {ok, gb_sets:to_list(NotDeleted), OkNodes};
+ E ->
+ E
+ end
+ end
+ end, do_get_tags_gc).
+
+% Timeouts in this call by the below processes can cause ddfs_master
+% itself to crash, since the processes are linked to it.
+-spec safe_get_read_nodes() -> {ok, [node()], non_neg_integer()} | error.
+safe_get_read_nodes() ->
+ try get_read_nodes() of
+ {ok, _ReadableNodes, _RBSize} = RN ->
+ RN;
+ E ->
+ lager:error("unexpected response retrieving readable nodes: ~p", [E]),
+ error
+ catch
+ K:E ->
+ lager:error("error retrieving readable nodes: ~p:~p", [K, E]),
+ error
+ end.
+
+-spec monitor_diskspace() -> no_return().
+monitor_diskspace() ->
+ case safe_get_read_nodes() of
+ {ok, ReadableNodes, _RBSize} ->
+ {Space, _F} = gen_server:multi_call(ReadableNodes,
+ ddfs_node,
+ get_diskspace,
+ ?NODE_TIMEOUT),
+ update_nodestats(gb_trees:from_orddict(lists:keysort(1, Space)));
+ error ->
+ ok
+ end,
+ timer:sleep(?DISKSPACE_INTERVAL),
+ monitor_diskspace().
+
+-spec refresh_tag_cache_proc() -> no_return().
+refresh_tag_cache_proc() ->
+ case safe_get_read_nodes() of
+ {ok, ReadableNodes, RBSize} ->
+ refresh_tag_cache(ReadableNodes, RBSize);
+ error ->
+ ok
+ end,
+ receive
+ refresh ->
+ ok
+ after ?TAG_CACHE_INTERVAL ->
+ ok
+ end,
+ refresh_tag_cache_proc().
+
+-spec refresh_tag_cache([node()], non_neg_integer()) -> ok.
+refresh_tag_cache(Nodes, BLSize) ->
+ TagMinK = list_to_integer(disco:get_setting("DDFS_TAG_MIN_REPLICAS")),
+ {Replies, Failed} =
+ gen_server:multi_call(Nodes, ddfs_node, get_tags, ?NODE_TIMEOUT),
+ if Nodes =/= [], length(Failed) + BLSize < TagMinK ->
+ {_OkNodes, Tags} = lists:unzip(Replies),
+ update_tag_cache(gb_sets:from_list(lists:flatten(Tags)));
+ true -> ok
+ end.
diff --git a/lib/dialyzer/test/small_SUITE_data/src/ddfs_master/ddfs_tag.hrl b/lib/dialyzer/test/small_SUITE_data/src/ddfs_master/ddfs_tag.hrl
new file mode 100644
index 0000000000..2920b67fc5
--- /dev/null
+++ b/lib/dialyzer/test/small_SUITE_data/src/ddfs_master/ddfs_tag.hrl
@@ -0,0 +1,19 @@
+
+-type tokentype() :: 'read' | 'write'.
+-type user_attr() :: [{binary(), binary()}].
+% An 'internal' token is also used by internal consumers, but never stored.
+-type token() :: 'null' | binary().
+
+-type tagname() :: binary().
+-type tagid() :: binary().
+
+-type attrib() :: 'urls' | 'read_token' | 'write_token' | {'user', binary()}.
+
+-record(tagcontent, {id :: tagid(),
+ last_modified :: binary(),
+ read_token = null :: token(),
+ write_token = null :: token(),
+ urls = [] :: [[binary()]],
+ user = [] :: user_attr()}).
+
+-type tagcontent() :: #tagcontent{}.
diff --git a/lib/dialyzer/test/small_SUITE_data/src/ddfs_master/gs_util.hrl b/lib/dialyzer/test/small_SUITE_data/src/ddfs_master/gs_util.hrl
new file mode 100644
index 0000000000..d579e9a7d7
--- /dev/null
+++ b/lib/dialyzer/test/small_SUITE_data/src/ddfs_master/gs_util.hrl
@@ -0,0 +1,16 @@
+% This is a set of type utilities to be used when spec-cing the
+% callbacks of a gen_server implementation. It should be included in
+% the impl module, which needs to define the state() type.
+
+-type gs_init() :: {ok, state()}.
+-type gs_reply(T) :: {reply, (T), state()}.
+-type gs_noreply() :: {noreply, state()}.
+-type gs_noreply_t() :: {noreply, state(), non_neg_integer()}.
+-type gs_stop(T) :: {stop, (T), state()}.
+
+% Generic utilities.
+
+-type server() :: pid() | atom() | {atom(), node()}.
+-type from() :: {pid(), term()}.
+
+-type dbg_state_msg() :: dbg_get_state.
diff --git a/lib/hipe/cerl/erl_types.erl b/lib/hipe/cerl/erl_types.erl
index 47b8dc766a..6065b79664 100644
--- a/lib/hipe/cerl/erl_types.erl
+++ b/lib/hipe/cerl/erl_types.erl
@@ -2985,16 +2985,19 @@ inf_union(U1, U2, Opaques) ->
List = [A,B,F,I,L,N,T,M,Map],
inf_union_collect(List, Opaque, InfFun, [], [])
end,
- O1 = OpaqueFun(U1, U2, fun(E, Opaque) -> t_inf(Opaque, E, Opaques) end),
- O2 = OpaqueFun(U2, U1, fun(E, Opaque) -> t_inf(E, Opaque, Opaques) end),
- Union = inf_union(U1, U2, 0, [], Opaques),
- t_sup([O1, O2, Union]).
+ {O1, ThrowList1} =
+ OpaqueFun(U1, U2, fun(E, Opaque) -> t_inf(Opaque, E, Opaques) end),
+ {O2, ThrowList2}
+ = OpaqueFun(U2, U1, fun(E, Opaque) -> t_inf(E, Opaque, Opaques) end),
+ {Union, ThrowList3} = inf_union(U1, U2, 0, [], [], Opaques),
+ ThrowList = lists:merge3(ThrowList1, ThrowList2, ThrowList3),
+ case t_sup([O1, O2, Union]) of
+ ?none when ThrowList =/= [] -> throw(hd(ThrowList));
+ Sup -> Sup
+ end.
inf_union_collect([], _Opaque, _InfFun, InfList, ThrowList) ->
- case t_sup(InfList) of
- ?none when ThrowList =/= [] -> throw(hd(lists:flatten(ThrowList)));
- Sup -> Sup
- end;
+ {t_sup(InfList), lists:usort(ThrowList)};
inf_union_collect([?none|L], Opaque, InfFun, InfList, ThrowList) ->
inf_union_collect(L, Opaque, InfFun, [?none|InfList], ThrowList);
inf_union_collect([E|L], Opaque, InfFun, InfList, ThrowList) ->
@@ -3005,19 +3008,21 @@ inf_union_collect([E|L], Opaque, InfFun, InfList, ThrowList) ->
inf_union_collect(L, Opaque, InfFun, InfList, [N|ThrowList])
end.
-inf_union([?none|Left1], [?none|Left2], N, Acc, Opaques) ->
- inf_union(Left1, Left2, N, [?none|Acc], Opaques);
-inf_union([T1|Left1], [T2|Left2], N, Acc, Opaques) ->
- case t_inf(T1, T2, Opaques) of
- ?none -> inf_union(Left1, Left2, N, [?none|Acc], Opaques);
- T -> inf_union(Left1, Left2, N+1, [T|Acc], Opaques)
+inf_union([?none|Left1], [?none|Left2], N, Acc, ThrowList, Opaques) ->
+ inf_union(Left1, Left2, N, [?none|Acc], ThrowList, Opaques);
+inf_union([T1|Left1], [T2|Left2], N, Acc, ThrowList, Opaques) ->
+ try t_inf(T1, T2, Opaques) of
+ ?none -> inf_union(Left1, Left2, N, [?none|Acc], ThrowList, Opaques);
+ T -> inf_union(Left1, Left2, N+1, [T|Acc], ThrowList, Opaques)
+ catch throw:N when is_integer(N) ->
+ inf_union(Left1, Left2, N, [?none|Acc], [N|ThrowList], Opaques)
end;
-inf_union([], [], N, Acc, _Opaques) ->
- if N =:= 0 -> ?none;
+inf_union([], [], N, Acc, ThrowList, _Opaques) ->
+ if N =:= 0 -> {?none, ThrowList};
N =:= 1 ->
[Type] = [T || T <- Acc, T =/= ?none],
- Type;
- N >= 2 -> ?union(lists:reverse(Acc))
+ {Type, ThrowList};
+ N >= 2 -> {?union(lists:reverse(Acc)), ThrowList}
end.
inf_bitstr(U1, B1, U2, B2) ->