diff options
Diffstat (limited to 'lib')
8 files changed, 769 insertions, 18 deletions
diff --git a/lib/dialyzer/test/small_SUITE_data/src/ddfs_master/common_types.hrl b/lib/dialyzer/test/small_SUITE_data/src/ddfs_master/common_types.hrl new file mode 100644 index 0000000000..f362a06bca --- /dev/null +++ b/lib/dialyzer/test/small_SUITE_data/src/ddfs_master/common_types.hrl @@ -0,0 +1,6 @@ +-type host() :: nonempty_string(). +-type path() :: nonempty_string(). +-type url() :: binary(). + +% The host portion of a url, if available. +-type url_host() :: host() | none. diff --git a/lib/dialyzer/test/small_SUITE_data/src/ddfs_master/config.hrl b/lib/dialyzer/test/small_SUITE_data/src/ddfs_master/config.hrl new file mode 100644 index 0000000000..8cab65fc9c --- /dev/null +++ b/lib/dialyzer/test/small_SUITE_data/src/ddfs_master/config.hrl @@ -0,0 +1,148 @@ + +-define(SECOND, 1000). +-define(MINUTE, (60 * ?SECOND)). +-define(HOUR, (60 * ?MINUTE)). +-define(DAY, (24 * ?HOUR)). +-define(MB, (1024 * 1024)). + +% Maximum length of tag/blob prefix +-define(NAME_MAX, 511). + +% How long ddfs node startup can take. The most time-consuming part +% is the scanning of the tag objects in the node's DDFS volumes. +-define(NODE_STARTUP, (1 * ?MINUTE)). + +% How long to wait on the master for replies from nodes. +-define(NODE_TIMEOUT, (10 * ?SECOND)). + +% How long to wait for a reply from an operation coordinated by the +% master that accesses nodes. This value should be larger than +% NODE_TIMEOUT. +-define(NODEOP_TIMEOUT, (1 * ?MINUTE)). + +% The minimum amount of free space a node must have, to be considered +% a primary candidate host for a new blob. +-define(MIN_FREE_SPACE, (1024 * ?MB)). + +% The maximum number of active HTTP connections on a system (this +% applies separately for GET and PUT operations). +-define(HTTP_MAX_ACTIVE, 3). + +% The maximum number of waiting HTTP connections to queue up on a busy system. +-define(HTTP_QUEUE_LENGTH, 100). + +% The maximum number of simultaneous HTTP connections. Note that +% HTTP_MAX_CONNS * 2 * 2 + 32 < Maximum number of file descriptors, where +% 2 = Get and put, 2 = two FDs required for each connection (connection +% itself + a file it accesses), 32 = a guess how many extra fds is needed. +-define(HTTP_MAX_CONNS, 128). + +% How long to keep a PUT request in queue if the system is busy. +-define(PUT_WAIT_TIMEOUT, (1 * ?MINUTE)). + +% How long to keep a GET request in queue if the system is busy. +-define(GET_WAIT_TIMEOUT, (1 * ?MINUTE)). + +% An unused loaded tag expires in TAG_EXPIRES milliseconds. Note that +% if TAG_EXPIRES is not smaller than GC_INTERVAL, tags will never +% expire from the memory cache and will always take up memory. +-define(TAG_EXPIRES, (10 * ?HOUR)). + +% How often the master's cache of all known tag names is refreshed. +% This refresh is only needed to purge deleted tags eventually from +% the tag cache. It doesn't harm to have a long interval. +-define(TAG_CACHE_INTERVAL, (10 * ?MINUTE)). + +% How soon a tag object initialized in memory expires if it's content +% cannot be fetched from the cluster. +-define(TAG_EXPIRES_ONERROR, (1 * ?SECOND)). + +% How often a DDFS node should refresh its tag cache from disk. +-define(FIND_TAGS_INTERVAL, ?DAY). + +% How often buffered (delayed) updates to a tag need to be +% flushed. Tradeoff: The longer the interval, the more updates are +% bundled in a single commit. On the other hand, in the worst case +% the requester has to wait for the full interval before getting a +% reply. A long interval also increases the likelihood that the server +% crashes before the commit has finished successfully, making requests +% more unreliable. +-define(DELAYED_FLUSH_INTERVAL, (1 * ?SECOND)). + +% How long to wait between garbage collection runs. +-define(GC_INTERVAL, ?DAY). + +% Max duration for a GC run. This should be smaller than +% min(ORPHANED_{BLOB,TAG}_EXPIRES). +-define(GC_MAX_DURATION, (3 * ?DAY)). + +% How long to wait after startup for cluster to stabilize before +% starting the first GC run. +-define(GC_DEFAULT_INITIAL_WAIT, (5 * ?MINUTE)). + +% The longest potential interval between messages in the GC protocol; +% used to ensure GC makes forward progress. This can be set to the +% estimated time to traverse all the volumes on a DDFS node. +-define(GC_PROGRESS_INTERVAL, (30 * ?MINUTE)). + +% Number of extra replicas (i.e. lost replicas recovered during GC) to +% allow before deleting extra replicas. +-define(NUM_EXTRA_REPLICAS, 1). + +% Permissions for files backing blobs and tags. +-define(FILE_MODE, 8#00400). + +% How often to check available disk space in ddfs_node. +-define(DISKSPACE_INTERVAL, (10 * ?SECOND)). + +% The maximum size of payloads of HTTP requests to the /ddfs/tag/ +% prefix. +-define(MAX_TAG_BODY_SIZE, (512 * ?MB)). + +% Tag attribute names and values have a limited size, and there +% can be only a limited number of them. +-define(MAX_TAG_ATTRIB_NAME_SIZE, 1024). +-define(MAX_TAG_ATTRIB_VALUE_SIZE, 1024). +-define(MAX_NUM_TAG_ATTRIBS, 1000). + +% How long HTTP requests that perform tag updates should wait to +% finish (a long time). +-define(TAG_UPDATE_TIMEOUT, ?DAY). + +% Timeout for re-replicating a single blob over HTTP PUT. This +% depends on the largest blobs hosted by DDFS, and the speed of the +% cluster network. +-define(GC_PUT_TIMEOUT, (180 * ?MINUTE)). + +% Delete !partial files after this many milliseconds. +-define(PARTIAL_EXPIRES, ?DAY). + +% When orphaned blob can be deleted. This should be large enough that +% you can upload all the new blobs of a tag and perform the tag update +% within this time. +-define(ORPHANED_BLOB_EXPIRES, (5 * ?DAY)). + +% When orphaned tag can be deleted. +-define(ORPHANED_TAG_EXPIRES, (5 * ?DAY)). + +% How long a tag has to stay on the deleted list before +% we can permanently forget it, after all known instances +% of the tag object have been removed. This quarantine period +% ensures that a node that was temporarily unavailable +% and reactivates can't resurrect deleted tags. You +% must ensure that all temporarily inactive nodes +% are reactivated (or cleaned) within the ?DELETED_TAG_EXPIRES +% time frame. +% +% This value _must_ be larger than the other time-related DDFS +% parameters listed in this file. In particular, it must be larger +% than ORPHANED_TAG_EXPIRES. +-define(DELETED_TAG_EXPIRES, (30 * ?DAY)). + +% How many times a tag operation should be retried before aborting. +-define(MAX_TAG_OP_RETRIES, 3). + +% How long to wait before timing out a tag retrieval. This should be +% large enough to read a large tag object off the disk and send it +% over the network. +-define(GET_TAG_TIMEOUT, (5 * ?MINUTE)). diff --git a/lib/dialyzer/test/small_SUITE_data/src/ddfs_master/ddfs.hrl b/lib/dialyzer/test/small_SUITE_data/src/ddfs_master/ddfs.hrl new file mode 100644 index 0000000000..e43ec23fe1 --- /dev/null +++ b/lib/dialyzer/test/small_SUITE_data/src/ddfs_master/ddfs.hrl @@ -0,0 +1,9 @@ +-type volume_name() :: nonempty_string(). + +% Diskinfo is {FreeSpace, UsedSpace}. +-type diskinfo() :: {non_neg_integer(), non_neg_integer()}. +-type volume() :: {diskinfo(), volume_name()}. + +-type object_type() :: 'blob' | 'tag'. +-type object_name() :: binary(). +-type taginfo() :: {erlang:timestamp(), volume_name()}. diff --git a/lib/dialyzer/test/small_SUITE_data/src/ddfs_master/ddfs_gc.hrl b/lib/dialyzer/test/small_SUITE_data/src/ddfs_master/ddfs_gc.hrl new file mode 100644 index 0000000000..dc43f7586b --- /dev/null +++ b/lib/dialyzer/test/small_SUITE_data/src/ddfs_master/ddfs_gc.hrl @@ -0,0 +1,17 @@ +-type local_object() :: {object_name(), node()}. +-type phase() :: 'start' | 'build_map' | 'map_wait' | 'gc' + | 'rr_blobs' | 'rr_blobs_wait' | 'rr_tags'. +-type protocol_msg() :: {'check_blob', object_name()} | 'start_gc' | 'end_rr'. + +-type blob_update() :: {object_name(), 'filter' | [url()]}. + +-type check_blob_result() :: 'false' | {'true', volume_name()}. + +% GC statistics + +% {Files, Bytes} +-type gc_stat() :: {non_neg_integer(), non_neg_integer()}. +% {Kept, Deleted} +-type obj_stats() :: {gc_stat(), gc_stat()}. +% {Tags, Blobs}. +-type gc_run_stats() :: {obj_stats(), obj_stats()}. diff --git a/lib/dialyzer/test/small_SUITE_data/src/ddfs_master/ddfs_master.erl b/lib/dialyzer/test/small_SUITE_data/src/ddfs_master/ddfs_master.erl new file mode 100644 index 0000000000..2be2773dc5 --- /dev/null +++ b/lib/dialyzer/test/small_SUITE_data/src/ddfs_master/ddfs_master.erl @@ -0,0 +1,531 @@ +-module(ddfs_master). +-behaviour(gen_server). + +-export([start_link/0]). +-export([get_tags/1, get_tags/3, + get_nodeinfo/1, + get_read_nodes/0, + get_hosted_tags/1, + gc_blacklist/0, gc_blacklist/1, + gc_stats/0, + choose_write_nodes/3, + new_blob/4, new_blob/5, + safe_gc_blacklist/0, safe_gc_blacklist/1, + refresh_tag_cache/0, + tag_notify/2, + tag_operation/2, tag_operation/3, + update_gc_stats/1, + update_nodes/1 + ]). +-export([init/1, + handle_call/3, + handle_cast/2, + handle_info/2, + terminate/2, + code_change/3]). + +-define(WEB_PORT, 8011). + +-compile(nowarn_deprecated_type). + +-include("common_types.hrl"). +-include("gs_util.hrl"). +-include("config.hrl"). +-include("ddfs.hrl"). +-include("ddfs_tag.hrl"). +-include("ddfs_gc.hrl"). + +-type node_info() :: {node(), {non_neg_integer(), non_neg_integer()}}. +-type gc_stats() :: none | gc_run_stats(). + +-record(state, {tags = gb_trees:empty() :: gb_trees:tree(), + tag_cache = false :: false | gb_sets:set(), + cache_refresher :: pid(), + + nodes = [] :: [node_info()], + write_blacklist = [] :: [node()], + read_blacklist = [] :: [node()], + gc_blacklist = [] :: [node()], + safe_gc_blacklist = gb_sets:empty() :: gb_sets:set(), + gc_stats = none :: none | {gc_stats(), erlang:timestamp()}}). +-type state() :: #state{}. +-type replyto() :: {pid(), reference()}. + +-export_type([gc_stats/0, node_info/0]). + +%% =================================================================== +%% API functions + +-spec start_link() -> {ok, pid()}. +start_link() -> + lager:info("DDFS master starts"), + case gen_server:start_link({local, ?MODULE}, ?MODULE, [], []) of + {ok, Server} -> {ok, Server}; + {error, {already_started, Server}} -> {ok, Server} + end. + +-spec tag_operation(term(), tagname()) -> term(). +tag_operation(Op, Tag) -> + gen_server:call(?MODULE, {tag, Op, Tag}). +-spec tag_operation(term(), tagname(), non_neg_integer() | infinity) -> + term(). +tag_operation(Op, Tag, Timeout) -> + gen_server:call(?MODULE, {tag, Op, Tag}, Timeout). + +-spec tag_notify(term(), tagname()) -> ok. +tag_notify(Op, Tag) -> + gen_server:cast(?MODULE, {tag_notify, Op, Tag}). + +-spec get_nodeinfo(all) -> {ok, [node_info()]}. +get_nodeinfo(all) -> + gen_server:call(?MODULE, {get_nodeinfo, all}). + +-spec get_read_nodes() -> {ok, [node()], non_neg_integer()} | {error, term()}. +get_read_nodes() -> + gen_server:call(?MODULE, get_read_nodes, infinity). + +-spec gc_blacklist() -> {ok, [node()]}. +gc_blacklist() -> + gen_server:call(?MODULE, gc_blacklist). + +-spec gc_blacklist([node()]) -> ok. +gc_blacklist(Nodes) -> + gen_server:cast(?MODULE, {gc_blacklist, Nodes}). + +-spec gc_stats() -> {ok, none | {gc_stats(), erlang:timestamp()}} | {error, term()}. +gc_stats() -> + gen_server:call(?MODULE, gc_stats). + +-spec get_hosted_tags(host()) -> {ok, [tagname()]} | {error, term()}. +get_hosted_tags(Host) -> + gen_server:call(?MODULE, {get_hosted_tags, Host}). + +-spec choose_write_nodes(non_neg_integer(), [node()], [node()]) -> {ok, [node()]}. +choose_write_nodes(K, Include, Exclude) -> + gen_server:call(?MODULE, {choose_write_nodes, K, Include, Exclude}). + +-spec get_tags(gc) -> {ok, [tagname()], [node()]} | too_many_failed_nodes; + (safe) -> {ok, [binary()]} | too_many_failed_nodes. +get_tags(Mode) -> + get_tags(?MODULE, Mode, ?GET_TAG_TIMEOUT). + +-spec get_tags(server(), gc, non_neg_integer()) -> + {ok, [tagname()], [node()]} | too_many_failed_nodes; + (server(), safe, non_neg_integer()) -> + {ok, [binary()]} | too_many_failed_nodes. +get_tags(Server, Mode, Timeout) -> + disco_profile:timed_run( + fun() -> gen_server:call(Server, {get_tags, Mode}, Timeout) end, + get_tags). + +-spec new_blob(string()|object_name(), non_neg_integer(), [node()], [node()]) -> + too_many_replicas | {ok, [nonempty_string()]}. +new_blob(Obj, K, Include, Exclude) -> + gen_server:call(?MODULE, {new_blob, Obj, K, Include, Exclude}, infinity). + +-spec new_blob(server(), string()|object_name(), non_neg_integer(), [node()], [node()]) -> + too_many_replicas | {ok, [nonempty_string()]}. +new_blob(Master, Obj, K, Include, Exclude) -> + gen_server:call(Master, {new_blob, Obj, K, Include, Exclude}, infinity). + +-spec safe_gc_blacklist() -> {ok, [node()]} | {error, term()}. +safe_gc_blacklist() -> + gen_server:call(?MODULE, safe_gc_blacklist). + +-spec safe_gc_blacklist(gb_sets:set()) -> ok. +safe_gc_blacklist(SafeGCBlacklist) -> + gen_server:cast(?MODULE, {safe_gc_blacklist, SafeGCBlacklist}). + +-spec update_gc_stats(gc_run_stats()) -> ok. +update_gc_stats(Stats) -> + gen_server:cast(?MODULE, {update_gc_stats, Stats}). + +-type nodes_update() :: [{node(), boolean(), boolean()}]. +-spec update_nodes(nodes_update()) -> ok. +update_nodes(DDFSNodes) -> + gen_server:cast(?MODULE, {update_nodes, DDFSNodes}). + +-spec update_nodestats(gb_trees:tree()) -> ok. +update_nodestats(NewNodes) -> + gen_server:cast(?MODULE, {update_nodestats, NewNodes}). + +-spec update_tag_cache(gb_sets:set()) -> ok. +update_tag_cache(TagCache) -> + gen_server:cast(?MODULE, {update_tag_cache, TagCache}). + +-spec refresh_tag_cache() -> ok. +refresh_tag_cache() -> + gen_server:cast(?MODULE, refresh_tag_cache). + +%% =================================================================== +%% gen_server callbacks + +-spec init(_) -> gs_init(). +init(_Args) -> + _ = [disco_profile:new_histogram(Name) + || Name <- [get_tags, do_get_tags_all, do_get_tags_filter, + do_get_tags_safe, do_get_tags_gc]], + spawn_link(fun() -> monitor_diskspace() end), + spawn_link(fun() -> ddfs_gc:start_gc(disco:get_setting("DDFS_DATA")) end), + Refresher = spawn_link(fun() -> refresh_tag_cache_proc() end), + put(put_port, disco:get_setting("DDFS_PUT_PORT")), + {ok, #state{cache_refresher = Refresher}}. + +-type choose_write_nodes_msg() :: {choose_write_nodes, non_neg_integer(), [node()], [node()]}. +-type new_blob_msg() :: {new_blob, string() | object_name(), non_neg_integer(), [node()]}. +-type tag_msg() :: {tag, ddfs_tag:call_msg(), tagname()}. +-spec handle_call(dbg_state_msg(), from(), state()) -> + gs_reply(state()); + ({get_nodeinfo, all}, from(), state()) -> + gs_reply({ok, [node_info()]}); + (get_read_nodes, from(), state()) -> + gs_reply({ok, [node()], non_neg_integer}); + (gc_blacklist, from(), state()) -> + gs_reply({ok, [node()]}); + (gc_stats, from(), state()) -> + gs_reply({ok, gc_stats(), erlang:timestamp()}); + (choose_write_nodes_msg(), from(), state()) -> + gs_reply({ok, [node()]}); + (new_blob_msg(), from(), state()) -> + gs_reply(new_blob_result()); + (tag_msg(), from(), state()) -> + gs_reply({error, nonodes}) | gs_noreply(); + ({get_tags, gc | safe}, from(), state()) -> + gs_noreply(); + ({get_hosted_tags, host()}, from(), state()) -> + gs_noreply(); + (safe_gc_blacklist, from(), state()) -> + gs_reply({ok, [node()]}). +handle_call(dbg_get_state, _, S) -> + {reply, S, S}; + +handle_call({get_nodeinfo, all}, _From, #state{nodes = Nodes} = S) -> + {reply, {ok, Nodes}, S}; + +handle_call(get_read_nodes, _F, #state{nodes = Nodes, read_blacklist = RB} = S) -> + {reply, do_get_readable_nodes(Nodes, RB), S}; + +handle_call(gc_blacklist, _F, #state{gc_blacklist = Nodes} = S) -> + {reply, {ok, Nodes}, S}; + +handle_call(gc_stats, _F, #state{gc_stats = Stats} = S) -> + {reply, {ok, Stats}, S}; + +handle_call({choose_write_nodes, K, Include, Exclude}, _, + #state{nodes = N, write_blacklist = WBL, gc_blacklist = GBL} = S) -> + BL = lists:umerge(WBL, GBL), + {reply, do_choose_write_nodes(N, K, Include, Exclude, BL), S}; + +handle_call({new_blob, Obj, K, Include, Exclude}, _, + #state{nodes = N, gc_blacklist = GBL, write_blacklist = WBL} = S) -> + BL = lists:umerge(WBL, GBL), + {reply, do_new_blob(Obj, K, Include, Exclude, BL, N), S}; + +handle_call({tag, _M, _Tag}, _From, #state{nodes = []} = S) -> + {reply, {error, no_nodes}, S}; + +handle_call({tag, M, Tag}, From, S) -> + {noreply, do_tag_request(M, Tag, From, S)}; + +handle_call({get_tags, Mode}, From, #state{nodes = Nodes} = S) -> + spawn(fun() -> + gen_server:reply(From, do_get_tags(Mode, [N || {N, _} <- Nodes])) + end), + {noreply, S}; + +handle_call({get_hosted_tags, Host}, From, S) -> + spawn(fun() -> gen_server:reply(From, ddfs_gc:hosted_tags(Host)) end), + {noreply, S}; + +handle_call(safe_gc_blacklist, _From, #state{safe_gc_blacklist = SBL} = S) -> + {reply, {ok, gb_sets:to_list(SBL)}, S}. + +-spec handle_cast({tag_notify, ddfs_tag:cast_msg(), tagname()} + | {gc_blacklist, [node()]} + | {safe_gc_blacklist, gb_sets:set()} + | {update_gc_stats, gc_stats()} + | {update_tag_cache, gb_sets:set()} + | refresh_tag_cache + | {update_nodes, nodes_update()} + | {update_nodestats, gb_trees:tree()}, + state()) -> gs_noreply(). +handle_cast({tag_notify, M, Tag}, S) -> + {noreply, do_tag_notify(M, Tag, S)}; + +handle_cast({gc_blacklist, Nodes}, #state{safe_gc_blacklist = SBL} = S) -> + BLSet = gb_sets:from_list(Nodes), + NewSBL = gb_sets:intersection(BLSet, SBL), + {noreply, S#state{gc_blacklist = gb_sets:to_list(BLSet), + safe_gc_blacklist = NewSBL}}; + +handle_cast({safe_gc_blacklist, SafeBlacklist}, #state{gc_blacklist = BL} = S) -> + SBL = gb_sets:intersection(SafeBlacklist, gb_sets:from_list(BL)), + {noreply, S#state{safe_gc_blacklist = SBL}}; + +handle_cast({update_gc_stats, Stats}, S) -> + {noreply, S#state{gc_stats = {Stats, now()}}}; + +handle_cast({update_tag_cache, TagCache}, S) -> + {noreply, S#state{tag_cache = TagCache}}; + +handle_cast(refresh_tag_cache, #state{cache_refresher = Refresher} = S) -> + Refresher ! refresh, + {noreply, S}; + +handle_cast({update_nodes, NewNodes}, S) -> + {noreply, do_update_nodes(NewNodes, S)}; + +handle_cast({update_nodestats, NewNodes}, S) -> + {noreply, do_update_nodestats(NewNodes, S)}. + +-spec handle_info({'DOWN', _, _, pid(), _}, state()) -> gs_noreply(). +handle_info({'DOWN', _, _, Pid, _}, S) -> + {noreply, do_tag_exit(Pid, S)}. + +%% =================================================================== +%% gen_server callback stubs + +-spec terminate(term(), state()) -> ok. +terminate(Reason, _State) -> + lager:warning("DDFS master died: ~p", [Reason]). + +-spec code_change(term(), state(), term()) -> {ok, state()}. +code_change(_OldVsn, State, _Extra) -> {ok, State}. + +%% =================================================================== +%% internal functions + +-spec do_get_readable_nodes([node_info()], [node()]) -> + {ok, [node()], non_neg_integer()}. +do_get_readable_nodes(Nodes, ReadBlacklist) -> + NodeSet = gb_sets:from_ordset(lists:sort([Node || {Node, _} <- Nodes])), + BlackSet = gb_sets:from_ordset(ReadBlacklist), + ReadableNodeSet = gb_sets:subtract(NodeSet, BlackSet), + {ok, gb_sets:to_list(ReadableNodeSet), gb_sets:size(BlackSet)}. + +-spec do_choose_write_nodes([node_info()], non_neg_integer(), [node()], [node()], [node()]) -> + {ok, [node()]}. +do_choose_write_nodes(Nodes, K, Include, Exclude, BlackList) -> + % Include is the list of nodes that must be included + % + % Node selection algorithm: + % 1. try to choose K nodes randomly from all the nodes which have + % more than ?MIN_FREE_SPACE bytes free space available and which + % are not excluded or blacklisted. + % 2. if K nodes cannot be found this way, choose the K emptiest + % nodes which are not excluded or blacklisted. + Primary = ([N || {N, {Free, _Total}} <- Nodes, Free > ?MIN_FREE_SPACE / 1024] + -- (Exclude ++ BlackList)), + if length(Primary) >= K -> + {ok, Include ++ disco_util:choose_random(Primary -- Include , K - length(Include))}; + true -> + Preferred = [N || {N, _} <- lists:reverse(lists:keysort(2, Nodes))], + Secondary = Include ++ lists:sublist(Preferred -- (Include ++ Exclude ++ BlackList), + K - length(Include)), + {ok, Secondary} + end. + +-type new_blob_result() :: too_many_replicas | {ok, [nonempty_string()]}. +-spec do_new_blob(string()|object_name(), non_neg_integer(), [node()], [node()], [node()], [node_info()]) -> + new_blob_result(). +do_new_blob(_Obj, K, _Include, _Exclude, _BlackList, Nodes) when K > length(Nodes) -> + too_many_replicas; +do_new_blob(Obj, K, Include, Exclude, BlackList, Nodes) -> + {ok, WriteNodes} = do_choose_write_nodes(Nodes, K, Include, Exclude, BlackList), + Urls = [["http://", disco:host(N), ":", get(put_port), "/ddfs/", Obj] + || N <- WriteNodes], + {ok, Urls}. + +% Tag request: Start a new tag server if one doesn't exist already. Forward +% the request to the tag server. + +-spec get_tag_pid(tagname(), gb_trees:tree(), false | gb_sets:set()) -> + {pid(), gb_trees:tree()}. +get_tag_pid(Tag, Tags, Cache) -> + case gb_trees:lookup(Tag, Tags) of + none -> + NotFound = (Cache =/= false + andalso not gb_sets:is_element(Tag, Cache)), + {ok, Server} = ddfs_tag:start(Tag, NotFound), + erlang:monitor(process, Server), + {Server, gb_trees:insert(Tag, Server, Tags)}; + {value, P} -> + {P, Tags} + end. + +-spec do_tag_request(term(), tagname(), replyto(), state()) -> + state(). +do_tag_request(M, Tag, From, #state{tags = Tags, tag_cache = Cache} = S) -> + {Pid, TagsN} = get_tag_pid(Tag, Tags, Cache), + gen_server:cast(Pid, {M, From}), + S#state{tags = TagsN, + tag_cache = Cache =/= false andalso gb_sets:add(Tag, Cache)}. + +-spec do_tag_notify(term(), tagname(), state()) -> state(). +do_tag_notify(M, Tag, #state{tags = Tags, tag_cache = Cache} = S) -> + {Pid, TagsN} = get_tag_pid(Tag, Tags, Cache), + gen_server:cast(Pid, {notify, M}), + S#state{tags = TagsN, + tag_cache = Cache =/= false andalso gb_sets:add(Tag, Cache)}. + +-spec do_update_nodes(nodes_update(), state()) -> state(). +do_update_nodes(NewNodes, #state{nodes = Nodes, tags = Tags} = S) -> + WriteBlacklist = lists:sort([Node || {Node, false, _} <- NewNodes]), + ReadBlacklist = lists:sort([Node || {Node, _, false} <- NewNodes]), + OldNodes = gb_trees:from_orddict(Nodes), + UpdatedNodes = lists:keysort(1, [case gb_trees:lookup(Node, OldNodes) of + none -> + {Node, {0, 0}}; + {value, OldStats} -> + {Node, OldStats} + end || {Node, _WB, _RB} <- NewNodes]), + if + UpdatedNodes =/= Nodes -> + _ = [gen_server:cast(Pid, {die, none}) || Pid <- gb_trees:values(Tags)], + spawn(fun() -> + {ok, ReadableNodes, RBSize} = + do_get_readable_nodes(UpdatedNodes, ReadBlacklist), + refresh_tag_cache(ReadableNodes, RBSize) + end), + S#state{nodes = UpdatedNodes, + write_blacklist = WriteBlacklist, + read_blacklist = ReadBlacklist, + tag_cache = false, + tags = gb_trees:empty()}; + true -> + S#state{write_blacklist = WriteBlacklist, + read_blacklist = ReadBlacklist} + end. + +-spec do_update_nodestats(gb_trees:tree(), state()) -> state(). +do_update_nodestats(NewNodes, #state{nodes = Nodes} = S) -> + UpdatedNodes = [case gb_trees:lookup(Node, NewNodes) of + none -> + {Node, Stats}; + {value, NewStats} -> + {Node, NewStats} + end || {Node, Stats} <- Nodes], + S#state{nodes = UpdatedNodes}. + +-spec do_tag_exit(pid(), state()) -> state(). +do_tag_exit(Pid, S) -> + NewTags = [X || {_, V} = X <- gb_trees:to_list(S#state.tags), V =/= Pid], + S#state{tags = gb_trees:from_orddict(NewTags)}. + +-spec do_get_tags(all | filter, [node()]) -> {[node()], [node()], [binary()]}; + (safe, [node()]) -> {ok, [binary()]} | too_many_failed_nodes; + (gc, [node()]) -> {ok, [binary()], [node()]} | too_many_failed_nodes. +do_get_tags(all, Nodes) -> + disco_profile:timed_run( + fun() -> + {Replies, Failed} = + gen_server:multi_call(Nodes, ddfs_node, get_tags, ?NODE_TIMEOUT), + {OkNodes, Tags} = lists:unzip(Replies), + {OkNodes, Failed, lists:usort(lists:flatten(Tags))} + end, do_get_tags_all); + +do_get_tags(filter, Nodes) -> + disco_profile:timed_run( + fun() -> + {OkNodes, Failed, Tags} = do_get_tags(all, Nodes), + case tag_operation(get_tagnames, <<"+deleted">>, ?NODEOP_TIMEOUT) of + {ok, Deleted} -> + TagSet = gb_sets:from_ordset(Tags), + DelSet = gb_sets:insert(<<"+deleted">>, Deleted), + NotDeleted = gb_sets:to_list(gb_sets:subtract(TagSet, DelSet)), + {OkNodes, Failed, NotDeleted}; + E -> + E + end + end, do_get_tags_filter); + +do_get_tags(safe, Nodes) -> + disco_profile:timed_run( + fun() -> + TagMinK = list_to_integer(disco:get_setting("DDFS_TAG_MIN_REPLICAS")), + case do_get_tags(filter, Nodes) of + {_OkNodes, Failed, Tags} when length(Failed) < TagMinK -> + {ok, Tags}; + _ -> + too_many_failed_nodes + end + end, do_get_tags_safe); + +% The returned tag list may include +deleted. +do_get_tags(gc, Nodes) -> + disco_profile:timed_run( + fun() -> + {OkNodes, Failed, Tags} = do_get_tags(all, Nodes), + TagMinK = list_to_integer(disco:get_setting("DDFS_TAG_MIN_REPLICAS")), + case length(Failed) < TagMinK of + false -> + too_many_failed_nodes; + true -> + case tag_operation(get_tagnames, <<"+deleted">>, ?NODEOP_TIMEOUT) of + {ok, Deleted} -> + TagSet = gb_sets:from_ordset(Tags), + NotDeleted = gb_sets:subtract(TagSet, Deleted), + {ok, gb_sets:to_list(NotDeleted), OkNodes}; + E -> + E + end + end + end, do_get_tags_gc). + +% Timeouts in this call by the below processes can cause ddfs_master +% itself to crash, since the processes are linked to it. +-spec safe_get_read_nodes() -> {ok, [node()], non_neg_integer()} | error. +safe_get_read_nodes() -> + try get_read_nodes() of + {ok, _ReadableNodes, _RBSize} = RN -> + RN; + E -> + lager:error("unexpected response retrieving readable nodes: ~p", [E]), + error + catch + K:E -> + lager:error("error retrieving readable nodes: ~p:~p", [K, E]), + error + end. + +-spec monitor_diskspace() -> no_return(). +monitor_diskspace() -> + case safe_get_read_nodes() of + {ok, ReadableNodes, _RBSize} -> + {Space, _F} = gen_server:multi_call(ReadableNodes, + ddfs_node, + get_diskspace, + ?NODE_TIMEOUT), + update_nodestats(gb_trees:from_orddict(lists:keysort(1, Space))); + error -> + ok + end, + timer:sleep(?DISKSPACE_INTERVAL), + monitor_diskspace(). + +-spec refresh_tag_cache_proc() -> no_return(). +refresh_tag_cache_proc() -> + case safe_get_read_nodes() of + {ok, ReadableNodes, RBSize} -> + refresh_tag_cache(ReadableNodes, RBSize); + error -> + ok + end, + receive + refresh -> + ok + after ?TAG_CACHE_INTERVAL -> + ok + end, + refresh_tag_cache_proc(). + +-spec refresh_tag_cache([node()], non_neg_integer()) -> ok. +refresh_tag_cache(Nodes, BLSize) -> + TagMinK = list_to_integer(disco:get_setting("DDFS_TAG_MIN_REPLICAS")), + {Replies, Failed} = + gen_server:multi_call(Nodes, ddfs_node, get_tags, ?NODE_TIMEOUT), + if Nodes =/= [], length(Failed) + BLSize < TagMinK -> + {_OkNodes, Tags} = lists:unzip(Replies), + update_tag_cache(gb_sets:from_list(lists:flatten(Tags))); + true -> ok + end. diff --git a/lib/dialyzer/test/small_SUITE_data/src/ddfs_master/ddfs_tag.hrl b/lib/dialyzer/test/small_SUITE_data/src/ddfs_master/ddfs_tag.hrl new file mode 100644 index 0000000000..2920b67fc5 --- /dev/null +++ b/lib/dialyzer/test/small_SUITE_data/src/ddfs_master/ddfs_tag.hrl @@ -0,0 +1,19 @@ + +-type tokentype() :: 'read' | 'write'. +-type user_attr() :: [{binary(), binary()}]. +% An 'internal' token is also used by internal consumers, but never stored. +-type token() :: 'null' | binary(). + +-type tagname() :: binary(). +-type tagid() :: binary(). + +-type attrib() :: 'urls' | 'read_token' | 'write_token' | {'user', binary()}. + +-record(tagcontent, {id :: tagid(), + last_modified :: binary(), + read_token = null :: token(), + write_token = null :: token(), + urls = [] :: [[binary()]], + user = [] :: user_attr()}). + +-type tagcontent() :: #tagcontent{}. diff --git a/lib/dialyzer/test/small_SUITE_data/src/ddfs_master/gs_util.hrl b/lib/dialyzer/test/small_SUITE_data/src/ddfs_master/gs_util.hrl new file mode 100644 index 0000000000..d579e9a7d7 --- /dev/null +++ b/lib/dialyzer/test/small_SUITE_data/src/ddfs_master/gs_util.hrl @@ -0,0 +1,16 @@ +% This is a set of type utilities to be used when spec-cing the +% callbacks of a gen_server implementation. It should be included in +% the impl module, which needs to define the state() type. + +-type gs_init() :: {ok, state()}. +-type gs_reply(T) :: {reply, (T), state()}. +-type gs_noreply() :: {noreply, state()}. +-type gs_noreply_t() :: {noreply, state(), non_neg_integer()}. +-type gs_stop(T) :: {stop, (T), state()}. + +% Generic utilities. + +-type server() :: pid() | atom() | {atom(), node()}. +-type from() :: {pid(), term()}. + +-type dbg_state_msg() :: dbg_get_state. diff --git a/lib/hipe/cerl/erl_types.erl b/lib/hipe/cerl/erl_types.erl index 47b8dc766a..6065b79664 100644 --- a/lib/hipe/cerl/erl_types.erl +++ b/lib/hipe/cerl/erl_types.erl @@ -2985,16 +2985,19 @@ inf_union(U1, U2, Opaques) -> List = [A,B,F,I,L,N,T,M,Map], inf_union_collect(List, Opaque, InfFun, [], []) end, - O1 = OpaqueFun(U1, U2, fun(E, Opaque) -> t_inf(Opaque, E, Opaques) end), - O2 = OpaqueFun(U2, U1, fun(E, Opaque) -> t_inf(E, Opaque, Opaques) end), - Union = inf_union(U1, U2, 0, [], Opaques), - t_sup([O1, O2, Union]). + {O1, ThrowList1} = + OpaqueFun(U1, U2, fun(E, Opaque) -> t_inf(Opaque, E, Opaques) end), + {O2, ThrowList2} + = OpaqueFun(U2, U1, fun(E, Opaque) -> t_inf(E, Opaque, Opaques) end), + {Union, ThrowList3} = inf_union(U1, U2, 0, [], [], Opaques), + ThrowList = lists:merge3(ThrowList1, ThrowList2, ThrowList3), + case t_sup([O1, O2, Union]) of + ?none when ThrowList =/= [] -> throw(hd(ThrowList)); + Sup -> Sup + end. inf_union_collect([], _Opaque, _InfFun, InfList, ThrowList) -> - case t_sup(InfList) of - ?none when ThrowList =/= [] -> throw(hd(lists:flatten(ThrowList))); - Sup -> Sup - end; + {t_sup(InfList), lists:usort(ThrowList)}; inf_union_collect([?none|L], Opaque, InfFun, InfList, ThrowList) -> inf_union_collect(L, Opaque, InfFun, [?none|InfList], ThrowList); inf_union_collect([E|L], Opaque, InfFun, InfList, ThrowList) -> @@ -3005,19 +3008,21 @@ inf_union_collect([E|L], Opaque, InfFun, InfList, ThrowList) -> inf_union_collect(L, Opaque, InfFun, InfList, [N|ThrowList]) end. -inf_union([?none|Left1], [?none|Left2], N, Acc, Opaques) -> - inf_union(Left1, Left2, N, [?none|Acc], Opaques); -inf_union([T1|Left1], [T2|Left2], N, Acc, Opaques) -> - case t_inf(T1, T2, Opaques) of - ?none -> inf_union(Left1, Left2, N, [?none|Acc], Opaques); - T -> inf_union(Left1, Left2, N+1, [T|Acc], Opaques) +inf_union([?none|Left1], [?none|Left2], N, Acc, ThrowList, Opaques) -> + inf_union(Left1, Left2, N, [?none|Acc], ThrowList, Opaques); +inf_union([T1|Left1], [T2|Left2], N, Acc, ThrowList, Opaques) -> + try t_inf(T1, T2, Opaques) of + ?none -> inf_union(Left1, Left2, N, [?none|Acc], ThrowList, Opaques); + T -> inf_union(Left1, Left2, N+1, [T|Acc], ThrowList, Opaques) + catch throw:N when is_integer(N) -> + inf_union(Left1, Left2, N, [?none|Acc], [N|ThrowList], Opaques) end; -inf_union([], [], N, Acc, _Opaques) -> - if N =:= 0 -> ?none; +inf_union([], [], N, Acc, ThrowList, _Opaques) -> + if N =:= 0 -> {?none, ThrowList}; N =:= 1 -> [Type] = [T || T <- Acc, T =/= ?none], - Type; - N >= 2 -> ?union(lists:reverse(Acc)) + {Type, ThrowList}; + N >= 2 -> {?union(lists:reverse(Acc)), ThrowList} end. inf_bitstr(U1, B1, U2, B2) -> |