diff options
author | Erlang/OTP <[email protected]> | 2009-11-20 14:54:40 +0000 |
---|---|---|
committer | Erlang/OTP <[email protected]> | 2009-11-20 14:54:40 +0000 |
commit | 84adefa331c4159d432d22840663c38f155cd4c1 (patch) | |
tree | bff9a9c66adda4df2106dfd0e5c053ab182a12bd /lib/stdlib/src/gb_sets.erl | |
download | otp-84adefa331c4159d432d22840663c38f155cd4c1.tar.gz otp-84adefa331c4159d432d22840663c38f155cd4c1.tar.bz2 otp-84adefa331c4159d432d22840663c38f155cd4c1.zip |
The R13B03 release.OTP_R13B03
Diffstat (limited to 'lib/stdlib/src/gb_sets.erl')
-rw-r--r-- | lib/stdlib/src/gb_sets.erl | 812 |
1 files changed, 812 insertions, 0 deletions
diff --git a/lib/stdlib/src/gb_sets.erl b/lib/stdlib/src/gb_sets.erl new file mode 100644 index 0000000000..086dc79b46 --- /dev/null +++ b/lib/stdlib/src/gb_sets.erl @@ -0,0 +1,812 @@ +%% +%% %CopyrightBegin% +%% +%% Copyright Ericsson AB 2001-2009. All Rights Reserved. +%% +%% The contents of this file are subject to the Erlang Public License, +%% Version 1.1, (the "License"); you may not use this file except in +%% compliance with the License. You should have received a copy of the +%% Erlang Public License along with this software. If not, it can be +%% retrieved online at http://www.erlang.org/. +%% +%% Software distributed under the License is distributed on an "AS IS" +%% basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See +%% the License for the specific language governing rights and limitations +%% under the License. +%% +%% %CopyrightEnd% +%% +%% ===================================================================== +%% Ordered Sets implemented as General Balanced Trees +%% +%% Copyright (C) 1999-2001 Richard Carlsson +%% +%% An implementation of ordered sets using Prof. Arne Andersson's +%% General Balanced Trees. This can be much more efficient than using +%% ordered lists, for larger sets, but depends on the application. See +%% notes below for details. +%% --------------------------------------------------------------------- +%% Notes: +%% +%% The complexity on set operations is bounded by either O(|S|) or O(|T| +%% * log(|S|)), where S is the largest given set, depending on which is +%% fastest for any particular function call. For operating on sets of +%% almost equal size, this implementation is about 3 times slower than +%% using ordered-list sets directly. For sets of very different sizes, +%% however, this solution can be arbitrarily much faster; in practical +%% cases, often between 10 and 100 times. This implementation is +%% particularly suited for ackumulating elements a few at a time, +%% building up a large set (more than 100-200 elements), and repeatedly +%% testing for membership in the current set. +%% +%% As with normal tree structures, lookup (membership testing), +%% insertion and deletion have logarithmic complexity. +%% +%% Operations: +%% +%% - empty(): returns empty set. +%% +%% Alias: new(), for compatibility with `sets'. +%% +%% - is_empty(S): returns 'true' if S is an empty set, and 'false' +%% otherwise. +%% +%% - size(S): returns the number of nodes in the set as an integer. +%% Returns 0 (zero) if the set is empty. +%% +%% - singleton(X): returns a set containing only the element X. +%% +%% - is_member(X, S): returns `true' if element X is a member of set S, +%% and `false' otherwise. +%% +%% Alias: is_element(), for compatibility with `sets'. +%% +%% - insert(X, S): inserts element X into set S; returns the new set. +%% *Assumes that the element is not present in S.* +%% +%% - add(X, S): adds element X to set S; returns the new set. If X is +%% already an element in S, nothing is changed. +%% +%% Alias: add_element(), for compatibility with `sets'. +%% +%% - delete(X, S): removes element X from set S; returns new set. +%% Assumes that the element exists in the set. +%% +%% - delete_any(X, S): removes key X from set S if the key is present +%% in the set, otherwise does nothing; returns new set. +%% +%% Alias: del_element(), for compatibility with `sets'. +%% +%% - balance(S): rebalances the tree representation of S. Note that this +%% is rarely necessary, but may be motivated when a large number of +%% elements have been deleted from the tree without further +%% insertions. Rebalancing could then be forced in order to minimise +%% lookup times, since deletion only does not rebalance the tree. +%% +%% - union(S1, S2): returns a new set that contains each element that is +%% in either S1 or S2 or both, and no other elements. +%% +%% - union(Ss): returns a new set that contains each element that is in +%% at least one of the sets in the list Ss, and no other elements. +%% +%% - intersection(S1, S2): returns a new set that contains each element +%% that is in both S1 and S2, and no other elements. +%% +%% - intersection(Ss): returns a new set that contains each element that +%% is in all of the sets in the list Ss, and no other elements. +%% +%% - is_disjoint(S1, S2): returns `true' if none of the elements in S1 +%% occurs in S2. +%% +%% - difference(S1, S2): returns a new set that contains each element in +%% S1 that is not also in S2, and no other elements. +%% +%% Alias: subtract(), for compatibility with `sets'. +%% +%% - is_subset(S1, S2): returns `true' if each element in S1 is also a +%% member of S2, and `false' otherwise. +%% +%% - to_list(S): returns an ordered list of all elements in set S. The +%% list never contains duplicates. +%% +%% - from_list(List): creates a set containing all elements in List, +%% where List may be unordered and contain duplicates. +%% +%% - from_ordset(L): turns an ordered-set list L into a set. The list +%% must not contain duplicates. +%% +%% - smallest(S): returns the smallest element in set S. Assumes that +%% the set S is nonempty. +%% +%% - largest(S): returns the largest element in set S. Assumes that the +%% set S is nonempty. +%% +%% - take_smallest(S): returns {X, S1}, where X is the smallest element +%% in set S, and S1 is the set S with element X deleted. Assumes that +%% the set S is nonempty. +%% +%% - take_largest(S): returns {X, S1}, where X is the largest element in +%% set S, and S1 is the set S with element X deleted. Assumes that the +%% set S is nonempty. +%% +%% - iterator(S): returns an iterator that can be used for traversing +%% the entries of set S; see `next'. The implementation of this is +%% very efficient; traversing the whole set using `next' is only +%% slightly slower than getting the list of all elements using +%% `to_list' and traversing that. The main advantage of the iterator +%% approach is that it does not require the complete list of all +%% elements to be built in memory at one time. +%% +%% - next(T): returns {X, T1} where X is the smallest element referred +%% to by the iterator T, and T1 is the new iterator to be used for +%% traversing the remaining elements, or the atom `none' if no +%% elements remain. +%% +%% - filter(P, S): Filters set S using predicate function P. Included +%% for compatibility with `sets'. +%% +%% - fold(F, A, S): Folds function F over set S with A as the initial +%% ackumulator. Included for compatibility with `sets'. +%% +%% - is_set(S): returns 'true' if S appears to be a set, and 'false' +%% otherwise. Not recommended; included for compatibility with `sets'. + +-module(gb_sets). + +-export([empty/0, is_empty/1, size/1, singleton/1, is_member/2, + insert/2, add/2, delete/2, delete_any/2, balance/1, union/2, + union/1, intersection/2, intersection/1, is_disjoint/2, difference/2, + is_subset/2, to_list/1, from_list/1, from_ordset/1, smallest/1, + largest/1, take_smallest/1, take_largest/1, iterator/1, next/1, + filter/2, fold/3, is_set/1]). + +%% `sets' compatibility aliases: + +-export([new/0, is_element/2, add_element/2, del_element/2, + subtract/2]). + +%% GB-trees adapted from Sven-Olof Nystr�m's implementation for +%% representation of sets. +%% +%% Data structures: +%% - {Size, Tree}, where `Tree' is composed of nodes of the form: +%% - {Key, Smaller, Bigger}, and the "empty tree" node: +%% - nil. +%% +%% No attempt is made to balance trees after deletions. Since deletions +%% don't increase the height of a tree, this should be OK. +%% +%% Original balance condition h(T) <= ceil(c * log(|T|)) has been +%% changed to the similar (but not quite equivalent) condition 2 ^ h(T) +%% <= |T| ^ c. This should also be OK. +%% +%% Behaviour is logarithmic (as it should be). + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +%% Some macros. + +-define(p, 2). % It seems that p = 2 is optimal for sorted keys + +-define(pow(A, _), A * A). % correct with exponent as defined above. + +-define(div2(X), X bsr 1). + +-define(mul2(X), X bsl 1). + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +%% Some types. + +-type gb_set_node() :: 'nil' | {term(), _, _}. + +%% A declaration equivalent to the following is currently hard-coded +%% in erl_types.erl +%% +%% -opaque gb_set() :: {non_neg_integer(), gb_set_node()}. + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% + +-spec empty() -> gb_set(). + +empty() -> + {0, nil}. + +-spec new() -> gb_set(). + +new() -> empty(). + +-spec is_empty(gb_set()) -> boolean(). + +is_empty({0, nil}) -> + true; +is_empty(_) -> + false. + +-spec size(gb_set()) -> non_neg_integer(). + +size({Size, _}) -> + Size. + +-spec singleton(term()) -> gb_set(). + +singleton(Key) -> + {1, {Key, nil, nil}}. + +-spec is_element(term(), gb_set()) -> boolean(). + +is_element(Key, S) -> + is_member(Key, S). + +-spec is_member(term(), gb_set()) -> boolean(). + +is_member(Key, {_, T}) -> + is_member_1(Key, T). + +is_member_1(Key, {Key1, Smaller, _}) when Key < Key1 -> + is_member_1(Key, Smaller); +is_member_1(Key, {Key1, _, Bigger}) when Key > Key1 -> + is_member_1(Key, Bigger); +is_member_1(_, {_, _, _}) -> + true; +is_member_1(_, nil) -> + false. + +-spec insert(term(), gb_set()) -> gb_set(). + +insert(Key, {S, T}) -> + S1 = S + 1, + {S1, insert_1(Key, T, ?pow(S1, ?p))}. + +insert_1(Key, {Key1, Smaller, Bigger}, S) when Key < Key1 -> + case insert_1(Key, Smaller, ?div2(S)) of + {T1, H1, S1} when is_integer(H1) -> + T = {Key1, T1, Bigger}, + {H2, S2} = count(Bigger), + H = ?mul2(erlang:max(H1, H2)), + SS = S1 + S2 + 1, + P = ?pow(SS, ?p), + if + H > P -> + balance(T, SS); + true -> + {T, H, SS} + end; + T1 -> + {Key1, T1, Bigger} + end; +insert_1(Key, {Key1, Smaller, Bigger}, S) when Key > Key1 -> + case insert_1(Key, Bigger, ?div2(S)) of + {T1, H1, S1} when is_integer(H1) -> + T = {Key1, Smaller, T1}, + {H2, S2} = count(Smaller), + H = ?mul2(erlang:max(H1, H2)), + SS = S1 + S2 + 1, + P = ?pow(SS, ?p), + if + H > P -> + balance(T, SS); + true -> + {T, H, SS} + end; + T1 -> + {Key1, Smaller, T1} + end; +insert_1(Key, nil, 0) -> + {{Key, nil, nil}, 1, 1}; +insert_1(Key, nil, _) -> + {Key, nil, nil}; +insert_1(Key, _, _) -> + erlang:error({key_exists, Key}). + +count({_, nil, nil}) -> + {1, 1}; +count({_, Sm, Bi}) -> + {H1, S1} = count(Sm), + {H2, S2} = count(Bi), + {?mul2(erlang:max(H1, H2)), S1 + S2 + 1}; +count(nil) -> + {1, 0}. + +-spec balance(gb_set()) -> gb_set(). + +balance({S, T}) -> + {S, balance(T, S)}. + +balance(T, S) -> + balance_list(to_list_1(T), S). + +balance_list(L, S) -> + {T, _} = balance_list_1(L, S), + T. + +balance_list_1(L, S) when S > 1 -> + Sm = S - 1, + S2 = Sm div 2, + S1 = Sm - S2, + {T1, [K | L1]} = balance_list_1(L, S1), + {T2, L2} = balance_list_1(L1, S2), + T = {K, T1, T2}, + {T, L2}; +balance_list_1([Key | L], 1) -> + {{Key, nil, nil}, L}; +balance_list_1(L, 0) -> + {nil, L}. + +-spec add_element(term(), gb_set()) -> gb_set(). + +add_element(X, S) -> + add(X, S). + +-spec add(term(), gb_set()) -> gb_set(). + +add(X, S) -> + case is_member(X, S) of + true -> + S; % we don't have to do anything here + false -> + insert(X, S) + end. + +-spec from_list([term()]) -> gb_set(). + +from_list(L) -> + from_ordset(ordsets:from_list(L)). + +-spec from_ordset([term()]) -> gb_set(). + +from_ordset(L) -> + S = length(L), + {S, balance_list(L, S)}. + +-spec del_element(term(), gb_set()) -> gb_set(). + +del_element(Key, S) -> + delete_any(Key, S). + +-spec delete_any(term(), gb_set()) -> gb_set(). + +delete_any(Key, S) -> + case is_member(Key, S) of + true -> + delete(Key, S); + false -> + S + end. + +-spec delete(term(), gb_set()) -> gb_set(). + +delete(Key, {S, T}) -> + {S - 1, delete_1(Key, T)}. + +delete_1(Key, {Key1, Smaller, Larger}) when Key < Key1 -> + Smaller1 = delete_1(Key, Smaller), + {Key1, Smaller1, Larger}; +delete_1(Key, {Key1, Smaller, Bigger}) when Key > Key1 -> + Bigger1 = delete_1(Key, Bigger), + {Key1, Smaller, Bigger1}; +delete_1(_, {_, Smaller, Larger}) -> + merge(Smaller, Larger). + +merge(Smaller, nil) -> + Smaller; +merge(nil, Larger) -> + Larger; +merge(Smaller, Larger) -> + {Key, Larger1} = take_smallest1(Larger), + {Key, Smaller, Larger1}. + +-spec take_smallest(gb_set()) -> {term(), gb_set()}. + +take_smallest({S, T}) -> + {Key, Larger} = take_smallest1(T), + {Key, {S - 1, Larger}}. + +take_smallest1({Key, nil, Larger}) -> + {Key, Larger}; +take_smallest1({Key, Smaller, Larger}) -> + {Key1, Smaller1} = take_smallest1(Smaller), + {Key1, {Key, Smaller1, Larger}}. + +-spec smallest(gb_set()) -> term(). + +smallest({_, T}) -> + smallest_1(T). + +smallest_1({Key, nil, _Larger}) -> + Key; +smallest_1({_Key, Smaller, _Larger}) -> + smallest_1(Smaller). + +-spec take_largest(gb_set()) -> {term(), gb_set()}. + +take_largest({S, T}) -> + {Key, Smaller} = take_largest1(T), + {Key, {S - 1, Smaller}}. + +take_largest1({Key, Smaller, nil}) -> + {Key, Smaller}; +take_largest1({Key, Smaller, Larger}) -> + {Key1, Larger1} = take_largest1(Larger), + {Key1, {Key, Smaller, Larger1}}. + +-spec largest(gb_set()) -> term(). + +largest({_, T}) -> + largest_1(T). + +largest_1({Key, _Smaller, nil}) -> + Key; +largest_1({_Key, _Smaller, Larger}) -> + largest_1(Larger). + +-spec to_list(gb_set()) -> [term()]. + +to_list({_, T}) -> + to_list(T, []). + +to_list_1(T) -> to_list(T, []). + +to_list({Key, Small, Big}, L) -> + to_list(Small, [Key | to_list(Big, L)]); +to_list(nil, L) -> L. + +-spec iterator(gb_set()) -> [term()]. + +iterator({_, T}) -> + iterator(T, []). + +%% The iterator structure is really just a list corresponding to the +%% call stack of an in-order traversal. This is quite fast. + +iterator({_, nil, _} = T, As) -> + [T | As]; +iterator({_, L, _} = T, As) -> + iterator(L, [T | As]); +iterator(nil, As) -> + As. + +-spec next([term()]) -> {term(), [term()]} | 'none'. + +next([{X, _, T} | As]) -> + {X, iterator(T, As)}; +next([]) -> + none. + + +%% Set operations: + + +%% If |X| < |Y|, then we traverse the elements of X. The cost for +%% testing a single random element for membership in a tree S is +%% proportional to log(|S|); thus, if |Y| / |X| < c * log(|Y|), for some +%% c, it is more efficient to scan the ordered sequence of elements of Y +%% while traversing X (under the same ordering) in order to test whether +%% elements of X are already in Y. Since the `math' module does not have +%% a `log2'-function, we rewrite the condition to |X| < |Y| * c1 * +%% ln(|X|), where c1 = c / ln 2. + +-define(c, 1.46). % 1 / ln 2; this appears to be best + +%% If the sets are not very different in size, i.e., if |Y| / |X| >= c * +%% log(|Y|), then the fastest way to do union (and the other similar set +%% operations) is to build the lists of elements, traverse these lists +%% in parallel while building a reversed ackumulator list, and finally +%% rebuild the tree directly from the ackumulator. Other methods of +%% traversing the elements can be devised, but they all have higher +%% overhead. + +-spec union(gb_set(), gb_set()) -> gb_set(). + +union({N1, T1}, {N2, T2}) when N2 < N1 -> + union(to_list_1(T2), N2, T1, N1); +union({N1, T1}, {N2, T2}) -> + union(to_list_1(T1), N1, T2, N2). + +%% We avoid the expensive mathematical computations if there is little +%% chance at saving at least the same amount of time by making the right +%% choice of strategy. Recall that N1 < N2 here. + +union(L, N1, T2, N2) when N2 < 10 -> + %% Break even is about 7 for N1 = 1 and 10 for N1 = 2 + union_2(L, to_list_1(T2), N1 + N2); +union(L, N1, T2, N2) -> + X = N1 * round(?c * math:log(N2)), + if N2 < X -> + union_2(L, to_list_1(T2), N1 + N2); + true -> + union_1(L, mk_set(N2, T2)) + end. + +-spec mk_set(non_neg_integer(), gb_set_node()) -> gb_set(). + +mk_set(N, T) -> + {N, T}. + +%% If the length of the list is in proportion with the size of the +%% target set, this version spends too much time doing lookups, compared +%% to the below version. + +union_1([X | Xs], S) -> + union_1(Xs, add(X, S)); +union_1([], S) -> + S. + + +%% If the length of the first list is too small in comparison with the +%% size of the target set, this version spends too much time scanning +%% the element list of the target set for possible membership, compared +%% with the above version. + +%% Some notes on sequential scanning of ordered lists +%% +%% 1) We want to put the equality case last, if we can assume that the +%% probability for overlapping elements is relatively low on average. +%% Doing this also allows us to completely skip the (arithmetic) +%% equality test, since the term order is arithmetically total. +%% +%% 2) We always test for `smaller than' first, i.e., whether the head of +%% the left list is smaller than the head of the right list, and if the +%% `greater than' test should instead turn out to be true, we switch +%% left and right arguments in the recursive call under the assumption +%% that the same is likely to apply to the next element also, +%% statistically reducing the number of failed tests and automatically +%% adapting to cases of lists having very different lengths. This saves +%% 10-40% of the traversation time compared to a "fixed" strategy, +%% depending on the sizes and contents of the lists. +%% +%% 3) A tail recursive version using `lists:reverse/2' is about 5-10% +%% faster than a plain recursive version using the stack, for lists of +%% more than about 20 elements and small stack frames. For very short +%% lists, however (length < 10), the stack version can be several times +%% faster. As stack frames grow larger, the advantages of using +%% `reverse' could get greater. + +union_2(Xs, Ys, S) -> + union_2(Xs, Ys, [], S). % S is the sum of the sizes here + +union_2([X | Xs1], [Y | _] = Ys, As, S) when X < Y -> + union_2(Xs1, Ys, [X | As], S); +union_2([X | _] = Xs, [Y | Ys1], As, S) when X > Y -> + union_2(Ys1, Xs, [Y | As], S); +union_2([X | Xs1], [_ | Ys1], As, S) -> + union_2(Xs1, Ys1, [X | As], S - 1); +union_2([], Ys, As, S) -> + {S, balance_revlist(push(Ys, As), S)}; +union_2(Xs, [], As, S) -> + {S, balance_revlist(push(Xs, As), S)}. + +push([X | Xs], As) -> + push(Xs, [X | As]); +push([], As) -> + As. + +balance_revlist(L, S) -> + {T, _} = balance_revlist_1(L, S), + T. + +balance_revlist_1(L, S) when S > 1 -> + Sm = S - 1, + S2 = Sm div 2, + S1 = Sm - S2, + {T2, [K | L1]} = balance_revlist_1(L, S1), + {T1, L2} = balance_revlist_1(L1, S2), + T = {K, T1, T2}, + {T, L2}; +balance_revlist_1([Key | L], 1) -> + {{Key, nil, nil}, L}; +balance_revlist_1(L, 0) -> + {nil, L}. + +-spec union([gb_set()]) -> gb_set(). + +union([S | Ss]) -> + union_list(S, Ss); +union([]) -> empty(). + +union_list(S, [S1 | Ss]) -> + union_list(union(S, S1), Ss); +union_list(S, []) -> S. + + +%% The rest is modelled on the above. + +-spec intersection(gb_set(), gb_set()) -> gb_set(). + +intersection({N1, T1}, {N2, T2}) when N2 < N1 -> + intersection(to_list_1(T2), N2, T1, N1); +intersection({N1, T1}, {N2, T2}) -> + intersection(to_list_1(T1), N1, T2, N2). + +intersection(L, _N1, T2, N2) when N2 < 10 -> + intersection_2(L, to_list_1(T2)); +intersection(L, N1, T2, N2) -> + X = N1 * round(?c * math:log(N2)), + if N2 < X -> + intersection_2(L, to_list_1(T2)); + true -> + intersection_1(L, T2) + end. + +%% We collect the intersecting elements in an accumulator list and count +%% them at the same time so we can balance the list afterwards. + +intersection_1(Xs, T) -> + intersection_1(Xs, T, [], 0). + +intersection_1([X | Xs], T, As, N) -> + case is_member_1(X, T) of + true -> + intersection_1(Xs, T, [X | As], N + 1); + false -> + intersection_1(Xs, T, As, N) + end; +intersection_1([], _, As, N) -> + {N, balance_revlist(As, N)}. + + +intersection_2(Xs, Ys) -> + intersection_2(Xs, Ys, [], 0). + +intersection_2([X | Xs1], [Y | _] = Ys, As, S) when X < Y -> + intersection_2(Xs1, Ys, As, S); +intersection_2([X | _] = Xs, [Y | Ys1], As, S) when X > Y -> + intersection_2(Ys1, Xs, As, S); +intersection_2([X | Xs1], [_ | Ys1], As, S) -> + intersection_2(Xs1, Ys1, [X | As], S + 1); +intersection_2([], _, As, S) -> + {S, balance_revlist(As, S)}; +intersection_2(_, [], As, S) -> + {S, balance_revlist(As, S)}. + +-spec intersection([gb_set()]) -> gb_set(). + +intersection([S | Ss]) -> + intersection_list(S, Ss). + +intersection_list(S, [S1 | Ss]) -> + intersection_list(intersection(S, S1), Ss); +intersection_list(S, []) -> S. + +-spec is_disjoint(gb_set(), gb_set()) -> boolean(). + +is_disjoint({N1, T1}, {N2, T2}) when N1 < N2 -> + is_disjoint_1(T1, T2); +is_disjoint({_, T1}, {_, T2}) -> + is_disjoint_1(T2, T1). + +is_disjoint_1({K1, Smaller1, Bigger}, {K2, Smaller2, _}=Tree) when K1 < K2 -> + not is_member_1(K1, Smaller2) andalso + is_disjoint_1(Smaller1, Smaller2) andalso + is_disjoint_1(Bigger, Tree); +is_disjoint_1({K1, Smaller, Bigger1}, {K2, _, Bigger2}=Tree) when K1 > K2 -> + not is_member_1(K1, Bigger2) andalso + is_disjoint_1(Bigger1, Bigger2) andalso + is_disjoint_1(Smaller, Tree); +is_disjoint_1({_K1, _, _}, {_K2, _, _}) -> %K1 == K2 + false; +is_disjoint_1(nil, _) -> + true; +is_disjoint_1(_, nil) -> + true. + +%% Note that difference is not symmetric. We don't use `delete' here, +%% since the GB-trees implementation does not rebalance after deletion +%% and so we could end up with very unbalanced trees indeed depending on +%% the sets. Therefore, we always build a new tree, and thus we need to +%% traverse the whole element list of the left operand. + +-spec subtract(gb_set(), gb_set()) -> gb_set(). + +subtract(S1, S2) -> + difference(S1, S2). + +-spec difference(gb_set(), gb_set()) -> gb_set(). + +difference({N1, T1}, {N2, T2}) -> + difference(to_list_1(T1), N1, T2, N2). + +difference(L, N1, T2, N2) when N2 < 10 -> + difference_2(L, to_list_1(T2), N1); +difference(L, N1, T2, N2) -> + X = N1 * round(?c * math:log(N2)), + if N2 < X -> + difference_2(L, to_list_1(T2), N1); + true -> + difference_1(L, T2) + end. + + +difference_1(Xs, T) -> + difference_1(Xs, T, [], 0). + +difference_1([X | Xs], T, As, N) -> + case is_member_1(X, T) of + true -> + difference_1(Xs, T, As, N); + false -> + difference_1(Xs, T, [X | As], N + 1) + end; +difference_1([], _, As, N) -> + {N, balance_revlist(As, N)}. + + +difference_2(Xs, Ys, S) -> + difference_2(Xs, Ys, [], S). % S is the size of the left set + +difference_2([X | Xs1], [Y | _] = Ys, As, S) when X < Y -> + difference_2(Xs1, Ys, [X | As], S); +difference_2([X | _] = Xs, [Y | Ys1], As, S) when X > Y -> + difference_2(Xs, Ys1, As, S); +difference_2([_X | Xs1], [_Y | Ys1], As, S) -> + difference_2(Xs1, Ys1, As, S - 1); +difference_2([], _Ys, As, S) -> + {S, balance_revlist(As, S)}; +difference_2(Xs, [], As, S) -> + {S, balance_revlist(push(Xs, As), S)}. + + +%% Subset testing is much the same thing as set difference, but +%% without the construction of a new set. + +-spec is_subset(gb_set(), gb_set()) -> boolean(). + +is_subset({N1, T1}, {N2, T2}) -> + is_subset(to_list_1(T1), N1, T2, N2). + +is_subset(L, _N1, T2, N2) when N2 < 10 -> + is_subset_2(L, to_list_1(T2)); +is_subset(L, N1, T2, N2) -> + X = N1 * round(?c * math:log(N2)), + if N2 < X -> + is_subset_2(L, to_list_1(T2)); + true -> + is_subset_1(L, T2) + end. + + +is_subset_1([X | Xs], T) -> + case is_member_1(X, T) of + true -> + is_subset_1(Xs, T); + false -> + false + end; +is_subset_1([], _) -> + true. + + +is_subset_2([X | _], [Y | _]) when X < Y -> + false; +is_subset_2([X | _] = Xs, [Y | Ys1]) when X > Y -> + is_subset_2(Xs, Ys1); +is_subset_2([_ | Xs1], [_ | Ys1]) -> + is_subset_2(Xs1, Ys1); +is_subset_2([], _) -> + true; +is_subset_2(_, []) -> + false. + + +%% For compatibility with `sets': + +-spec is_set(term()) -> boolean(). + +is_set({0, nil}) -> true; +is_set({N, {_, _, _}}) when is_integer(N), N >= 0 -> true; +is_set(_) -> false. + +-spec filter(fun((term()) -> boolean()), gb_set()) -> gb_set(). + +filter(F, S) -> + from_ordset([X || X <- to_list(S), F(X)]). + +-spec fold(fun((term(), term()) -> term()), term(), gb_set()) -> term(). + +fold(F, A, {_, T}) when is_function(F, 2) -> + fold_1(F, A, T). + +fold_1(F, Acc0, {Key, Small, Big}) -> + Acc1 = fold_1(F, Acc0, Small), + Acc = F(Key, Acc1), + fold_1(F, Acc, Big); +fold_1(_, Acc, _) -> + Acc. |