diff options
Diffstat (limited to 'lib/stdlib/src/gb_trees.erl')
-rw-r--r-- | lib/stdlib/src/gb_trees.erl | 515 |
1 files changed, 515 insertions, 0 deletions
diff --git a/lib/stdlib/src/gb_trees.erl b/lib/stdlib/src/gb_trees.erl new file mode 100644 index 0000000000..d37786a100 --- /dev/null +++ b/lib/stdlib/src/gb_trees.erl @@ -0,0 +1,515 @@ +%% +%% %CopyrightBegin% +%% +%% Copyright Ericsson AB 2001-2009. All Rights Reserved. +%% +%% The contents of this file are subject to the Erlang Public License, +%% Version 1.1, (the "License"); you may not use this file except in +%% compliance with the License. You should have received a copy of the +%% Erlang Public License along with this software. If not, it can be +%% retrieved online at http://www.erlang.org/. +%% +%% Software distributed under the License is distributed on an "AS IS" +%% basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See +%% the License for the specific language governing rights and limitations +%% under the License. +%% +%% %CopyrightEnd% +%% +%% ===================================================================== +%% General Balanced Trees - highly efficient dictionaries. +%% +%% Copyright (C) 1999-2001 Sven-Olof Nystr�m, Richard Carlsson +%% +%% An efficient implementation of Prof. Arne Andersson's General +%% Balanced Trees. These have no storage overhead compared to plain +%% unbalanced binary trees, and their performance is in general better +%% than AVL trees. +%% --------------------------------------------------------------------- +%% Operations: +%% +%% - empty(): returns empty tree. +%% +%% - is_empty(T): returns 'true' if T is an empty tree, and 'false' +%% otherwise. +%% +%% - size(T): returns the number of nodes in the tree as an integer. +%% Returns 0 (zero) if the tree is empty. +%% +%% - lookup(X, T): looks up key X in tree T; returns {value, V}, or +%% `none' if the key is not present. +%% +%% - get(X, T): retreives the value stored with key X in tree T. Assumes +%% that the key is present in the tree. +%% +%% - insert(X, V, T): inserts key X with value V into tree T; returns +%% the new tree. Assumes that the key is *not* present in the tree. +%% +%% - update(X, V, T): updates key X to value V in tree T; returns the +%% new tree. Assumes that the key is present in the tree. +%% +%% - enter(X, V, T): inserts key X with value V into tree T if the key +%% is not present in the tree, otherwise updates key X to value V in +%% T. Returns the new tree. +%% +%% - delete(X, T): removes key X from tree T; returns new tree. Assumes +%% that the key is present in the tree. +%% +%% - delete_any(X, T): removes key X from tree T if the key is present +%% in the tree, otherwise does nothing; returns new tree. +%% +%% - balance(T): rebalances tree T. Note that this is rarely necessary, +%% but may be motivated when a large number of entries have been +%% deleted from the tree without further insertions. Rebalancing could +%% then be forced in order to minimise lookup times, since deletion +%% only does not rebalance the tree. +%% +%% - is_defined(X, T): returns `true' if key X is present in tree T, and +%% `false' otherwise. +%% +%% - keys(T): returns an ordered list of all keys in tree T. +%% +%% - values(T): returns the list of values for all keys in tree T, +%% sorted by their corresponding keys. Duplicates are not removed. +%% +%% - to_list(T): returns an ordered list of {Key, Value} pairs for all +%% keys in tree T. +%% +%% - from_orddict(L): turns an ordered list L of {Key, Value} pairs into +%% a tree. The list must not contain duplicate keys. +%% +%% - smallest(T): returns {X, V}, where X is the smallest key in tree T, +%% and V is the value associated with X in T. Assumes that the tree T +%% is nonempty. +%% +%% - largest(T): returns {X, V}, where X is the largest key in tree T, +%% and V is the value associated with X in T. Assumes that the tree T +%% is nonempty. +%% +%% - take_smallest(T): returns {X, V, T1}, where X is the smallest key +%% in tree T, V is the value associated with X in T, and T1 is the +%% tree T with key X deleted. Assumes that the tree T is nonempty. +%% +%% - take_largest(T): returns {X, V, T1}, where X is the largest key +%% in tree T, V is the value associated with X in T, and T1 is the +%% tree T with key X deleted. Assumes that the tree T is nonempty. +%% +%% - iterator(T): returns an iterator that can be used for traversing +%% the entries of tree T; see `next'. The implementation of this is +%% very efficient; traversing the whole tree using `next' is only +%% slightly slower than getting the list of all elements using +%% `to_list' and traversing that. The main advantage of the iterator +%% approach is that it does not require the complete list of all +%% elements to be built in memory at one time. +%% +%% - next(S): returns {X, V, S1} where X is the smallest key referred to +%% by the iterator S, and S1 is the new iterator to be used for +%% traversing the remaining entries, or the atom `none' if no entries +%% remain. +%% +%% - map(F, T): maps the function F(K, V) -> V' to all key-value pairs +%% of the tree T and returns a new tree T' with the same set of keys +%% as T and the new set of values V'. + +-module(gb_trees). + +-export([empty/0, is_empty/1, size/1, lookup/2, get/2, insert/3, + update/3, enter/3, delete/2, delete_any/2, balance/1, + is_defined/2, keys/1, values/1, to_list/1, from_orddict/1, + smallest/1, largest/1, take_smallest/1, take_largest/1, + iterator/1, next/1, map/2]). + + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +%% Data structure: +%% - {Size, Tree}, where `Tree' is composed of nodes of the form: +%% - {Key, Value, Smaller, Bigger}, and the "empty tree" node: +%% - nil. +%% +%% I make no attempt to balance trees after deletions. Since deletions +%% don't increase the height of a tree, I figure this is OK. +%% +%% Original balance condition h(T) <= ceil(c * log(|T|)) has been +%% changed to the similar (but not quite equivalent) condition 2 ^ h(T) +%% <= |T| ^ c. I figure this should also be OK. +%% +%% Performance is comparable to the AVL trees in the Erlang book (and +%% faster in general due to less overhead); the difference is that +%% deletion works for my trees, but not for the book's trees. Behaviour +%% is logaritmic (as it should be). + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +%% Some macros. + +-define(p, 2). % It seems that p = 2 is optimal for sorted keys + +-define(pow(A, _), A * A). % correct with exponent as defined above. + +-define(div2(X), X bsr 1). + +-define(mul2(X), X bsl 1). + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +%% Some types. + +-type gb_tree_node() :: 'nil' | {_, _, _, _}. + +%% A declaration equivalent to the following is currently hard-coded +%% in erl_types.erl +%% +%% -opaque gb_tree() :: {non_neg_integer(), gb_tree_node()}. + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% + +-spec empty() -> gb_tree(). + +empty() -> + {0, nil}. + +-spec is_empty(gb_tree()) -> boolean(). + +is_empty({0, nil}) -> + true; +is_empty(_) -> + false. + +-spec size(gb_tree()) -> non_neg_integer(). + +size({Size, _}) when is_integer(Size), Size >= 0 -> + Size. + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% + +-spec lookup(term(), gb_tree()) -> 'none' | {'value', term()}. + +lookup(Key, {_, T}) -> + lookup_1(Key, T). + +%% The term order is an arithmetic total order, so we should not +%% test exact equality for the keys. (If we do, then it becomes +%% possible that neither `>', `<', nor `=:=' matches.) Testing '<' +%% and '>' first is statistically better than testing for +%% equality, and also allows us to skip the test completely in the +%% remaining case. + +lookup_1(Key, {Key1, _, Smaller, _}) when Key < Key1 -> + lookup_1(Key, Smaller); +lookup_1(Key, {Key1, _, _, Bigger}) when Key > Key1 -> + lookup_1(Key, Bigger); +lookup_1(_, {_, Value, _, _}) -> + {value, Value}; +lookup_1(_, nil) -> + none. + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% + +%% This is a specialized version of `lookup'. + +-spec is_defined(term(), gb_tree()) -> boolean(). + +is_defined(Key, {_, T}) -> + is_defined_1(Key, T). + +is_defined_1(Key, {Key1, _, Smaller, _}) when Key < Key1 -> + is_defined_1(Key, Smaller); +is_defined_1(Key, {Key1, _, _, Bigger}) when Key > Key1 -> + is_defined_1(Key, Bigger); +is_defined_1(_, {_, _, _, _}) -> + true; +is_defined_1(_, nil) -> + false. + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% + +%% This is a specialized version of `lookup'. + +-spec get(term(), gb_tree()) -> term(). + +get(Key, {_, T}) -> + get_1(Key, T). + +get_1(Key, {Key1, _, Smaller, _}) when Key < Key1 -> + get_1(Key, Smaller); +get_1(Key, {Key1, _, _, Bigger}) when Key > Key1 -> + get_1(Key, Bigger); +get_1(_, {_, Value, _, _}) -> + Value. + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% + +-spec update(term(), term(), gb_tree()) -> gb_tree(). + +update(Key, Val, {S, T}) -> + T1 = update_1(Key, Val, T), + {S, T1}. + +%% See `lookup' for notes on the term comparison order. + +update_1(Key, Value, {Key1, V, Smaller, Bigger}) when Key < Key1 -> + {Key1, V, update_1(Key, Value, Smaller), Bigger}; +update_1(Key, Value, {Key1, V, Smaller, Bigger}) when Key > Key1 -> + {Key1, V, Smaller, update_1(Key, Value, Bigger)}; +update_1(Key, Value, {_, _, Smaller, Bigger}) -> + {Key, Value, Smaller, Bigger}. + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% + +-spec insert(term(), term(), gb_tree()) -> gb_tree(). + +insert(Key, Val, {S, T}) when is_integer(S) -> + S1 = S+1, + {S1, insert_1(Key, Val, T, ?pow(S1, ?p))}. + +insert_1(Key, Value, {Key1, V, Smaller, Bigger}, S) when Key < Key1 -> + case insert_1(Key, Value, Smaller, ?div2(S)) of + {T1, H1, S1} -> + T = {Key1, V, T1, Bigger}, + {H2, S2} = count(Bigger), + H = ?mul2(erlang:max(H1, H2)), + SS = S1 + S2 + 1, + P = ?pow(SS, ?p), + if + H > P -> + balance(T, SS); + true -> + {T, H, SS} + end; + T1 -> + {Key1, V, T1, Bigger} + end; +insert_1(Key, Value, {Key1, V, Smaller, Bigger}, S) when Key > Key1 -> + case insert_1(Key, Value, Bigger, ?div2(S)) of + {T1, H1, S1} -> + T = {Key1, V, Smaller, T1}, + {H2, S2} = count(Smaller), + H = ?mul2(erlang:max(H1, H2)), + SS = S1 + S2 + 1, + P = ?pow(SS, ?p), + if + H > P -> + balance(T, SS); + true -> + {T, H, SS} + end; + T1 -> + {Key1, V, Smaller, T1} + end; +insert_1(Key, Value, nil, S) when S =:= 0 -> + {{Key, Value, nil, nil}, 1, 1}; +insert_1(Key, Value, nil, _S) -> + {Key, Value, nil, nil}; +insert_1(Key, _, _, _) -> + erlang:error({key_exists, Key}). + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% + +-spec enter(term(), term(), gb_tree()) -> gb_tree(). + +enter(Key, Val, T) -> + case is_defined(Key, T) of + true -> + update(Key, Val, T); + false -> + insert(Key, Val, T) + end. + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% + +count({_, _, nil, nil}) -> + {1, 1}; +count({_, _, Sm, Bi}) -> + {H1, S1} = count(Sm), + {H2, S2} = count(Bi), + {?mul2(erlang:max(H1, H2)), S1 + S2 + 1}; +count(nil) -> + {1, 0}. + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% + +-spec balance(gb_tree()) -> gb_tree(). + +balance({S, T}) -> + {S, balance(T, S)}. + +balance(T, S) -> + balance_list(to_list_1(T), S). + +balance_list(L, S) -> + {T, []} = balance_list_1(L, S), + T. + +balance_list_1(L, S) when S > 1 -> + Sm = S - 1, + S2 = Sm div 2, + S1 = Sm - S2, + {T1, [{K, V} | L1]} = balance_list_1(L, S1), + {T2, L2} = balance_list_1(L1, S2), + T = {K, V, T1, T2}, + {T, L2}; +balance_list_1([{Key, Val} | L], 1) -> + {{Key, Val, nil, nil}, L}; +balance_list_1(L, 0) -> + {nil, L}. + +-spec from_orddict([{_,_}]) -> gb_tree(). + +from_orddict(L) -> + S = length(L), + {S, balance_list(L, S)}. + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% + +-spec delete_any(term(), gb_tree()) -> gb_tree(). + +delete_any(Key, T) -> + case is_defined(Key, T) of + true -> + delete(Key, T); + false -> + T + end. + +%%% delete. Assumes that key is present. + +-spec delete(term(), gb_tree()) -> gb_tree(). + +delete(Key, {S, T}) when is_integer(S), S >= 0 -> + {S - 1, delete_1(Key, T)}. + +%% See `lookup' for notes on the term comparison order. + +delete_1(Key, {Key1, Value, Smaller, Larger}) when Key < Key1 -> + Smaller1 = delete_1(Key, Smaller), + {Key1, Value, Smaller1, Larger}; +delete_1(Key, {Key1, Value, Smaller, Bigger}) when Key > Key1 -> + Bigger1 = delete_1(Key, Bigger), + {Key1, Value, Smaller, Bigger1}; +delete_1(_, {_, _, Smaller, Larger}) -> + merge(Smaller, Larger). + +merge(Smaller, nil) -> + Smaller; +merge(nil, Larger) -> + Larger; +merge(Smaller, Larger) -> + {Key, Value, Larger1} = take_smallest1(Larger), + {Key, Value, Smaller, Larger1}. + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% + +-spec take_smallest(gb_tree()) -> {term(), term(), gb_tree()}. + +take_smallest({Size, Tree}) when is_integer(Size), Size >= 0 -> + {Key, Value, Larger} = take_smallest1(Tree), + {Key, Value, {Size - 1, Larger}}. + +take_smallest1({Key, Value, nil, Larger}) -> + {Key, Value, Larger}; +take_smallest1({Key, Value, Smaller, Larger}) -> + {Key1, Value1, Smaller1} = take_smallest1(Smaller), + {Key1, Value1, {Key, Value, Smaller1, Larger}}. + +-spec smallest(gb_tree()) -> {term(), term()}. + +smallest({_, Tree}) -> + smallest_1(Tree). + +smallest_1({Key, Value, nil, _Larger}) -> + {Key, Value}; +smallest_1({_Key, _Value, Smaller, _Larger}) -> + smallest_1(Smaller). + +-spec take_largest(gb_tree()) -> {term(), term(), gb_tree()}. + +take_largest({Size, Tree}) when is_integer(Size), Size >= 0 -> + {Key, Value, Smaller} = take_largest1(Tree), + {Key, Value, {Size - 1, Smaller}}. + +take_largest1({Key, Value, Smaller, nil}) -> + {Key, Value, Smaller}; +take_largest1({Key, Value, Smaller, Larger}) -> + {Key1, Value1, Larger1} = take_largest1(Larger), + {Key1, Value1, {Key, Value, Smaller, Larger1}}. + +-spec largest(gb_tree()) -> {term(), term()}. + +largest({_, Tree}) -> + largest_1(Tree). + +largest_1({Key, Value, _Smaller, nil}) -> + {Key, Value}; +largest_1({_Key, _Value, _Smaller, Larger}) -> + largest_1(Larger). + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% + +-spec to_list(gb_tree()) -> [{term(), term()}]. + +to_list({_, T}) -> + to_list(T, []). + +to_list_1(T) -> to_list(T, []). + +to_list({Key, Value, Small, Big}, L) -> + to_list(Small, [{Key, Value} | to_list(Big, L)]); +to_list(nil, L) -> L. + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% + +-spec keys(gb_tree()) -> [term()]. + +keys({_, T}) -> + keys(T, []). + +keys({Key, _Value, Small, Big}, L) -> + keys(Small, [Key | keys(Big, L)]); +keys(nil, L) -> L. + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% + +-spec values(gb_tree()) -> [term()]. + +values({_, T}) -> + values(T, []). + +values({_Key, Value, Small, Big}, L) -> + values(Small, [Value | values(Big, L)]); +values(nil, L) -> L. + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% + +-spec iterator(gb_tree()) -> [gb_tree_node()]. + +iterator({_, T}) -> + iterator_1(T). + +iterator_1(T) -> + iterator(T, []). + +%% The iterator structure is really just a list corresponding to +%% the call stack of an in-order traversal. This is quite fast. + +iterator({_, _, nil, _} = T, As) -> + [T | As]; +iterator({_, _, L, _} = T, As) -> + iterator(L, [T | As]); +iterator(nil, As) -> + As. + +-spec next([gb_tree_node()]) -> 'none' | {term(), term(), [gb_tree_node()]}. + +next([{X, V, _, T} | As]) -> + {X, V, iterator(T, As)}; +next([]) -> + none. + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% + +-spec map(fun((term(), term()) -> term()), gb_tree()) -> gb_tree(). + +map(F, {Size, Tree}) when is_function(F, 2) -> + {Size, map_1(F, Tree)}. + +map_1(_, nil) -> nil; +map_1(F, {K, V, Smaller, Larger}) -> + {K, F(K, V), map_1(F, Smaller), map_1(F, Larger)}. |