From 97ab480df55cf574ab42a87b6927ef5bba83000e Mon Sep 17 00:00:00 2001 From: Patrik Nyblom Date: Mon, 10 May 2010 16:27:58 +0200 Subject: Add documentation for binary module Correct behaviour of copy/2 witn 0 copies. --- lib/stdlib/doc/src/Makefile | 1 + lib/stdlib/doc/src/binary.xml | 729 ++++++++++++++++++++++++++++++++ lib/stdlib/doc/src/ref_man.xml | 1 + lib/stdlib/test/binary_module_SUITE.erl | 5 +- lib/stdlib/test/binref.erl | 2 +- 5 files changed, 735 insertions(+), 3 deletions(-) create mode 100644 lib/stdlib/doc/src/binary.xml (limited to 'lib') diff --git a/lib/stdlib/doc/src/Makefile b/lib/stdlib/doc/src/Makefile index 13b9b2ff18..353c1b90b9 100644 --- a/lib/stdlib/doc/src/Makefile +++ b/lib/stdlib/doc/src/Makefile @@ -40,6 +40,7 @@ XML_REF3_FILES = \ array.xml \ base64.xml \ beam_lib.xml \ + binary.xml \ c.xml \ calendar.xml \ dets.xml \ diff --git a/lib/stdlib/doc/src/binary.xml b/lib/stdlib/doc/src/binary.xml new file mode 100644 index 0000000000..05ec4406c6 --- /dev/null +++ b/lib/stdlib/doc/src/binary.xml @@ -0,0 +1,729 @@ + + + + +
+ + 2009 + 2010 + Ericsson AB, All Rights Reserved + + + The contents of this file are subject to the Erlang Public License, + Version 1.1, (the "License"); you may not use this file except in + compliance with the License. You should have received a copy of the + Erlang Public License along with this software. If not, it can be + retrieved on line at http://www.erlang.org/. + + Software distributed under the License is distributed on an "AS IS" + basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See + the License for the specific language governing rights and limitations + under the License. + + The Initial Developer of the Original Code is Ericsson AB. + + + binary + Patrik Nyblom + Kenneth Lundin + 1 + + + 2010-05-05 + A + binary.xml +
+ binary + Library for handling binary data + + +

This module contains functions for manipulating byte-oriented + binaries. Although the majority of functions could be implemented + using bit-syntax, the functions in this library are highly + optimized and are expected to either execute faster or consume + less memory (or both) than a counterpart written in pure Erlang.

+ +

The module is implemented according to the EEP (Erlang Enhancement Proposal) 31.

+ + +

+ The library handles byte-oriented data. Bitstrings that are not + binaries (does not contain whole octets of bits) will result in a badarg + exception being thrown from any of the functions in this + module. +

+
+ + +
+
+ DATA TYPES + + cp() + - Opaque data-type representing a compiled search-pattern. Guaranteed to be a tuple() + to allow programs to distinguish it from non precompiled search patterns. + + + part() = {Start,Length} + Start = int() + Length = int() + - A representaion of a part (or range) in a binary. Start is a + zero-based offset into a binary() and Length is the length of + that part. As input to functions in this module, a reverse + part specification is allowed, constructed with a negative + Length, so that the part of the binary begins at Start + + Length and is -Length long. This is useful for referencing the + last N bytes of a binary as {size(Binary), -N}. The functions + in this module always return part()'s with positive Length. + +
+ + + at(Subject, Pos) -> int() + Returns the byte at a specific position in a binary + + Subject = binary() + Pos = int() >= 0 + + + +

Returns the byte at position Pos (zero-based) in the binary + Subject as an integer. If Pos >= byte_size(Subject), + a badarg + exception is raised.

+ +
+
+ + bin_to_list(Subject) -> list() + Convert a binary to a list of integers + + Subject = binary() + + +

The same as bin_to_list(Subject,{0,byte_size(Subject)}).

+
+
+ + bin_to_list(Subject, PosLen) -> list() + Convert a binary to a list of integers + + Subject = binary() + PosLen = part() + + + +

Converts Subject to a list of int()s, each representing + the value of one byte. The part() denotes which part of the + binary() to convert. Example:

+ + +1> binary:bin_to_list(<<"erlang">>,{1,3}). +"rla" +%% or [114,108,97] in list notation. + +

If PosLen in any way references outside the binary, a badarg exception is raised.

+
+
+ + bin_to_list(Subject, Pos, Len) -> list() + Convert a binary to a list of integers + + Subject = binary() + Pos = int() + Len = int() + + +

The same as bin_to_list(Subject,{Pos,Len}).

+
+
+ + compile_pattern(Pattern) -> cp() + Pre-compiles a binary search pattern + + Pattern = binary() | [ binary() ] + + + +

Builds an internal structure representing a compilation of a + search-pattern, later to be used in the match/3, + matches/3, + split/3 or + replace/4 + functions. The cp() returned is guaranteed to be a + tuple() to allow programs to distinguish it from non + pre-compiled search patterns

+ +

When a list of binaries is given, it denotes a set of + alternative binaries to search for. I.e if + [<<"functional">>,<<"programming">>] + is given as Pattern, this + means "either <<"functional">> or + <<"programming">>". The pattern is a set of + alternatives; when only a single binary is given, the set has + only one element. The order of alternatives in a pattern is not significant.

+ +

The list of binaries used for search alternatives shall be flat and proper.

+ +

If Pattern is not a binary or a flat proper list of binaries with length > 0, + a badarg exception will be raised.

+ +
+
+ + copy(Subject) -> binary() + Creates a duplicate of a binary + + Subject = binary() + + +

The same as copy(Subject, 1).

+
+
+ + copy(Subject,N) -> binary() + Duplicates a binary N times and creates a new + + Subject = binary() + N = int() >= 0 + + +

Creates a binary with the content of Subject duplicated N times.

+ +

This function will always create a new binary, even if N = + 1. By using copy/1 on a binary referencing a larger binary, one + might free up the larger binary for garbage collection.

+ + +

By deliberately copying a single binary to avoid referencing + a larger binary, one might, instead of freeing up the larger + binary for later garbage collection, create much more binary + data than needed. Sharing binary data is usually good. Only in + special cases, when small parts reference large binaries and the + large binaries are no longer used in any process, deliberate + copying might be a good idea.

+ +

If N < 0, a badarg exception is raised.

+
+
+ + decode_unsigned(Subject) -> Unsigned + Decode a whole binary into an integer of arbitrary size + + Subject = binary() + Unsigned = int() >= 0 + + +

The same as decode_unsigned(Subject,big).

+
+
+ + decode_unsigned(Subject, Endianess) -> Unsigned + Decode a whole binary into an integer of arbitrary size + + Subject = binary() + Endianess = big | little + Unsigned = int() >= 0 + + + +

Converts the binary digit representation, in big or little + endian, of a positive integer in Subject to an Erlang int().

+ +

Example:

+ + +1> binary:decode_unsigned(<<169,138,199>>,big). +11111111 + +
+
+ + encode_unsigned(Unsigned) -> binary() + Encodes an unsigned integer into the minimal binary + + Unsigned = int() >= 0 + + +

The same as encode_unsigned(Unsigned,big).

+
+
+ + encode_unsigned(Unsigned,Endianess) -> binary() + Encodes an unsigned integer into the minimal binary + + Unsigned = int() >= 0 + Endianess = big | little + + + +

Converts a positive integer to the smallest possible + representation in a binary digit representation, either big + or little endian.

+ +

Example:

+ + +1> binary:encode_unsigned(11111111,big). +<<169,138,199>> + +
+
+ + first(Subject) -> int() + Returns the first byte of a binary + + Subject = binary() + + + +

Returns the first byte of the binary Subject as an integer. If the + size of Subject is zero, a badarg exception is raised.

+ +
+
+ + last(Subject) -> int() + Returns the last byte of a binary + + Subject = binary() + + + +

Returns the last byte of the binary Subject as an integer. If the + size of Subject is zero, a badarg exception is raised.

+ +
+
+ + list_to_bin(ByteList) -> binary() + Convert a list of integers and binaries to a binary + + ByteList = iodata() (see module erlang) + + +

Works exactly as erlang:list_to_binary/1, added for completeness.

+
+
+ + longest_common_prefix(Binaries) -> int() + Returns length of longest common prefix for a set of binaries + + Binaries = [ binary() ] + + + +

Returns the length of the longest common prefix of the + binaries in the list Binaries. Example:

+ + +1> binary:longest_common_prefix([<<"erlang">>,<<"ergonomy">>]). +2 +2> binary:longest_common_prefix([<<"erlang">>,<<"perl">>]). +0 + + +

If Binaries is not a flat list of binaries, a badarg exception is raised.

+
+
+ + longest_common_suffix(Binaries) -> int() + Returns length of longest common suffix for a set of binaries + + Binaries = [ binary() ] + + + +

Returns the length of the longest common suffix of the + binaries in the list Binaries. Example:

+ + +1> binary:longest_common_suffix([<<"erlang">>,<<"fang">>]). +3 +2> binary:longest_common_suffix([<<"erlang">>,<<"perl">>]). +0 + + +

If Binaries is not a flat list of binaries, a badarg exception is raised.

+ +
+
+ + match(Subject, Pattern) -> Found | nomatch + Searches for the first match of a pattern in a binary + + Subject = binary() + Pattern = binary() | [ binary() ] | cp() + Found = part() + + +

The same as match(Subject, Pattern, []).

+
+
+ + match(Subject,Pattern,Options) -> Found | nomatch + Searches for the first match of a pattern in a binary + + Subject = binary() + Pattern = binary() | [ binary() ] | cp() + Found = part() + Options = [ Option ] + Option = {scope, part()} + + + +

Searches for the first occurrence of Pattern in Subject and + returns the position and length.

+ +

The function will return {Pos,Length} for the binary + in Pattern starting at the lowest position in + Subject, Example:

+ + +1> binary:match(<<"abcde">>, [<<"bcde">>,<<"cd">>],[]). +{1,4} + + +

Even though <<"cd">> ends before + <<"bcde">>, <<"bcde">> + begins first and is therefore the first match. If two + overlapping matches begins at the same position, the longest is + returned.

+ +

Summary of the options:

+ + + {scope, {Start, Length}} +

Only the given part is searched. Return values still have + offsets from the beginning of Subject. A negative Length is + allowed as described in the TYPES section of this manual.

+
+ +

If none of the strings in + Pattern is found, the atom nomatch is returned.

+ +

For a description of Pattern, see + compile_pattern/1.

+ +

If {scope, {Start,Length}} is given in the options + such that Start is larger than the size of + Subject, Start + Length is less than zero or + Start + Length is larger than the size of + Subject, a badarg exception is raised.

+ +
+
+ + matches(Subject, Pattern) -> Found + Searches for all matches of a pattern in a binary + + Subject = binary() + Pattern = binary() | [ binary() ] | cp() + Found = [ part() ] | [] + + +

The same as matches(Subject, Pattern, []).

+
+
+ + matches(Subject,Pattern,Options) -> Found + Searches for all matches of a pattern in a binary + + Subject = binary() + Pattern = binary() | [ binary() ] | cp() + Found = [ part() ] | [] + Options = [ Option ] + Option = {scope, part()} + + + +

Works like match, but the Subject is searched until + exhausted and a list of all non-overlapping parts matching + Pattern is returned (in order).

+ +

The first and longest match is preferred to a shorter, + which is illustrated by the following example:

+ + +1> binary:matches(<<"abcde">>, + [<<"bcde">>,<<"bc">>>,<<"de">>],[]). +[{1,4}] + + +

The result shows that <<bcde">> is selected instead of the + shorter match <<"bc">> (which would have given raise to one + more match,<<"de">>). This corresponds to the behavior of posix + regular expressions (and programs like awk), but is not + consistent with alternative matches in re (and Perl), where + instead lexical ordering in the search pattern selects which + string matches.

+ +

If none of the strings in pattern is found, an empty list is returned.

+ +

For a description of Pattern, see compile_pattern/1 and for a + description of available options, see match/3.

+ +

If {scope, {Start,Length}} is given in the options such that + Start is larger than the size of Subject, Start + Length is + less than zero or Start + Length is larger than the size of + Subject, a badarg exception is raised.

+ +
+
+ + part(Subject, PosLen) -> binary() + Extracts a part of a binary + + Subject = binary() + PosLen = part() + + + +

Extracts the part of the binary Subject described by PosLen.

+ +

Negative length can be used to extract bytes at the end of a binary:

+ + +1> Bin = <<1,2,3,4,5,6,7,8,9,10>>. +2> binary:part(Bin,{byte_size(Bin), -5)). +<<6,7,8,9,10>> + + + +

part/2and part/3 are also available in the + erlang module under the names binary_part/2 and + binary_part/3. Those BIFs are allowed in guard tests.

+
+ +

If PosLen in any way references outside the binary, a badarg exception + is raised.

+ +
+
+ + part(Subject, Pos, Len) -> binary() + Extracts a part of a binary + + Subject = binary() + Pos = int() + Len = int() + + +

The same as part(Subject, {Pos, Len}).

+
+
+ + referenced_byte_size(binary()) -> int() + Determines the size of the actual binary pointed out by a sub-binary + + +

If a binary references a larger binary (often described as + being a sub-binary), it can be useful to get the size of the + actual referenced binary. This function can be used in a program + to trigger the use of copy/1. By copying a binary, one might + dereference the original, possibly large, binary which a smaller + binary is a reference to.

+ +

Example:

+ + +store(Binary, GBSet) -> + NewBin = + case binary:referenced_byte_size(Binary) of + Large when Large > 2 * byte_size(Binary) -> + binary:copy(Binary); + _ -> + Binary + end, + gb_sets:insert(NewBin,GBSet). + + +

In this example, we chose to copy the binary content before + inserting it in the gb_set() if it references a binary more than + twice the size of the data we're going to keep. Of course + different rules for when copying will apply to different + programs.

+ +

Binary sharing will occur whenever binaries are taken apart, + this is the fundamental reason why binaries are fast, + decomposition can always be done with O(1) complexity. In rare + circumstances this data sharing is however undesirable, why this + function together with copy/1 might be useful when optimizing + for memory use.

+ +

Example of binary sharing:

+ + +1> A = binary:copy(<<1>>,100). +<<1,1,1,1,1 ... +2> byte_size(A). +100 +3> binary:referenced_byte_size(A) +100 +4> <<_:10/binary,B:10/binary,_/binary>> = A. +<<1,1,1,1,1 ... +5> byte_size(B). +10 +6> binary:referenced_byte_size(B) +100 + + + +

Binary data is shared among processes. If another process + still references the larger binary, copying the part this + process uses only consumes more memory and will not free up the + larger binary for garbage collection. Use this kind of intrusive + functions with extreme care, and only if a real problem is + detected.

+
+ +
+
+ + replace(Subject,Pattern,Replacement) -> Result + Replaces bytes in a binary according to a pattern + + Subject = binary() + Pattern = binary() | [ binary() ] | cp() + Replacement = binary() + Result = binary() + + +

The same as replace(Subject,Pattern,Replacement,[]).

+
+
+ + replace(Subject,Pattern,Replacement,Options) -> Result + Replaces bytes in a binary according to a pattern + + Subject = binary() + Pattern = binary() | [ binary() ] | cp() + Replacement = binary() + Result = binary() + Options = [ Option ] + Option = global | {scope, part()} | {insert_replaced, InsPos} + InsPos = OnePos | [ OnePos ] + OnePos = int() =< byte_size(Replacement) + + + +

Constructs a new binary by replacing the parts in + Subject matching Pattern with the content of + Replacement.

+ +

If the matching sub-part of Subject giving raise to the + replacement is to be inserted in the result, the option + {insert_replaced, InsPos} will insert the matching part into + Replacement at the given position (or positions) before actually + inserting Replacement into the Subject. Example:

+ + +1> binary:replace(<<"abcde">>,<<"b">>,<<"[]">>,[{insert_replaced,1}]). +<<"a[b]cde">> +2> binary:replace(<<"abcde">>,[<<"b">>,<<"d">>],<<"[]">>, + [global,{insert_replaced,1}]). +<<"a[b]c[d]e">> +3> binary:replace(<<"abcde">>,[<<"b">>,<<"d">>],<<"[]">>, + [global,{insert_replaced,[1,1]}]). +<<"a[bb]c[dd]e">> +4> binary:replace(<<"abcde">>,[<<"b">>,<<"d">>],<<"[-]">>, + [global,{insert_replaced,[1,2]}]). +<<"a[b-b]c[d-d]e">> + + +

If any position given in InsPos is greater than the size of the replacement binary, a badarg exception is raised.

+ +

The options global and {scope, part()} works as for split/3. The return type is always a binary().

+ +

For a description of Pattern, see compile_pattern/1.

+
+
+ + split(Subject,Pattern) -> Parts + Splits a binary according to a pattern + + Subject = binary() + Pattern = binary() | [ binary() ] | cp() + Parts = [ binary() ] + + +

The same as split(Subject, Pattern, []).

+
+
+ + split(Subject,Pattern,Options) -> Parts + Splits a binary according to a pattern + + Subject = binary() + Pattern = binary() | [ binary() ] | cp() + Parts = [ binary() ] + Options = [ Option ] + Option = {scope, part()} | trim | global + + + +

Splits Binary into a list of binaries based on Pattern. If + the option global is not given, only the first occurrence of + Pattern in Subject will give rise to a split.

+ +

The parts of Pattern actually found in Subject are not included in the result.

+ +

Example:

+ + +1> binary:split(<<1,255,4,0,0,0,2,3>>, [<<0,0,0>>,<<2>>],[]). +[<<1,255,4>>, <<2,3>>] +2> binary:split(<<0,1,0,0,4,255,255,9>>, [<<0,0>>, <<255,255>>],[global]). +[<<0,1>>,<<4>>,<<9>>] + + +

Summary of options:

+ + + {scope, part()} + +

Works as in match/3 and + matches/3. Note that + this only defines the scope of the search for matching strings, + it does not cut the binary before splitting. The bytes before + and after the scope will be kept in the result. See example + below.

+ + trim + +

Removes trailing empty parts of the result (as does trim in re:split/3)

+ + global + +

Repeats the split until the Subject is + exhausted. Conceptually the global option makes split work on + the positions returned by matches/3, + while it normally + works on the position returned by + match/3.

+ +
+ +

Example of the difference between a scope and taking the + binary apart before splitting:

+ + +1> binary:split(<<"banana">>,[<<"a">>],[{scope,{2,3}}]). +[<<"ban">>,<<"na">>] +2> binary:split(binary:part(<<"banana">>,{2,3}),[<<"a">>],[]). +[<<"n">>,<<"n">>] + + +

The return type is always a list of binaries that are all + referencing Subject. This means that the data in Subject is not + actually copied to new binaries and that Subject cannot be + garbage collected until the results of the split are no longer + referenced.

+ +

For a description of Pattern, see compile_pattern/1.

+ +
+
+
+
diff --git a/lib/stdlib/doc/src/ref_man.xml b/lib/stdlib/doc/src/ref_man.xml index f6ae368e92..de7aeb2274 100644 --- a/lib/stdlib/doc/src/ref_man.xml +++ b/lib/stdlib/doc/src/ref_man.xml @@ -37,6 +37,7 @@ + diff --git a/lib/stdlib/test/binary_module_SUITE.erl b/lib/stdlib/test/binary_module_SUITE.erl index 028b7f0f17..16ed9a2c26 100644 --- a/lib/stdlib/test/binary_module_SUITE.erl +++ b/lib/stdlib/test/binary_module_SUITE.erl @@ -727,9 +727,10 @@ copy(Config) when is_list(Config) -> ?line RS = random_string({1,10000}), ?line RS = RS2 = binary:copy(RS), ?line false = erts_debug:same(RS,RS2), - ?line badarg = ?MASK_ERROR(binary:copy(<<1,2,3>>,0)), + ?line <<>> = ?MASK_ERROR(binary:copy(<<1,2,3>>,0)), ?line badarg = ?MASK_ERROR(binary:copy(<<1,2,3:3>>,2)), - ?line badarg = ?MASK_ERROR(binary:copy(<<>>,0)), + ?line badarg = ?MASK_ERROR(binary:copy([],0)), + ?line <<>> = ?MASK_ERROR(binary:copy(<<>>,0)), ?line badarg = ?MASK_ERROR(binary:copy(<<1,2,3>>,1.0)), ?line badarg = ?MASK_ERROR(binary:copy(<<1,2,3>>, 16#FFFFFFFFFFFFFFFFFFFFFFFFFFFFFF)), diff --git a/lib/stdlib/test/binref.erl b/lib/stdlib/test/binref.erl index af79c8fa09..6d96736ef3 100644 --- a/lib/stdlib/test/binref.erl +++ b/lib/stdlib/test/binref.erl @@ -465,7 +465,7 @@ copy(Subject) -> copy(Subject,1). copy(Subject,N) -> try - true = is_integer(N) and (N > 0) and is_binary(Subject), % Badarg, not function clause + true = is_integer(N) and (N >= 0) and is_binary(Subject), % Badarg, not function clause erlang:list_to_binary(lists:duplicate(N,Subject)) catch _:_ -> -- cgit v1.2.3