From 2c72e662bad11a41839780f86680d4bb05367c78 Mon Sep 17 00:00:00 2001
From: Dan Gudmundsson This module provides functions for string processing. A string in this module is represented by
+ This module operates on grapheme clusters. A grapheme cluster
+ is a user-perceived character, which can be represented by several
+ codepoints.
+
+ The string length of "ß↑e̊" is 3, even though it is represented by the
+ codepoints
+ Grapheme clusters for codepoints of class
+ Splitting and appending strings is to be done on grapheme clusters
+ borders.
+ There is no verification that the results of appending strings are
+ valid or normalized.
+
+ Most of the functions expect all input to be normalized to one form,
+ see for example
+ Language or locale specific handling of input is not considered
+ in any function.
+
+ The functions can crash for non-valid input strings. For example,
+ the functions expect UTF-8 binaries but not all functions
+ verify that all binaries are encoded correctly.
+
+ Unless otherwise specified the return value type is the same as
+ the input type. That is, binary input returns binary output,
+ list input returns a list output, and mixed input can return a
+ mixed output. This module has been reworked in Erlang/OTP 20 to
+ handle A user-perceived character, consisting of one or more
+ codepoints.
+ Converts Example:
+ Returns a string where any trailing Example:
+ Returns
+ If If By default,
+ Example:
+ Removes anything before
+ By default, Example: Returns Example:
+ Returns the number of grapheme clusters in Example:
+ Returns a list of lexemes in
+ Notice that, as shown in this example, two or more
+ adjacent separator graphemes clusters in Notice that Example:
+ Converts
+ Notice that function Example:
+ Returns the first codepoint in Example:
+ Returns the first grapheme cluster in Example: Returns lexeme number Example:
+ Pads By default, Example:
+ If Example:
+ Replaces Can be implemented as: Example:
+ Returns the reverse list of the grapheme clusters in Example: Returns a substring of By default, Example:
+ Splits Example: Takes characters from Example:
+ Converts Example: Argument Example: Argument Example:
+ Converts Example:
+ Returns a string, where leading or trailing, or both,
+ Default
+ Notice that Example:
+ Converts See also Example: Here follows the function of the old API.
+ These functions only work on a list of Latin-1 characters.
+
+ The functions are kept for backward compatibility, but are
+ not recommended.
+ They will be deprecated in Erlang/OTP 21.
+ Any undocumented functions in Returns a string, where This function is Returns a string consisting of This function is Returns the index of the first occurrence of
This function is Concatenates
+ This function is Returns a string containing This function is Returns the length of the maximum initial segment of
This function is Example: Returns Returns a string with the elements of This function is Example:
+"abcd" is a valid string
+<<"abcd">> is a valid string
+["abcd"] is a valid string
+<<"abc..åäö"/utf8>> is a valid string
+<<"abc..åäö">> is NOT a valid string,
+ but a binary with Latin-1-encoded codepoints
+[<<"abc">>, "..åäö"] is a valid string
+[atom] is NOT a valid string
+
+"å" [229] or [97, 778]
+"e̊" [101, 778]
+
+1> string:trim(" sarah ").
+"sarah"
+2> string:trim(<<" sarah ">>).
+<<"sarah">>
+3> string:lexemes("foo bar", " ").
+["foo","bar"]
+4> string:lexemes(<<"foo bar">>, " ").
+[<<"foo">>,<<"bar">>]
+
+1> string:casefold("Ω and ẞ SHARP S").
+"ω and ss sharp s"
+
+182> string:chomp(<<"\nHello\n\n">>).
+<<"\nHello">>
+183> string:chomp("\nHello\r\r\n").
+"\nHello\r"
+
+1> string:equal("åäö", <<"åäö"/utf8>>).
+true
+2> string:equal("åäö", unicode:characters_to_nfd_binary("åäö")).
+false
+3> string:equal("åäö", unicode:characters_to_nfd_binary("ÅÄÖ"), true, nfc).
+true
+
+1> string:find("ab..cd..ef", ".").
+"..cd..ef"
+2> string:find(<<"ab..cd..ef">>, "..", trailing).
+<<"..ef">>
+3> string:find(<<"ab..cd..ef">>, "x", leading).
+nomatch
+4> string:find("ab..cd..ef", "x", trailing).
+nomatch
+
+1> string:is_empty("foo").
+false
+2> string:is_empty(["",<<>>]).
+true
+
+1> string:length("ß↑e̊").
+3
+2> string:length(<<195,159,226,134,145,101,204,138>>).
+3
+
+1> string:lexemes("abc de̊fxxghix jkl\r\nfoo", "x e" ++ [[$\r,$\n]]).
+["abc","de̊f","ghi","jkl","foo"]
+2> string:lexemes(<<"abc de̊fxxghix jkl\r\nfoo"/utf8>>, "x e" ++ [$\r,$\n]).
+[<<"abc">>,<<"de̊f"/utf8>>,<<"ghi">>,<<"jkl\r\nfoo">>]
+
+2> string:lowercase(string:uppercase("Michał")).
+"michał"
+
+1> string:next_codepoint(unicode:characters_to_binary("e̊fg")).
+[101|<<"̊fg"/utf8>>]
+
+1> string:next_grapheme(unicode:characters_to_binary("e̊fg")).
+["e̊"|<<"fg">>]
+
+1> string:nth_lexeme("abc.de̊f.ghiejkl", 3, ".e").
+"ghi"
+
+1> string:pad(<<"He̊llö"/utf8>>, 8).
+[<<72,101,204,138,108,108,195,182>>,32,32,32]
+2> io:format("'~ts'~n",[string:pad("He̊llö", 8, leading)]).
+' He̊llö'
+3> io:format("'~ts'~n",[string:pad("He̊llö", 8, both)]).
+' He̊llö '
+
+1> string:prefix(<<"prefix of string">>, "pre").
+<<"fix of string">>
+2> string:prefix("pre", "prefix").
+nomatch
+ lists:join(Replacement, split(String, SearchPattern, Where)).
+
+1> string:replace(<<"ab..cd..ef">>, "..", "*").
+[<<"ab">>,"*",<<"cd..ef">>]
+2> string:replace(<<"ab..cd..ef">>, "..", "*", all).
+[<<"ab">>,"*",<<"cd">>,"*",<<"ef">>]
+
+1> Reverse = string:reverse(unicode:characters_to_nfd_binary("ÅÄÖ")).
+[[79,776],[65,776],[65,778]]
+2> io:format("~ts~n",[Reverse]).
+ÖÄÅ
+
+1> string:slice(<<"He̊llö Wörld"/utf8>>, 4).
+<<"ö Wörld"/utf8>>
+2> string:slice(["He̊llö ", <<"Wörld"/utf8>>], 4,4).
+"ö Wö"
+3> string:slice(["He̊llö ", <<"Wörld"/utf8>>], 4,50).
+"ö Wörld"
+
+0> string:split("ab..bc..cd", "..").
+["ab","bc..cd"]
+1> string:split(<<"ab..bc..cd">>, "..", trailing).
+[<<"ab..bc">>,<<"cd">>]
+2> string:split(<<"ab..bc....cd">>, "..", all).
+[<<"ab">>,<<"bc">>,<<>>,<<"cd">>]
+
+5> string:take("abc0z123", lists:seq($a,$z)).
+{"abc","0z123"}
+6> string:take(<<"abc0z123">>, lists:seq($0,$9), true, leading).
+{<<"abc">>,<<"0z123">>}
+7> string:take("abc0z123", lists:seq($0,$9), false, trailing).
+{"abc0z","123"}
+8> string:take(<<"abc0z123">>, lists:seq($a,$z), true, trailing).
+{<<"abc0z">>,<<"123">>}
+
+1> string:titlecase("ß is a SHARP s").
+"Ss is a SHARP s"
+
+> {F1,Fs} = string:to_float("1.0-1.0e-1"),
+> {F2,[]} = string:to_float(Fs),
+> F1+F2.
+0.9
+> string:to_float("3/2=1.5").
+{error,no_float}
+> string:to_float("-1.5eX").
+{-1.5,"eX"}
+
+> {I1,Is} = string:to_integer("33+22"),
+> {I2,[]} = string:to_integer(Is),
+> I1-I2.
+11
+> string:to_integer("0.5").
+{0,".5"}
+> string:to_integer("x=2").
+{error,no_integer}
+
+1> string:to_graphemes("ß↑e̊").
+[223,8593,[101,778]]
+2> string:to_graphemes(<<"ß↑e̊"/utf8>>).
+[223,8593,[101,778]]
+
+1> string:trim("\t Hello \n").
+"Hello"
+2> string:trim(<<"\t Hello \n">>, leading).
+<<"Hello \n">>
+3> string:trim(<<".Hello.\n">>, trailing, "\n.").
+<<".Hello">>
+
+1> string:uppercase("Michał").
+"MICHAŁ"
+
> string:cspan("\t abcdef", " \t").
@@ -105,21 +736,15 @@
-
> join(["one", "two", "three"], ", ").
@@ -137,6 +762,10 @@
fixed. If
This function is
Example:
> string:left("Hello",10,$.).
@@ -149,6 +778,9 @@
Return the length of a string.
Returns the number of characters in String .
+ This function is obsolete .
+ Use
+ length/1 .
@@ -160,6 +792,9 @@
Returns the index of the last occurrence of
Character in String . Returns
0 if Character does not occur.
+ This function is obsolete .
+ Use
+ find/3 .
@@ -173,6 +808,9 @@
fixed. If the length of (String ) <
Number , then String is padded
with blanks or Character s.
+ This function is obsolete .
+ Use
+ pad/3 .
Example:
> string:right("Hello", 10, $.).
@@ -188,6 +826,9 @@
SubString begins in String .
Returns 0 if SubString
does not exist in String .
+ This function is obsolete .
+ Use
+ find/3 .
Example:
> string:rstr(" Hello Hello World World ", "Hello World").
@@ -202,6 +843,9 @@
Returns the length of the maximum initial segment of
String , which consists entirely of characters
from Chars .
+ This function is obsolete .
+ Use
+ take/2 .
Example:
> string:span("\t abcdef", " \t").
@@ -217,6 +861,9 @@
SubString begins in String .
Returns 0 if SubString
does not exist in String .
+ This function is obsolete .
+ Use
+ find/2 .
Example:
> string:str(" Hello Hello World World ", "Hello World").
@@ -230,12 +877,15 @@
Strip leading or trailing characters.
- Returns a string, where leading and/or trailing blanks or a
+
Returns a string, where leading or trailing, or both, blanks or a
number of Character have been removed.
Direction , which can be left , right ,
or both , indicates from which direction blanks are to be
removed. strip/1 is equivalent to
strip(String, both) .
+ This function is obsolete .
+ Use
+ trim/3 .
Example:
> string:strip("...Hello.....", both, $.).
@@ -251,6 +901,9 @@
Returns a substring of String , starting at
position Start to the end of the string, or to
and including position Stop .
+ This function is obsolete .
+ Use
+ slice/3 .
Example:
sub_string("Hello World", 4, 8).
@@ -266,6 +919,9 @@ sub_string("Hello World", 4, 8).
Returns a substring of String , starting at
position Start , and ending at the end of the
string or at length Length .
+ This function is obsolete .
+ Use
+ slice/3 .
Example:
> substr("Hello World", 4, 5).
@@ -281,6 +937,9 @@ sub_string("Hello World", 4, 8).
Returns the word in position Number of
String . Words are separated by blanks or
Character s.
+ This function is obsolete .
+ Use
+ nth_lexeme/3 .
Example:
> string:sub_word(" Hello old boy !",3,$o).
@@ -288,50 +947,6 @@ sub_string("Hello World", 4, 8).
-
-
- Returns a float whose text representation is the integers
- (ASCII values) in a string.
-
- Argument String is expected to start with a
- valid text represented float (the digits are ASCII values).
- Remaining characters in the string after the float are returned in
- Rest .
- Example:
-
-> {F1,Fs} = string:to_float("1.0-1.0e-1"),
-> {F2,[]} = string:to_float(Fs),
-> F1+F2.
-0.9
-> string:to_float("3/2=1.5").
-{error,no_float}
-> string:to_float("-1.5eX").
-{-1.5,"eX"}
-
-
-
-
-
- Returns an integer whose text representation is the integers
- (ASCII values) in a string.
-
- Argument String is expected to start with a
- valid text represented integer (the digits are ASCII values).
- Remaining characters in the string after the integer are returned in
- Rest .
- Example:
-
-> {I1,Is} = string:to_integer("33+22"),
-> {I2,[]} = string:to_integer(Is),
-> I1-I2.
-11
-> string:to_integer("0.5").
-{0,".5"}
-> string:to_integer("x=2").
-{error,no_integer}
-
-
-
@@ -346,6 +961,11 @@ sub_string("Hello World", 4, 8).
The specified string or character is case-converted. Notice that
the supported character set is ISO/IEC 8859-1 (also called Latin 1);
all values outside this set are unchanged
+ This function is obsolete use
+ lowercase/1 ,
+ uppercase/1 ,
+ titlecase/1 or
+ casefold/1 .
@@ -363,6 +983,9 @@ sub_string("Hello World", 4, 8).
adjacent separator characters in String
are treated as one. That is, there are no empty
strings in the resulting list of tokens.
+ This function is obsolete .
+ Use
+ lexemes/2 .
@@ -373,6 +996,9 @@ sub_string("Hello World", 4, 8).
Returns the number of words in String , separated
by blanks or Character .
+ This function is obsolete .
+ Use
+ lexemes/2 .
Example:
> words(" Hello old boy!", $o).
@@ -387,10 +1013,7 @@ sub_string("Hello World", 4, 8).
other. The reason is that this string package is the
combination of two earlier packages and all functions of
both packages have been retained.
-
-
- Any undocumented functions in string are not to be used.
-
+
diff --git a/lib/stdlib/doc/src/unicode_usage.xml b/lib/stdlib/doc/src/unicode_usage.xml
index a8ef8ff5c5..11b84f552a 100644
--- a/lib/stdlib/doc/src/unicode_usage.xml
+++ b/lib/stdlib/doc/src/unicode_usage.xml
@@ -65,7 +65,10 @@
In Erlang/OTP 20.0, atoms and function can contain
Unicode characters. Module names are still restricted to
- the ISO-Latin-1 range.
+ the ISO-Latin-1 range.
+ Support was added for normalizations forms in
+ unicode and the string module now handles
+ utf8-encoded binaries.
This section outlines the current Unicode support and gives some
@@ -110,23 +113,27 @@
-
So, a conversion function must know not only one character at a time,
- but possibly the whole sentence, the natural language to translate to,
- the differences in input and output string length, and so on.
- Erlang/OTP has currently no Unicode to_upper /to_lower
- functionality, but publicly available libraries address these issues.
-
- Another example is the accented characters, where the same glyph has two
- different representations. The Swedish letter "ö" is one example.
- The Unicode standard has a code point for it, but you can also write it
- as "o" followed by "U+0308" (Combining Diaeresis, with the simplified
- meaning that the last letter is to have "¨" above). They have the same
- glyph. They are for most purposes the same, but have different
- representations. For example, MacOS X converts all filenames to use
- Combining Diaeresis, while most other programs (including Erlang) try to
- hide that by doing the opposite when, for example, listing directories.
- However it is done, it is usually important to normalize such
- characters to avoid confusion.
+ So, a conversion function must know not only one character at a
+ time, but possibly the whole sentence, the natural language to
+ translate to, the differences in input and output string length,
+ and so on. Erlang/OTP has currently no Unicode
+ uppercase /lowercase functionality with language
+ specific handling, but publicly available libraries address these
+ issues.
+
+ Another example is the accented characters, where the same
+ glyph has two different representations. The Swedish letter "ö" is
+ one example. The Unicode standard has a code point for it, but
+ you can also write it as "o" followed by "U+0308" (Combining
+ Diaeresis, with the simplified meaning that the last letter is to
+ have "¨" above). They have the same glyph, user perceived
+ character. They are for most purposes the same, but have different
+ representations. For example, MacOS X converts all filenames to
+ use Combining Diaeresis, while most other programs (including
+ Erlang) try to hide that by doing the opposite when, for example,
+ listing directories. However it is done, it is usually important
+ to normalize such characters to avoid confusion.
+
The list of examples can be made long. One need a kind of knowledge that
was not needed when programs only considered one or two languages. The
@@ -273,7 +280,7 @@
them. In some cases functionality has been added to already
existing interfaces (as the string module now can
- handle lists with any code points). In some cases new
+ handle strings with any code points). In some cases new
functionality or options have been added (as in the io module, the file
handling, the Fortunately, most textual data has been stored in lists and range
checking has been sparse, so modules like string work well for
- Unicode lists with little need for conversion or extension.
+ Unicode strings with little need for conversion or extension.
Some modules are, however, changed to be explicitly Unicode-aware. These
modules include:
@@ -1028,18 +1035,17 @@ Eshell V5.10.1 (abort with ^G)
has extensive support for Unicode text.
- The string module works
- perfectly for Unicode strings and ISO Latin-1 strings, except the
- language-dependent functions
- string:to_upper/1
- and
- string:to_lower/1 ,
- which are only correct for the ISO Latin-1 character set. These two
- functions can never function correctly for Unicode characters in their
- current form, as there are language and locale issues as well as
- multi-character mappings to consider when converting text between cases.
- Converting case in an international environment is a large subject not
- yet addressed in OTP.
+ The string
+ module works perfectly for Unicode strings and ISO Latin-1
+ strings, except the language-dependent functions string:uppercase/1
+ and string:lowercase/1 .
+ These two functions can never function correctly for Unicode
+ characters in their current form, as there are language and locale
+ issues to consider when converting text between cases. Converting
+ case in an international environment is a large subject not yet
+ addressed in OTP.
diff --git a/lib/stdlib/src/string.erl b/lib/stdlib/src/string.erl
index c659db78bd..4fdfe99b66 100644
--- a/lib/stdlib/src/string.erl
+++ b/lib/stdlib/src/string.erl
@@ -17,22 +17,72 @@
%%
%% %CopyrightEnd%
%%
+%% A string library that works on grapheme clusters, with the exception
+%% of codepoints of class 'prepend' and non modern (or decomposed) Hangul.
+%% If these codepoints appear, functions like 'find/2' may return a string
+%% which starts inside a grapheme cluster.
+%% These exceptions are made because the codepoints classes are
+%% seldom used and require that we are able look at previous codepoints in
+%% the stream and is thus hard to implement effectively.
+%%
+%% GC (grapheme cluster) implies that the length of string 'ß↑e̊' is 3 though
+%% it is represented by the codepoints [223,8593,101,778] or the
+%% utf8 binary <<195,159,226,134,145,101,204,138>>
+%%
+%% And that searching for strings or graphemes finds the correct positions:
+%%
+%% find("eeeee̊eee", "e̊") -> "e̊ee".:
+%% find("1£4e̊abcdef", "e") -> "ef"
+%%
+%% Most functions expect all input to be normalized to one form,
+%% see unicode:characters_to_nfc and unicode:characters_to_nfd functions.
+%% When appending strings no checking is done to verify that the
+%% result is valid unicode strings.
+%%
+%% The functions may crash for invalid utf-8 input.
+%%
+%% Return value should be kept consistent when return type is
+%% unicode:chardata() i.e. binary input => binary output,
+%% list input => list output mixed input => mixed output
+%%
-module(string).
--export([len/1,equal/2,concat/2,chr/2,rchr/2,str/2,rstr/2,
- span/2,cspan/2,substr/2,substr/3,tokens/2,chars/2,chars/3]).
+-export([is_empty/1, length/1, to_graphemes/1,
+ reverse/1,
+ equal/2, equal/3, equal/4,
+ slice/2, slice/3,
+ pad/2, pad/3, pad/4, trim/1, trim/2, trim/3, chomp/1,
+ take/2, take/3, take/4,
+ lexemes/2, nth_lexeme/3,
+ uppercase/1, lowercase/1, titlecase/1,casefold/1,
+ prefix/2,
+ split/2,split/3,replace/3,replace/4,
+ find/2,find/3,
+ next_codepoint/1, next_grapheme/1
+ ]).
+
+%% Old (will be deprecated) lists/string API kept for backwards compability
+-export([len/1, concat/2, % equal/2, (extended in the new api)
+ chr/2,rchr/2,str/2,rstr/2,
+ span/2,cspan/2,substr/2,substr/3, tokens/2,
+ chars/2,chars/3]).
-export([copies/2,words/1,words/2,strip/1,strip/2,strip/3,
sub_word/2,sub_word/3,left/2,left/3,right/2,right/3,
sub_string/2,sub_string/3,centre/2,centre/3, join/2]).
-export([to_upper/1, to_lower/1]).
+%%
+-import(lists,[member/2]).
--import(lists,[reverse/1,member/2]).
+-compile({no_auto_import,[length/1]}).
-%%---------------------------------------------------------------------------
+-export_type([grapheme_cluster/0]).
-%%% BIFs
+-type grapheme_cluster() :: char() | [char()].
+-type direction() :: 'leading' | 'trailing'.
+%%% BIFs
-export([to_float/1, to_integer/1]).
+-dialyzer({no_improper_lists, stack/2}).
-spec to_float(String) -> {Float, Rest} | {error, Reason} when
String :: string(),
@@ -54,6 +104,1180 @@ to_integer(_) ->
%%% End of BIFs
+%% Check if string is the empty string
+-spec is_empty(String::unicode:chardata()) -> boolean().
+is_empty([]) -> true;
+is_empty(<<>>) -> true;
+is_empty([L|R]) -> is_empty(L) andalso is_empty(R);
+is_empty(_) -> false.
+
+%% Count the number of grapheme clusters in chardata
+-spec length(String::unicode:chardata()) -> non_neg_integer().
+length(CD) ->
+ length_1(unicode_util:gc(CD), 0).
+
+%% Convert a string to a list of grapheme clusters
+-spec to_graphemes(String::unicode:chardata()) -> [grapheme_cluster()].
+to_graphemes(CD0) ->
+ case unicode_util:gc(CD0) of
+ [GC|CD] -> [GC|to_graphemes(CD)];
+ [] -> []
+ end.
+
+%% Compare two strings return boolean, assumes that the input are
+%% normalized to same form, see unicode:characters_to_nfX_xxx(..)
+-spec equal(A, B) -> boolean() when
+ A::unicode:chardata(),
+ B::unicode:chardata().
+equal(A,B) when is_binary(A), is_binary(B) ->
+ A =:= B;
+equal(A,B) ->
+ equal_1(A,B).
+
+%% Compare two strings return boolean, assumes that the input are
+%% normalized to same form, see unicode:characters_to_nfX_xxx(..)
+%% does casefold on the fly
+-spec equal(A, B, IgnoreCase) -> boolean() when
+ A::unicode:chardata(),
+ B::unicode:chardata(),
+ IgnoreCase :: boolean().
+equal(A, B, false) ->
+ equal(A,B);
+equal(A, B, true) ->
+ equal_nocase(A,B).
+
+%% Compare two strings return boolean
+%% if specified does casefold and normalization on the fly
+-spec equal(A, B, IgnoreCase, Norm) -> boolean() when
+ A :: unicode:chardata(),
+ B :: unicode:chardata(),
+ IgnoreCase :: boolean(),
+ Norm :: 'none' | 'nfc' | 'nfd' | 'nfkc' | 'nfkd'.
+equal(A, B, Case, none) ->
+ equal(A,B,Case);
+equal(A, B, false, Norm) ->
+ equal_norm(A, B, Norm);
+equal(A, B, true, Norm) ->
+ equal_norm_nocase(A, B, Norm).
+
+%% Reverse grapheme clusters
+-spec reverse(String::unicode:chardata()) -> [grapheme_cluster()].
+reverse(CD) ->
+ reverse_1(CD, []).
+
+%% Slice a string and return rest of string
+%% Note: counts grapheme_clusters
+-spec slice(String, Start) -> Slice when
+ String::unicode:chardata(),
+ Start :: non_neg_integer(),
+ Slice :: unicode:chardata().
+slice(CD, N) when is_integer(N), N >= 0 ->
+ slice_l(CD, N, is_binary(CD)).
+
+-spec slice(String, Start, Length) -> Slice when
+ String::unicode:chardata(),
+ Start :: non_neg_integer(),
+ Length :: 'infinity' | non_neg_integer(),
+ Slice :: unicode:chardata().
+slice(CD, N, Length)
+ when is_integer(N), N >= 0, is_integer(Length), Length > 0 ->
+ slice_trail(slice_l(CD, N, is_binary(CD)), Length);
+slice(CD, N, infinity) ->
+ slice_l(CD, N, is_binary(CD));
+slice(CD, _, 0) ->
+ case is_binary(CD) of
+ true -> <<>>;
+ false -> []
+ end.
+
+%% Pad a string to desired length
+-spec pad(String, Length) -> unicode:charlist() when
+ String ::unicode:chardata(),
+ Length :: integer().
+pad(CD, Length) ->
+ pad(CD, Length, trailing, $\s).
+
+-spec pad(String, Length, Dir) -> unicode:charlist() when
+ String ::unicode:chardata(),
+ Length :: integer(),
+ Dir :: direction() | 'both'.
+pad(CD, Length, Dir) ->
+ pad(CD, Length, Dir, $\s).
+
+-spec pad(String, Length, Dir, Char) -> unicode:charlist() when
+ String ::unicode:chardata(),
+ Length :: integer(),
+ Dir :: direction() | 'both',
+ Char :: grapheme_cluster().
+pad(CD, Length, leading, Char) when is_integer(Length) ->
+ Len = length(CD),
+ [lists:duplicate(max(0, Length-Len), Char), CD];
+pad(CD, Length, trailing, Char) when is_integer(Length) ->
+ Len = length(CD),
+ [CD|lists:duplicate(max(0, Length-Len), Char)];
+pad(CD, Length, both, Char) when is_integer(Length) ->
+ Len = length(CD),
+ Size = max(0, Length-Len),
+ Pre = lists:duplicate(Size div 2, Char),
+ Post = case Size rem 2 of
+ 1 -> [Char];
+ _ -> []
+ end,
+ [Pre, CD, Pre|Post].
+
+%% Strip characters from whitespace or Separator in Direction
+-spec trim(String) -> unicode:chardata() when
+ String :: unicode:chardata().
+trim(Str) ->
+ trim(Str, both, unicode_util:whitespace()).
+
+-spec trim(String, Dir) -> unicode:chardata() when
+ String :: unicode:chardata(),
+ Dir :: direction() | 'both'.
+trim(Str, Dir) ->
+ trim(Str, Dir, unicode_util:whitespace()).
+
+-spec trim(String, Dir, Characters) -> unicode:chardata() when
+ String :: unicode:chardata(),
+ Dir :: direction() | 'both',
+ Characters :: [grapheme_cluster()].
+trim(Str, _, []) -> Str;
+trim(Str, leading, Sep) when is_list(Sep) ->
+ trim_l(Str, search_pattern(Sep));
+trim(Str, trailing, Sep) when is_list(Sep) ->
+ trim_t(Str, 0, search_pattern(Sep));
+trim(Str, both, Sep0) when is_list(Sep0) ->
+ Sep = search_pattern(Sep0),
+ trim_t(trim_l(Str,Sep), 0, Sep).
+
+%% Delete trailing newlines or \r\n
+-spec chomp(String::unicode:chardata()) -> unicode:chardata().
+chomp(Str) ->
+ trim_t(Str,0, {[[$\r,$\n],$\n], [$\r,$\n], [<<$\r>>,<<$\n>>]}).
+
+%% Split String into two parts where the leading part consists of Characters
+-spec take(String, Characters) -> {Leading, Trailing} when
+ String::unicode:chardata(),
+ Characters::[grapheme_cluster()],
+ Leading::unicode:chardata(),
+ Trailing::unicode:chardata().
+take(Str, Sep) ->
+ take(Str, Sep, false, leading).
+-spec take(String, Characters, Complement) -> {Leading, Trailing} when
+ String::unicode:chardata(),
+ Characters::[grapheme_cluster()],
+ Complement::boolean(),
+ Leading::unicode:chardata(),
+ Trailing::unicode:chardata().
+take(Str, Sep, Complement) ->
+ take(Str, Sep, Complement, leading).
+-spec take(String, Characters, Complement, Dir) -> {Leading, Trailing} when
+ String::unicode:chardata(),
+ Characters::[grapheme_cluster()],
+ Complement::boolean(),
+ Dir::direction(),
+ Leading::unicode:chardata(),
+ Trailing::unicode:chardata().
+take(Str, [], Complement, Dir) ->
+ Empty = case is_binary(Str) of true -> <<>>; false -> [] end,
+ case {Complement,Dir} of
+ {false, leading} -> {Empty, Str};
+ {false, trailing} -> {Str, Empty};
+ {true, leading} -> {Str, Empty};
+ {true, trailing} -> {Empty, Str}
+ end;
+take(Str, Sep0, false, leading) ->
+ Sep = search_pattern(Sep0),
+ take_l(Str, Sep, []);
+take(Str, Sep0, true, leading) ->
+ Sep = search_pattern(Sep0),
+ take_lc(Str, Sep, []);
+take(Str, Sep0, false, trailing) ->
+ Sep = search_pattern(Sep0),
+ take_t(Str, 0, Sep);
+take(Str, Sep0, true, trailing) ->
+ Sep = search_pattern(Sep0),
+ take_tc(Str, 0, Sep).
+
+%% Uppercase all chars in Str
+-spec uppercase(String::unicode:chardata()) -> unicode:chardata().
+uppercase(CD) when is_list(CD) ->
+ uppercase_list(CD);
+uppercase(CD) when is_binary(CD) ->
+ uppercase_bin(CD,<<>>).
+
+%% Lowercase all chars in Str
+-spec lowercase(String::unicode:chardata()) -> unicode:chardata().
+lowercase(CD) when is_list(CD) ->
+ lowercase_list(CD);
+lowercase(CD) when is_binary(CD) ->
+ lowercase_bin(CD,<<>>).
+
+%% Make a titlecase of the first char in Str
+-spec titlecase(String::unicode:chardata()) -> unicode:chardata().
+titlecase(CD) when is_list(CD) ->
+ case unicode_util:titlecase(CD) of
+ [GC|Tail] -> append(GC,Tail);
+ Empty -> Empty
+ end;
+titlecase(CD) when is_binary(CD) ->
+ case unicode_util:titlecase(CD) of
+ [CP|Chars] when is_integer(CP) -> <>;
+ [CPs|Chars] ->
+ << << <> || CP <- CPs>>/binary, Chars/binary>>;
+ [] -> <<>>
+ end.
+
+%% Make a comparable string of the Str should be used for equality tests only
+-spec casefold(String::unicode:chardata()) -> unicode:chardata().
+casefold(CD) when is_list(CD) ->
+ casefold_list(CD);
+casefold(CD) when is_binary(CD) ->
+ casefold_bin(CD,<<>>).
+
+%% Return the remaining string with prefix removed or else nomatch
+-spec prefix(String::unicode:chardata(), Prefix::unicode:chardata()) ->
+ 'nomatch' | unicode:chardata().
+prefix(Str, []) -> Str;
+prefix(Str, Prefix0) ->
+ Prefix = unicode:characters_to_list(Prefix0),
+ case prefix_1(Str, Prefix) of
+ [] when is_binary(Str) -> <<>>;
+ Res -> Res
+ end.
+
+%% split String with the first occurrence of SearchPattern, return list of splits
+-spec split(String, SearchPattern) -> [unicode:chardata()] when
+ String :: unicode:chardata(),
+ SearchPattern :: unicode:chardata().
+split(String, SearchPattern) ->
+ split(String, SearchPattern, leading).
+
+%% split String with SearchPattern, return list of splits
+-spec split(String, SearchPattern, Where) -> [unicode:chardata()] when
+ String :: unicode:chardata(),
+ SearchPattern :: unicode:chardata(),
+ Where :: direction() | 'all'.
+split(String, SearchPattern, Where) ->
+ case is_empty(SearchPattern) of
+ true -> [String];
+ false ->
+ SearchPatternCPs = unicode:characters_to_list(SearchPattern),
+ case split_1(String, SearchPatternCPs, 0, Where, [], []) of
+ {_Curr, []} -> [String];
+ {_Curr, Acc} when Where =:= trailing -> Acc;
+ {Curr, Acc} when Where =:= all -> lists:reverse([Curr|Acc]);
+ Acc when is_list(Acc) -> Acc
+ end
+ end.
+
+%% Replace the first SearchPattern in String with Replacement
+-spec replace(String, SearchPattern, Replacement) ->
+ [unicode:chardata()] when
+ String :: unicode:chardata(),
+ SearchPattern :: unicode:chardata(),
+ Replacement :: unicode:chardata().
+replace(String, SearchPattern, Replacement) ->
+ lists:join(Replacement, split(String, SearchPattern)).
+
+%% Replace Where SearchPattern in String with Replacement
+-spec replace(String, SearchPattern, Replacement, Where) ->
+ [unicode:chardata()] when
+ String :: unicode:chardata(),
+ SearchPattern :: unicode:chardata(),
+ Replacement :: unicode:chardata(),
+ Where :: direction() | 'all'.
+replace(String, SearchPattern, Replacement, Where) ->
+ lists:join(Replacement, split(String, SearchPattern, Where)).
+
+%% Split Str into a list of chardata separated by one of the grapheme
+%% clusters in Seps
+-spec lexemes(String::unicode:chardata(),
+ SeparatorList::[grapheme_cluster()]) ->
+ [unicode:chardata()].
+lexemes([], _) -> [];
+lexemes(Str, Seps0) when is_list(Seps0) ->
+ Seps = search_pattern(Seps0),
+ lexemes_m(Str, Seps, []).
+
+-spec nth_lexeme(String, N, SeparatorList) -> unicode:chardata() when
+ String::unicode:chardata(),
+ N::non_neg_integer(),
+ SeparatorList::[grapheme_cluster()].
+
+nth_lexeme(Str, 1, []) -> Str;
+nth_lexeme(Str, N, Seps0) when is_list(Seps0), is_integer(N), N > 0 ->
+ Seps = search_pattern(Seps0),
+ nth_lexeme_m(Str, Seps, N).
+
+%% find first SearchPattern in String return rest of string
+-spec find(String, SearchPattern) -> unicode:chardata() | 'nomatch' when
+ String::unicode:chardata(),
+ SearchPattern::unicode:chardata().
+find(String, SearchPattern) ->
+ find(String, SearchPattern, leading).
+
+%% find SearchPattern in String (search in Dir direction) return rest of string
+-spec find(String, SearchPattern, Dir) -> unicode:chardata() | 'nomatch' when
+ String::unicode:chardata(),
+ SearchPattern::unicode:chardata(),
+ Dir::direction().
+find(String, "", _) -> String;
+find(String, <<>>, _) -> String;
+find(String, SearchPattern, leading) ->
+ find_l(String, unicode:characters_to_list(SearchPattern));
+find(String, SearchPattern, trailing) ->
+ find_r(String, unicode:characters_to_list(SearchPattern), nomatch).
+
+%% Fetch first codepoint and return rest in tail
+-spec next_grapheme(String::unicode:chardata()) ->
+ maybe_improper_list(grapheme_cluster(),unicode:chardata()).
+next_grapheme(CD) -> unicode_util:gc(CD).
+
+%% Fetch first grapheme cluster and return rest in tail
+-spec next_codepoint(String::unicode:chardata()) ->
+ maybe_improper_list(char(),unicode:chardata()).
+next_codepoint(CD) -> unicode_util:cp(CD).
+
+%% Internals
+
+length_1([_|Rest], N) ->
+ length_1(unicode_util:gc(Rest), N+1);
+length_1([], N) ->
+ N.
+
+equal_1([A|AR], [B|BR]) when is_integer(A), is_integer(B) ->
+ A =:= B andalso equal_1(AR, BR);
+equal_1([], BR) -> is_empty(BR);
+equal_1(A0,B0) ->
+ case {unicode_util:cp(A0), unicode_util:cp(B0)} of
+ {[CP|A],[CP|B]} -> equal_1(A,B);
+ {[], []} -> true;
+ _ -> false
+ end.
+
+equal_nocase(A, A) -> true;
+equal_nocase(A0, B0) ->
+ case {unicode_util:cp(unicode_util:casefold(A0)),
+ unicode_util:cp(unicode_util:casefold(B0))} of
+ {[CP|A],[CP|B]} -> equal_nocase(A,B);
+ {[], []} -> true;
+ _ -> false
+ end.
+
+equal_norm(A, A, _Norm) -> true;
+equal_norm(A0, B0, Norm) ->
+ case {unicode_util:cp(unicode_util:Norm(A0)),
+ unicode_util:cp(unicode_util:Norm(B0))} of
+ {[CP|A],[CP|B]} -> equal_norm(A,B, Norm);
+ {[], []} -> true;
+ _ -> false
+ end.
+
+equal_norm_nocase(A, A, _Norm) -> true;
+equal_norm_nocase(A0, B0, Norm) ->
+ case {unicode_util:cp(unicode_util:casefold(unicode_util:Norm(A0))),
+ unicode_util:cp(unicode_util:casefold(unicode_util:Norm(B0)))} of
+ {[CP|A],[CP|B]} -> equal_norm_nocase(A,B, Norm);
+ {[], []} -> true;
+ _ -> false
+ end.
+
+reverse_1(CD, Acc) ->
+ case unicode_util:gc(CD) of
+ [GC|Rest] -> reverse_1(Rest, [GC|Acc]);
+ [] -> Acc
+ end.
+
+slice_l(CD, N, Binary) when N > 0 ->
+ case unicode_util:gc(CD) of
+ [_|Cont] -> slice_l(Cont, N-1, Binary);
+ [] when Binary -> <<>>;
+ [] -> []
+ end;
+slice_l(Cont, 0, Binary) ->
+ case is_empty(Cont) of
+ true when Binary -> <<>>;
+ _ -> Cont
+ end.
+
+slice_trail(CD, N) when is_list(CD) ->
+ slice_list(CD, N);
+slice_trail(CD, N) when is_binary(CD) ->
+ slice_bin(CD, N, CD).
+
+slice_list(CD, N) when N > 0 ->
+ case unicode_util:gc(CD) of
+ [GC|Cont] -> append(GC, slice_list(Cont, N-1));
+ [] -> []
+ end;
+slice_list(_, 0) ->
+ [].
+
+slice_bin(CD, N, Orig) when N > 0 ->
+ case unicode_util:gc(CD) of
+ [_|Cont] -> slice_bin(Cont, N-1, Orig);
+ [] -> Orig
+ end;
+slice_bin([], 0, Orig) ->
+ Orig;
+slice_bin(CD, 0, Orig) ->
+ Sz = byte_size(Orig) - byte_size(CD),
+ <> = Orig,
+ Keep.
+
+uppercase_list(CPs0) ->
+ case unicode_util:uppercase(CPs0) of
+ [Char|CPs] -> append(Char,uppercase_list(CPs));
+ [] -> []
+ end.
+
+uppercase_bin(CPs0, Acc) ->
+ case unicode_util:uppercase(CPs0) of
+ [Char|CPs] when is_integer(Char) ->
+ uppercase_bin(CPs, <>);
+ [Chars|CPs] ->
+ uppercase_bin(CPs, <> || CP <- Chars>>/binary >>);
+ [] -> Acc
+ end.
+
+lowercase_list(CPs0) ->
+ case unicode_util:lowercase(CPs0) of
+ [Char|CPs] -> append(Char,lowercase_list(CPs));
+ [] -> []
+ end.
+
+lowercase_bin(CPs0, Acc) ->
+ case unicode_util:lowercase(CPs0) of
+ [Char|CPs] when is_integer(Char) ->
+ lowercase_bin(CPs, <>);
+ [Chars|CPs] ->
+ lowercase_bin(CPs, <> || CP <- Chars>>/binary >>);
+ [] -> Acc
+ end.
+
+casefold_list(CPs0) ->
+ case unicode_util:casefold(CPs0) of
+ [Char|CPs] -> append(Char, casefold_list(CPs));
+ [] -> []
+ end.
+
+casefold_bin(CPs0, Acc) ->
+ case unicode_util:casefold(CPs0) of
+ [Char|CPs] when is_integer(Char) ->
+ casefold_bin(CPs, <>);
+ [Chars|CPs] ->
+ casefold_bin(CPs, <> || CP <- Chars>>/binary >>);
+ [] -> Acc
+ end.
+
+
+trim_l([Bin|Cont0], Sep) when is_binary(Bin) ->
+ case bin_search_inv(Bin, Cont0, Sep) of
+ {nomatch, Cont} -> trim_l(Cont, Sep);
+ Keep -> Keep
+ end;
+trim_l(Str, {GCs, _, _}=Sep) when is_list(Str) ->
+ case unicode_util:gc(Str) of
+ [C|Cs] ->
+ case lists:member(C, GCs) of
+ true -> trim_l(Cs, Sep);
+ false -> Str
+ end;
+ [] -> []
+ end;
+trim_l(Bin, Sep) when is_binary(Bin) ->
+ case bin_search_inv(Bin, [], Sep) of
+ {nomatch,_} -> <<>>;
+ [Keep] -> Keep
+ end.
+
+trim_t([Bin|Cont0], N, Sep) when is_binary(Bin) ->
+ <<_:N/binary, Rest/binary>> = Bin,
+ case bin_search(Rest, Cont0, Sep) of
+ {nomatch,_} ->
+ stack(Bin, trim_t(Cont0, 0, Sep));
+ [SepStart|Cont1] ->
+ case bin_search_inv(SepStart, Cont1, Sep) of
+ {nomatch, Cont} ->
+ Tail = trim_t(Cont, 0, Sep),
+ case is_empty(Tail) of
+ true ->
+ KeepSz = byte_size(Bin) - byte_size(SepStart),
+ <> = Bin,
+ Keep;
+ false ->
+ Used = cp_prefix(Cont0, Cont),
+ stack(Bin, stack(Used, Tail))
+ end;
+ [NonSep|Cont] when is_binary(NonSep) ->
+ KeepSz = byte_size(Bin) - byte_size(NonSep),
+ trim_t([Bin|Cont], KeepSz, Sep)
+ end
+ end;
+trim_t(Str, 0, {GCs,CPs,_}=Sep) when is_list(Str) ->
+ case unicode_util:cp(Str) of
+ [CP|Cs] ->
+ case lists:member(CP, CPs) of
+ true ->
+ [GC|Cs1] = unicode_util:gc(Str),
+ case lists:member(GC, GCs) of
+ true ->
+ Tail = trim_t(Cs1, 0, Sep),
+ case is_empty(Tail) of
+ true -> [];
+ false -> append(GC,Tail)
+ end;
+ false ->
+ append(GC,trim_t(Cs1, 0, Sep))
+ end;
+ false ->
+ append(CP,trim_t(Cs, 0, Sep))
+ end;
+ [] -> []
+ end;
+trim_t(Bin, N, Sep) when is_binary(Bin) ->
+ <<_:N/binary, Rest/binary>> = Bin,
+ case bin_search(Rest, Sep) of
+ {nomatch,_} -> Bin;
+ [SepStart] ->
+ case bin_search_inv(SepStart, [], Sep) of
+ {nomatch,_} ->
+ KeepSz = byte_size(Bin) - byte_size(SepStart),
+ <> = Bin,
+ Keep;
+ [NonSep] ->
+ KeepSz = byte_size(Bin) - byte_size(NonSep),
+ trim_t(Bin, KeepSz, Sep)
+ end
+ end.
+
+take_l([Bin|Cont0], Sep, Acc) when is_binary(Bin) ->
+ case bin_search_inv(Bin, Cont0, Sep) of
+ {nomatch, Cont} ->
+ Used = cp_prefix(Cont0, Cont),
+ take_l(Cont, Sep, [unicode:characters_to_binary([Bin|Used])|Acc]);
+ [Bin1|_]=After when is_binary(Bin1) ->
+ First = byte_size(Bin) - byte_size(Bin1),
+ <> = Bin,
+ {btoken(Keep,Acc), After}
+ end;
+take_l(Str, {GCs, _, _}=Sep, Acc) when is_list(Str) ->
+ case unicode_util:gc(Str) of
+ [C|Cs] ->
+ case lists:member(C, GCs) of
+ true -> take_l(Cs, Sep, append(rev(C),Acc));
+ false -> {rev(Acc), Str}
+ end;
+ [] -> {rev(Acc), []}
+ end;
+take_l(Bin, Sep, Acc) when is_binary(Bin) ->
+ case bin_search_inv(Bin, [], Sep) of
+ {nomatch,_} ->
+ {btoken(Bin, Acc), <<>>};
+ [After] ->
+ First = byte_size(Bin) - byte_size(After),
+ <> = Bin,
+ {btoken(Keep, Acc), After}
+ end.
+
+take_lc([Bin|Cont0], Sep, Acc) when is_binary(Bin) ->
+ case bin_search(Bin, Cont0, Sep) of
+ {nomatch, Cont} ->
+ Used = cp_prefix(Cont0, Cont),
+ take_lc(Cont, Sep, [unicode:characters_to_binary([Bin|Used])|Acc]);
+ [Bin1|_]=After when is_binary(Bin1) ->
+ First = byte_size(Bin) - byte_size(Bin1),
+ <> = Bin,
+ {btoken(Keep,Acc), After}
+ end;
+take_lc(Str, {GCs, _, _}=Sep, Acc) when is_list(Str) ->
+ case unicode_util:gc(Str) of
+ [C|Cs] ->
+ case lists:member(C, GCs) of
+ false -> take_lc(Cs, Sep, append(rev(C),Acc));
+ true -> {rev(Acc), Str}
+ end;
+ [] -> {rev(Acc), []}
+ end;
+take_lc(Bin, Sep, Acc) when is_binary(Bin) ->
+ case bin_search(Bin, [], Sep) of
+ {nomatch,_} ->
+ {btoken(Bin, Acc), <<>>};
+ [After] ->
+ First = byte_size(Bin) - byte_size(After),
+ <> = Bin,
+ {btoken(Keep, Acc), After}
+ end.
+
+take_t([Bin|Cont0], N, Sep) when is_binary(Bin) ->
+ <<_:N/binary, Rest/binary>> = Bin,
+ case bin_search(Rest, Cont0, Sep) of
+ {nomatch,Cont} ->
+ Used = cp_prefix(Cont0, Cont),
+ {Head, Tail} = take_t(Cont, 0, Sep),
+ {stack(unicode:characters_to_binary([Bin|Used]), Head), Tail};
+ [SepStart|Cont1] ->
+ case bin_search_inv(SepStart, Cont1, Sep) of
+ {nomatch, Cont} ->
+ {Head, Tail} = take_t(Cont, 0, Sep),
+ Used = cp_prefix(Cont0, Cont),
+ case equal(Tail, Cont) of
+ true ->
+ KeepSz = byte_size(Bin) - byte_size(SepStart),
+ <> = Bin,
+ {stack(Keep,Head), stack(stack(End,Used),Tail)};
+ false ->
+ {stack(unicode:characters_to_binary([Bin|Used]),Head), Tail}
+ end;
+ [NonSep|Cont] when is_binary(NonSep) ->
+ KeepSz = byte_size(Bin) - byte_size(NonSep),
+ take_t([Bin|Cont], KeepSz, Sep)
+ end
+ end;
+take_t(Str, 0, {GCs,CPs,_}=Sep) when is_list(Str) ->
+ case unicode_util:cp(Str) of
+ [CP|Cs] ->
+ case lists:member(CP, CPs) of
+ true ->
+ [GC|Cs1] = unicode_util:gc(Str),
+ case lists:member(GC, GCs) of
+ true ->
+ {Head, Tail} = take_t(Cs1, 0, Sep),
+ case equal(Tail, Cs1) of
+ true -> {Head, append(GC,Tail)};
+ false -> {append(GC,Head), Tail}
+ end;
+ false ->
+ {Head, Tail} = take_t(Cs, 0, Sep),
+ {append(CP,Head), Tail}
+ end;
+ false ->
+ {Head, Tail} = take_t(Cs, 0, Sep),
+ {append(CP,Head), Tail}
+ end;
+ [] -> {[],[]}
+ end;
+take_t(Bin, N, Sep) when is_binary(Bin) ->
+ <<_:N/binary, Rest/binary>> = Bin,
+ case bin_search(Rest, Sep) of
+ {nomatch,_} -> {Bin, <<>>};
+ [SepStart] ->
+ case bin_search_inv(SepStart, [], Sep) of
+ {nomatch,_} ->
+ KeepSz = byte_size(Bin) - byte_size(SepStart),
+ <> = Bin,
+ {Before, End};
+ [NonSep] ->
+ KeepSz = byte_size(Bin) - byte_size(NonSep),
+ take_t(Bin, KeepSz, Sep)
+ end
+ end.
+
+take_tc([Bin|Cont0], N, Sep) when is_binary(Bin) ->
+ <<_:N/binary, Rest/binary>> = Bin,
+ case bin_search_inv(Rest, Cont0, Sep) of
+ {nomatch,Cont} ->
+ Used = cp_prefix(Cont0, Cont),
+ {Head, Tail} = take_tc(Cont, 0, Sep),
+ {stack(unicode:characters_to_binary([Bin|Used]), Head), Tail};
+ [SepStart|Cont1] ->
+ case bin_search(SepStart, Cont1, Sep) of
+ {nomatch, Cont} ->
+ {Head, Tail} = take_tc(Cont, 0, Sep),
+ Used = cp_prefix(Cont0, Cont),
+ case equal(Tail, Cont) of
+ true ->
+ KeepSz = byte_size(Bin) - byte_size(SepStart),
+ <> = Bin,
+ {stack(Keep,Head), stack(stack(End,Used),Tail)};
+ false ->
+ {stack(unicode:characters_to_binary([Bin|Used]),Head), Tail}
+ end;
+ [NonSep|Cont] when is_binary(NonSep) ->
+ KeepSz = byte_size(Bin) - byte_size(NonSep),
+ take_tc([Bin|Cont], KeepSz, Sep)
+ end
+ end;
+take_tc(Str, 0, {GCs,CPs,_}=Sep) when is_list(Str) ->
+ case unicode_util:cp(Str) of
+ [CP|Cs] ->
+ case lists:member(CP, CPs) of
+ true ->
+ [GC|Cs1] = unicode_util:gc(Str),
+ case lists:member(GC, GCs) of
+ false ->
+ {Head, Tail} = take_tc(Cs1, 0, Sep),
+ case equal(Tail, Cs1) of
+ true -> {Head, append(GC,Tail)};
+ false -> {append(GC,Head), Tail}
+ end;
+ true ->
+ {Head, Tail} = take_tc(Cs1, 0, Sep),
+ {append(GC,Head), Tail}
+ end;
+ false ->
+ {Head, Tail} = take_tc(Cs, 0, Sep),
+ case equal(Tail, Cs) of
+ true -> {Head, append(CP,Tail)};
+ false -> {append(CP,Head), Tail}
+ end
+ end;
+ [] -> {[],[]}
+ end;
+take_tc(Bin, N, Sep) when is_binary(Bin) ->
+ <<_:N/binary, Rest/binary>> = Bin,
+ case bin_search_inv(Rest, [], Sep) of
+ {nomatch,_} -> {Bin, <<>>};
+ [SepStart] ->
+ case bin_search(SepStart, [], Sep) of
+ {nomatch,_} ->
+ KeepSz = byte_size(Bin) - byte_size(SepStart),
+ <> = Bin,
+ {Before, End};
+ [NonSep] ->
+ KeepSz = byte_size(Bin) - byte_size(NonSep),
+ take_tc(Bin, KeepSz, Sep)
+ end
+ end.
+
+prefix_1(Cs, []) -> Cs;
+prefix_1(Cs, [_]=Pre) ->
+ prefix_2(unicode_util:gc(Cs), Pre);
+prefix_1(Cs, Pre) ->
+ prefix_2(unicode_util:cp(Cs), Pre).
+
+prefix_2([C|Cs], [C|Pre]) ->
+ prefix_1(Cs, Pre);
+prefix_2(_, _) ->
+ nomatch.
+
+split_1([Bin|Cont0], Needle, Start, Where, Curr0, Acc)
+ when is_binary(Bin) ->
+ case bin_search_str(Bin, Start, Cont0, Needle) of
+ {nomatch,Sz,Cont} ->
+ <> = Bin,
+ split_1(Cont, Needle, 0, Where, [Keep|Curr0], Acc);
+ {Before, [Cs0|Cont], After} ->
+ Curr = add_non_empty(Before,Curr0),
+ case Where of
+ leading ->
+ [rev(Curr),After];
+ trailing ->
+ <<_/utf8, Cs/binary>> = Cs0,
+ Next = byte_size(Bin) - byte_size(Cs),
+ split_1([Bin|Cont], Needle, Next, Where,
+ Curr0, [rev(Curr),After]);
+ all ->
+ split_1(After, Needle, 0, Where, [], [rev(Curr)|Acc])
+ end
+ end;
+split_1(Cs0, [C|_]=Needle, _, Where, Curr, Acc) when is_list(Cs0) ->
+ case unicode_util:cp(Cs0) of
+ [C|Cs] ->
+ case prefix_1(Cs0, Needle) of
+ nomatch -> split_1(Cs, Needle, 0, Where, append(C,Curr), Acc);
+ Rest when Where =:= leading ->
+ [rev(Curr), Rest];
+ Rest when Where =:= trailing ->
+ split_1(Cs, Needle, 0, Where, [C|Curr], [rev(Curr), Rest]);
+ Rest when Where =:= all ->
+ split_1(Rest, Needle, 0, Where, [], [rev(Curr)|Acc])
+ end;
+ [Other|Cs] ->
+ split_1(Cs, Needle, 0, Where, append(Other,Curr), Acc);
+ [] ->
+ {rev(Curr), Acc}
+ end;
+split_1(Bin, [_C|_]=Needle, Start, Where, Curr0, Acc) ->
+ case bin_search_str(Bin, Start, [], Needle) of
+ {nomatch,_,_} ->
+ <<_:Start/binary, Keep/binary>> = Bin,
+ {rev([Keep|Curr0]), Acc};
+ {Before, [Cs0], After} ->
+ case Where of
+ leading ->
+ [rev([Before|Curr0]),After];
+ trailing ->
+ <<_/utf8, Cs/binary>> = Cs0,
+ Next = byte_size(Bin) - byte_size(Cs),
+ split_1(Bin, Needle, Next, Where, Curr0,
+ [btoken(Before,Curr0),After]);
+ all ->
+ Next = byte_size(Bin) - byte_size(After),
+ <<_:Start/binary, Keep/binary>> = Before,
+ Curr = [Keep|Curr0],
+ split_1(Bin, Needle, Next, Where, [], [rev(Curr)|Acc])
+ end
+ end.
+
+lexemes_m([Bin|Cont0], Seps, Ts) when is_binary(Bin) ->
+ case bin_search_inv(Bin, Cont0, Seps) of
+ {nomatch,Cont} ->
+ lexemes_m(Cont, Seps, Ts);
+ Cs ->
+ {Lexeme,Rest} = lexeme_pick(Cs, Seps, []),
+ lexemes_m(Rest, Seps, [Lexeme|Ts])
+ end;
+lexemes_m(Cs0, {GCs, _, _}=Seps, Ts) when is_list(Cs0) ->
+ case unicode_util:gc(Cs0) of
+ [C|Cs] ->
+ case lists:member(C, GCs) of
+ true ->
+ lexemes_m(Cs, Seps, Ts);
+ false ->
+ {Lexeme,Rest} = lexeme_pick(Cs0, Seps, []),
+ lexemes_m(Rest, Seps, [Lexeme|Ts])
+ end;
+ [] ->
+ lists:reverse(Ts)
+ end;
+lexemes_m(Bin, Seps, Ts) when is_binary(Bin) ->
+ case bin_search_inv(Bin, [], Seps) of
+ {nomatch,_} ->
+ lists:reverse(Ts);
+ [Cs] ->
+ {Lexeme,Rest} = lexeme_pick(Cs, Seps, []),
+ lexemes_m(Rest, Seps, add_non_empty(Lexeme,Ts))
+ end.
+
+lexeme_pick([CP|Cs1]=Cs0, {GCs,CPs,_}=Seps, Tkn) when is_integer(CP) ->
+ case lists:member(CP, CPs) of
+ true ->
+ [GC|Cs2] = unicode_util:gc(Cs0),
+ case lists:member(GC, GCs) of
+ true -> {rev(Tkn), Cs2};
+ false -> lexeme_pick(Cs2, Seps, append(rev(GC),Tkn))
+ end;
+ false -> lexeme_pick(Cs1, Seps, [CP|Tkn])
+ end;
+lexeme_pick([Bin|Cont0], Seps, Tkn) when is_binary(Bin) ->
+ case bin_search(Bin, Cont0, Seps) of
+ {nomatch,_} ->
+ lexeme_pick(Cont0, Seps, [Bin|Tkn]);
+ [Left|_Cont] = Cs ->
+ Bytes = byte_size(Bin) - byte_size(Left),
+ <> = Bin,
+ {btoken(Lexeme, Tkn), Cs}
+ end;
+lexeme_pick(Cs0, {GCs, CPs, _} = Seps, Tkn) when is_list(Cs0) ->
+ case unicode_util:cp(Cs0) of
+ [CP|Cs] ->
+ case lists:member(CP, CPs) of
+ true ->
+ [GC|Cs2] = unicode_util:gc(Cs0),
+ case lists:member(GC, GCs) of
+ true -> {rev(Tkn), Cs0};
+ false -> lexeme_pick(Cs2, Seps, append(rev(GC),Tkn))
+ end;
+ false ->
+ lexeme_pick(Cs, Seps, append(CP,Tkn))
+ end;
+ [] ->
+ {rev(Tkn), []}
+ end;
+lexeme_pick(Bin, Seps, Tkn) when is_binary(Bin) ->
+ case bin_search(Bin, Seps) of
+ {nomatch,_} ->
+ {btoken(Bin,Tkn), []};
+ [Left] ->
+ Bytes = byte_size(Bin) - byte_size(Left),
+ <> = Bin,
+ {btoken(Lexeme, Tkn), Left}
+ end.
+
+nth_lexeme_m([Bin|Cont0], Seps, N) when is_binary(Bin) ->
+ case bin_search_inv(Bin, Cont0, Seps) of
+ {nomatch,Cont} ->
+ nth_lexeme_m(Cont, Seps, N);
+ Cs when N > 1 ->
+ Rest = lexeme_skip(Cs, Seps),
+ nth_lexeme_m(Rest, Seps, N-1);
+ Cs ->
+ {Lexeme,_} = lexeme_pick(Cs, Seps, []),
+ Lexeme
+ end;
+nth_lexeme_m(Cs0, {GCs, _, _}=Seps, N) when is_list(Cs0) ->
+ case unicode_util:gc(Cs0) of
+ [C|Cs] ->
+ case lists:member(C, GCs) of
+ true ->
+ nth_lexeme_m(Cs, Seps, N);
+ false when N > 1 ->
+ Cs1 = lexeme_skip(Cs, Seps),
+ nth_lexeme_m(Cs1, Seps, N-1);
+ false ->
+ {Lexeme,_} = lexeme_pick(Cs0, Seps, []),
+ Lexeme
+ end;
+ [] ->
+ []
+ end;
+nth_lexeme_m(Bin, Seps, N) when is_binary(Bin) ->
+ case bin_search_inv(Bin, [], Seps) of
+ [Cs] when N > 1 ->
+ Cs1 = lexeme_skip(Cs, Seps),
+ nth_lexeme_m(Cs1, Seps, N-1);
+ [Cs] ->
+ {Lexeme,_} = lexeme_pick(Cs, Seps, []),
+ Lexeme;
+ {nomatch,_} ->
+ <<>>
+ end.
+
+lexeme_skip([CP|Cs1]=Cs0, {GCs,CPs,_}=Seps) when is_integer(CP) ->
+ case lists:member(CP, CPs) of
+ true ->
+ [GC|Cs2] = unicode_util:gc(Cs0),
+ case lists:member(GC, GCs) of
+ true -> Cs0;
+ false -> lexeme_skip(Cs2, Seps)
+ end;
+ false ->
+ lexeme_skip(Cs1, Seps)
+ end;
+lexeme_skip([Bin|Cont0], Seps) when is_binary(Bin) ->
+ case bin_search(Bin, Cont0, Seps) of
+ {nomatch,_} -> lexeme_skip(Cont0, Seps);
+ Cs -> Cs
+ end;
+lexeme_skip(Cs0, {GCs, CPs, _} = Seps) when is_list(Cs0) ->
+ case unicode_util:cp(Cs0) of
+ [CP|Cs] ->
+ case lists:member(CP, CPs) of
+ true ->
+ [GC|Cs2] = unicode_util:gc(Cs0),
+ case lists:member(GC, GCs) of
+ true -> Cs0;
+ false -> lexeme_skip(Cs2, Seps)
+ end;
+ false ->
+ lexeme_skip(Cs, Seps)
+ end;
+ [] ->
+ []
+ end;
+lexeme_skip(Bin, Seps) when is_binary(Bin) ->
+ case bin_search(Bin, Seps) of
+ {nomatch,_} -> <<>>;
+ [Left] -> Left
+ end.
+
+find_l([Bin|Cont0], Needle) when is_binary(Bin) ->
+ case bin_search_str(Bin, 0, Cont0, Needle) of
+ {nomatch, _, Cont} ->
+ find_l(Cont, Needle);
+ {_Before, Cs, _After} ->
+ Cs
+ end;
+find_l(Cs0, [C|_]=Needle) when is_list(Cs0) ->
+ case unicode_util:cp(Cs0) of
+ [C|Cs] ->
+ case prefix_1(Cs0, Needle) of
+ nomatch -> find_l(Cs, Needle);
+ _ -> Cs0
+ end;
+ [_C|Cs] ->
+ find_l(Cs, Needle);
+ [] -> nomatch
+ end;
+find_l(Bin, Needle) ->
+ case bin_search_str(Bin, 0, [], Needle) of
+ {nomatch,_,_} -> nomatch;
+ {_Before, [Cs], _After} -> Cs
+ end.
+
+find_r([Bin|Cont0], Needle, Res) when is_binary(Bin) ->
+ case bin_search_str(Bin, 0, Cont0, Needle) of
+ {nomatch,_,Cont} ->
+ find_r(Cont, Needle, Res);
+ {_, Cs0, _} ->
+ [_|Cs] = unicode_util:gc(Cs0),
+ find_r(Cs, Needle, Cs0)
+ end;
+find_r(Cs0, [C|_]=Needle, Res) when is_list(Cs0) ->
+ case unicode_util:cp(Cs0) of
+ [C|Cs] ->
+ case prefix_1(Cs0, Needle) of
+ nomatch -> find_r(Cs, Needle, Res);
+ _ -> find_r(Cs, Needle, Cs0)
+ end;
+ [_C|Cs] ->
+ find_r(Cs, Needle, Res);
+ [] -> Res
+ end;
+find_r(Bin, Needle, Res) ->
+ case bin_search_str(Bin, 0, [], Needle) of
+ {nomatch,_,_} -> Res;
+ {_Before, [Cs0], _After} ->
+ <<_/utf8, Cs/binary>> = Cs0,
+ find_r(Cs, Needle, Cs0)
+ end.
+
+%% These are used to avoid creating lists around binaries
+%% might be unnecessary, is there a better solution?
+btoken(Token, []) -> Token;
+btoken(BinPart, [C]) when is_integer(C) -> <>;
+btoken(<<>>, Tkn) -> lists:reverse(Tkn);
+btoken(BinPart, Cs) -> [lists:reverse(Cs),BinPart].
+
+rev([B]) when is_binary(B) -> B;
+rev(L) when is_list(L) -> lists:reverse(L);
+rev(C) when is_integer(C) -> C.
+
+append(Char, <<>>) when is_integer(Char) -> [Char];
+append(Char, <<>>) when is_list(Char) -> Char;
+append(Char, Bin) when is_binary(Bin) -> [Char,Bin];
+append(Char, Str) when is_integer(Char) -> [Char|Str];
+append(GC, Str) when is_list(GC) -> GC ++ Str.
+
+stack(Bin, []) -> Bin;
+stack(<<>>, St) -> St;
+stack([], St) -> St;
+stack(Bin, St) -> [Bin|St].
+
+add_non_empty(<<>>, L) -> L;
+add_non_empty(Token, L) -> [Token|L].
+
+cp_prefix(Orig, Cont) ->
+ case unicode_util:cp(Cont) of
+ [] -> Orig;
+ [Cp|Rest] -> cp_prefix_1(Orig, Cp, Rest)
+ end.
+
+cp_prefix_1(Orig, Until, Cont) ->
+ case unicode_util:cp(Orig) of
+ [Until|Rest] ->
+ case equal(Rest, Cont) of
+ true -> [];
+ false-> [Until|cp_prefix_1(Rest, Until, Cont)]
+ end;
+ [CP|Rest] -> [CP|cp_prefix_1(Rest, Until, Cont)]
+ end.
+
+
+%% Binary special
+bin_search(Bin, Seps) ->
+ bin_search(Bin, [], Seps).
+
+bin_search(_Bin, Cont, {[],_,_}) ->
+ {nomatch, Cont};
+bin_search(Bin, Cont, {Seps,_,BP}) ->
+ bin_search_loop(Bin, 0, BP, Cont, Seps).
+
+%% Need to work with [<<$a>>, <<778/utf8>>],
+%% i.e. å in nfd form $a "COMBINING RING ABOVE"
+%% and PREPEND characters like "ARABIC NUMBER SIGN" 1536 <<216,128>>
+%% combined with other characters are currently ignored.
+search_pattern(Seps) ->
+ CPs = search_cp(Seps),
+ Bin = bin_pattern(CPs),
+ {Seps, CPs, Bin}.
+
+search_cp([CP|Seps]) when is_integer(CP) ->
+ [CP|search_cp(Seps)];
+search_cp([Pattern|Seps]) ->
+ [CP|_] = unicode_util:cp(Pattern),
+ [CP|search_cp(Seps)];
+search_cp([]) -> [].
+
+bin_pattern([CP|Seps]) ->
+ [<>|bin_pattern(Seps)];
+bin_pattern([]) -> [].
+
+bin_search_loop(Bin0, Start, _, Cont, _Seps)
+ when byte_size(Bin0) =< Start; Start < 0 ->
+ {nomatch, Cont};
+bin_search_loop(Bin0, Start, BinSeps, Cont, Seps) ->
+ <<_:Start/binary, Bin/binary>> = Bin0,
+ case binary:match(Bin, BinSeps) of
+ nomatch ->
+ {nomatch,Cont};
+ {Where, _CL} ->
+ <<_:Where/binary, Cont0/binary>> = Bin,
+ Cont1 = stack(Cont0, Cont),
+ [GC|Cont2] = unicode_util:gc(Cont1),
+ case lists:member(GC, Seps) of
+ false ->
+ case Cont2 of
+ [BinR|Cont] when is_binary(BinR) ->
+ Next = byte_size(Bin0) - byte_size(BinR),
+ bin_search_loop(Bin0, Next, BinSeps, Cont, Seps);
+ BinR when is_binary(BinR), Cont =:= [] ->
+ Next = byte_size(Bin0) - byte_size(BinR),
+ bin_search_loop(Bin0, Next, BinSeps, Cont, Seps);
+ _ ->
+ {nomatch, Cont2}
+ end;
+ true when is_list(Cont1) ->
+ Cont1;
+ true ->
+ [Cont1]
+ end
+ end.
+
+bin_search_inv(Bin, Cont, {[], _, _}) ->
+ [Bin|Cont];
+bin_search_inv(Bin, Cont, {[Sep], _, _}) ->
+ bin_search_inv_1([Bin|Cont], Sep);
+bin_search_inv(Bin, Cont, {Seps, _, _}) ->
+ bin_search_inv_n([Bin|Cont], Seps).
+
+bin_search_inv_1([<<>>|CPs], _) ->
+ {nomatch, CPs};
+bin_search_inv_1(CPs = [Bin0|Cont], Sep) when is_binary(Bin0) ->
+ case unicode_util:gc(CPs) of
+ [Sep|Bin] when is_binary(Bin), Cont =:= [] ->
+ bin_search_inv_1([Bin], Sep);
+ [Sep|[Bin|Cont]=Cs] when is_binary(Bin) ->
+ bin_search_inv_1(Cs, Sep);
+ [Sep|Cs] ->
+ {nomatch, Cs};
+ _ -> CPs
+ end.
+
+bin_search_inv_n([<<>>|CPs], _) ->
+ {nomatch, CPs};
+bin_search_inv_n([Bin0|Cont]=CPs, Seps) when is_binary(Bin0) ->
+ [C|Cs0] = unicode_util:gc(CPs),
+ case {lists:member(C, Seps), Cs0} of
+ {true, Cs} when is_binary(Cs), Cont =:= [] ->
+ bin_search_inv_n([Cs], Seps);
+ {true, [Bin|Cont]=Cs} when is_binary(Bin) ->
+ bin_search_inv_n(Cs, Seps);
+ {true, Cs} -> {nomatch, Cs};
+ {false, _} -> CPs
+ end.
+
+bin_search_str(Bin0, Start, Cont, [CP|_]=SearchCPs) ->
+ <<_:Start/binary, Bin/binary>> = Bin0,
+ case binary:match(Bin, <>) of
+ nomatch -> {nomatch, byte_size(Bin0), Cont};
+ {Where0, _} ->
+ Where = Start+Where0,
+ <> = Bin0,
+ [GC|Cs]=unicode_util:gc(Cs0),
+ case prefix_1(stack(Cs0,Cont), SearchCPs) of
+ nomatch when is_binary(Cs) ->
+ KeepSz = byte_size(Bin0) - byte_size(Cs),
+ bin_search_str(Bin0, KeepSz, Cont, SearchCPs);
+ nomatch ->
+ {nomatch, Where, stack([GC|Cs],Cont)};
+ [] ->
+ {Keep, [Cs0|Cont], <<>>};
+ Rest ->
+ {Keep, [Cs0|Cont], Rest}
+ end
+ end.
+
+
+%%---------------------------------------------------------------------------
+%% OLD lists API kept for backwards compability
+%%---------------------------------------------------------------------------
+
%% Robert's bit
%% len(String)
@@ -68,12 +1292,12 @@ len(S) -> length(S).
%% equal(String1, String2)
%% Test if 2 strings are equal.
--spec equal(String1, String2) -> boolean() when
- String1 :: string(),
- String2 :: string().
+%% -spec equal(String1, String2) -> boolean() when
+%% String1 :: string(),
+%% String2 :: string().
-equal(S, S) -> true;
-equal(_, _) -> false.
+%% equal(S, S) -> true;
+%% equal(_, _) -> false.
%% concat(String1, String2)
%% Concatenate 2 strings.
@@ -127,7 +1351,7 @@ rchr([], _C, _I, L) -> L.
str(S, Sub) when is_list(Sub) -> str(S, Sub, 1).
str([C|S], [C|Sub], I) ->
- case prefix(Sub, S) of
+ case l_prefix(Sub, S) of
true -> I;
false -> str(S, [C|Sub], I+1)
end;
@@ -142,16 +1366,16 @@ str([], _Sub, _I) -> 0.
rstr(S, Sub) when is_list(Sub) -> rstr(S, Sub, 1, 0).
rstr([C|S], [C|Sub], I, L) ->
- case prefix(Sub, S) of
+ case l_prefix(Sub, S) of
true -> rstr(S, [C|Sub], I+1, I);
false -> rstr(S, [C|Sub], I+1, L)
end;
rstr([_|S], Sub, I, L) -> rstr(S, Sub, I+1, L);
rstr([], _Sub, _I, L) -> L.
-prefix([C|Pre], [C|String]) -> prefix(Pre, String);
-prefix([], String) when is_list(String) -> true;
-prefix(Pre, String) when is_list(Pre), is_list(String) -> false.
+l_prefix([C|Pre], [C|String]) -> l_prefix(Pre, String);
+l_prefix([], String) when is_list(String) -> true;
+l_prefix(Pre, String) when is_list(Pre), is_list(String) -> false.
%% span(String, Chars) -> Length.
%% cspan(String, Chars) -> Length.
@@ -229,9 +1453,9 @@ tokens(S, Seps) ->
[_|_] -> [S]
end;
[C] ->
- tokens_single_1(reverse(S), C, []);
+ tokens_single_1(lists:reverse(S), C, []);
[_|_] ->
- tokens_multiple_1(reverse(S), Seps, [])
+ tokens_multiple_1(lists:reverse(S), Seps, [])
end.
tokens_single_1([Sep|S], Sep, Toks) ->
@@ -342,8 +1566,8 @@ sub_word(String, Index, Char) when is_integer(Index), is_integer(Char) ->
s_word(strip(String, left, Char), Index, Char, 1, [])
end.
-s_word([], _, _, _,Res) -> reverse(Res);
-s_word([Char|_],Index,Char,Index,Res) -> reverse(Res);
+s_word([], _, _, _,Res) -> lists:reverse(Res);
+s_word([Char|_],Index,Char,Index,Res) -> lists:reverse(Res);
s_word([H|T],Index,Char,Index,Res) -> s_word(T,Index,Char,Index,[H|Res]);
s_word([Char|T],Stop,Char,Index,Res) when Index < Stop ->
s_word(strip(T,left,Char),Stop,Char,Index+1,Res);
@@ -359,7 +1583,7 @@ strip(String) -> strip(String, both).
-spec strip(String, Direction) -> Stripped when
String :: string(),
Stripped :: string(),
- Direction :: left | right | both.
+ Direction :: 'left' | 'right' | 'both'.
strip(String, left) -> strip_left(String, $\s);
strip(String, right) -> strip_right(String, $\s);
@@ -369,7 +1593,7 @@ strip(String, both) ->
-spec strip(String, Direction, Character) -> Stripped when
String :: string(),
Stripped :: string(),
- Direction :: left | right | both,
+ Direction :: 'left' | 'right' | 'both',
Character :: char().
strip(String, right, Char) -> strip_right(String, Char);
diff --git a/lib/stdlib/test/string_SUITE.erl b/lib/stdlib/test/string_SUITE.erl
index 836f9e5142..a78ddf761b 100644
--- a/lib/stdlib/test/string_SUITE.erl
+++ b/lib/stdlib/test/string_SUITE.erl
@@ -29,25 +29,46 @@
-export([init_per_testcase/2, end_per_testcase/2]).
%% Test cases must be exported.
--export([len/1,equal/1,concat/1,chr_rchr/1,str_rstr/1]).
--export([span_cspan/1,substr/1,tokens/1,chars/1]).
+-export([is_empty/1, length/1, to_graphemes/1,
+ reverse/1, slice/1,
+ equal/1,
+ pad/1, trim/1, chomp/1, take/1,
+ uppercase/1, lowercase/1, titlecase/1, casefold/1,
+ prefix/1, split/1, replace/1, find/1,
+ lexemes/1, nth_lexeme/1, cd_gc/1, meas/1
+ ]).
+
+-export([len/1,old_equal/1,old_concat/1,chr_rchr/1,str_rstr/1]).
+-export([span_cspan/1,substr/1,old_tokens/1,chars/1]).
-export([copies/1,words/1,strip/1,sub_word/1,left_right/1]).
-export([sub_string/1,centre/1, join/1]).
-export([to_integer/1,to_float/1]).
-export([to_upper_to_lower/1]).
+%% Run tests when debugging them
+-export([debug/0]).
+
suite() ->
[{ct_hooks,[ts_install_cth]},
{timetrap,{minutes,1}}].
-all() ->
- [len, equal, concat, chr_rchr, str_rstr, span_cspan,
- substr, tokens, chars, copies, words, strip, sub_word,
- left_right, sub_string, centre, join, to_integer,
- to_float, to_upper_to_lower].
+all() ->
+ [{group, chardata}, {group, list_string}].
-groups() ->
- [].
+groups() ->
+ [{chardata,
+ [is_empty, length, to_graphemes,
+ equal, reverse, slice,
+ pad, trim, chomp, take,
+ lexemes, nth_lexeme,
+ uppercase, lowercase, titlecase, casefold,
+ prefix, find, split, replace, cd_gc,
+ meas]},
+ {list_string,
+ [len, old_equal, old_concat, chr_rchr, str_rstr, span_cspan,
+ substr, old_tokens, chars, copies, words, strip, sub_word,
+ left_right, sub_string, centre, join, to_integer,
+ to_float, to_upper_to_lower]}].
init_per_suite(Config) ->
Config.
@@ -68,8 +89,839 @@ init_per_testcase(_Case, Config) ->
end_per_testcase(_Case, _Config) ->
ok.
+debug() ->
+ Config = [{data_dir, ?MODULE_STRING++"_data"}],
+ [io:format("~p:~p~n",[Test,?MODULE:Test(Config)]) ||
+ {_,Tests} <- groups(), Test <- Tests].
+
+-define(TEST(B,C,D), test(?LINE,?FUNCTION_NAME,B,C,D, true)).
+-define(TEST_EQ(B,C,D),
+ test(?LINE,?FUNCTION_NAME,B,C,D, true),
+ test(?LINE,?FUNCTION_NAME,hd(C),[B|tl(C),D, true)).
+
+-define(TEST_NN(B,C,D),
+ test(?LINE,?FUNCTION_NAME,B,C,D, false),
+ test(?LINE,?FUNCTION_NAME,hd(C),[B|tl(C)],D, false)).
+
+
+is_empty(_) ->
+ ?TEST("", [], true),
+ ?TEST([""|<<>>], [], true),
+ ?TEST("a", [], false),
+ ?TEST([""|<<$a>>], [], false),
+ ?TEST(["",[<<>>]], [], true),
+ ok.
+
+length(_) ->
+ %% invalid arg type
+ {'EXIT',_} = (catch string:length({})),
+ {'EXIT',_} = (catch string:length(foo)),
+ %% Valid signs
+ ?TEST("", [], 0),
+ ?TEST([""|<<>>], [], 0),
+ L = tuple_size(list_to_tuple(atom_to_list(?MODULE))),
+ ?TEST(atom_to_list(?MODULE), [], L),
+ ?TEST("Hello", [], 5),
+ ?TEST("UC Ω ßð", [], 7),
+ ?TEST(["abc"|<<"abc">>], [], 6),
+ ?TEST(["abc",["def"]], [], 6),
+ ?TEST([<<97/utf8, 778/utf8, 98/utf8>>, [776,111,776]], [], 3), %% åäö in nfd
+ ok.
+
+equal(_) ->
+ %% invalid arg type
+ {'EXIT',_} = (catch string:equal(1, 2)),
+ {'EXIT',_} = (catch string:equal(1, 2, foo)),
+ {'EXIT',_} = (catch string:equal(1, 2, true, foo)),
+
+ ?TEST("", [<<"">>], true),
+ ?TEST("Hello", ["Hello"], true),
+ ?TEST("Hello", ["Hell"], false),
+ ?TEST("Hello", ["Hello!"], false),
+ ?TEST("Hello", [<<"Hello"/utf8>>], true),
+ ?TEST("Hello", [<<"Mello"/utf8>>], false),
+ ?TEST("Hello", [<<"Hello!"/utf8>>], false),
+ ?TEST(["Hello",[" deep"]], ["Hello deep"], true),
+ ?TEST(["Hello",[<<" deep"/utf8>>]], ["Hello deep"], true),
+ ?TEST("Hello deep", [["Hello", [" deep"]]], true),
+ ?TEST("Hello deep", [["Hello", [" d!eep"]]], false),
+ ?TEST("Hello deep", [["Hello", [<<" deep"/utf8>>]]], true),
+ false = string:equal("Åäö", [<<97/utf8, 778/utf8, 98/utf8>>, [776,111,776]]), %% nfc vs nfd
+
+ %% case_insensitive_equal()
+ ?TEST("", ["", true], true),
+ ?TEST("a", ["b", true], false),
+ ?TEST("", [<<>>, true], true),
+ ?TEST("", [[<<>>,[]], true], true),
+ ?TEST("", [[<<>>,[$a]], true], false),
+ ?TEST("123", ["123", true], true),
+ ?TEST("abc", ["abc", true], true),
+ ?TEST([[],<<>>,"ABC"|<<>>], [["abc",[]], true], true),
+ ?TEST("ABCa", ["abcå", true], false),
+ ?TEST("åäö", [{norm,"åäö"}, true], true),
+ ?TEST("ÅÄÖ", [{norm,"åäö"}, true], true),
+ ?TEST("MICHAŁ", ["michał", true], true),
+ ?TEST(["Mic",<<"HAŁ"/utf8>>], ["michał", true], true),
+ ?TEST("ß SHARP S", ["ss sharp s", true], true),
+ ?TEST("ẞ SHARP S", [[<<$ß/utf8, $\s>>,"SHARP S"], true], true),
+ ?TEST("ẞ SHARP ß", ["ss sharp s", true], false),
+ ?TEST(<<"İ I WITH DOT ABOVE"/utf8>>, ["i̇ i with dot above", true], true),
+ %% These should be equivalent with the above
+ true = string:equal(string:casefold(["Mic",<<"HAŁ"/utf8>>]), string:casefold("michał")),
+ true = string:equal(string:casefold("ẞ SHARP S"), string:casefold([<<$ß/utf8, $\s>>,"SHARP S"])),
+ false = string:equal(string:casefold("ẞ SHARP ß"), string:casefold("ss sharp s")),
+
+ %% Normalization
+ ?TEST_NN("", ["", true, none], true),
+ ?TEST_NN("a", ["b", true, nfc], false),
+ ?TEST_NN("a", ["b", true, nfd], false),
+ ?TEST_NN("a", ["b", true, nfkc], false),
+ ?TEST_NN("a", ["b", true, nfkd], false),
+
+ ?TEST_NN("a", ["A", false, nfc], false),
+ ?TEST_NN("a", ["A", false, nfd], false),
+ ?TEST_NN([<<>>,"a"|<<>>], ["A", true, nfkc], true),
+ ?TEST_NN(<<"a">>, ["A", true, nfkd], true),
+
+ ?TEST_NN([$a, <<$b>>, [97,776]], ["abä", false, none], false),
+ ?TEST_NN([$a, <<$b>>, [97,776]], ["abä", false, nfc], true),
+ ?TEST_NN([$a, <<$b>>, [97,776]], ["abä", false, nfd], true),
+ ?TEST_NN([$a, <<$b>>, [97,776]], ["abä", false, nfkc], true),
+ ?TEST_NN([$a, <<$b>>, [97,776]], ["abä", false, nfkd], true),
+
+ ?TEST_NN([$a, <<$b>>, [97,776]], ["abÄ", true, none], false),
+ ?TEST_NN([$a, <<$b>>, [97,776]], ["abÄ", false, nfc], false),
+ ?TEST_NN([$a, <<$b>>, [97,776]], ["abÄ", true, nfc], true),
+ ?TEST_NN([$a, <<$b>>, [97,776]], ["abÄ", true, nfd], true),
+ ?TEST_NN([$a, <<$b>>, [97,776]], ["abÄ", true, nfkc], true),
+ ?TEST_NN([$a, <<$b>>, [97,776]], ["abÄ", true, nfkd], true),
+
+ ?TEST_NN([$a, <<$b>>, "ホンダ"], ["abホンダ", true, none], false),
+ ?TEST_NN([$a, <<$b>>, "ホンダ"], ["abホンダ", true, nfc], false),
+ ?TEST_NN([$a, <<$b>>, "ホンダ"], ["abホンダ", true, nfd], false),
+ ?TEST_NN([$a, <<$b>>, "ホンダ"], ["abホンダ", true, nfkc], true),
+ ?TEST_NN([$a, <<$b>>, "ホンダ"], ["abホンダ", true, nfkd], true),
+
+ ?TEST_NN([$a, <<$b>>, "32"], ["ab32", true, none], false),
+ ?TEST_NN([$a, <<$b>>, "32"], ["ab32", true, nfc], false),
+ ?TEST_NN([$a, <<$b>>, "32"], ["ab32", true, nfd], false),
+ ?TEST_NN([$a, <<$b>>, "32"], ["ab32", true, nfkc], true),
+ ?TEST_NN([$a, <<$b>>, "32"], ["ab32", true, nfkd], true),
+
+ %% Coverage.
+ ?TEST("", [<<"">>, false, nfc], true),
+ ?TEST("", [<<"">>, true, nfc], true),
+
+ ok.
+
+to_graphemes(_) ->
+ %% More tests are in unicode_util_SUITE.erl
+ {'EXIT', _} = (catch unicode:characters_to_nfd_binary(["asdåäö", an_atom])),
+ String = ["abc..åäö", $e, 788, <<"Ωµe`è"/utf8>>, "œŒþæÆħ§ß"],
+ NFD = unicode:characters_to_nfd_list(String),
+ [] = string:to_graphemes([]),
+ [] = string:to_graphemes(<<>>),
+ GCs = string:to_graphemes(String),
+ true = erlang:length(GCs) =:= string:length(String),
+ true = erlang:length(GCs) =:= erlang:length(string:to_graphemes(NFD)),
+ true = erlang:length(GCs) =:=
+ erlang:length(string:to_graphemes(unicode:characters_to_nfc_list(String))),
+ ok.
+
+reverse(_) ->
+ {'EXIT',_} = (catch string:reverse(2)),
+ Str1 = "Hello ",
+ Str2 = "Ω ßð",
+ Str3 = "åäö",
+ ?TEST("", [], ""),
+ ?TEST(Str1, [], lists:reverse(Str1)),
+ ?TEST(Str2, [], lists:reverse(Str2)),
+ ?TEST(Str3, [], lists:reverse(Str3)),
+ true = string:reverse(Str3) =:= lists:reverse(string:to_graphemes(Str3)),
+ ok.
+
+slice(_) ->
+ {'EXIT',_} = (catch string:slice(2, 2, 2)),
+ {'EXIT',_} = (catch string:slice("asd", foo, 2)),
+ {'EXIT',_} = (catch string:slice("asd", 2, -1)),
+ ?TEST("", [3], ""),
+ ?TEST("aåä", [1, 0], ""),
+ ?TEST("aåä", [3], ""),
+ ?TEST("aåäöbcd", [3], "öbcd"),
+ ?TEST([<<"aå"/utf8>>,"äöbcd"], [3], "öbcd"),
+ ?TEST([<<"aåä"/utf8>>,"öbcd"], [3], "öbcd"),
+ ?TEST([<<"aåä"/utf8>>,"öbcd"], [3, infinity], "öbcd"),
+
+ ?TEST("", [3, 2], ""),
+ ?TEST("aåä", [3, 2], ""),
+ ?TEST("aåäöbcd", [3,2], "öb"),
+ ?TEST([<<"aå"/utf8>>,"äöbcd"], [3,3], "öbc"),
+ ?TEST([<<"aåä"/utf8>>,"öbcd"], [3,10], "öbcd"),
+
+ ok.
+
+pad(_) ->
+ Str = "Hallå",
+ ?TEST(Str, [7], "Hallå "),
+ ?TEST(Str, [7, leading], " Hallå"),
+ ?TEST(Str, [4, both, $.], "Hallå"),
+ ?TEST(Str, [10, both, $.], "..Hallå..."),
+ ?TEST(Str, [10, leading, $.], ".....Hallå"),
+ ?TEST(Str, [10, trailing, $.], "Hallå....."),
+ ?TEST(Str++["f"], [10, trailing, $.], "Hallåf...."),
+ ?TEST(Str++[" flåwer"], [10, trailing, $.], "Hallå flåwer"),
+ ok.
+
+trim(_) ->
+ Str = "\t\s..Ha\s.llå..\t\n\r",
+ ?TEST("", [], ""),
+ ?TEST(Str, [both, "x"], Str),
+ ?TEST(Str, [leading], "..Ha\s.llå..\t\n\r"),
+ ?TEST(Str, [trailing], "\t\s..Ha\s.llå.."),
+ ?TEST(Str, [], "..Ha .llå.."),
+ ?TEST(".. ", [both, ""], ".. "),
+ ?TEST([<<".. ">>], [both, ". "], ""),
+ ?TEST(".. h.ej ..", [leading, ". "], "h.ej .."),
+ ?TEST(".. h.ej ..", [trailing, ". "], ".. h.ej"),
+ ?TEST(".. h.ej ..", [both, ". "], "h.ej"),
+ ?TEST(["..", <<"h.ej">>, ".."], [both, ". "], "h.ej"),
+ ?TEST([[], "..", " h.ej ", <<"..">>], [both, ". "], "h.ej"),
+ ?TEST([<<>>,<<"..">>, " h.ej", <<" ..">>], [both, ". "], "h.ej"),
+ ?TEST([<<>>,<<"..">>, " h.ej", <<" ..">>], [trailing, ". "], ".. h.ej"),
+ ?TEST([<<".. h.ej .">>, <<"..">>], [both, ". "], "h.ej"),
+ ?TEST(["..h", ".e", <<"j..">>], [both, ". "], "h.ej"),
+ ?TEST(["..h", <<".ejsa"/utf8>>, "n.."], [both, ". "], "h.ejsan"),
+ %% Test that it behaves with graphemes (i.e. nfd tests are the hard part)
+ ?TEST("aaåaa", [both, "a"], "å"),
+ ?TEST(["aaa",778,"äöoo"], [both, "ao"], "åäö"),
+ ?TEST([<<"aaa">>,778,"äöoo"], [both, "ao"], "åäö"),
+ ?TEST([<<"e">>,778,"åäöe", <<778/utf8>>], [both, [[$e,778]]], "åäö"),
+ ?TEST([[<<"!v">>|<<204,128,$v,204,129>>]],[trailing, [[$v,769]]], [$!,$v,768]),
+ ?TEST([[[<<"v">>|<<204,129,118,204,128,118>>],769,118,769]], [trailing, [[118,769]]], [$v,769,$v,768]),
+ ?TEST([<<"vv">>|<<204,128,118,204,128>>], [trailing, [[118,768]]], "v"),
+ ok.
+
+chomp(_) ->
+ Str = "åäö\na\r\nsd\n",
+ Res = "åäö\na\r\nsd",
+ ?TEST("", [], ""),
+ ?TEST("\n", [], ""),
+ ?TEST("str \t", [], "str \t"),
+ ?TEST("str \t\n\r", [], "str \t\n\r"),
+ ?TEST(Str, [], Res),
+ ?TEST([Str,$\n], [], Res),
+ ?TEST([Str|"\n"], [], Res),
+ ?TEST([Str|<<"\n">>], [], Res),
+ ?TEST([Str,$\r|<<"\n">>], [], Res),
+ ?TEST([Str, <<$\r>>|"\n"], [], Res),
+ ?TEST([<<$a,$\r>>,"\na\n"], [], "a\r\na"),
+ ok.
+
+take(_) ->
+ Str = "\t\s..Ha\s.llå..\t\n\r",
+ WS = "\t\s\n\r",
+ Chars = lists:seq($a,$z)++lists:seq($A,$Z),
+ %% complement=false, dir=leading
+ ?TEST("", ["abc"], {"",""}),
+ ?TEST(Str, ["x"], {[], Str}),
+ ?TEST(Str, [WS], {"\t\s","..Ha\s.llå..\t\n\r"}),
+ ?TEST(".. ", ["", false], {"", ".. "}),
+ ?TEST([<<".. ">>], [". ", false, leading], {".. ", ""}),
+ ?TEST(".. h.ej ..", [". ", false, leading], {".. ", "h.ej .."}),
+ ?TEST(["..", <<"h.ej">>, ".."], [". ", false, leading], {"..", "h.ej.."}),
+ ?TEST([[], "..", " h.ej ", <<"..">>], [". ", false, leading], {".. ","h.ej .."}),
+ ?TEST([<<>>,<<"..">>, " h.ej", <<" ..">>], [". ", false, leading], {".. ", "h.ej .."}),
+ ?TEST(["..h", <<".ejsa"/utf8>>, "n.."], [". ", false, leading], {"..", "h.ejsan.."}),
+ ?TEST([[<<101,204,138,33>>]], [[[$e,778]]], {[$e,778], "!"}),
+ %% Test that it behaves with graphemes (i.e. nfd tests are the hard part)
+ ?TEST("aaåaa", ["a", false, leading], {"aa", "åaa"}),
+ ?TEST(["aaa",778,"äöoo"], ["ao", false, leading], {"aa", "åäöoo"}),
+ ?TEST([<<"aaa">>,778,"äöoo"], ["ao",false,leading], {"aa", "åäöoo"}),
+ ?TEST([<<"e">>,778,"åäöe", <<778/utf8>>], [[[$e,778]], false, leading], {[$e,778],"åäöe"++[778]}),
+
+ %% complement=true, dir=leading
+ ?TEST("", ["abc", true], {"",""}),
+ ?TEST(Str, ["x", true], {Str, []}),
+ ?TEST(Str, [Chars, true], {"\t\s..","Ha\s.llå..\t\n\r"}),
+ ?TEST(".. ", ["",true], {".. ", ""}),
+ ?TEST([<<".. ">>], [Chars, true, leading], {".. ", ""}),
+ ?TEST(".. h.ej ..", [Chars, true, leading], {".. ", "h.ej .."}),
+ ?TEST(["..", <<"h.ej">>, ".."], [Chars, true, leading], {"..", "h.ej.."}),
+ ?TEST([[], "..", " h.ej ", <<"..">>], [Chars, true, leading], {".. ","h.ej .."}),
+ ?TEST([<<>>,<<"..">>, " h.ej", <<" ..">>], [Chars, true, leading], {".. ", "h.ej .."}),
+ ?TEST(["..h", <<".ejsa"/utf8>>, "n.."], [Chars, true, leading], {"..", "h.ejsan.."}),
+ %% Test that it behaves with graphemes (i.e. nfd tests are the hard part)
+ ?TEST(["aaee",778,"äöoo"], [[[$e,778]], true, leading], {"aae", [$e,778|"äöoo"]}),
+ ?TEST([<<"aae">>,778,"äöoo"], [[[$e,778]],true,leading], {"aa", [$e,778|"äöoo"]}),
+ ?TEST([<<"e">>,778,"åäöe", <<778/utf8>>], [[[$e,778]], true, leading], {[], [$e,778]++"åäöe"++[778]}),
+
+ %% complement=false, dir=trailing
+ ?TEST(Str, ["", false, trailing], {Str, []}),
+ ?TEST(Str, ["x", false, trailing], {Str, []}),
+ ?TEST(Str, [WS, false,trailing], {"\t\s..Ha\s.llå..", "\t\n\r"}),
+ ?TEST(".. h.ej ..", [". ", false, trailing], {".. h.ej", " .."}),
+ ?TEST(["..", <<"h.ej">>, ".."], [". ", false, trailing], {"..h.ej", ".."}),
+ ?TEST([[], "..", " h.ej ", <<"..">>], [". ", false, trailing], {".. h.ej", " .."}),
+ ?TEST([<<>>,<<"..">>, " h.ej", <<" ..">>], [". ", false, trailing], {".. h.ej", " .."}),
+ ?TEST(["..h", <<".ejsa"/utf8>>, "n.."], [". ", false, trailing], {"..h.ejsan", ".."}),
+ ?TEST("aaåaa", ["a", false, trailing], {"aaå", "aa"}),
+ ?TEST([<<"KMШ"/utf8>>], [[1064], false, trailing], {"KMШ",[]}),
+ ?TEST([[<<"!\"">>|<<"\"">>]], ["\"", false, trailing], {"!", "\"\""}),
+ ?TEST([<<$v>>, 769], [[[$v,769]], false, trailing], {"", [$v,769]}),
+ ?TEST(["aaa",778,"äöoo"], ["ao", false, trailing], {"aaåäö", "oo"}),
+ ?TEST([<<"aaa">>,778,"äöoo"], ["ao", false, trailing], {"aaåäö", "oo"}),
+ ?TEST([<<"e">>,778,"åäöee", <<778/utf8>>], [[[$e,778]], false, trailing], {[$e,778|"åäöe"], [$e,778]}),
+
+ %% complement=true, dir=trailing
+ ?TEST("", ["abc", true, trailing], {"",""}),
+ ?TEST(Str, ["x", true, trailing], {[], Str}),
+ %?TEST(Str, [{norm,Chars}, true, trailing], {"\t\s..Ha\s.ll","å..\t\n\r"}),
+ ?TEST(".. ", ["", true, trailing], {"", ".. "}),
+ ?TEST([<<".. ">>], [Chars, true, trailing], {"", ".. "}),
+ ?TEST(".. h.ej ..", [Chars, true, trailing], {".. h.ej", " .."}),
+ ?TEST(["..", <<"h.ej">>, ".."], [Chars, true, trailing], {"..h.ej", ".."}),
+ ?TEST([[], "..", " h.ej ", <<"..">>], [Chars, true, trailing], {".. h.ej"," .."}),
+ ?TEST([<<>>,<<"..">>, " h.ej", <<" ..">>], [Chars, true, trailing], {".. h.ej"," .."}),
+ ?TEST(["..h", <<".ejsa"/utf8>>, "n.."], [Chars, true, trailing], {"..h.ejsan", ".."}),
+ ?TEST([[<<101,204,138,33>>]], [[[$e,778]], true, trailing], {[$e,778], "!"}),
+ ?TEST([<<"Fa">>], [[$F], true, trailing], {"F", "a"}),
+ ?TEST([[<<101,101,204,138>>,1045,778]], ["e", true, trailing], {"e", [101,778,1045,778]}),
+ ?TEST([[<<101,101,204,138>>,<<1045/utf8,778/utf8>>]], ["e", true, trailing], {"e", [101,778,1045,778]}),
+ ?TEST([[[118,769,118],<<204,129,118,204,129,120,204,128,118>>,768,120,768]],
+ [[[118,769]], true, trailing], {[118,769,118,769,118,769],[120,768,118,768,120,768]}),
+ ?TEST([[<<118,204,128,118>>|<<204,128,118,204,128,118,204,128,206,132,204,129,206,132,204,129>>]],
+ [[[118,768]], true, trailing], {[118,768,118,768,118,768,118,768], [900,769,900,769]}),
+ %% Test that it behaves with graphemes (i.e. nfd tests are the hard part)
+ ?TEST(["aaee",778,"äöoo"], [[[$e,778]], true, trailing], {"aae"++[$e,778], "äöoo"}),
+ ?TEST([<<"aae">>,778,"äöoo"], [[[$e,778]],true,trailing], {"aa"++[$e,778], "äöoo"}),
+ ?TEST([<<"e">>,778,"åäöe", <<778/utf8>>], [[[$e,778]], true, trailing], {[$e,778]++"åäöe"++[778], []}),
+ ?TEST([<<"e">>,778,"åäöe", <<778/utf8>>, $e, 779], [[[$e,778]], true, trailing],
+ {[$e,778]++"åäöe"++[778], [$e,779]}),
+
+ ok.
+
+
+uppercase(_) ->
+ ?TEST("", [], ""),
+ ?TEST("123", [], "123"),
+ ?TEST("abc", [], "ABC"),
+ ?TEST("ABC", [], "ABC"),
+ ?TEST("abcdefghiljklmnopqrstvxyzåäö",[], "ABCDEFGHILJKLMNOPQRSTVXYZÅÄÖ"),
+ ?TEST("åäö", [], "ÅÄÖ"),
+ ?TEST("ÅÄÖ", [], "ÅÄÖ"),
+ ?TEST("Michał", [], "MICHAŁ"),
+ ?TEST(["Mic",<<"hał"/utf8>>], [], "MICHAŁ"),
+ ?TEST("ljLJ", [], "LJLJ"),
+ ?TEST("LJlj", [], "LJLJ"),
+ ?TEST("ß sharp s", [], "SS SHARP S"),
+ ok.
+
+lowercase(_) ->
+ ?TEST("", [], ""),
+ ?TEST("123", [], "123"),
+ ?TEST("abc", [], "abc"),
+ ?TEST("ABC", [], "abc"),
+ ?TEST("åäö", [], "åäö"),
+ ?TEST("ÅÄÖ", [], "åäö"),
+ ?TEST("MICHAŁ", [], "michał"),
+ ?TEST(["Mic",<<"HAŁ"/utf8>>], [], "michał"),
+ ?TEST("ß SHARP S", [], "ß sharp s"),
+ ?TEST("İ I WITH DOT ABOVE", [], "i̇ i with dot above"),
+ ok.
+
+titlecase(_) ->
+ ?TEST("", [], ""),
+ ?TEST("123", [], "123"),
+ %% Titlecase is the same as uppercase for most chars
+ [?TEST([C,$x], [], string:uppercase([C])++[$x]) ||
+ C <-"abcdefghiljklmnopqrstvxyzåäö"],
+ %% Example of a different mapping
+ ?TEST("ljusad", [],"Ljusad"),
+ ?TEST("ljLJ", [], "LjLJ"),
+ ?TEST("LJlj", [], "Ljlj"),
+ ?TEST("ß sharp s", [], "Ss sharp s"),
+ ok.
+
+casefold(_) ->
+ ?TEST("", [], ""),
+ ?TEST("123", [], "123"),
+ ?TEST("abc", [], "abc"),
+ ?TEST("ABC", [], "abc"),
+ ?TEST("åäö", [], "åäö"),
+ ?TEST("ÅÄÖ", [], "åäö"),
+ ?TEST("MICHAŁ", [], "michał"),
+ ?TEST(["Mic",<<"HAŁ"/utf8>>], [], "michał"),
+ ?TEST("ß SHARP S", [], "ss sharp s"),
+ ?TEST("ẞ SHARP S", [], "ss sharp s"),
+ ?TEST("İ I WITH DOT ABOVE", [], "i̇ i with dot above"),
+ ok.
+
+prefix(_) ->
+ ?TEST("", ["a"], nomatch),
+ ?TEST("a", [""], "a"),
+ ?TEST("b", ["a"], nomatch),
+ ?TEST("a", ["a"], ""),
+ ?TEST("å", ["a"], nomatch),
+ ?TEST(["a",<<778/utf8>>], ["a"], nomatch),
+ ?TEST([<<"a"/utf8>>,778], ["a"], nomatch),
+ ?TEST("hejsan", [""], "hejsan"),
+ ?TEST("hejsan", ["hej"], "san"),
+ ?TEST("hejsan", ["hes"], nomatch),
+ ?TEST(["h", "ejsan"], ["hej"], "san"),
+ ?TEST(["h", "e", "jsan"], ["hej"], "san"),
+ ?TEST(["h", "e", "san"], ["hej"], nomatch),
+ ?TEST(["h", <<"ejsan">>], ["hej"], "san"),
+ ?TEST(["h", <<"e">>, "jsan"], ["hej"], "san"),
+ ?TEST(["h", "e", <<"jsan">>], ["hej"], "san"),
+ ok.
+
+split(_) ->
+ Mod = fun(Res) ->
+ [lists:flatten(unicode:characters_to_nfc_list(io_lib:format("~ts", [Str])))
+ || Str <- Res] end,
+ ?TEST("..", ["", leading], {Mod, [".."]}),
+ ?TEST("..", ["..", leading], {Mod, [[],[]]}),
+ ?TEST("abcd", ["..", leading], {Mod, ["abcd"]}),
+ ?TEST("ab..bc", ["..", leading], {Mod, ["ab","bc"]}),
+ ?TEST("ab..bc..cd", ["..", leading], {Mod, ["ab","bc..cd"]}),
+ ?TEST("..ab", [".."], {Mod, [[],"ab"]}),
+ ?TEST("ab..", ["..", leading], {Mod, ["ab",[]]}),
+ ?TEST(["ab..bc..cd"], ["..", leading], {Mod, ["ab","bc..cd"]}),
+ ?TEST(["ab","..bc..cd"], ["..", leading], {Mod, ["ab","bc..cd"]}),
+ ?TEST(["ab",<<"..bc..cd">>], ["..", leading], {Mod, ["ab","bc..cd"]}),
+ ?TEST(["ab.",".bc..cd"], ["..", leading], {Mod, ["ab","bc..cd"]}),
+ ?TEST(["ab.",<<".bc..cd">>], ["..", leading], {Mod, ["ab","bc..cd"]}),
+ ?TEST(["ab..","bc..cd"], ["..", leading], {Mod, ["ab","bc..cd"]}),
+ ?TEST(["ab..",<<"bc..cd">>], ["..", leading], {Mod, ["ab","bc..cd"]}),
+ ?TEST(["ab.","bc..cd"], ["..", leading], {Mod, ["ab.bc","cd"]}),
+ ?TEST("ab...bc", ["..", leading], {Mod, ["ab",".bc"]}),
+
+ ?TEST("..", ["", trailing], {Mod, [".."]}),
+ ?TEST("..", ["..", trailing], {Mod, [[],[]]}),
+ ?TEST("abcd", ["..", trailing], {Mod, ["abcd"]}),
+ ?TEST("ab..bc", ["..", trailing], {Mod, ["ab","bc"]}),
+ ?TEST("ab..bc..cd", ["..", trailing], {Mod, ["ab..bc","cd"]}),
+ ?TEST("..ab", ["..", trailing], {Mod, [[],"ab"]}),
+ ?TEST("ab..", ["..", trailing], {Mod, ["ab",[]]}),
+ ?TEST(["ab..bc..cd"], ["..", trailing], {Mod, ["ab..bc","cd"]}),
+ ?TEST(["ab","..bc..cd"], ["..", trailing], {Mod, ["ab..bc","cd"]}),
+ ?TEST(["ab"|<<"a">>], ["a", trailing], {Mod, ["ab",[]]}),
+ ?TEST(["ab",<<"..bc..cd">>], ["..", trailing], {Mod, ["ab..bc","cd"]}),
+ ?TEST([<<"ab.">>,".bc..cd"], ["..", trailing], {Mod, ["ab..bc","cd"]}),
+ ?TEST(["ab.",<<".bc..cd">>], ["..", trailing], {Mod, ["ab..bc","cd"]}),
+ ?TEST(["ab..","bc..cd"], ["..", trailing], {Mod, ["ab..bc","cd"]}),
+ ?TEST(["ab..",<<"bc..cd">>], ["..", trailing], {Mod, ["ab..bc","cd"]}),
+ ?TEST(["ab.","bc..cd"], ["..", trailing], {Mod, ["ab.bc","cd"]}),
+ ?TEST("ab...bc", ["..", trailing], {Mod, ["ab.","bc"]}),
+
+ ?TEST("..", ["..", all], {Mod, [[],[]]}),
+ ?TEST("abcd", ["..", all], {Mod, ["abcd"]}),
+ ?TEST("a..b", ["..", all], {Mod, ["a","b"]}),
+ ?TEST("a..b..c", ["..", all], {Mod, ["a","b","c"]}),
+ ?TEST("a..", ["..", all], {Mod, ["a",[]]}),
+ ?TEST(["a..b..c"], ["..", all], {Mod, ["a","b","c"]}),
+ ?TEST(["a","..b..c"], ["..", all], {Mod, ["a","b","c"]}),
+ ?TEST(["a",<<"..b..c">>], ["..", all], {Mod, ["a","b","c"]}),
+ ?TEST(["a.",".b..c"], ["..", all], {Mod, ["a","b","c"]}),
+ ?TEST(["a.",<<".b..c">>], ["..", all], {Mod, ["a","b","c"]}),
+ ?TEST(["a..","b..c"], ["..", all], {Mod, ["a","b","c"]}),
+ ?TEST(["a..",<<"b..c">>], ["..", all], {Mod, ["a","b","c"]}),
+ ?TEST(["a.","b..c"], ["..", all], {Mod, ["a.b","c"]}),
+ ?TEST("a...b", ["..", all], {Mod, ["a",".b"]}),
+
+ %% Grapheme (split) tests
+ ?TEST("aΩΩb", ["Ω", all], {Mod, ["a","","b"]}),
+ ?TEST([<<"aae">>,778,"äöoo"], [[$e,778], leading], {Mod, ["aa","äöoo"]}),
+ ?TEST([<<"aae">>,778,"äöoo"], [[$e,778], trailing], {Mod, ["aa","äöoo"]}),
+ ?TEST([<<"aae">>,778,"äöoo"], [[$e,778], all], {Mod, ["aa","äöoo"]}),
+ ?TEST([<<"aae">>,778,"öeeåäö"], ["e", leading], {Mod, [[$a, $a, $e,778,$ö],"eåäö"]}),
+ ?TEST([<<"aae">>,778,"öeeåäö"], ["e", trailing], {Mod, [[$a, $a, $e,778,$ö, $e],"åäö"]}),
+ ?TEST([<<"aae">>,778,"öeeåäö"], ["e", all], {Mod, [[$a, $a, $e,778,$ö],"", "åäö"]}),
+
+ ok.
+
+replace(_) ->
+ ?TEST(["a..b.", [".c"]], ["xxx", "::"], "a..b..c"),
+ ?TEST(["a..b.", [".c"]], ["..", "::"], "a::b..c"),
+ ?TEST([<<"a..b.">>, [".c"]], ["..", "::", trailing], "a..b::c"),
+ ?TEST(["a..b.", [".c"]], ["..", "::", all], "a::b::c"),
+ ok.
+
+cd_gc(_) ->
+ [] = string:next_codepoint(""),
+ [] = string:next_codepoint(<<>>),
+ [] = string:next_codepoint([<<>>]),
+ "abcd" = string:next_codepoint("abcd"),
+ [$e,778] = string:next_codepoint([$e,778]),
+ [$e|<<204,138>>] = string:next_codepoint(<<$e,778/utf8>>),
+ [778|_] = string:next_codepoint(tl(string:next_codepoint(<<$e,778/utf8>>))),
+
+ [] = string:next_grapheme(""),
+ [] = string:next_grapheme(<<>>),
+ [] = string:next_grapheme([<<>>]),
+ "abcd" = string:next_grapheme("abcd"),
+ [[$e,778]] = string:next_grapheme([$e,778]),
+ [[$e,778]] = string:next_grapheme(<<$e,778/utf8>>),
+
+ ok.
+
+
+find(_) ->
+ ?TEST(["h", "ejsan"], [""], "hejsan"),
+ ?TEST(["h", "ejsan"], [<<>>], "hejsan"),
+ ?TEST([], [""], ""),
+ ?TEST([], ["hej"], nomatch),
+ ?TEST(["h", "ejsan"], ["hej"], "hejsan"),
+ ?TEST(["h", "e", "jsan"], ["hej"], "hejsan"),
+ ?TEST(["xh", "e", "san"], ["hej"], nomatch),
+ ?TEST([<<"xh">>, <<"ejsan">>], ["hej"], "hejsan"),
+ ?TEST(["xh", <<"ejsan">>], ["hej"], "hejsan"),
+ ?TEST(["xh", <<"e">>, "jsan"], ["hej"], "hejsan"),
+ ?TEST(["xh", "e", <<"jsan">>], ["hej"], "hejsan"),
+ ?TEST(["xh", "er", <<"ljsane">>, "rlang"], ["erl", leading], "erljsanerlang"),
+ ?TEST("aΩΩb", ["Ω", leading], "ΩΩb"),
+ ?TEST([<<"aae">>,778,"äöoo"], [[$e,778], leading], [$e,778]++"äöoo"),
+ ?TEST([<<"aae">>,778,"öeeåäö"], ["e", leading], "eeåäö"),
+
+ ?TEST(["h", "ejsan"], ["", trailing], "hejsan"),
+ ?TEST([], ["", trailing], ""),
+ ?TEST([], ["hej", trailing], nomatch),
+ ?TEST(["h", "ejsan"], ["hej", trailing], "hejsan"),
+ ?TEST(["h", "e", "jsan"], ["hej", trailing], "hejsan"),
+ ?TEST(["xh", "e", "san"], ["hej", trailing], nomatch),
+ ?TEST([<<"xh">>, <<"ejsan">>], ["hej", trailing], "hejsan"),
+ ?TEST(["xh", <<"ejsan">>], ["hej", trailing], "hejsan"),
+ ?TEST(["xh", <<"e">>, "jsan"], ["hej", trailing], "hejsan"),
+ ?TEST(["xh", "e", <<"jsan">>], ["hej", trailing], "hejsan"),
+ ?TEST(["xh", "er", <<"ljsane">>, "rlang"], ["erl", trailing], "erlang"),
+ ?TEST("aΩΩb", ["Ω", trailing], "Ωb"),
+ ?TEST([<<"aae">>,778,"äöoo"], [[$e,778], trailing], [$e,778]++"äöoo"),
+ ?TEST([<<"aeae">>,778,"äö"], ["e", trailing], "eae"++[778,$ä,$ö]),
+
+ ok.
+
+lexemes(_) ->
+ Mod = fun(Res) ->
+ [unicode:characters_to_nfc_list(io_lib:format("~ts", [Str]))|| Str <- Res]
+ end,
+ Res = ["Hej", "san", "Hopp", "san"],
+ ?TEST("", [" ,."], {Mod, []}),
+ ?TEST("Hej san", [""], {Mod, ["Hej san"]}),
+ ?TEST(" ,., ", [" ,."], {Mod, []}),
+ ?TEST( "Hej san Hopp san", [" ,."], {Mod, Res}),
+ ?TEST(" Hej san Hopp san ", [" ,."], {Mod, Res}),
+ ?TEST(" Hej san, .Hopp san ", [" ,."], {Mod, Res}),
+
+ ?TEST([" Hej san",", .Hopp san "], [" ,."], {Mod, Res}),
+ ?TEST([" Hej sa","n, .Hopp san "], [" ,."], {Mod, Res}),
+ ?TEST([" Hej san,"," .Hopp san "], [" ,."], {Mod, Res}),
+
+ ?TEST([" Hej san",[", .Hopp san "]], [" ,."], {Mod, Res}),
+ ?TEST([" Hej sa",["n, .Hopp san "]], [" ,."], {Mod, Res}),
+ ?TEST([" Hej san,",[" .Hopp san "]], [" ,."], {Mod, Res}),
+
+ ?TEST([" H",<<"ej san, .Hopp "/utf8>>, "san"], [" ,."], {Mod, Res}),
+ ?TEST([" Hej san",<<", .Hopp "/utf8>>, "san"], [" ,."], {Mod, Res}),
+ ?TEST([" Hej sa",<<"n, .Hopp"/utf8>>, " san"], [" ,."], {Mod, Res}),
+ ?TEST([" Hej san,",<<" .Hopp s"/utf8>>, "an"], [" ,."], {Mod, Res}),
+ ?TEST([" Hej san",[<<", .Hopp san "/utf8>>]], [" ,."], {Mod, Res}),
+ ?TEST([" Hej sa",[<<"n, .Hopp san "/utf8>>]], [" ,."], {Mod, Res}),
+ ?TEST([" Hej san,",[<<" .Hopp san "/utf8>>], <<" ">>], [" ,."], {Mod, Res}),
+
+ ?TEST(" Hej\r\nsan\nnl", ["\r\n\s"], {Mod, ["Hej\r\nsan", "nl"]}),
+
+ ?TEST(["b1ec1e",778,"äöo21"], ["eo"], {Mod, ["b1",[$c,$1,$e,778,$ä,$ö],"21"]}),
+ ?TEST([<<"b1ec1e">>,778,"äöo21"], ["eo"], {Mod, ["b1",[$c,$1,$e,778,$ä,$ö],"21"]}),
+ %% Grapheme (split) tests
+ Str10 = [[[<<"÷"/utf8>>,1101],<<"ë"/utf8>>|<<"\"">>]],
+ ?TEST(Str10, [[1076]], {Mod, [unicode:characters_to_nfc_list(Str10)]}),
+ ?TEST("a1Ωb1Ωc1", ["Ω"], {Mod, ["a1","b1","c1"]}),
+ ?TEST([<<"aae">>,778,"äöoo"], [[[$e,778]]], {Mod, ["aa","äöoo"]}),
+ ?TEST([<<"aae">>,778,"äöo21"], [[[$e,778],$o]], {Mod, ["aa","äö","21"]}),
+ ?TEST([<<"aae">>,778,"öeeåäö"], ["e"], {Mod, [[$a, $a, $e,778,$ö],"åäö"]}),
+ ok.
+
+nth_lexeme(_) ->
+ {'EXIT', _} = (catch string:nth_lexeme("test test", 0, [])),
+ {'EXIT', _} = (catch string:nth_lexeme(<<"test test">>, 0, [])),
+ ?TEST( "", [1, " ,."], []),
+ ?TEST( "Hej san", [1, ""], "Hej san"),
+ ?TEST( " ,., ", [1, " ,."], []),
+ ?TEST( " ,., ", [3, " ,."], []),
+ ?TEST("Hej san Hopp san", [1, " ,."], "Hej"),
+ ?TEST("...Hej san Hopp san", [1, " ,."], "Hej"),
+ ?TEST("Hej san Hopp san", [3, " ,."], "Hopp"),
+ ?TEST(" Hej san Hopp san ", [3, " ,."], "Hopp"),
+ ?TEST(" Hej san, .Hopp san ", [3, " ,."], "Hopp"),
+ ?TEST("ab cd", [3, " "], ""),
+
+ ?TEST([" Hej san",", .Hopp san "], [3, " ,."], "Hopp"),
+ ?TEST([" Hej sa","n, .Hopp san "], [3, " ,."], "Hopp"),
+ ?TEST([" Hej san,"," .Hopp san "], [3, " ,."], "Hopp"),
+
+ ?TEST([" Hej san",[", .Hopp san "]], [3," ,."], "Hopp"),
+ ?TEST([" Hej sa",["n, .Hopp san "]], [3, " ,."], "Hopp"),
+ ?TEST([" Hej san,",[" .Hopp san "]], [3, " ,."], "Hopp"),
+
+ ?TEST([" Hej san",<<", .Hopp "/utf8>>, "san"], [3, " ,."], "Hopp"),
+ ?TEST([" Hej sa",<<"n, .Hopp"/utf8>>, " san"], [3, " ,."], "Hopp"),
+ ?TEST([" Hej san,",<<" .Hopp s"/utf8>>, "an"], [3, " ,."], "Hopp"),
+ ?TEST([" Hej san,",<<" .Hopp s"/utf8>>, "an"], [4, " ,."], "san"),
+ ?TEST([" Hej san",[<<", .Hopp san "/utf8>>]], [3, " ,."], "Hopp"),
+ ?TEST([" Hej sa",[<<"n, .Hopp san "/utf8>>]], [3, " ,."], "Hopp"),
+ ?TEST([" Hej san,",[<<" .Hopp san "/utf8>>], <<" ">>], [3, " ,."], "Hopp"),
+
+ ?TEST(["b1ec1e",778,"äöo21"], [3,"eo"], "21"),
+ ?TEST([<<"b1ec1e">>,778,"äöo21"], [3, "eo"], "21"),
+ %% Grapheme (split) tests
+ ?TEST("a1Ωb1Ωc1", [1, "Ω"], "a1"),
+ ?TEST([<<"aae">>,778,"äöoo"], [2,[[$e,778]]], "äöoo"),
+ ?TEST([<<"aae">>,778,"äöo21"], [2,[[$e,778],$o]], "äö"),
+ ?TEST([<<"aae">>,778,"öeeåäö"], [2,"e"], "åäö"),
+ ok.
+
+
+meas(Config) ->
+ case ct:get_timetrap_info() of
+ {_,{_,Scale}} when Scale > 1 ->
+ {skip,{will_not_run_in_debug,Scale}};
+ _ -> % No scaling
+ DataDir = proplists:get_value(data_dir, Config),
+ TestDir = filename:dirname(string:trim(DataDir, trailing, "/")),
+ do_measure(TestDir)
+ end.
+
+do_measure(TestDir) ->
+ File = filename:join(TestDir, ?MODULE_STRING ++ ".erl"),
+ io:format("File ~s ",[File]),
+ {ok, Bin} = file:read_file(File),
+ io:format("~p~n",[byte_size(Bin)]),
+ Do = fun(Name, Func, Mode) ->
+ {N, Mean, Stddev, _} = time_func(Func, Mode, Bin),
+ io:format("~10w ~6w ~6.2fms ±~4.2fms #~.2w gc included~n",
+ [Name, Mode, Mean/1000, Stddev/1000, N])
+ end,
+ io:format("----------------------~n"),
+ Do(tokens, fun(Str) -> string:tokens(Str, [$\n,$\r]) end, list),
+ Tokens = {lexemes, fun(Str) -> string:lexemes(Str, [$\n,$\r]) end},
+ [Do(Name,Fun,Mode) || {Name,Fun} <- [Tokens], Mode <- [list, binary]],
+ ok.
+
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+%% internal functions
+
+test(Line, Func, Str, Args, Res, Norm) ->
+ %%io:format("~p: ~p ~w ~w~n",[Line, Func, Str, Args]),
+ test_1(Line, Func, Str, [Str|norm(none,Args)], Res),
+ %%io:format("~p: ~p bin ",[Line, Func]),
+ test_1({Line,list}, Func, Str,
+ [unicode:characters_to_list(Str)|norm(none,Args)], Res),
+ Norm andalso
+ test_1({Line,clist}, Func, Str,
+ [unicode:characters_to_nfc_list(Str)|norm(nfc,Args)], Res),
+ Norm andalso
+ test_1({Line,dlist}, Func, Str,
+ [unicode:characters_to_nfd_list(Str)|norm(nfd,Args)], Res),
+ test_1({Line,bin}, Func, Str,
+ [unicode:characters_to_binary(Str)|norm(none, Args)], Res),
+ Norm andalso
+ test_1({Line,cbin}, Func, Str,
+ [unicode:characters_to_nfc_binary(Str)|norm(nfc,Args)], Res),
+ Norm andalso
+ test_1({Line,dbin}, Func, Str,
+ [unicode:characters_to_nfd_binary(Str)|norm(nfd,Args)], Res),
+ %%io:format("~n",[]),
+ ok.
+
+test_1(Line, Func, Str, Args, Exp) ->
+ try
+ Res = apply(string, Func, Args),
+ check_types(Line, Func, Args, Res),
+ case res(Res, Exp) of
+ true -> ok;
+ {Res1,Exp1} when is_tuple(Exp1) ->
+ io:format("~p~n",[Args]),
+ io:format("~p:~p: ~ts~w =>~n :~w:~w~n",
+ [Func,Line, Str,Str,Res1,Exp1]),
+ exit({error, Func});
+ {Res1,Exp1} ->
+ io:format("~p:~p: ~ts~w =>~n :~ts~w:~ts~w~n",
+ [Func,Line, Str,Str, Res1,Res1, Exp1,Exp1]),
+ exit({error, Func})
+ end
+ catch
+ error:Exp ->
+ ok;
+ error:Reason ->
+ io:format("~p:~p: Crash ~p ~p~n",
+ [?MODULE,Line, Reason, erlang:get_stacktrace()]),
+ exit({error, Func})
+ end.
+
+norm(Type, Args) ->
+ Norm = case Type of
+ nfc -> fun unicode:characters_to_nfc_list/1;
+ nfd -> fun unicode:characters_to_nfd_list/1;
+ none -> fun(Str) -> Str end
+ end,
+ lists:map(fun({norm,Str}) -> Norm(Str);
+ (Other) -> Other
+ end, Args).
+
+res(Str, Str) -> true;
+res(Str, Exp) when is_list(Str), is_list(Exp) ->
+ A = unicode:characters_to_nfc_list(Str),
+ A==Exp orelse {A,Exp};
+res(Str, Exp) when is_binary(Str), is_list(Exp) ->
+ A = unicode:characters_to_nfc_list(Str),
+ A==Exp orelse {A,Exp};
+res(What, {Fun, Exp}) when is_function(Fun) ->
+ Fun(What) == Exp orelse {Fun(What), Exp};
+res({S1,S2}=S, {Exp1,Exp2}=E) -> %% For take
+ case {res(S1,Exp1), res(S2,Exp2)} of
+ {true, true} -> true;
+ _ -> {S, E}
+ end;
+res(Int, Exp) ->
+ Int == Exp orelse {Int, Exp}.
+
+
+check_types(_Line, _Func, _Str, Res)
+ when is_integer(Res); is_boolean(Res); Res =:= nomatch ->
+ %% length or equal
+ ok;
+check_types(Line, Func, [S1,S2], Res)
+ when Func =:= concat ->
+ case check_types_1(type(S1),type(S2)) of
+ ok ->
+ case check_types_1(type(S1),type(Res)) of
+ ok -> ok;
+ {T1,T2} ->
+ io:format("Failed: ~p ~p ~p ~p~n",[Line, Func, T1, T2]),
+ io:format(" ~p ~p => ~p~n", [S1, S2, Res]),
+ error
+ end;
+ _ -> ok
+ end;
+check_types(Line, Func, [Str|_], Res) ->
+ AddList = fun(mixed) -> mixed;
+ ({list,{list,_}}) -> {list, deep};
+ (R) ->
+ case lists:member(Func, [lexemes, tokens, split]) of
+ true -> {list, R};
+ false -> R
+ end
+ end,
+ try needs_check(Func) andalso (ok = check_types_1(AddList(type(Str)), type(Res))) of
+ ok -> ok;
+ false -> ok
+ catch _:{badmatch, {T1,T2}} ->
+ io:format("Failed: ~p ~p: ~p ~p~n",[Line, Func, T1, T2]),
+ io:format(" ~p => ~p~n", [Str, Res]),
+ error;
+ _:Reason ->
+ io:format("Crash: ~p in~n ~p~n",[Reason, erlang:get_stacktrace()]),
+ io:format("Failed: ~p ~p: ~p => ~p~n", [Line, Func, Str, Res]),
+ exit({Reason, erlang:get_stacktrace()})
+ end.
+
+check_types_1(T, T) ->
+ ok;
+check_types_1(Str, Res)
+ when is_binary(Str), is_binary(Res) ->
+ ok;
+check_types_1({list, _},{list, undefined}) ->
+ ok;
+check_types_1({list, _},{list, codepoints}) ->
+ ok;
+check_types_1({list, _},{list, {list, codepoints}}) ->
+ ok;
+check_types_1({list, {list, _}},{list, {list, codepoints}}) ->
+ ok;
+check_types_1(mixed,_) ->
+ ok;
+check_types_1({list, binary}, binary) ->
+ ok;
+check_types_1({list, binary}, {other, _, _}) -> %% take
+ ok;
+check_types_1({list, deep}, _) ->
+ ok;
+check_types_1({list, {list, deep}}, _) ->
+ ok;
+check_types_1(T1,T2) ->
+ {T1,T2}.
+
+type(Bin) when is_binary(Bin) ->
+ binary;
+type([]) ->
+ {list, undefined};
+type(List) when is_list(List) ->
+ Deep = fun(L) when is_list(L) ->
+ lists:any(fun(C) -> is_list(C) orelse is_binary(C) end, L);
+ (_) -> false
+ end,
+ case all(fun(C) -> not is_binary(C) end, List) of
+ true ->
+ case all(fun(C) -> is_integer(C) end, List) of
+ true -> {list, codepoints};
+ false ->
+ case [deep || L <- List, Deep(L)] of
+ [] -> {list, {list, codepoints}};
+ _ -> {list, deep}
+ end
+ end;
+ false ->
+ case all(fun(C) -> is_binary(C) end, List) of
+ true -> {list, binary};
+ false -> mixed
+ end
+ end;
+type({R1,R2}) ->
+ case {type(R1),type(R2)} of
+ {T,T} -> T;
+ {{list,undefined}, {list,codepoints}} -> {list,codepoints};
+ {{list,codepoints}, {list,undefined}} -> {list,codepoints};
+ {T1,T2} -> {other, T1,T2}
+ end;
+type(Other) ->
+ {other, Other}.
+
+all(_Check, []) ->
+ true;
+all(Check, [H|T]) ->
+ Check(H) andalso all(Check,T);
+all(Check, Bin) when is_binary(Bin) ->
+ Check(Bin).
+
+needs_check(reverse) -> false;
+needs_check(pad) -> false;
+needs_check(replace) -> false;
+needs_check(_) -> true.
+
+%%%% Timer stuff
+
+time_func(Fun, Mode, Bin) ->
+ timer:sleep(100), %% Let emulator catch up and clean things before test runs
+ Self = self(),
+ Pid = spawn_link(fun() ->
+ Str = mode(Mode, Bin),
+ Self ! {self(),time_func(0,0,0, Fun, Str, undefined)}
+ end),
+ receive {Pid,Msg} -> Msg end.
+
+time_func(N,Sum,SumSq, Fun, Str, _) when N < 50 ->
+ {Time, Res} = timer:tc(fun() -> Fun(Str) end),
+ time_func(N+1,Sum+Time,SumSq+Time*Time, Fun, Str, Res);
+time_func(N,Sum,SumSq, _, _, Res) ->
+ Mean = round(Sum / N),
+ Stdev = round(math:sqrt((SumSq - (Sum*Sum/N))/(N - 1))),
+ {N, Mean, Stdev, Res}.
+
+mode(binary, Bin) -> Bin;
+mode(list, Bin) -> unicode:characters_to_list(Bin).
+
%%
-%% Test cases starts here.
+%% Old string lists Test cases starts here.
%%
len(Config) when is_list(Config) ->
@@ -80,16 +932,14 @@ len(Config) when is_list(Config) ->
{'EXIT',_} = (catch string:len({})),
ok.
-equal(Config) when is_list(Config) ->
+old_equal(Config) when is_list(Config) ->
true = string:equal("", ""),
false = string:equal("", " "),
true = string:equal("laban", "laban"),
false = string:equal("skvimp", "skvump"),
- %% invalid arg type
- true = string:equal(2, 2), % not good, should crash
ok.
-concat(Config) when is_list(Config) ->
+old_concat(Config) when is_list(Config) ->
"erlang rules" = string:concat("erlang ", "rules"),
"" = string:concat("", ""),
"x" = string:concat("x", ""),
@@ -130,6 +980,7 @@ str_rstr(Config) when is_list(Config) ->
3 = string:rstr("xxxx", "xx"),
3 = string:str("xy z yx", " z"),
3 = string:rstr("xy z yx", " z"),
+ 3 = string:str("aaab", "ab"),
%% invalid arg type
{'EXIT',_} = (catch string:str(hello, "he")),
%% invalid arg type
@@ -184,7 +1035,7 @@ substr(Config) when is_list(Config) ->
{'EXIT',_} = (catch string:substr("1234", "1")),
ok.
-tokens(Config) when is_list(Config) ->
+old_tokens(Config) when is_list(Config) ->
[] = string:tokens("",""),
[] = string:tokens("abc","abc"),
["abc"] = string:tokens("abc", ""),
@@ -221,7 +1072,7 @@ replace_sep(C, Seps, New) ->
chars(Config) when is_list(Config) ->
[] = string:chars($., 0),
[] = string:chars($., 0, []),
- 10 = length(string:chars(32, 10, [])),
+ 10 = erlang:length(string:chars(32, 10, [])),
"aaargh" = string:chars($a, 3, "rgh"),
%% invalid arg type
{'EXIT',_} = (catch string:chars($x, [])),
@@ -231,7 +1082,7 @@ copies(Config) when is_list(Config) ->
"" = string:copies("", 10),
"" = string:copies(".", 0),
"." = string:copies(".", 1),
- 30 = length(string:copies("123", 10)),
+ 30 = erlang:length(string:copies("123", 10)),
%% invalid arg type
{'EXIT',_} = (catch string:copies("hej", -1)),
{'EXIT',_} = (catch string:copies("hej", 2.0)),
@@ -360,7 +1211,7 @@ to_integer(Config) when is_list(Config) ->
ok.
test_to_integer(Str) ->
- io:format("Checking ~p~n", [Str]),
+ %% io:format("Checking ~p~n", [Str]),
case string:to_integer(Str) of
{error,_Reason} = Bad ->
{'EXIT',_} = (catch list_to_integer(Str)),
@@ -403,7 +1254,7 @@ to_float(Config) when is_list(Config) ->
ok.
test_to_float(Str) ->
- io:format("Checking ~p~n", [Str]),
+ %% io:format("Checking ~p~n", [Str]),
case string:to_float(Str) of
{error,_Reason} = Bad ->
{'EXIT',_} = (catch list_to_float(Str)),
@@ -419,7 +1270,7 @@ to_upper_to_lower(Config) when is_list(Config) ->
All = lists:seq(0, 255),
UC = string:to_upper(All),
- 256 = length(UC),
+ 256 = erlang:length(UC),
all_upper_latin1(UC, 0),
LC = string:to_lower(All),
@@ -450,7 +1301,7 @@ all_lower_latin1([C|T], C) when 0 =< C, C < $A;
all_lower_latin1([H|T], C) when $A =< C, C =< $Z;
16#C0 =< C, C =< 16#F6;
16#C8 =< C, C =< 16#DE ->
- io:format("~p\n", [{H,C}]),
+ % io:format("~p\n", [{H,C}]),
H = C + 32,
all_lower_latin1(T, C+1);
all_lower_latin1([], 256) -> ok.
--
cgit v1.2.3