From 2c72e662bad11a41839780f86680d4bb05367c78 Mon Sep 17 00:00:00 2001 From: Dan Gudmundsson Date: Mon, 3 Apr 2017 12:19:21 +0200 Subject: New unicode aware string module that works with unicode:chardata() Works with unicode:chardata() as input as was decided on OTP board meeting as response to EEP-35 a long time ago. Works on graphemes clusters as base, with a few exceptions, does not handle classic (nor nfd'ified) Hangul nor the extended grapheme clusters such as the prepend class. That would make handling binaries as input/output very slow. List input => list output, binary input => binary output and mixed input => mixed output for all find/split functions. So that results can be post-processed without the need to invoke unicode:characters_to_list|binary for intermediate data. pad functions return lists of unicode:chardata() for performance. --- lib/stdlib/doc/src/string.xml | 741 ++++++++++++++++++-- lib/stdlib/doc/src/unicode_usage.xml | 70 +- lib/stdlib/src/string.erl | 1266 +++++++++++++++++++++++++++++++++- lib/stdlib/test/string_SUITE.erl | 893 +++++++++++++++++++++++- 4 files changed, 2837 insertions(+), 133 deletions(-) (limited to 'lib/stdlib') diff --git a/lib/stdlib/doc/src/string.xml b/lib/stdlib/doc/src/string.xml index dddedf1132..dc83c40a9a 100644 --- a/lib/stdlib/doc/src/string.xml +++ b/lib/stdlib/doc/src/string.xml @@ -36,8 +36,613 @@ String processing functions.

This module provides functions for string processing.

+

A string in this module is represented by + unicode:chardata(), that is, a list of codepoints, + binaries with UTF-8-encoded codepoints + (UTF-8 binaries), or a mix of the two.

+ +"abcd" is a valid string +<<"abcd">> is a valid string +["abcd"] is a valid string +<<"abc..åäö"/utf8>> is a valid string +<<"abc..åäö">> is NOT a valid string, + but a binary with Latin-1-encoded codepoints +[<<"abc">>, "..åäö"] is a valid string +[atom] is NOT a valid string +

+ This module operates on grapheme clusters. A grapheme cluster + is a user-perceived character, which can be represented by several + codepoints. +

+ +"å" [229] or [97, 778] +"e̊" [101, 778] +

+ The string length of "ß↑e̊" is 3, even though it is represented by the + codepoints [223,8593,101,778] or the UTF-8 binary + <<195,159,226,134,145,101,204,138>>. +

+

+ Grapheme clusters for codepoints of class prepend + and non-modern (or decomposed) Hangul is not handled for performance + reasons in + find/3, + replace/3, + split/2, + split/2 and + trim/3. +

+

+ Splitting and appending strings is to be done on grapheme clusters + borders. + There is no verification that the results of appending strings are + valid or normalized. +

+

+ Most of the functions expect all input to be normalized to one form, + see for example + unicode:characters_to_nfc_list/1. +

+

+ Language or locale specific handling of input is not considered + in any function. +

+

+ The functions can crash for non-valid input strings. For example, + the functions expect UTF-8 binaries but not all functions + verify that all binaries are encoded correctly. +

+

+ Unless otherwise specified the return value type is the same as + the input type. That is, binary input returns binary output, + list input returns a list output, and mixed input can return a + mixed output.

+ +1> string:trim(" sarah "). +"sarah" +2> string:trim(<<" sarah ">>). +<<"sarah">> +3> string:lexemes("foo bar", " "). +["foo","bar"] +4> string:lexemes(<<"foo bar">>, " "). +[<<"foo">>,<<"bar">>] +

This module has been reworked in Erlang/OTP 20 to + handle + unicode:chardata() and operate on grapheme + clusters. The old + functions that only work on Latin-1 lists as input + are still available but should not be + used. They will be deprecated in Erlang/OTP 21. +

+ + + + + +

A user-perceived character, consisting of one or more + codepoints.

+
+
+
+ + + + + + Convert a string to a comparable string. + +

+ Converts String to a case-agnostic + comparable string. Function casefold/1 is preferred + over lowercase/1 when two strings are to be compared + for equality. See also equal/4. +

+

Example:

+
+1> string:casefold("Ω and ẞ SHARP S").
+"ω and ss sharp s"
+
+
+ + + + Remove trailing end of line control characters. + +

+ Returns a string where any trailing \n or + \r\n have been removed from String. +

+

Example:

+
+182> string:chomp(<<"\nHello\n\n">>).
+<<"\nHello">>
+183> string:chomp("\nHello\r\r\n").
+"\nHello\r"
+
+
+ + + + + + Test string equality. + +

+ Returns true if A and + B are equal, otherwise false. +

+

+ If IgnoreCase is true + the function does + casefolding on the fly before the equality test. +

+

If Norm is not none + the function applies normalization on the fly before the equality test. + There are four available normalization forms: + nfc, + nfd, + nfkc, and + nfkd. +

+

By default, + IgnoreCase is false and + Norm is none.

+

Example:

+
+1> string:equal("åäö", <<"åäö"/utf8>>).
+true
+2> string:equal("åäö", unicode:characters_to_nfd_binary("åäö")).
+false
+3> string:equal("åäö", unicode:characters_to_nfd_binary("ÅÄÖ"), true, nfc).
+true
+
+
+ + + + + Find start of substring. + +

+ Removes anything before SearchPattern in String + and returns the remainder of the string or nomatch if SearchPattern is not + found. + Dir, which can be leading or + trailing, indicates from which direction characters + are to be searched. +

+

+ By default, Dir is leading. +

+

Example:

+
+1> string:find("ab..cd..ef", ".").
+"..cd..ef"
+2> string:find(<<"ab..cd..ef">>, "..", trailing).
+<<"..ef">>
+3> string:find(<<"ab..cd..ef">>, "x", leading).
+nomatch
+4> string:find("ab..cd..ef", "x", trailing).
+nomatch
+
+
+ + + + Check if the string is empty. + +

Returns true if String is the + empty string, otherwise false.

+

Example:

+
+1> string:is_empty("foo").
+false
+2> string:is_empty(["",<<>>]).
+true
+
+
+ + + + Calculate length of the string. + +

+ Returns the number of grapheme clusters in String. +

+

Example:

+
+1> string:length("ß↑e̊").
+3
+2> string:length(<<195,159,226,134,145,101,204,138>>).
+3
+
+
+ + + + Split string into lexemes. + +

+ Returns a list of lexemes in String, separated + by the grapheme clusters in SeparatorList. +

+

+ Notice that, as shown in this example, two or more + adjacent separator graphemes clusters in String + are treated as one. That is, there are no empty + strings in the resulting list of lexemes. + See also split/3 which returns + empty strings. +

+

Notice that [$\r,$\n] is one grapheme cluster.

+

Example:

+
+1> string:lexemes("abc de̊fxxghix jkl\r\nfoo", "x e" ++ [[$\r,$\n]]).
+["abc","de̊f","ghi","jkl","foo"]
+2> string:lexemes(<<"abc de̊fxxghix jkl\r\nfoo"/utf8>>, "x e" ++ [$\r,$\n]).
+[<<"abc">>,<<"de̊f"/utf8>>,<<"ghi">>,<<"jkl\r\nfoo">>]
+
+
+ + + + Convert a string to lowercase + +

+ Converts String to lowercase. +

+

+ Notice that function casefold/1 + should be used when converting a string to + be tested for equality. +

+

Example:

+
+2> string:lowercase(string:uppercase("Michał")).
+"michał"
+
+
+ + + + Pick the first codepoint. + +

+ Returns the first codepoint in String + and the rest of String in the tail. +

+

Example:

+
+1> string:next_codepoint(unicode:characters_to_binary("e̊fg")).
+[101|<<"̊fg"/utf8>>]
+
+
+ + + + Pick the first grapheme cluster. + +

+ Returns the first grapheme cluster in String + and the rest of String in the tail. +

+

Example:

+
+1> string:next_grapheme(unicode:characters_to_binary("e̊fg")).
+["e̊"|<<"fg">>]
+
+
+ + + + Pick the nth lexeme. + +

Returns lexeme number N in + String, where lexemes are separated by + the grapheme clusters in SeparatorList. +

+

Example:

+
+1> string:nth_lexeme("abc.de̊f.ghiejkl", 3, ".e").
+"ghi"
+
+
+ + + + + + Pad a string to given length. + +

+ Pads String to Length with + grapheme cluster Char. + Dir, which can be leading, trailing, + or both, indicates where the padding should be added. +

+

By default, Char is $\s and + Dir is trailing. +

+

Example:

+
+1> string:pad(<<"He̊llö"/utf8>>, 8).
+[<<72,101,204,138,108,108,195,182>>,32,32,32]
+2> io:format("'~ts'~n",[string:pad("He̊llö", 8, leading)]).
+'   He̊llö'
+3> io:format("'~ts'~n",[string:pad("He̊llö", 8, both)]).
+' He̊llö  '
+
+
+ + + + Remove prefix from string. + +

+ If Prefix is the prefix of + String, removes it and returns the + remainder of String, otherwise returns + nomatch. +

+

Example:

+
+1> string:prefix(<<"prefix of string">>, "pre").
+<<"fix of string">>
+2> string:prefix("pre", "prefix").
+nomatch
+
+
+ + + + + Replace a pattern in string. + +

+ Replaces SearchPattern in String + with Replacement. + Where, default leading, indicates whether + the leading, the trailing or all encounters of + SearchPattern are to be replaced. +

+

Can be implemented as:

+
lists:join(Replacement, split(String, SearchPattern, Where)).
+

Example:

+
+1> string:replace(<<"ab..cd..ef">>, "..", "*").
+[<<"ab">>,"*",<<"cd..ef">>]
+2> string:replace(<<"ab..cd..ef">>, "..", "*", all).
+[<<"ab">>,"*",<<"cd">>,"*",<<"ef">>]
+
+
+ + + + Reverses a string + +

+ Returns the reverse list of the grapheme clusters in String. +

+

Example:

+
+1> Reverse = string:reverse(unicode:characters_to_nfd_binary("ÅÄÖ")).
+[[79,776],[65,776],[65,778]]
+2> io:format("~ts~n",[Reverse]).
+ÖÄÅ
+
+
+ + + + + Extract a part of string + +

Returns a substring of String of + at most Length grapheme clusters, starting at position + Start.

+

By default, Length is infinity.

+

Example:

+
+1> string:slice(<<"He̊llö Wörld"/utf8>>, 4).
+<<"ö Wörld"/utf8>>
+2> string:slice(["He̊llö ", <<"Wörld"/utf8>>], 4,4).
+"ö Wö"
+3> string:slice(["He̊llö ", <<"Wörld"/utf8>>], 4,50).
+"ö Wörld"
+
+
+ + + + + Split a string into substrings. + +

+ Splits String where SearchPattern + is encountered and return the remaining parts. + Where, default leading, indicates whether + the leading, the trailing or all encounters of + SearchPattern will split String. +

+

Example:

+
+0> string:split("ab..bc..cd", "..").
+["ab","bc..cd"]
+1> string:split(<<"ab..bc..cd">>, "..", trailing).
+[<<"ab..bc">>,<<"cd">>]
+2> string:split(<<"ab..bc....cd">>, "..", all).
+[<<"ab">>,<<"bc">>,<<>>,<<"cd">>]
+
+
+ + + + + + Take leading or trailing parts. + +

Takes characters from String as long as + the characters are members of set Characters + or the complement of set Characters. + Dir, + which can be leading or trailing, indicates from + which direction characters are to be taken. +

+

Example:

+
+5> string:take("abc0z123", lists:seq($a,$z)).
+{"abc","0z123"}
+6> string:take(<<"abc0z123">>, lists:seq($0,$9), true, leading).
+{<<"abc">>,<<"0z123">>}
+7> string:take("abc0z123", lists:seq($0,$9), false, trailing).
+{"abc0z","123"}
+8> string:take(<<"abc0z123">>, lists:seq($a,$z), true, trailing).
+{<<"abc0z">>,<<"123">>}
+
+
+ + + + Convert a string to titlecase. + +

+ Converts String to titlecase. +

+

Example:

+
+1> string:titlecase("ß is a SHARP s").
+"Ss is a SHARP s"
+
+
+ + + + Return a float whose text representation is the integers + (ASCII values) of a string. + +

Argument String is expected to start with a + valid text represented float (the digits are ASCII values). + Remaining characters in the string after the float are returned in + Rest.

+

Example:

+
+> {F1,Fs} = string:to_float("1.0-1.0e-1"),
+> {F2,[]} = string:to_float(Fs),
+> F1+F2.
+0.9
+> string:to_float("3/2=1.5").
+{error,no_float}
+> string:to_float("-1.5eX").
+{-1.5,"eX"}
+
+
+ + + + Return an integer whose text representation is the integers + (ASCII values) of a string. + +

Argument String is expected to start with a + valid text represented integer (the digits are ASCII values). + Remaining characters in the string after the integer are returned in + Rest.

+

Example:

+
+> {I1,Is} = string:to_integer("33+22"),
+> {I2,[]} = string:to_integer(Is),
+> I1-I2.
+11
+> string:to_integer("0.5").
+{0,".5"}
+> string:to_integer("x=2").
+{error,no_integer}
+
+
+ + + + Convert a string to a list of grapheme clusters. + +

+ Converts String to a list of grapheme clusters. +

+

Example:

+
+1> string:to_graphemes("ß↑e̊").
+[223,8593,[101,778]]
+2> string:to_graphemes(<<"ß↑e̊"/utf8>>).
+[223,8593,[101,778]]
+
+
+ + + + + + Trim leading or trailing, or both, characters. + +

+ Returns a string, where leading or trailing, or both, + Characters have been removed. + Dir which can be leading, trailing, + or both, indicates from which direction characters + are to be removed. +

+

Default Characters are the set of + nonbreakable whitespace codepoints, defined as + Pattern_White_Space in + Unicode Standard Annex #31. + By default, Dir is both. +

+

+ Notice that [$\r,$\n] is one grapheme cluster according + to the Unicode Standard. +

+

Example:

+
+1> string:trim("\t  Hello  \n").
+"Hello"
+2> string:trim(<<"\t  Hello  \n">>, leading).
+<<"Hello  \n">>
+3> string:trim(<<".Hello.\n">>, trailing, "\n.").
+<<".Hello">>
+
+
+ + + + Convert a string to uppercase. + +

+ Converts String to uppercase. +

+

See also titlecase/1.

+

Example:

+
+1> string:uppercase("Michał").
+"MICHAŁ"
+
+
+ +
+ +
+ + Obsolete API functions +

Here follows the function of the old API. + These functions only work on a list of Latin-1 characters. +

+

+ The functions are kept for backward compatibility, but are + not recommended. + They will be deprecated in Erlang/OTP 21. +

+

Any undocumented functions in string are not to be used.

+
+
+ @@ -47,17 +652,24 @@

Returns a string, where String is centered in the string and surrounded by blanks or Character. The resulting string has length Number.

+

This function is obsolete. + Use + pad/3. +

- Returns a string consisting of numbers of characters. + Return a string consisting of numbers of characters.

Returns a string consisting of Number characters Character. Optionally, the string can end with string Tail.

+

This function is obsolete. + Use + lists:duplicate/2.

@@ -69,6 +681,9 @@

Returns the index of the first occurrence of Character in String. Returns 0 if Character does not occur.

+

This function is obsolete. + Use + find/2.

@@ -79,6 +694,16 @@

Concatenates String1 and String2 to form a new string String3, which is returned.

+

+ This function is obsolete. + Use [String1, String2] as + Data argument, and call + + unicode:characters_to_list/2 or + + unicode:characters_to_binary/2 + to flatten the output. +

@@ -88,6 +713,9 @@

Returns a string containing String repeated Number times.

+

This function is obsolete. + Use + lists:duplicate/2.

@@ -98,6 +726,9 @@

Returns the length of the maximum initial segment of String, which consists entirely of characters not from Chars.

+

This function is obsolete. + Use + take/3.

Example:

> string:cspan("\t abcdef", " \t"). @@ -105,21 +736,15 @@ - - - Test string equality. - -

Returns true if String1 and - String2 are equal, otherwise false.

-
-
- Join a list of strings with separator.

Returns a string with the elements of StringList separated by the string in Separator.

+

This function is obsolete. + Use + lists:join/2.

Example:

> join(["one", "two", "three"], ", "). @@ -137,6 +762,10 @@ fixed. If length(String) < Number, then String is padded with blanks or Characters.

+

This function is obsolete. + Use + pad/2 or + pad/3.

Example:

> string:left("Hello",10,$.). @@ -149,6 +778,9 @@ Return the length of a string.

Returns the number of characters in String.

+

This function is obsolete. + Use + length/1.

@@ -160,6 +792,9 @@

Returns the index of the last occurrence of Character in String. Returns 0 if Character does not occur.

+

This function is obsolete. + Use + find/3.

@@ -173,6 +808,9 @@ fixed. If the length of (String) < Number, then String is padded with blanks or Characters.

+

This function is obsolete. + Use + pad/3.

Example:

> string:right("Hello", 10, $.). @@ -188,6 +826,9 @@ SubString begins in String. Returns 0 if SubString does not exist in String.

+

This function is obsolete. + Use + find/3.

Example:

> string:rstr(" Hello Hello World World ", "Hello World"). @@ -202,6 +843,9 @@

Returns the length of the maximum initial segment of String, which consists entirely of characters from Chars.

+

This function is obsolete. + Use + take/2.

Example:

> string:span("\t abcdef", " \t"). @@ -217,6 +861,9 @@ SubString begins in String. Returns 0 if SubString does not exist in String.

+

This function is obsolete. + Use + find/2.

Example:

> string:str(" Hello Hello World World ", "Hello World"). @@ -230,12 +877,15 @@ Strip leading or trailing characters. -

Returns a string, where leading and/or trailing blanks or a +

Returns a string, where leading or trailing, or both, blanks or a number of Character have been removed. Direction, which can be left, right, or both, indicates from which direction blanks are to be removed. strip/1 is equivalent to strip(String, both).

+

This function is obsolete. + Use + trim/3.

Example:

> string:strip("...Hello.....", both, $.). @@ -251,6 +901,9 @@

Returns a substring of String, starting at position Start to the end of the string, or to and including position Stop.

+

This function is obsolete. + Use + slice/3.

Example:

sub_string("Hello World", 4, 8). @@ -266,6 +919,9 @@ sub_string("Hello World", 4, 8).

Returns a substring of String, starting at position Start, and ending at the end of the string or at length Length.

+

This function is obsolete. + Use + slice/3.

Example:

> substr("Hello World", 4, 5). @@ -281,6 +937,9 @@ sub_string("Hello World", 4, 8).

Returns the word in position Number of String. Words are separated by blanks or Characters.

+

This function is obsolete. + Use + nth_lexeme/3.

Example:

> string:sub_word(" Hello old boy !",3,$o). @@ -288,50 +947,6 @@ sub_string("Hello World", 4, 8).
- - - Returns a float whose text representation is the integers - (ASCII values) in a string. - -

Argument String is expected to start with a - valid text represented float (the digits are ASCII values). - Remaining characters in the string after the float are returned in - Rest.

-

Example:

- -> {F1,Fs} = string:to_float("1.0-1.0e-1"), -> {F2,[]} = string:to_float(Fs), -> F1+F2. -0.9 -> string:to_float("3/2=1.5"). -{error,no_float} -> string:to_float("-1.5eX"). -{-1.5,"eX"} -
-
- - - - Returns an integer whose text representation is the integers - (ASCII values) in a string. - -

Argument String is expected to start with a - valid text represented integer (the digits are ASCII values). - Remaining characters in the string after the integer are returned in - Rest.

-

Example:

- -> {I1,Is} = string:to_integer("33+22"), -> {I2,[]} = string:to_integer(Is), -> I1-I2. -11 -> string:to_integer("0.5"). -{0,".5"} -> string:to_integer("x=2"). -{error,no_integer} -
-
- @@ -346,6 +961,11 @@ sub_string("Hello World", 4, 8).

The specified string or character is case-converted. Notice that the supported character set is ISO/IEC 8859-1 (also called Latin 1); all values outside this set are unchanged

+

This function is obsolete use + lowercase/1, + uppercase/1, + titlecase/1 or + casefold/1.

@@ -363,6 +983,9 @@ sub_string("Hello World", 4, 8). adjacent separator characters in String are treated as one. That is, there are no empty strings in the resulting list of tokens.

+

This function is obsolete. + Use + lexemes/2.

@@ -373,6 +996,9 @@ sub_string("Hello World", 4, 8).

Returns the number of words in String, separated by blanks or Character.

+

This function is obsolete. + Use + lexemes/2.

Example:

> words(" Hello old boy!", $o). @@ -387,10 +1013,7 @@ sub_string("Hello World", 4, 8). other. The reason is that this string package is the combination of two earlier packages and all functions of both packages have been retained.

- - -

Any undocumented functions in string are not to be used.

-
+ diff --git a/lib/stdlib/doc/src/unicode_usage.xml b/lib/stdlib/doc/src/unicode_usage.xml index a8ef8ff5c5..11b84f552a 100644 --- a/lib/stdlib/doc/src/unicode_usage.xml +++ b/lib/stdlib/doc/src/unicode_usage.xml @@ -65,7 +65,10 @@

In Erlang/OTP 20.0, atoms and function can contain Unicode characters. Module names are still restricted to - the ISO-Latin-1 range.

+ the ISO-Latin-1 range.

+

Support was added for normalizations forms in + unicode and the string module now handles + utf8-encoded binaries.

This section outlines the current Unicode support and gives some @@ -110,23 +113,27 @@ -

So, a conversion function must know not only one character at a time, - but possibly the whole sentence, the natural language to translate to, - the differences in input and output string length, and so on. - Erlang/OTP has currently no Unicode to_upper/to_lower - functionality, but publicly available libraries address these issues.

- -

Another example is the accented characters, where the same glyph has two - different representations. The Swedish letter "ö" is one example. - The Unicode standard has a code point for it, but you can also write it - as "o" followed by "U+0308" (Combining Diaeresis, with the simplified - meaning that the last letter is to have "¨" above). They have the same - glyph. They are for most purposes the same, but have different - representations. For example, MacOS X converts all filenames to use - Combining Diaeresis, while most other programs (including Erlang) try to - hide that by doing the opposite when, for example, listing directories. - However it is done, it is usually important to normalize such - characters to avoid confusion.

+

So, a conversion function must know not only one character at a + time, but possibly the whole sentence, the natural language to + translate to, the differences in input and output string length, + and so on. Erlang/OTP has currently no Unicode + uppercase/lowercase functionality with language + specific handling, but publicly available libraries address these + issues.

+ +

Another example is the accented characters, where the same + glyph has two different representations. The Swedish letter "ö" is + one example. The Unicode standard has a code point for it, but + you can also write it as "o" followed by "U+0308" (Combining + Diaeresis, with the simplified meaning that the last letter is to + have "¨" above). They have the same glyph, user perceived + character. They are for most purposes the same, but have different + representations. For example, MacOS X converts all filenames to + use Combining Diaeresis, while most other programs (including + Erlang) try to hide that by doing the opposite when, for example, + listing directories. However it is done, it is usually important + to normalize such characters to avoid confusion. +

The list of examples can be made long. One need a kind of knowledge that was not needed when programs only considered one or two languages. The @@ -273,7 +280,7 @@ them. In some cases functionality has been added to already existing interfaces (as the string module now can - handle lists with any code points). In some cases new + handle strings with any code points). In some cases new functionality or options have been added (as in the io module, the file handling, the Fortunately, most textual data has been stored in lists and range checking has been sparse, so modules like string work well for - Unicode lists with little need for conversion or extension.

+ Unicode strings with little need for conversion or extension.

Some modules are, however, changed to be explicitly Unicode-aware. These modules include:

@@ -1028,18 +1035,17 @@ Eshell V5.10.1 (abort with ^G) has extensive support for Unicode text.

-

The string module works - perfectly for Unicode strings and ISO Latin-1 strings, except the - language-dependent functions - string:to_upper/1 - and - string:to_lower/1, - which are only correct for the ISO Latin-1 character set. These two - functions can never function correctly for Unicode characters in their - current form, as there are language and locale issues as well as - multi-character mappings to consider when converting text between cases. - Converting case in an international environment is a large subject not - yet addressed in OTP.

+

The string + module works perfectly for Unicode strings and ISO Latin-1 + strings, except the language-dependent functions string:uppercase/1 + and string:lowercase/1. + These two functions can never function correctly for Unicode + characters in their current form, as there are language and locale + issues to consider when converting text between cases. Converting + case in an international environment is a large subject not yet + addressed in OTP.

diff --git a/lib/stdlib/src/string.erl b/lib/stdlib/src/string.erl index c659db78bd..4fdfe99b66 100644 --- a/lib/stdlib/src/string.erl +++ b/lib/stdlib/src/string.erl @@ -17,22 +17,72 @@ %% %% %CopyrightEnd% %% +%% A string library that works on grapheme clusters, with the exception +%% of codepoints of class 'prepend' and non modern (or decomposed) Hangul. +%% If these codepoints appear, functions like 'find/2' may return a string +%% which starts inside a grapheme cluster. +%% These exceptions are made because the codepoints classes are +%% seldom used and require that we are able look at previous codepoints in +%% the stream and is thus hard to implement effectively. +%% +%% GC (grapheme cluster) implies that the length of string 'ß↑e̊' is 3 though +%% it is represented by the codepoints [223,8593,101,778] or the +%% utf8 binary <<195,159,226,134,145,101,204,138>> +%% +%% And that searching for strings or graphemes finds the correct positions: +%% +%% find("eeeee̊eee", "e̊") -> "e̊ee".: +%% find("1£4e̊abcdef", "e") -> "ef" +%% +%% Most functions expect all input to be normalized to one form, +%% see unicode:characters_to_nfc and unicode:characters_to_nfd functions. +%% When appending strings no checking is done to verify that the +%% result is valid unicode strings. +%% +%% The functions may crash for invalid utf-8 input. +%% +%% Return value should be kept consistent when return type is +%% unicode:chardata() i.e. binary input => binary output, +%% list input => list output mixed input => mixed output +%% -module(string). --export([len/1,equal/2,concat/2,chr/2,rchr/2,str/2,rstr/2, - span/2,cspan/2,substr/2,substr/3,tokens/2,chars/2,chars/3]). +-export([is_empty/1, length/1, to_graphemes/1, + reverse/1, + equal/2, equal/3, equal/4, + slice/2, slice/3, + pad/2, pad/3, pad/4, trim/1, trim/2, trim/3, chomp/1, + take/2, take/3, take/4, + lexemes/2, nth_lexeme/3, + uppercase/1, lowercase/1, titlecase/1,casefold/1, + prefix/2, + split/2,split/3,replace/3,replace/4, + find/2,find/3, + next_codepoint/1, next_grapheme/1 + ]). + +%% Old (will be deprecated) lists/string API kept for backwards compability +-export([len/1, concat/2, % equal/2, (extended in the new api) + chr/2,rchr/2,str/2,rstr/2, + span/2,cspan/2,substr/2,substr/3, tokens/2, + chars/2,chars/3]). -export([copies/2,words/1,words/2,strip/1,strip/2,strip/3, sub_word/2,sub_word/3,left/2,left/3,right/2,right/3, sub_string/2,sub_string/3,centre/2,centre/3, join/2]). -export([to_upper/1, to_lower/1]). +%% +-import(lists,[member/2]). --import(lists,[reverse/1,member/2]). +-compile({no_auto_import,[length/1]}). -%%--------------------------------------------------------------------------- +-export_type([grapheme_cluster/0]). -%%% BIFs +-type grapheme_cluster() :: char() | [char()]. +-type direction() :: 'leading' | 'trailing'. +%%% BIFs -export([to_float/1, to_integer/1]). +-dialyzer({no_improper_lists, stack/2}). -spec to_float(String) -> {Float, Rest} | {error, Reason} when String :: string(), @@ -54,6 +104,1180 @@ to_integer(_) -> %%% End of BIFs +%% Check if string is the empty string +-spec is_empty(String::unicode:chardata()) -> boolean(). +is_empty([]) -> true; +is_empty(<<>>) -> true; +is_empty([L|R]) -> is_empty(L) andalso is_empty(R); +is_empty(_) -> false. + +%% Count the number of grapheme clusters in chardata +-spec length(String::unicode:chardata()) -> non_neg_integer(). +length(CD) -> + length_1(unicode_util:gc(CD), 0). + +%% Convert a string to a list of grapheme clusters +-spec to_graphemes(String::unicode:chardata()) -> [grapheme_cluster()]. +to_graphemes(CD0) -> + case unicode_util:gc(CD0) of + [GC|CD] -> [GC|to_graphemes(CD)]; + [] -> [] + end. + +%% Compare two strings return boolean, assumes that the input are +%% normalized to same form, see unicode:characters_to_nfX_xxx(..) +-spec equal(A, B) -> boolean() when + A::unicode:chardata(), + B::unicode:chardata(). +equal(A,B) when is_binary(A), is_binary(B) -> + A =:= B; +equal(A,B) -> + equal_1(A,B). + +%% Compare two strings return boolean, assumes that the input are +%% normalized to same form, see unicode:characters_to_nfX_xxx(..) +%% does casefold on the fly +-spec equal(A, B, IgnoreCase) -> boolean() when + A::unicode:chardata(), + B::unicode:chardata(), + IgnoreCase :: boolean(). +equal(A, B, false) -> + equal(A,B); +equal(A, B, true) -> + equal_nocase(A,B). + +%% Compare two strings return boolean +%% if specified does casefold and normalization on the fly +-spec equal(A, B, IgnoreCase, Norm) -> boolean() when + A :: unicode:chardata(), + B :: unicode:chardata(), + IgnoreCase :: boolean(), + Norm :: 'none' | 'nfc' | 'nfd' | 'nfkc' | 'nfkd'. +equal(A, B, Case, none) -> + equal(A,B,Case); +equal(A, B, false, Norm) -> + equal_norm(A, B, Norm); +equal(A, B, true, Norm) -> + equal_norm_nocase(A, B, Norm). + +%% Reverse grapheme clusters +-spec reverse(String::unicode:chardata()) -> [grapheme_cluster()]. +reverse(CD) -> + reverse_1(CD, []). + +%% Slice a string and return rest of string +%% Note: counts grapheme_clusters +-spec slice(String, Start) -> Slice when + String::unicode:chardata(), + Start :: non_neg_integer(), + Slice :: unicode:chardata(). +slice(CD, N) when is_integer(N), N >= 0 -> + slice_l(CD, N, is_binary(CD)). + +-spec slice(String, Start, Length) -> Slice when + String::unicode:chardata(), + Start :: non_neg_integer(), + Length :: 'infinity' | non_neg_integer(), + Slice :: unicode:chardata(). +slice(CD, N, Length) + when is_integer(N), N >= 0, is_integer(Length), Length > 0 -> + slice_trail(slice_l(CD, N, is_binary(CD)), Length); +slice(CD, N, infinity) -> + slice_l(CD, N, is_binary(CD)); +slice(CD, _, 0) -> + case is_binary(CD) of + true -> <<>>; + false -> [] + end. + +%% Pad a string to desired length +-spec pad(String, Length) -> unicode:charlist() when + String ::unicode:chardata(), + Length :: integer(). +pad(CD, Length) -> + pad(CD, Length, trailing, $\s). + +-spec pad(String, Length, Dir) -> unicode:charlist() when + String ::unicode:chardata(), + Length :: integer(), + Dir :: direction() | 'both'. +pad(CD, Length, Dir) -> + pad(CD, Length, Dir, $\s). + +-spec pad(String, Length, Dir, Char) -> unicode:charlist() when + String ::unicode:chardata(), + Length :: integer(), + Dir :: direction() | 'both', + Char :: grapheme_cluster(). +pad(CD, Length, leading, Char) when is_integer(Length) -> + Len = length(CD), + [lists:duplicate(max(0, Length-Len), Char), CD]; +pad(CD, Length, trailing, Char) when is_integer(Length) -> + Len = length(CD), + [CD|lists:duplicate(max(0, Length-Len), Char)]; +pad(CD, Length, both, Char) when is_integer(Length) -> + Len = length(CD), + Size = max(0, Length-Len), + Pre = lists:duplicate(Size div 2, Char), + Post = case Size rem 2 of + 1 -> [Char]; + _ -> [] + end, + [Pre, CD, Pre|Post]. + +%% Strip characters from whitespace or Separator in Direction +-spec trim(String) -> unicode:chardata() when + String :: unicode:chardata(). +trim(Str) -> + trim(Str, both, unicode_util:whitespace()). + +-spec trim(String, Dir) -> unicode:chardata() when + String :: unicode:chardata(), + Dir :: direction() | 'both'. +trim(Str, Dir) -> + trim(Str, Dir, unicode_util:whitespace()). + +-spec trim(String, Dir, Characters) -> unicode:chardata() when + String :: unicode:chardata(), + Dir :: direction() | 'both', + Characters :: [grapheme_cluster()]. +trim(Str, _, []) -> Str; +trim(Str, leading, Sep) when is_list(Sep) -> + trim_l(Str, search_pattern(Sep)); +trim(Str, trailing, Sep) when is_list(Sep) -> + trim_t(Str, 0, search_pattern(Sep)); +trim(Str, both, Sep0) when is_list(Sep0) -> + Sep = search_pattern(Sep0), + trim_t(trim_l(Str,Sep), 0, Sep). + +%% Delete trailing newlines or \r\n +-spec chomp(String::unicode:chardata()) -> unicode:chardata(). +chomp(Str) -> + trim_t(Str,0, {[[$\r,$\n],$\n], [$\r,$\n], [<<$\r>>,<<$\n>>]}). + +%% Split String into two parts where the leading part consists of Characters +-spec take(String, Characters) -> {Leading, Trailing} when + String::unicode:chardata(), + Characters::[grapheme_cluster()], + Leading::unicode:chardata(), + Trailing::unicode:chardata(). +take(Str, Sep) -> + take(Str, Sep, false, leading). +-spec take(String, Characters, Complement) -> {Leading, Trailing} when + String::unicode:chardata(), + Characters::[grapheme_cluster()], + Complement::boolean(), + Leading::unicode:chardata(), + Trailing::unicode:chardata(). +take(Str, Sep, Complement) -> + take(Str, Sep, Complement, leading). +-spec take(String, Characters, Complement, Dir) -> {Leading, Trailing} when + String::unicode:chardata(), + Characters::[grapheme_cluster()], + Complement::boolean(), + Dir::direction(), + Leading::unicode:chardata(), + Trailing::unicode:chardata(). +take(Str, [], Complement, Dir) -> + Empty = case is_binary(Str) of true -> <<>>; false -> [] end, + case {Complement,Dir} of + {false, leading} -> {Empty, Str}; + {false, trailing} -> {Str, Empty}; + {true, leading} -> {Str, Empty}; + {true, trailing} -> {Empty, Str} + end; +take(Str, Sep0, false, leading) -> + Sep = search_pattern(Sep0), + take_l(Str, Sep, []); +take(Str, Sep0, true, leading) -> + Sep = search_pattern(Sep0), + take_lc(Str, Sep, []); +take(Str, Sep0, false, trailing) -> + Sep = search_pattern(Sep0), + take_t(Str, 0, Sep); +take(Str, Sep0, true, trailing) -> + Sep = search_pattern(Sep0), + take_tc(Str, 0, Sep). + +%% Uppercase all chars in Str +-spec uppercase(String::unicode:chardata()) -> unicode:chardata(). +uppercase(CD) when is_list(CD) -> + uppercase_list(CD); +uppercase(CD) when is_binary(CD) -> + uppercase_bin(CD,<<>>). + +%% Lowercase all chars in Str +-spec lowercase(String::unicode:chardata()) -> unicode:chardata(). +lowercase(CD) when is_list(CD) -> + lowercase_list(CD); +lowercase(CD) when is_binary(CD) -> + lowercase_bin(CD,<<>>). + +%% Make a titlecase of the first char in Str +-spec titlecase(String::unicode:chardata()) -> unicode:chardata(). +titlecase(CD) when is_list(CD) -> + case unicode_util:titlecase(CD) of + [GC|Tail] -> append(GC,Tail); + Empty -> Empty + end; +titlecase(CD) when is_binary(CD) -> + case unicode_util:titlecase(CD) of + [CP|Chars] when is_integer(CP) -> <>; + [CPs|Chars] -> + << << <> || CP <- CPs>>/binary, Chars/binary>>; + [] -> <<>> + end. + +%% Make a comparable string of the Str should be used for equality tests only +-spec casefold(String::unicode:chardata()) -> unicode:chardata(). +casefold(CD) when is_list(CD) -> + casefold_list(CD); +casefold(CD) when is_binary(CD) -> + casefold_bin(CD,<<>>). + +%% Return the remaining string with prefix removed or else nomatch +-spec prefix(String::unicode:chardata(), Prefix::unicode:chardata()) -> + 'nomatch' | unicode:chardata(). +prefix(Str, []) -> Str; +prefix(Str, Prefix0) -> + Prefix = unicode:characters_to_list(Prefix0), + case prefix_1(Str, Prefix) of + [] when is_binary(Str) -> <<>>; + Res -> Res + end. + +%% split String with the first occurrence of SearchPattern, return list of splits +-spec split(String, SearchPattern) -> [unicode:chardata()] when + String :: unicode:chardata(), + SearchPattern :: unicode:chardata(). +split(String, SearchPattern) -> + split(String, SearchPattern, leading). + +%% split String with SearchPattern, return list of splits +-spec split(String, SearchPattern, Where) -> [unicode:chardata()] when + String :: unicode:chardata(), + SearchPattern :: unicode:chardata(), + Where :: direction() | 'all'. +split(String, SearchPattern, Where) -> + case is_empty(SearchPattern) of + true -> [String]; + false -> + SearchPatternCPs = unicode:characters_to_list(SearchPattern), + case split_1(String, SearchPatternCPs, 0, Where, [], []) of + {_Curr, []} -> [String]; + {_Curr, Acc} when Where =:= trailing -> Acc; + {Curr, Acc} when Where =:= all -> lists:reverse([Curr|Acc]); + Acc when is_list(Acc) -> Acc + end + end. + +%% Replace the first SearchPattern in String with Replacement +-spec replace(String, SearchPattern, Replacement) -> + [unicode:chardata()] when + String :: unicode:chardata(), + SearchPattern :: unicode:chardata(), + Replacement :: unicode:chardata(). +replace(String, SearchPattern, Replacement) -> + lists:join(Replacement, split(String, SearchPattern)). + +%% Replace Where SearchPattern in String with Replacement +-spec replace(String, SearchPattern, Replacement, Where) -> + [unicode:chardata()] when + String :: unicode:chardata(), + SearchPattern :: unicode:chardata(), + Replacement :: unicode:chardata(), + Where :: direction() | 'all'. +replace(String, SearchPattern, Replacement, Where) -> + lists:join(Replacement, split(String, SearchPattern, Where)). + +%% Split Str into a list of chardata separated by one of the grapheme +%% clusters in Seps +-spec lexemes(String::unicode:chardata(), + SeparatorList::[grapheme_cluster()]) -> + [unicode:chardata()]. +lexemes([], _) -> []; +lexemes(Str, Seps0) when is_list(Seps0) -> + Seps = search_pattern(Seps0), + lexemes_m(Str, Seps, []). + +-spec nth_lexeme(String, N, SeparatorList) -> unicode:chardata() when + String::unicode:chardata(), + N::non_neg_integer(), + SeparatorList::[grapheme_cluster()]. + +nth_lexeme(Str, 1, []) -> Str; +nth_lexeme(Str, N, Seps0) when is_list(Seps0), is_integer(N), N > 0 -> + Seps = search_pattern(Seps0), + nth_lexeme_m(Str, Seps, N). + +%% find first SearchPattern in String return rest of string +-spec find(String, SearchPattern) -> unicode:chardata() | 'nomatch' when + String::unicode:chardata(), + SearchPattern::unicode:chardata(). +find(String, SearchPattern) -> + find(String, SearchPattern, leading). + +%% find SearchPattern in String (search in Dir direction) return rest of string +-spec find(String, SearchPattern, Dir) -> unicode:chardata() | 'nomatch' when + String::unicode:chardata(), + SearchPattern::unicode:chardata(), + Dir::direction(). +find(String, "", _) -> String; +find(String, <<>>, _) -> String; +find(String, SearchPattern, leading) -> + find_l(String, unicode:characters_to_list(SearchPattern)); +find(String, SearchPattern, trailing) -> + find_r(String, unicode:characters_to_list(SearchPattern), nomatch). + +%% Fetch first codepoint and return rest in tail +-spec next_grapheme(String::unicode:chardata()) -> + maybe_improper_list(grapheme_cluster(),unicode:chardata()). +next_grapheme(CD) -> unicode_util:gc(CD). + +%% Fetch first grapheme cluster and return rest in tail +-spec next_codepoint(String::unicode:chardata()) -> + maybe_improper_list(char(),unicode:chardata()). +next_codepoint(CD) -> unicode_util:cp(CD). + +%% Internals + +length_1([_|Rest], N) -> + length_1(unicode_util:gc(Rest), N+1); +length_1([], N) -> + N. + +equal_1([A|AR], [B|BR]) when is_integer(A), is_integer(B) -> + A =:= B andalso equal_1(AR, BR); +equal_1([], BR) -> is_empty(BR); +equal_1(A0,B0) -> + case {unicode_util:cp(A0), unicode_util:cp(B0)} of + {[CP|A],[CP|B]} -> equal_1(A,B); + {[], []} -> true; + _ -> false + end. + +equal_nocase(A, A) -> true; +equal_nocase(A0, B0) -> + case {unicode_util:cp(unicode_util:casefold(A0)), + unicode_util:cp(unicode_util:casefold(B0))} of + {[CP|A],[CP|B]} -> equal_nocase(A,B); + {[], []} -> true; + _ -> false + end. + +equal_norm(A, A, _Norm) -> true; +equal_norm(A0, B0, Norm) -> + case {unicode_util:cp(unicode_util:Norm(A0)), + unicode_util:cp(unicode_util:Norm(B0))} of + {[CP|A],[CP|B]} -> equal_norm(A,B, Norm); + {[], []} -> true; + _ -> false + end. + +equal_norm_nocase(A, A, _Norm) -> true; +equal_norm_nocase(A0, B0, Norm) -> + case {unicode_util:cp(unicode_util:casefold(unicode_util:Norm(A0))), + unicode_util:cp(unicode_util:casefold(unicode_util:Norm(B0)))} of + {[CP|A],[CP|B]} -> equal_norm_nocase(A,B, Norm); + {[], []} -> true; + _ -> false + end. + +reverse_1(CD, Acc) -> + case unicode_util:gc(CD) of + [GC|Rest] -> reverse_1(Rest, [GC|Acc]); + [] -> Acc + end. + +slice_l(CD, N, Binary) when N > 0 -> + case unicode_util:gc(CD) of + [_|Cont] -> slice_l(Cont, N-1, Binary); + [] when Binary -> <<>>; + [] -> [] + end; +slice_l(Cont, 0, Binary) -> + case is_empty(Cont) of + true when Binary -> <<>>; + _ -> Cont + end. + +slice_trail(CD, N) when is_list(CD) -> + slice_list(CD, N); +slice_trail(CD, N) when is_binary(CD) -> + slice_bin(CD, N, CD). + +slice_list(CD, N) when N > 0 -> + case unicode_util:gc(CD) of + [GC|Cont] -> append(GC, slice_list(Cont, N-1)); + [] -> [] + end; +slice_list(_, 0) -> + []. + +slice_bin(CD, N, Orig) when N > 0 -> + case unicode_util:gc(CD) of + [_|Cont] -> slice_bin(Cont, N-1, Orig); + [] -> Orig + end; +slice_bin([], 0, Orig) -> + Orig; +slice_bin(CD, 0, Orig) -> + Sz = byte_size(Orig) - byte_size(CD), + <> = Orig, + Keep. + +uppercase_list(CPs0) -> + case unicode_util:uppercase(CPs0) of + [Char|CPs] -> append(Char,uppercase_list(CPs)); + [] -> [] + end. + +uppercase_bin(CPs0, Acc) -> + case unicode_util:uppercase(CPs0) of + [Char|CPs] when is_integer(Char) -> + uppercase_bin(CPs, <>); + [Chars|CPs] -> + uppercase_bin(CPs, <> || CP <- Chars>>/binary >>); + [] -> Acc + end. + +lowercase_list(CPs0) -> + case unicode_util:lowercase(CPs0) of + [Char|CPs] -> append(Char,lowercase_list(CPs)); + [] -> [] + end. + +lowercase_bin(CPs0, Acc) -> + case unicode_util:lowercase(CPs0) of + [Char|CPs] when is_integer(Char) -> + lowercase_bin(CPs, <>); + [Chars|CPs] -> + lowercase_bin(CPs, <> || CP <- Chars>>/binary >>); + [] -> Acc + end. + +casefold_list(CPs0) -> + case unicode_util:casefold(CPs0) of + [Char|CPs] -> append(Char, casefold_list(CPs)); + [] -> [] + end. + +casefold_bin(CPs0, Acc) -> + case unicode_util:casefold(CPs0) of + [Char|CPs] when is_integer(Char) -> + casefold_bin(CPs, <>); + [Chars|CPs] -> + casefold_bin(CPs, <> || CP <- Chars>>/binary >>); + [] -> Acc + end. + + +trim_l([Bin|Cont0], Sep) when is_binary(Bin) -> + case bin_search_inv(Bin, Cont0, Sep) of + {nomatch, Cont} -> trim_l(Cont, Sep); + Keep -> Keep + end; +trim_l(Str, {GCs, _, _}=Sep) when is_list(Str) -> + case unicode_util:gc(Str) of + [C|Cs] -> + case lists:member(C, GCs) of + true -> trim_l(Cs, Sep); + false -> Str + end; + [] -> [] + end; +trim_l(Bin, Sep) when is_binary(Bin) -> + case bin_search_inv(Bin, [], Sep) of + {nomatch,_} -> <<>>; + [Keep] -> Keep + end. + +trim_t([Bin|Cont0], N, Sep) when is_binary(Bin) -> + <<_:N/binary, Rest/binary>> = Bin, + case bin_search(Rest, Cont0, Sep) of + {nomatch,_} -> + stack(Bin, trim_t(Cont0, 0, Sep)); + [SepStart|Cont1] -> + case bin_search_inv(SepStart, Cont1, Sep) of + {nomatch, Cont} -> + Tail = trim_t(Cont, 0, Sep), + case is_empty(Tail) of + true -> + KeepSz = byte_size(Bin) - byte_size(SepStart), + <> = Bin, + Keep; + false -> + Used = cp_prefix(Cont0, Cont), + stack(Bin, stack(Used, Tail)) + end; + [NonSep|Cont] when is_binary(NonSep) -> + KeepSz = byte_size(Bin) - byte_size(NonSep), + trim_t([Bin|Cont], KeepSz, Sep) + end + end; +trim_t(Str, 0, {GCs,CPs,_}=Sep) when is_list(Str) -> + case unicode_util:cp(Str) of + [CP|Cs] -> + case lists:member(CP, CPs) of + true -> + [GC|Cs1] = unicode_util:gc(Str), + case lists:member(GC, GCs) of + true -> + Tail = trim_t(Cs1, 0, Sep), + case is_empty(Tail) of + true -> []; + false -> append(GC,Tail) + end; + false -> + append(GC,trim_t(Cs1, 0, Sep)) + end; + false -> + append(CP,trim_t(Cs, 0, Sep)) + end; + [] -> [] + end; +trim_t(Bin, N, Sep) when is_binary(Bin) -> + <<_:N/binary, Rest/binary>> = Bin, + case bin_search(Rest, Sep) of + {nomatch,_} -> Bin; + [SepStart] -> + case bin_search_inv(SepStart, [], Sep) of + {nomatch,_} -> + KeepSz = byte_size(Bin) - byte_size(SepStart), + <> = Bin, + Keep; + [NonSep] -> + KeepSz = byte_size(Bin) - byte_size(NonSep), + trim_t(Bin, KeepSz, Sep) + end + end. + +take_l([Bin|Cont0], Sep, Acc) when is_binary(Bin) -> + case bin_search_inv(Bin, Cont0, Sep) of + {nomatch, Cont} -> + Used = cp_prefix(Cont0, Cont), + take_l(Cont, Sep, [unicode:characters_to_binary([Bin|Used])|Acc]); + [Bin1|_]=After when is_binary(Bin1) -> + First = byte_size(Bin) - byte_size(Bin1), + <> = Bin, + {btoken(Keep,Acc), After} + end; +take_l(Str, {GCs, _, _}=Sep, Acc) when is_list(Str) -> + case unicode_util:gc(Str) of + [C|Cs] -> + case lists:member(C, GCs) of + true -> take_l(Cs, Sep, append(rev(C),Acc)); + false -> {rev(Acc), Str} + end; + [] -> {rev(Acc), []} + end; +take_l(Bin, Sep, Acc) when is_binary(Bin) -> + case bin_search_inv(Bin, [], Sep) of + {nomatch,_} -> + {btoken(Bin, Acc), <<>>}; + [After] -> + First = byte_size(Bin) - byte_size(After), + <> = Bin, + {btoken(Keep, Acc), After} + end. + +take_lc([Bin|Cont0], Sep, Acc) when is_binary(Bin) -> + case bin_search(Bin, Cont0, Sep) of + {nomatch, Cont} -> + Used = cp_prefix(Cont0, Cont), + take_lc(Cont, Sep, [unicode:characters_to_binary([Bin|Used])|Acc]); + [Bin1|_]=After when is_binary(Bin1) -> + First = byte_size(Bin) - byte_size(Bin1), + <> = Bin, + {btoken(Keep,Acc), After} + end; +take_lc(Str, {GCs, _, _}=Sep, Acc) when is_list(Str) -> + case unicode_util:gc(Str) of + [C|Cs] -> + case lists:member(C, GCs) of + false -> take_lc(Cs, Sep, append(rev(C),Acc)); + true -> {rev(Acc), Str} + end; + [] -> {rev(Acc), []} + end; +take_lc(Bin, Sep, Acc) when is_binary(Bin) -> + case bin_search(Bin, [], Sep) of + {nomatch,_} -> + {btoken(Bin, Acc), <<>>}; + [After] -> + First = byte_size(Bin) - byte_size(After), + <> = Bin, + {btoken(Keep, Acc), After} + end. + +take_t([Bin|Cont0], N, Sep) when is_binary(Bin) -> + <<_:N/binary, Rest/binary>> = Bin, + case bin_search(Rest, Cont0, Sep) of + {nomatch,Cont} -> + Used = cp_prefix(Cont0, Cont), + {Head, Tail} = take_t(Cont, 0, Sep), + {stack(unicode:characters_to_binary([Bin|Used]), Head), Tail}; + [SepStart|Cont1] -> + case bin_search_inv(SepStart, Cont1, Sep) of + {nomatch, Cont} -> + {Head, Tail} = take_t(Cont, 0, Sep), + Used = cp_prefix(Cont0, Cont), + case equal(Tail, Cont) of + true -> + KeepSz = byte_size(Bin) - byte_size(SepStart), + <> = Bin, + {stack(Keep,Head), stack(stack(End,Used),Tail)}; + false -> + {stack(unicode:characters_to_binary([Bin|Used]),Head), Tail} + end; + [NonSep|Cont] when is_binary(NonSep) -> + KeepSz = byte_size(Bin) - byte_size(NonSep), + take_t([Bin|Cont], KeepSz, Sep) + end + end; +take_t(Str, 0, {GCs,CPs,_}=Sep) when is_list(Str) -> + case unicode_util:cp(Str) of + [CP|Cs] -> + case lists:member(CP, CPs) of + true -> + [GC|Cs1] = unicode_util:gc(Str), + case lists:member(GC, GCs) of + true -> + {Head, Tail} = take_t(Cs1, 0, Sep), + case equal(Tail, Cs1) of + true -> {Head, append(GC,Tail)}; + false -> {append(GC,Head), Tail} + end; + false -> + {Head, Tail} = take_t(Cs, 0, Sep), + {append(CP,Head), Tail} + end; + false -> + {Head, Tail} = take_t(Cs, 0, Sep), + {append(CP,Head), Tail} + end; + [] -> {[],[]} + end; +take_t(Bin, N, Sep) when is_binary(Bin) -> + <<_:N/binary, Rest/binary>> = Bin, + case bin_search(Rest, Sep) of + {nomatch,_} -> {Bin, <<>>}; + [SepStart] -> + case bin_search_inv(SepStart, [], Sep) of + {nomatch,_} -> + KeepSz = byte_size(Bin) - byte_size(SepStart), + <> = Bin, + {Before, End}; + [NonSep] -> + KeepSz = byte_size(Bin) - byte_size(NonSep), + take_t(Bin, KeepSz, Sep) + end + end. + +take_tc([Bin|Cont0], N, Sep) when is_binary(Bin) -> + <<_:N/binary, Rest/binary>> = Bin, + case bin_search_inv(Rest, Cont0, Sep) of + {nomatch,Cont} -> + Used = cp_prefix(Cont0, Cont), + {Head, Tail} = take_tc(Cont, 0, Sep), + {stack(unicode:characters_to_binary([Bin|Used]), Head), Tail}; + [SepStart|Cont1] -> + case bin_search(SepStart, Cont1, Sep) of + {nomatch, Cont} -> + {Head, Tail} = take_tc(Cont, 0, Sep), + Used = cp_prefix(Cont0, Cont), + case equal(Tail, Cont) of + true -> + KeepSz = byte_size(Bin) - byte_size(SepStart), + <> = Bin, + {stack(Keep,Head), stack(stack(End,Used),Tail)}; + false -> + {stack(unicode:characters_to_binary([Bin|Used]),Head), Tail} + end; + [NonSep|Cont] when is_binary(NonSep) -> + KeepSz = byte_size(Bin) - byte_size(NonSep), + take_tc([Bin|Cont], KeepSz, Sep) + end + end; +take_tc(Str, 0, {GCs,CPs,_}=Sep) when is_list(Str) -> + case unicode_util:cp(Str) of + [CP|Cs] -> + case lists:member(CP, CPs) of + true -> + [GC|Cs1] = unicode_util:gc(Str), + case lists:member(GC, GCs) of + false -> + {Head, Tail} = take_tc(Cs1, 0, Sep), + case equal(Tail, Cs1) of + true -> {Head, append(GC,Tail)}; + false -> {append(GC,Head), Tail} + end; + true -> + {Head, Tail} = take_tc(Cs1, 0, Sep), + {append(GC,Head), Tail} + end; + false -> + {Head, Tail} = take_tc(Cs, 0, Sep), + case equal(Tail, Cs) of + true -> {Head, append(CP,Tail)}; + false -> {append(CP,Head), Tail} + end + end; + [] -> {[],[]} + end; +take_tc(Bin, N, Sep) when is_binary(Bin) -> + <<_:N/binary, Rest/binary>> = Bin, + case bin_search_inv(Rest, [], Sep) of + {nomatch,_} -> {Bin, <<>>}; + [SepStart] -> + case bin_search(SepStart, [], Sep) of + {nomatch,_} -> + KeepSz = byte_size(Bin) - byte_size(SepStart), + <> = Bin, + {Before, End}; + [NonSep] -> + KeepSz = byte_size(Bin) - byte_size(NonSep), + take_tc(Bin, KeepSz, Sep) + end + end. + +prefix_1(Cs, []) -> Cs; +prefix_1(Cs, [_]=Pre) -> + prefix_2(unicode_util:gc(Cs), Pre); +prefix_1(Cs, Pre) -> + prefix_2(unicode_util:cp(Cs), Pre). + +prefix_2([C|Cs], [C|Pre]) -> + prefix_1(Cs, Pre); +prefix_2(_, _) -> + nomatch. + +split_1([Bin|Cont0], Needle, Start, Where, Curr0, Acc) + when is_binary(Bin) -> + case bin_search_str(Bin, Start, Cont0, Needle) of + {nomatch,Sz,Cont} -> + <> = Bin, + split_1(Cont, Needle, 0, Where, [Keep|Curr0], Acc); + {Before, [Cs0|Cont], After} -> + Curr = add_non_empty(Before,Curr0), + case Where of + leading -> + [rev(Curr),After]; + trailing -> + <<_/utf8, Cs/binary>> = Cs0, + Next = byte_size(Bin) - byte_size(Cs), + split_1([Bin|Cont], Needle, Next, Where, + Curr0, [rev(Curr),After]); + all -> + split_1(After, Needle, 0, Where, [], [rev(Curr)|Acc]) + end + end; +split_1(Cs0, [C|_]=Needle, _, Where, Curr, Acc) when is_list(Cs0) -> + case unicode_util:cp(Cs0) of + [C|Cs] -> + case prefix_1(Cs0, Needle) of + nomatch -> split_1(Cs, Needle, 0, Where, append(C,Curr), Acc); + Rest when Where =:= leading -> + [rev(Curr), Rest]; + Rest when Where =:= trailing -> + split_1(Cs, Needle, 0, Where, [C|Curr], [rev(Curr), Rest]); + Rest when Where =:= all -> + split_1(Rest, Needle, 0, Where, [], [rev(Curr)|Acc]) + end; + [Other|Cs] -> + split_1(Cs, Needle, 0, Where, append(Other,Curr), Acc); + [] -> + {rev(Curr), Acc} + end; +split_1(Bin, [_C|_]=Needle, Start, Where, Curr0, Acc) -> + case bin_search_str(Bin, Start, [], Needle) of + {nomatch,_,_} -> + <<_:Start/binary, Keep/binary>> = Bin, + {rev([Keep|Curr0]), Acc}; + {Before, [Cs0], After} -> + case Where of + leading -> + [rev([Before|Curr0]),After]; + trailing -> + <<_/utf8, Cs/binary>> = Cs0, + Next = byte_size(Bin) - byte_size(Cs), + split_1(Bin, Needle, Next, Where, Curr0, + [btoken(Before,Curr0),After]); + all -> + Next = byte_size(Bin) - byte_size(After), + <<_:Start/binary, Keep/binary>> = Before, + Curr = [Keep|Curr0], + split_1(Bin, Needle, Next, Where, [], [rev(Curr)|Acc]) + end + end. + +lexemes_m([Bin|Cont0], Seps, Ts) when is_binary(Bin) -> + case bin_search_inv(Bin, Cont0, Seps) of + {nomatch,Cont} -> + lexemes_m(Cont, Seps, Ts); + Cs -> + {Lexeme,Rest} = lexeme_pick(Cs, Seps, []), + lexemes_m(Rest, Seps, [Lexeme|Ts]) + end; +lexemes_m(Cs0, {GCs, _, _}=Seps, Ts) when is_list(Cs0) -> + case unicode_util:gc(Cs0) of + [C|Cs] -> + case lists:member(C, GCs) of + true -> + lexemes_m(Cs, Seps, Ts); + false -> + {Lexeme,Rest} = lexeme_pick(Cs0, Seps, []), + lexemes_m(Rest, Seps, [Lexeme|Ts]) + end; + [] -> + lists:reverse(Ts) + end; +lexemes_m(Bin, Seps, Ts) when is_binary(Bin) -> + case bin_search_inv(Bin, [], Seps) of + {nomatch,_} -> + lists:reverse(Ts); + [Cs] -> + {Lexeme,Rest} = lexeme_pick(Cs, Seps, []), + lexemes_m(Rest, Seps, add_non_empty(Lexeme,Ts)) + end. + +lexeme_pick([CP|Cs1]=Cs0, {GCs,CPs,_}=Seps, Tkn) when is_integer(CP) -> + case lists:member(CP, CPs) of + true -> + [GC|Cs2] = unicode_util:gc(Cs0), + case lists:member(GC, GCs) of + true -> {rev(Tkn), Cs2}; + false -> lexeme_pick(Cs2, Seps, append(rev(GC),Tkn)) + end; + false -> lexeme_pick(Cs1, Seps, [CP|Tkn]) + end; +lexeme_pick([Bin|Cont0], Seps, Tkn) when is_binary(Bin) -> + case bin_search(Bin, Cont0, Seps) of + {nomatch,_} -> + lexeme_pick(Cont0, Seps, [Bin|Tkn]); + [Left|_Cont] = Cs -> + Bytes = byte_size(Bin) - byte_size(Left), + <> = Bin, + {btoken(Lexeme, Tkn), Cs} + end; +lexeme_pick(Cs0, {GCs, CPs, _} = Seps, Tkn) when is_list(Cs0) -> + case unicode_util:cp(Cs0) of + [CP|Cs] -> + case lists:member(CP, CPs) of + true -> + [GC|Cs2] = unicode_util:gc(Cs0), + case lists:member(GC, GCs) of + true -> {rev(Tkn), Cs0}; + false -> lexeme_pick(Cs2, Seps, append(rev(GC),Tkn)) + end; + false -> + lexeme_pick(Cs, Seps, append(CP,Tkn)) + end; + [] -> + {rev(Tkn), []} + end; +lexeme_pick(Bin, Seps, Tkn) when is_binary(Bin) -> + case bin_search(Bin, Seps) of + {nomatch,_} -> + {btoken(Bin,Tkn), []}; + [Left] -> + Bytes = byte_size(Bin) - byte_size(Left), + <> = Bin, + {btoken(Lexeme, Tkn), Left} + end. + +nth_lexeme_m([Bin|Cont0], Seps, N) when is_binary(Bin) -> + case bin_search_inv(Bin, Cont0, Seps) of + {nomatch,Cont} -> + nth_lexeme_m(Cont, Seps, N); + Cs when N > 1 -> + Rest = lexeme_skip(Cs, Seps), + nth_lexeme_m(Rest, Seps, N-1); + Cs -> + {Lexeme,_} = lexeme_pick(Cs, Seps, []), + Lexeme + end; +nth_lexeme_m(Cs0, {GCs, _, _}=Seps, N) when is_list(Cs0) -> + case unicode_util:gc(Cs0) of + [C|Cs] -> + case lists:member(C, GCs) of + true -> + nth_lexeme_m(Cs, Seps, N); + false when N > 1 -> + Cs1 = lexeme_skip(Cs, Seps), + nth_lexeme_m(Cs1, Seps, N-1); + false -> + {Lexeme,_} = lexeme_pick(Cs0, Seps, []), + Lexeme + end; + [] -> + [] + end; +nth_lexeme_m(Bin, Seps, N) when is_binary(Bin) -> + case bin_search_inv(Bin, [], Seps) of + [Cs] when N > 1 -> + Cs1 = lexeme_skip(Cs, Seps), + nth_lexeme_m(Cs1, Seps, N-1); + [Cs] -> + {Lexeme,_} = lexeme_pick(Cs, Seps, []), + Lexeme; + {nomatch,_} -> + <<>> + end. + +lexeme_skip([CP|Cs1]=Cs0, {GCs,CPs,_}=Seps) when is_integer(CP) -> + case lists:member(CP, CPs) of + true -> + [GC|Cs2] = unicode_util:gc(Cs0), + case lists:member(GC, GCs) of + true -> Cs0; + false -> lexeme_skip(Cs2, Seps) + end; + false -> + lexeme_skip(Cs1, Seps) + end; +lexeme_skip([Bin|Cont0], Seps) when is_binary(Bin) -> + case bin_search(Bin, Cont0, Seps) of + {nomatch,_} -> lexeme_skip(Cont0, Seps); + Cs -> Cs + end; +lexeme_skip(Cs0, {GCs, CPs, _} = Seps) when is_list(Cs0) -> + case unicode_util:cp(Cs0) of + [CP|Cs] -> + case lists:member(CP, CPs) of + true -> + [GC|Cs2] = unicode_util:gc(Cs0), + case lists:member(GC, GCs) of + true -> Cs0; + false -> lexeme_skip(Cs2, Seps) + end; + false -> + lexeme_skip(Cs, Seps) + end; + [] -> + [] + end; +lexeme_skip(Bin, Seps) when is_binary(Bin) -> + case bin_search(Bin, Seps) of + {nomatch,_} -> <<>>; + [Left] -> Left + end. + +find_l([Bin|Cont0], Needle) when is_binary(Bin) -> + case bin_search_str(Bin, 0, Cont0, Needle) of + {nomatch, _, Cont} -> + find_l(Cont, Needle); + {_Before, Cs, _After} -> + Cs + end; +find_l(Cs0, [C|_]=Needle) when is_list(Cs0) -> + case unicode_util:cp(Cs0) of + [C|Cs] -> + case prefix_1(Cs0, Needle) of + nomatch -> find_l(Cs, Needle); + _ -> Cs0 + end; + [_C|Cs] -> + find_l(Cs, Needle); + [] -> nomatch + end; +find_l(Bin, Needle) -> + case bin_search_str(Bin, 0, [], Needle) of + {nomatch,_,_} -> nomatch; + {_Before, [Cs], _After} -> Cs + end. + +find_r([Bin|Cont0], Needle, Res) when is_binary(Bin) -> + case bin_search_str(Bin, 0, Cont0, Needle) of + {nomatch,_,Cont} -> + find_r(Cont, Needle, Res); + {_, Cs0, _} -> + [_|Cs] = unicode_util:gc(Cs0), + find_r(Cs, Needle, Cs0) + end; +find_r(Cs0, [C|_]=Needle, Res) when is_list(Cs0) -> + case unicode_util:cp(Cs0) of + [C|Cs] -> + case prefix_1(Cs0, Needle) of + nomatch -> find_r(Cs, Needle, Res); + _ -> find_r(Cs, Needle, Cs0) + end; + [_C|Cs] -> + find_r(Cs, Needle, Res); + [] -> Res + end; +find_r(Bin, Needle, Res) -> + case bin_search_str(Bin, 0, [], Needle) of + {nomatch,_,_} -> Res; + {_Before, [Cs0], _After} -> + <<_/utf8, Cs/binary>> = Cs0, + find_r(Cs, Needle, Cs0) + end. + +%% These are used to avoid creating lists around binaries +%% might be unnecessary, is there a better solution? +btoken(Token, []) -> Token; +btoken(BinPart, [C]) when is_integer(C) -> <>; +btoken(<<>>, Tkn) -> lists:reverse(Tkn); +btoken(BinPart, Cs) -> [lists:reverse(Cs),BinPart]. + +rev([B]) when is_binary(B) -> B; +rev(L) when is_list(L) -> lists:reverse(L); +rev(C) when is_integer(C) -> C. + +append(Char, <<>>) when is_integer(Char) -> [Char]; +append(Char, <<>>) when is_list(Char) -> Char; +append(Char, Bin) when is_binary(Bin) -> [Char,Bin]; +append(Char, Str) when is_integer(Char) -> [Char|Str]; +append(GC, Str) when is_list(GC) -> GC ++ Str. + +stack(Bin, []) -> Bin; +stack(<<>>, St) -> St; +stack([], St) -> St; +stack(Bin, St) -> [Bin|St]. + +add_non_empty(<<>>, L) -> L; +add_non_empty(Token, L) -> [Token|L]. + +cp_prefix(Orig, Cont) -> + case unicode_util:cp(Cont) of + [] -> Orig; + [Cp|Rest] -> cp_prefix_1(Orig, Cp, Rest) + end. + +cp_prefix_1(Orig, Until, Cont) -> + case unicode_util:cp(Orig) of + [Until|Rest] -> + case equal(Rest, Cont) of + true -> []; + false-> [Until|cp_prefix_1(Rest, Until, Cont)] + end; + [CP|Rest] -> [CP|cp_prefix_1(Rest, Until, Cont)] + end. + + +%% Binary special +bin_search(Bin, Seps) -> + bin_search(Bin, [], Seps). + +bin_search(_Bin, Cont, {[],_,_}) -> + {nomatch, Cont}; +bin_search(Bin, Cont, {Seps,_,BP}) -> + bin_search_loop(Bin, 0, BP, Cont, Seps). + +%% Need to work with [<<$a>>, <<778/utf8>>], +%% i.e. å in nfd form $a "COMBINING RING ABOVE" +%% and PREPEND characters like "ARABIC NUMBER SIGN" 1536 <<216,128>> +%% combined with other characters are currently ignored. +search_pattern(Seps) -> + CPs = search_cp(Seps), + Bin = bin_pattern(CPs), + {Seps, CPs, Bin}. + +search_cp([CP|Seps]) when is_integer(CP) -> + [CP|search_cp(Seps)]; +search_cp([Pattern|Seps]) -> + [CP|_] = unicode_util:cp(Pattern), + [CP|search_cp(Seps)]; +search_cp([]) -> []. + +bin_pattern([CP|Seps]) -> + [<>|bin_pattern(Seps)]; +bin_pattern([]) -> []. + +bin_search_loop(Bin0, Start, _, Cont, _Seps) + when byte_size(Bin0) =< Start; Start < 0 -> + {nomatch, Cont}; +bin_search_loop(Bin0, Start, BinSeps, Cont, Seps) -> + <<_:Start/binary, Bin/binary>> = Bin0, + case binary:match(Bin, BinSeps) of + nomatch -> + {nomatch,Cont}; + {Where, _CL} -> + <<_:Where/binary, Cont0/binary>> = Bin, + Cont1 = stack(Cont0, Cont), + [GC|Cont2] = unicode_util:gc(Cont1), + case lists:member(GC, Seps) of + false -> + case Cont2 of + [BinR|Cont] when is_binary(BinR) -> + Next = byte_size(Bin0) - byte_size(BinR), + bin_search_loop(Bin0, Next, BinSeps, Cont, Seps); + BinR when is_binary(BinR), Cont =:= [] -> + Next = byte_size(Bin0) - byte_size(BinR), + bin_search_loop(Bin0, Next, BinSeps, Cont, Seps); + _ -> + {nomatch, Cont2} + end; + true when is_list(Cont1) -> + Cont1; + true -> + [Cont1] + end + end. + +bin_search_inv(Bin, Cont, {[], _, _}) -> + [Bin|Cont]; +bin_search_inv(Bin, Cont, {[Sep], _, _}) -> + bin_search_inv_1([Bin|Cont], Sep); +bin_search_inv(Bin, Cont, {Seps, _, _}) -> + bin_search_inv_n([Bin|Cont], Seps). + +bin_search_inv_1([<<>>|CPs], _) -> + {nomatch, CPs}; +bin_search_inv_1(CPs = [Bin0|Cont], Sep) when is_binary(Bin0) -> + case unicode_util:gc(CPs) of + [Sep|Bin] when is_binary(Bin), Cont =:= [] -> + bin_search_inv_1([Bin], Sep); + [Sep|[Bin|Cont]=Cs] when is_binary(Bin) -> + bin_search_inv_1(Cs, Sep); + [Sep|Cs] -> + {nomatch, Cs}; + _ -> CPs + end. + +bin_search_inv_n([<<>>|CPs], _) -> + {nomatch, CPs}; +bin_search_inv_n([Bin0|Cont]=CPs, Seps) when is_binary(Bin0) -> + [C|Cs0] = unicode_util:gc(CPs), + case {lists:member(C, Seps), Cs0} of + {true, Cs} when is_binary(Cs), Cont =:= [] -> + bin_search_inv_n([Cs], Seps); + {true, [Bin|Cont]=Cs} when is_binary(Bin) -> + bin_search_inv_n(Cs, Seps); + {true, Cs} -> {nomatch, Cs}; + {false, _} -> CPs + end. + +bin_search_str(Bin0, Start, Cont, [CP|_]=SearchCPs) -> + <<_:Start/binary, Bin/binary>> = Bin0, + case binary:match(Bin, <>) of + nomatch -> {nomatch, byte_size(Bin0), Cont}; + {Where0, _} -> + Where = Start+Where0, + <> = Bin0, + [GC|Cs]=unicode_util:gc(Cs0), + case prefix_1(stack(Cs0,Cont), SearchCPs) of + nomatch when is_binary(Cs) -> + KeepSz = byte_size(Bin0) - byte_size(Cs), + bin_search_str(Bin0, KeepSz, Cont, SearchCPs); + nomatch -> + {nomatch, Where, stack([GC|Cs],Cont)}; + [] -> + {Keep, [Cs0|Cont], <<>>}; + Rest -> + {Keep, [Cs0|Cont], Rest} + end + end. + + +%%--------------------------------------------------------------------------- +%% OLD lists API kept for backwards compability +%%--------------------------------------------------------------------------- + %% Robert's bit %% len(String) @@ -68,12 +1292,12 @@ len(S) -> length(S). %% equal(String1, String2) %% Test if 2 strings are equal. --spec equal(String1, String2) -> boolean() when - String1 :: string(), - String2 :: string(). +%% -spec equal(String1, String2) -> boolean() when +%% String1 :: string(), +%% String2 :: string(). -equal(S, S) -> true; -equal(_, _) -> false. +%% equal(S, S) -> true; +%% equal(_, _) -> false. %% concat(String1, String2) %% Concatenate 2 strings. @@ -127,7 +1351,7 @@ rchr([], _C, _I, L) -> L. str(S, Sub) when is_list(Sub) -> str(S, Sub, 1). str([C|S], [C|Sub], I) -> - case prefix(Sub, S) of + case l_prefix(Sub, S) of true -> I; false -> str(S, [C|Sub], I+1) end; @@ -142,16 +1366,16 @@ str([], _Sub, _I) -> 0. rstr(S, Sub) when is_list(Sub) -> rstr(S, Sub, 1, 0). rstr([C|S], [C|Sub], I, L) -> - case prefix(Sub, S) of + case l_prefix(Sub, S) of true -> rstr(S, [C|Sub], I+1, I); false -> rstr(S, [C|Sub], I+1, L) end; rstr([_|S], Sub, I, L) -> rstr(S, Sub, I+1, L); rstr([], _Sub, _I, L) -> L. -prefix([C|Pre], [C|String]) -> prefix(Pre, String); -prefix([], String) when is_list(String) -> true; -prefix(Pre, String) when is_list(Pre), is_list(String) -> false. +l_prefix([C|Pre], [C|String]) -> l_prefix(Pre, String); +l_prefix([], String) when is_list(String) -> true; +l_prefix(Pre, String) when is_list(Pre), is_list(String) -> false. %% span(String, Chars) -> Length. %% cspan(String, Chars) -> Length. @@ -229,9 +1453,9 @@ tokens(S, Seps) -> [_|_] -> [S] end; [C] -> - tokens_single_1(reverse(S), C, []); + tokens_single_1(lists:reverse(S), C, []); [_|_] -> - tokens_multiple_1(reverse(S), Seps, []) + tokens_multiple_1(lists:reverse(S), Seps, []) end. tokens_single_1([Sep|S], Sep, Toks) -> @@ -342,8 +1566,8 @@ sub_word(String, Index, Char) when is_integer(Index), is_integer(Char) -> s_word(strip(String, left, Char), Index, Char, 1, []) end. -s_word([], _, _, _,Res) -> reverse(Res); -s_word([Char|_],Index,Char,Index,Res) -> reverse(Res); +s_word([], _, _, _,Res) -> lists:reverse(Res); +s_word([Char|_],Index,Char,Index,Res) -> lists:reverse(Res); s_word([H|T],Index,Char,Index,Res) -> s_word(T,Index,Char,Index,[H|Res]); s_word([Char|T],Stop,Char,Index,Res) when Index < Stop -> s_word(strip(T,left,Char),Stop,Char,Index+1,Res); @@ -359,7 +1583,7 @@ strip(String) -> strip(String, both). -spec strip(String, Direction) -> Stripped when String :: string(), Stripped :: string(), - Direction :: left | right | both. + Direction :: 'left' | 'right' | 'both'. strip(String, left) -> strip_left(String, $\s); strip(String, right) -> strip_right(String, $\s); @@ -369,7 +1593,7 @@ strip(String, both) -> -spec strip(String, Direction, Character) -> Stripped when String :: string(), Stripped :: string(), - Direction :: left | right | both, + Direction :: 'left' | 'right' | 'both', Character :: char(). strip(String, right, Char) -> strip_right(String, Char); diff --git a/lib/stdlib/test/string_SUITE.erl b/lib/stdlib/test/string_SUITE.erl index 836f9e5142..a78ddf761b 100644 --- a/lib/stdlib/test/string_SUITE.erl +++ b/lib/stdlib/test/string_SUITE.erl @@ -29,25 +29,46 @@ -export([init_per_testcase/2, end_per_testcase/2]). %% Test cases must be exported. --export([len/1,equal/1,concat/1,chr_rchr/1,str_rstr/1]). --export([span_cspan/1,substr/1,tokens/1,chars/1]). +-export([is_empty/1, length/1, to_graphemes/1, + reverse/1, slice/1, + equal/1, + pad/1, trim/1, chomp/1, take/1, + uppercase/1, lowercase/1, titlecase/1, casefold/1, + prefix/1, split/1, replace/1, find/1, + lexemes/1, nth_lexeme/1, cd_gc/1, meas/1 + ]). + +-export([len/1,old_equal/1,old_concat/1,chr_rchr/1,str_rstr/1]). +-export([span_cspan/1,substr/1,old_tokens/1,chars/1]). -export([copies/1,words/1,strip/1,sub_word/1,left_right/1]). -export([sub_string/1,centre/1, join/1]). -export([to_integer/1,to_float/1]). -export([to_upper_to_lower/1]). +%% Run tests when debugging them +-export([debug/0]). + suite() -> [{ct_hooks,[ts_install_cth]}, {timetrap,{minutes,1}}]. -all() -> - [len, equal, concat, chr_rchr, str_rstr, span_cspan, - substr, tokens, chars, copies, words, strip, sub_word, - left_right, sub_string, centre, join, to_integer, - to_float, to_upper_to_lower]. +all() -> + [{group, chardata}, {group, list_string}]. -groups() -> - []. +groups() -> + [{chardata, + [is_empty, length, to_graphemes, + equal, reverse, slice, + pad, trim, chomp, take, + lexemes, nth_lexeme, + uppercase, lowercase, titlecase, casefold, + prefix, find, split, replace, cd_gc, + meas]}, + {list_string, + [len, old_equal, old_concat, chr_rchr, str_rstr, span_cspan, + substr, old_tokens, chars, copies, words, strip, sub_word, + left_right, sub_string, centre, join, to_integer, + to_float, to_upper_to_lower]}]. init_per_suite(Config) -> Config. @@ -68,8 +89,839 @@ init_per_testcase(_Case, Config) -> end_per_testcase(_Case, _Config) -> ok. +debug() -> + Config = [{data_dir, ?MODULE_STRING++"_data"}], + [io:format("~p:~p~n",[Test,?MODULE:Test(Config)]) || + {_,Tests} <- groups(), Test <- Tests]. + +-define(TEST(B,C,D), test(?LINE,?FUNCTION_NAME,B,C,D, true)). +-define(TEST_EQ(B,C,D), + test(?LINE,?FUNCTION_NAME,B,C,D, true), + test(?LINE,?FUNCTION_NAME,hd(C),[B|tl(C),D, true)). + +-define(TEST_NN(B,C,D), + test(?LINE,?FUNCTION_NAME,B,C,D, false), + test(?LINE,?FUNCTION_NAME,hd(C),[B|tl(C)],D, false)). + + +is_empty(_) -> + ?TEST("", [], true), + ?TEST([""|<<>>], [], true), + ?TEST("a", [], false), + ?TEST([""|<<$a>>], [], false), + ?TEST(["",[<<>>]], [], true), + ok. + +length(_) -> + %% invalid arg type + {'EXIT',_} = (catch string:length({})), + {'EXIT',_} = (catch string:length(foo)), + %% Valid signs + ?TEST("", [], 0), + ?TEST([""|<<>>], [], 0), + L = tuple_size(list_to_tuple(atom_to_list(?MODULE))), + ?TEST(atom_to_list(?MODULE), [], L), + ?TEST("Hello", [], 5), + ?TEST("UC Ω ßð", [], 7), + ?TEST(["abc"|<<"abc">>], [], 6), + ?TEST(["abc",["def"]], [], 6), + ?TEST([<<97/utf8, 778/utf8, 98/utf8>>, [776,111,776]], [], 3), %% åäö in nfd + ok. + +equal(_) -> + %% invalid arg type + {'EXIT',_} = (catch string:equal(1, 2)), + {'EXIT',_} = (catch string:equal(1, 2, foo)), + {'EXIT',_} = (catch string:equal(1, 2, true, foo)), + + ?TEST("", [<<"">>], true), + ?TEST("Hello", ["Hello"], true), + ?TEST("Hello", ["Hell"], false), + ?TEST("Hello", ["Hello!"], false), + ?TEST("Hello", [<<"Hello"/utf8>>], true), + ?TEST("Hello", [<<"Mello"/utf8>>], false), + ?TEST("Hello", [<<"Hello!"/utf8>>], false), + ?TEST(["Hello",[" deep"]], ["Hello deep"], true), + ?TEST(["Hello",[<<" deep"/utf8>>]], ["Hello deep"], true), + ?TEST("Hello deep", [["Hello", [" deep"]]], true), + ?TEST("Hello deep", [["Hello", [" d!eep"]]], false), + ?TEST("Hello deep", [["Hello", [<<" deep"/utf8>>]]], true), + false = string:equal("Åäö", [<<97/utf8, 778/utf8, 98/utf8>>, [776,111,776]]), %% nfc vs nfd + + %% case_insensitive_equal() + ?TEST("", ["", true], true), + ?TEST("a", ["b", true], false), + ?TEST("", [<<>>, true], true), + ?TEST("", [[<<>>,[]], true], true), + ?TEST("", [[<<>>,[$a]], true], false), + ?TEST("123", ["123", true], true), + ?TEST("abc", ["abc", true], true), + ?TEST([[],<<>>,"ABC"|<<>>], [["abc",[]], true], true), + ?TEST("ABCa", ["abcå", true], false), + ?TEST("åäö", [{norm,"åäö"}, true], true), + ?TEST("ÅÄÖ", [{norm,"åäö"}, true], true), + ?TEST("MICHAŁ", ["michał", true], true), + ?TEST(["Mic",<<"HAŁ"/utf8>>], ["michał", true], true), + ?TEST("ß SHARP S", ["ss sharp s", true], true), + ?TEST("ẞ SHARP S", [[<<$ß/utf8, $\s>>,"SHARP S"], true], true), + ?TEST("ẞ SHARP ß", ["ss sharp s", true], false), + ?TEST(<<"İ I WITH DOT ABOVE"/utf8>>, ["i̇ i with dot above", true], true), + %% These should be equivalent with the above + true = string:equal(string:casefold(["Mic",<<"HAŁ"/utf8>>]), string:casefold("michał")), + true = string:equal(string:casefold("ẞ SHARP S"), string:casefold([<<$ß/utf8, $\s>>,"SHARP S"])), + false = string:equal(string:casefold("ẞ SHARP ß"), string:casefold("ss sharp s")), + + %% Normalization + ?TEST_NN("", ["", true, none], true), + ?TEST_NN("a", ["b", true, nfc], false), + ?TEST_NN("a", ["b", true, nfd], false), + ?TEST_NN("a", ["b", true, nfkc], false), + ?TEST_NN("a", ["b", true, nfkd], false), + + ?TEST_NN("a", ["A", false, nfc], false), + ?TEST_NN("a", ["A", false, nfd], false), + ?TEST_NN([<<>>,"a"|<<>>], ["A", true, nfkc], true), + ?TEST_NN(<<"a">>, ["A", true, nfkd], true), + + ?TEST_NN([$a, <<$b>>, [97,776]], ["abä", false, none], false), + ?TEST_NN([$a, <<$b>>, [97,776]], ["abä", false, nfc], true), + ?TEST_NN([$a, <<$b>>, [97,776]], ["abä", false, nfd], true), + ?TEST_NN([$a, <<$b>>, [97,776]], ["abä", false, nfkc], true), + ?TEST_NN([$a, <<$b>>, [97,776]], ["abä", false, nfkd], true), + + ?TEST_NN([$a, <<$b>>, [97,776]], ["abÄ", true, none], false), + ?TEST_NN([$a, <<$b>>, [97,776]], ["abÄ", false, nfc], false), + ?TEST_NN([$a, <<$b>>, [97,776]], ["abÄ", true, nfc], true), + ?TEST_NN([$a, <<$b>>, [97,776]], ["abÄ", true, nfd], true), + ?TEST_NN([$a, <<$b>>, [97,776]], ["abÄ", true, nfkc], true), + ?TEST_NN([$a, <<$b>>, [97,776]], ["abÄ", true, nfkd], true), + + ?TEST_NN([$a, <<$b>>, "ホンダ"], ["abホンダ", true, none], false), + ?TEST_NN([$a, <<$b>>, "ホンダ"], ["abホンダ", true, nfc], false), + ?TEST_NN([$a, <<$b>>, "ホンダ"], ["abホンダ", true, nfd], false), + ?TEST_NN([$a, <<$b>>, "ホンダ"], ["abホンダ", true, nfkc], true), + ?TEST_NN([$a, <<$b>>, "ホンダ"], ["abホンダ", true, nfkd], true), + + ?TEST_NN([$a, <<$b>>, "32"], ["ab32", true, none], false), + ?TEST_NN([$a, <<$b>>, "32"], ["ab32", true, nfc], false), + ?TEST_NN([$a, <<$b>>, "32"], ["ab32", true, nfd], false), + ?TEST_NN([$a, <<$b>>, "32"], ["ab32", true, nfkc], true), + ?TEST_NN([$a, <<$b>>, "32"], ["ab32", true, nfkd], true), + + %% Coverage. + ?TEST("", [<<"">>, false, nfc], true), + ?TEST("", [<<"">>, true, nfc], true), + + ok. + +to_graphemes(_) -> + %% More tests are in unicode_util_SUITE.erl + {'EXIT', _} = (catch unicode:characters_to_nfd_binary(["asdåäö", an_atom])), + String = ["abc..åäö", $e, 788, <<"Ωµe`è"/utf8>>, "œŒþæÆħ§ß"], + NFD = unicode:characters_to_nfd_list(String), + [] = string:to_graphemes([]), + [] = string:to_graphemes(<<>>), + GCs = string:to_graphemes(String), + true = erlang:length(GCs) =:= string:length(String), + true = erlang:length(GCs) =:= erlang:length(string:to_graphemes(NFD)), + true = erlang:length(GCs) =:= + erlang:length(string:to_graphemes(unicode:characters_to_nfc_list(String))), + ok. + +reverse(_) -> + {'EXIT',_} = (catch string:reverse(2)), + Str1 = "Hello ", + Str2 = "Ω ßð", + Str3 = "åäö", + ?TEST("", [], ""), + ?TEST(Str1, [], lists:reverse(Str1)), + ?TEST(Str2, [], lists:reverse(Str2)), + ?TEST(Str3, [], lists:reverse(Str3)), + true = string:reverse(Str3) =:= lists:reverse(string:to_graphemes(Str3)), + ok. + +slice(_) -> + {'EXIT',_} = (catch string:slice(2, 2, 2)), + {'EXIT',_} = (catch string:slice("asd", foo, 2)), + {'EXIT',_} = (catch string:slice("asd", 2, -1)), + ?TEST("", [3], ""), + ?TEST("aåä", [1, 0], ""), + ?TEST("aåä", [3], ""), + ?TEST("aåäöbcd", [3], "öbcd"), + ?TEST([<<"aå"/utf8>>,"äöbcd"], [3], "öbcd"), + ?TEST([<<"aåä"/utf8>>,"öbcd"], [3], "öbcd"), + ?TEST([<<"aåä"/utf8>>,"öbcd"], [3, infinity], "öbcd"), + + ?TEST("", [3, 2], ""), + ?TEST("aåä", [3, 2], ""), + ?TEST("aåäöbcd", [3,2], "öb"), + ?TEST([<<"aå"/utf8>>,"äöbcd"], [3,3], "öbc"), + ?TEST([<<"aåä"/utf8>>,"öbcd"], [3,10], "öbcd"), + + ok. + +pad(_) -> + Str = "Hallå", + ?TEST(Str, [7], "Hallå "), + ?TEST(Str, [7, leading], " Hallå"), + ?TEST(Str, [4, both, $.], "Hallå"), + ?TEST(Str, [10, both, $.], "..Hallå..."), + ?TEST(Str, [10, leading, $.], ".....Hallå"), + ?TEST(Str, [10, trailing, $.], "Hallå....."), + ?TEST(Str++["f"], [10, trailing, $.], "Hallåf...."), + ?TEST(Str++[" flåwer"], [10, trailing, $.], "Hallå flåwer"), + ok. + +trim(_) -> + Str = "\t\s..Ha\s.llå..\t\n\r", + ?TEST("", [], ""), + ?TEST(Str, [both, "x"], Str), + ?TEST(Str, [leading], "..Ha\s.llå..\t\n\r"), + ?TEST(Str, [trailing], "\t\s..Ha\s.llå.."), + ?TEST(Str, [], "..Ha .llå.."), + ?TEST(".. ", [both, ""], ".. "), + ?TEST([<<".. ">>], [both, ". "], ""), + ?TEST(".. h.ej ..", [leading, ". "], "h.ej .."), + ?TEST(".. h.ej ..", [trailing, ". "], ".. h.ej"), + ?TEST(".. h.ej ..", [both, ". "], "h.ej"), + ?TEST(["..", <<"h.ej">>, ".."], [both, ". "], "h.ej"), + ?TEST([[], "..", " h.ej ", <<"..">>], [both, ". "], "h.ej"), + ?TEST([<<>>,<<"..">>, " h.ej", <<" ..">>], [both, ". "], "h.ej"), + ?TEST([<<>>,<<"..">>, " h.ej", <<" ..">>], [trailing, ". "], ".. h.ej"), + ?TEST([<<".. h.ej .">>, <<"..">>], [both, ". "], "h.ej"), + ?TEST(["..h", ".e", <<"j..">>], [both, ". "], "h.ej"), + ?TEST(["..h", <<".ejsa"/utf8>>, "n.."], [both, ". "], "h.ejsan"), + %% Test that it behaves with graphemes (i.e. nfd tests are the hard part) + ?TEST("aaåaa", [both, "a"], "å"), + ?TEST(["aaa",778,"äöoo"], [both, "ao"], "åäö"), + ?TEST([<<"aaa">>,778,"äöoo"], [both, "ao"], "åäö"), + ?TEST([<<"e">>,778,"åäöe", <<778/utf8>>], [both, [[$e,778]]], "åäö"), + ?TEST([[<<"!v">>|<<204,128,$v,204,129>>]],[trailing, [[$v,769]]], [$!,$v,768]), + ?TEST([[[<<"v">>|<<204,129,118,204,128,118>>],769,118,769]], [trailing, [[118,769]]], [$v,769,$v,768]), + ?TEST([<<"vv">>|<<204,128,118,204,128>>], [trailing, [[118,768]]], "v"), + ok. + +chomp(_) -> + Str = "åäö\na\r\nsd\n", + Res = "åäö\na\r\nsd", + ?TEST("", [], ""), + ?TEST("\n", [], ""), + ?TEST("str \t", [], "str \t"), + ?TEST("str \t\n\r", [], "str \t\n\r"), + ?TEST(Str, [], Res), + ?TEST([Str,$\n], [], Res), + ?TEST([Str|"\n"], [], Res), + ?TEST([Str|<<"\n">>], [], Res), + ?TEST([Str,$\r|<<"\n">>], [], Res), + ?TEST([Str, <<$\r>>|"\n"], [], Res), + ?TEST([<<$a,$\r>>,"\na\n"], [], "a\r\na"), + ok. + +take(_) -> + Str = "\t\s..Ha\s.llå..\t\n\r", + WS = "\t\s\n\r", + Chars = lists:seq($a,$z)++lists:seq($A,$Z), + %% complement=false, dir=leading + ?TEST("", ["abc"], {"",""}), + ?TEST(Str, ["x"], {[], Str}), + ?TEST(Str, [WS], {"\t\s","..Ha\s.llå..\t\n\r"}), + ?TEST(".. ", ["", false], {"", ".. "}), + ?TEST([<<".. ">>], [". ", false, leading], {".. ", ""}), + ?TEST(".. h.ej ..", [". ", false, leading], {".. ", "h.ej .."}), + ?TEST(["..", <<"h.ej">>, ".."], [". ", false, leading], {"..", "h.ej.."}), + ?TEST([[], "..", " h.ej ", <<"..">>], [". ", false, leading], {".. ","h.ej .."}), + ?TEST([<<>>,<<"..">>, " h.ej", <<" ..">>], [". ", false, leading], {".. ", "h.ej .."}), + ?TEST(["..h", <<".ejsa"/utf8>>, "n.."], [". ", false, leading], {"..", "h.ejsan.."}), + ?TEST([[<<101,204,138,33>>]], [[[$e,778]]], {[$e,778], "!"}), + %% Test that it behaves with graphemes (i.e. nfd tests are the hard part) + ?TEST("aaåaa", ["a", false, leading], {"aa", "åaa"}), + ?TEST(["aaa",778,"äöoo"], ["ao", false, leading], {"aa", "åäöoo"}), + ?TEST([<<"aaa">>,778,"äöoo"], ["ao",false,leading], {"aa", "åäöoo"}), + ?TEST([<<"e">>,778,"åäöe", <<778/utf8>>], [[[$e,778]], false, leading], {[$e,778],"åäöe"++[778]}), + + %% complement=true, dir=leading + ?TEST("", ["abc", true], {"",""}), + ?TEST(Str, ["x", true], {Str, []}), + ?TEST(Str, [Chars, true], {"\t\s..","Ha\s.llå..\t\n\r"}), + ?TEST(".. ", ["",true], {".. ", ""}), + ?TEST([<<".. ">>], [Chars, true, leading], {".. ", ""}), + ?TEST(".. h.ej ..", [Chars, true, leading], {".. ", "h.ej .."}), + ?TEST(["..", <<"h.ej">>, ".."], [Chars, true, leading], {"..", "h.ej.."}), + ?TEST([[], "..", " h.ej ", <<"..">>], [Chars, true, leading], {".. ","h.ej .."}), + ?TEST([<<>>,<<"..">>, " h.ej", <<" ..">>], [Chars, true, leading], {".. ", "h.ej .."}), + ?TEST(["..h", <<".ejsa"/utf8>>, "n.."], [Chars, true, leading], {"..", "h.ejsan.."}), + %% Test that it behaves with graphemes (i.e. nfd tests are the hard part) + ?TEST(["aaee",778,"äöoo"], [[[$e,778]], true, leading], {"aae", [$e,778|"äöoo"]}), + ?TEST([<<"aae">>,778,"äöoo"], [[[$e,778]],true,leading], {"aa", [$e,778|"äöoo"]}), + ?TEST([<<"e">>,778,"åäöe", <<778/utf8>>], [[[$e,778]], true, leading], {[], [$e,778]++"åäöe"++[778]}), + + %% complement=false, dir=trailing + ?TEST(Str, ["", false, trailing], {Str, []}), + ?TEST(Str, ["x", false, trailing], {Str, []}), + ?TEST(Str, [WS, false,trailing], {"\t\s..Ha\s.llå..", "\t\n\r"}), + ?TEST(".. h.ej ..", [". ", false, trailing], {".. h.ej", " .."}), + ?TEST(["..", <<"h.ej">>, ".."], [". ", false, trailing], {"..h.ej", ".."}), + ?TEST([[], "..", " h.ej ", <<"..">>], [". ", false, trailing], {".. h.ej", " .."}), + ?TEST([<<>>,<<"..">>, " h.ej", <<" ..">>], [". ", false, trailing], {".. h.ej", " .."}), + ?TEST(["..h", <<".ejsa"/utf8>>, "n.."], [". ", false, trailing], {"..h.ejsan", ".."}), + ?TEST("aaåaa", ["a", false, trailing], {"aaå", "aa"}), + ?TEST([<<"KMШ"/utf8>>], [[1064], false, trailing], {"KMШ",[]}), + ?TEST([[<<"!\"">>|<<"\"">>]], ["\"", false, trailing], {"!", "\"\""}), + ?TEST([<<$v>>, 769], [[[$v,769]], false, trailing], {"", [$v,769]}), + ?TEST(["aaa",778,"äöoo"], ["ao", false, trailing], {"aaåäö", "oo"}), + ?TEST([<<"aaa">>,778,"äöoo"], ["ao", false, trailing], {"aaåäö", "oo"}), + ?TEST([<<"e">>,778,"åäöee", <<778/utf8>>], [[[$e,778]], false, trailing], {[$e,778|"åäöe"], [$e,778]}), + + %% complement=true, dir=trailing + ?TEST("", ["abc", true, trailing], {"",""}), + ?TEST(Str, ["x", true, trailing], {[], Str}), + %?TEST(Str, [{norm,Chars}, true, trailing], {"\t\s..Ha\s.ll","å..\t\n\r"}), + ?TEST(".. ", ["", true, trailing], {"", ".. "}), + ?TEST([<<".. ">>], [Chars, true, trailing], {"", ".. "}), + ?TEST(".. h.ej ..", [Chars, true, trailing], {".. h.ej", " .."}), + ?TEST(["..", <<"h.ej">>, ".."], [Chars, true, trailing], {"..h.ej", ".."}), + ?TEST([[], "..", " h.ej ", <<"..">>], [Chars, true, trailing], {".. h.ej"," .."}), + ?TEST([<<>>,<<"..">>, " h.ej", <<" ..">>], [Chars, true, trailing], {".. h.ej"," .."}), + ?TEST(["..h", <<".ejsa"/utf8>>, "n.."], [Chars, true, trailing], {"..h.ejsan", ".."}), + ?TEST([[<<101,204,138,33>>]], [[[$e,778]], true, trailing], {[$e,778], "!"}), + ?TEST([<<"Fa">>], [[$F], true, trailing], {"F", "a"}), + ?TEST([[<<101,101,204,138>>,1045,778]], ["e", true, trailing], {"e", [101,778,1045,778]}), + ?TEST([[<<101,101,204,138>>,<<1045/utf8,778/utf8>>]], ["e", true, trailing], {"e", [101,778,1045,778]}), + ?TEST([[[118,769,118],<<204,129,118,204,129,120,204,128,118>>,768,120,768]], + [[[118,769]], true, trailing], {[118,769,118,769,118,769],[120,768,118,768,120,768]}), + ?TEST([[<<118,204,128,118>>|<<204,128,118,204,128,118,204,128,206,132,204,129,206,132,204,129>>]], + [[[118,768]], true, trailing], {[118,768,118,768,118,768,118,768], [900,769,900,769]}), + %% Test that it behaves with graphemes (i.e. nfd tests are the hard part) + ?TEST(["aaee",778,"äöoo"], [[[$e,778]], true, trailing], {"aae"++[$e,778], "äöoo"}), + ?TEST([<<"aae">>,778,"äöoo"], [[[$e,778]],true,trailing], {"aa"++[$e,778], "äöoo"}), + ?TEST([<<"e">>,778,"åäöe", <<778/utf8>>], [[[$e,778]], true, trailing], {[$e,778]++"åäöe"++[778], []}), + ?TEST([<<"e">>,778,"åäöe", <<778/utf8>>, $e, 779], [[[$e,778]], true, trailing], + {[$e,778]++"åäöe"++[778], [$e,779]}), + + ok. + + +uppercase(_) -> + ?TEST("", [], ""), + ?TEST("123", [], "123"), + ?TEST("abc", [], "ABC"), + ?TEST("ABC", [], "ABC"), + ?TEST("abcdefghiljklmnopqrstvxyzåäö",[], "ABCDEFGHILJKLMNOPQRSTVXYZÅÄÖ"), + ?TEST("åäö", [], "ÅÄÖ"), + ?TEST("ÅÄÖ", [], "ÅÄÖ"), + ?TEST("Michał", [], "MICHAŁ"), + ?TEST(["Mic",<<"hał"/utf8>>], [], "MICHAŁ"), + ?TEST("ljLJ", [], "LJLJ"), + ?TEST("LJlj", [], "LJLJ"), + ?TEST("ß sharp s", [], "SS SHARP S"), + ok. + +lowercase(_) -> + ?TEST("", [], ""), + ?TEST("123", [], "123"), + ?TEST("abc", [], "abc"), + ?TEST("ABC", [], "abc"), + ?TEST("åäö", [], "åäö"), + ?TEST("ÅÄÖ", [], "åäö"), + ?TEST("MICHAŁ", [], "michał"), + ?TEST(["Mic",<<"HAŁ"/utf8>>], [], "michał"), + ?TEST("ß SHARP S", [], "ß sharp s"), + ?TEST("İ I WITH DOT ABOVE", [], "i̇ i with dot above"), + ok. + +titlecase(_) -> + ?TEST("", [], ""), + ?TEST("123", [], "123"), + %% Titlecase is the same as uppercase for most chars + [?TEST([C,$x], [], string:uppercase([C])++[$x]) || + C <-"abcdefghiljklmnopqrstvxyzåäö"], + %% Example of a different mapping + ?TEST("ljusad", [],"Ljusad"), + ?TEST("ljLJ", [], "LjLJ"), + ?TEST("LJlj", [], "Ljlj"), + ?TEST("ß sharp s", [], "Ss sharp s"), + ok. + +casefold(_) -> + ?TEST("", [], ""), + ?TEST("123", [], "123"), + ?TEST("abc", [], "abc"), + ?TEST("ABC", [], "abc"), + ?TEST("åäö", [], "åäö"), + ?TEST("ÅÄÖ", [], "åäö"), + ?TEST("MICHAŁ", [], "michał"), + ?TEST(["Mic",<<"HAŁ"/utf8>>], [], "michał"), + ?TEST("ß SHARP S", [], "ss sharp s"), + ?TEST("ẞ SHARP S", [], "ss sharp s"), + ?TEST("İ I WITH DOT ABOVE", [], "i̇ i with dot above"), + ok. + +prefix(_) -> + ?TEST("", ["a"], nomatch), + ?TEST("a", [""], "a"), + ?TEST("b", ["a"], nomatch), + ?TEST("a", ["a"], ""), + ?TEST("å", ["a"], nomatch), + ?TEST(["a",<<778/utf8>>], ["a"], nomatch), + ?TEST([<<"a"/utf8>>,778], ["a"], nomatch), + ?TEST("hejsan", [""], "hejsan"), + ?TEST("hejsan", ["hej"], "san"), + ?TEST("hejsan", ["hes"], nomatch), + ?TEST(["h", "ejsan"], ["hej"], "san"), + ?TEST(["h", "e", "jsan"], ["hej"], "san"), + ?TEST(["h", "e", "san"], ["hej"], nomatch), + ?TEST(["h", <<"ejsan">>], ["hej"], "san"), + ?TEST(["h", <<"e">>, "jsan"], ["hej"], "san"), + ?TEST(["h", "e", <<"jsan">>], ["hej"], "san"), + ok. + +split(_) -> + Mod = fun(Res) -> + [lists:flatten(unicode:characters_to_nfc_list(io_lib:format("~ts", [Str]))) + || Str <- Res] end, + ?TEST("..", ["", leading], {Mod, [".."]}), + ?TEST("..", ["..", leading], {Mod, [[],[]]}), + ?TEST("abcd", ["..", leading], {Mod, ["abcd"]}), + ?TEST("ab..bc", ["..", leading], {Mod, ["ab","bc"]}), + ?TEST("ab..bc..cd", ["..", leading], {Mod, ["ab","bc..cd"]}), + ?TEST("..ab", [".."], {Mod, [[],"ab"]}), + ?TEST("ab..", ["..", leading], {Mod, ["ab",[]]}), + ?TEST(["ab..bc..cd"], ["..", leading], {Mod, ["ab","bc..cd"]}), + ?TEST(["ab","..bc..cd"], ["..", leading], {Mod, ["ab","bc..cd"]}), + ?TEST(["ab",<<"..bc..cd">>], ["..", leading], {Mod, ["ab","bc..cd"]}), + ?TEST(["ab.",".bc..cd"], ["..", leading], {Mod, ["ab","bc..cd"]}), + ?TEST(["ab.",<<".bc..cd">>], ["..", leading], {Mod, ["ab","bc..cd"]}), + ?TEST(["ab..","bc..cd"], ["..", leading], {Mod, ["ab","bc..cd"]}), + ?TEST(["ab..",<<"bc..cd">>], ["..", leading], {Mod, ["ab","bc..cd"]}), + ?TEST(["ab.","bc..cd"], ["..", leading], {Mod, ["ab.bc","cd"]}), + ?TEST("ab...bc", ["..", leading], {Mod, ["ab",".bc"]}), + + ?TEST("..", ["", trailing], {Mod, [".."]}), + ?TEST("..", ["..", trailing], {Mod, [[],[]]}), + ?TEST("abcd", ["..", trailing], {Mod, ["abcd"]}), + ?TEST("ab..bc", ["..", trailing], {Mod, ["ab","bc"]}), + ?TEST("ab..bc..cd", ["..", trailing], {Mod, ["ab..bc","cd"]}), + ?TEST("..ab", ["..", trailing], {Mod, [[],"ab"]}), + ?TEST("ab..", ["..", trailing], {Mod, ["ab",[]]}), + ?TEST(["ab..bc..cd"], ["..", trailing], {Mod, ["ab..bc","cd"]}), + ?TEST(["ab","..bc..cd"], ["..", trailing], {Mod, ["ab..bc","cd"]}), + ?TEST(["ab"|<<"a">>], ["a", trailing], {Mod, ["ab",[]]}), + ?TEST(["ab",<<"..bc..cd">>], ["..", trailing], {Mod, ["ab..bc","cd"]}), + ?TEST([<<"ab.">>,".bc..cd"], ["..", trailing], {Mod, ["ab..bc","cd"]}), + ?TEST(["ab.",<<".bc..cd">>], ["..", trailing], {Mod, ["ab..bc","cd"]}), + ?TEST(["ab..","bc..cd"], ["..", trailing], {Mod, ["ab..bc","cd"]}), + ?TEST(["ab..",<<"bc..cd">>], ["..", trailing], {Mod, ["ab..bc","cd"]}), + ?TEST(["ab.","bc..cd"], ["..", trailing], {Mod, ["ab.bc","cd"]}), + ?TEST("ab...bc", ["..", trailing], {Mod, ["ab.","bc"]}), + + ?TEST("..", ["..", all], {Mod, [[],[]]}), + ?TEST("abcd", ["..", all], {Mod, ["abcd"]}), + ?TEST("a..b", ["..", all], {Mod, ["a","b"]}), + ?TEST("a..b..c", ["..", all], {Mod, ["a","b","c"]}), + ?TEST("a..", ["..", all], {Mod, ["a",[]]}), + ?TEST(["a..b..c"], ["..", all], {Mod, ["a","b","c"]}), + ?TEST(["a","..b..c"], ["..", all], {Mod, ["a","b","c"]}), + ?TEST(["a",<<"..b..c">>], ["..", all], {Mod, ["a","b","c"]}), + ?TEST(["a.",".b..c"], ["..", all], {Mod, ["a","b","c"]}), + ?TEST(["a.",<<".b..c">>], ["..", all], {Mod, ["a","b","c"]}), + ?TEST(["a..","b..c"], ["..", all], {Mod, ["a","b","c"]}), + ?TEST(["a..",<<"b..c">>], ["..", all], {Mod, ["a","b","c"]}), + ?TEST(["a.","b..c"], ["..", all], {Mod, ["a.b","c"]}), + ?TEST("a...b", ["..", all], {Mod, ["a",".b"]}), + + %% Grapheme (split) tests + ?TEST("aΩΩb", ["Ω", all], {Mod, ["a","","b"]}), + ?TEST([<<"aae">>,778,"äöoo"], [[$e,778], leading], {Mod, ["aa","äöoo"]}), + ?TEST([<<"aae">>,778,"äöoo"], [[$e,778], trailing], {Mod, ["aa","äöoo"]}), + ?TEST([<<"aae">>,778,"äöoo"], [[$e,778], all], {Mod, ["aa","äöoo"]}), + ?TEST([<<"aae">>,778,"öeeåäö"], ["e", leading], {Mod, [[$a, $a, $e,778,$ö],"eåäö"]}), + ?TEST([<<"aae">>,778,"öeeåäö"], ["e", trailing], {Mod, [[$a, $a, $e,778,$ö, $e],"åäö"]}), + ?TEST([<<"aae">>,778,"öeeåäö"], ["e", all], {Mod, [[$a, $a, $e,778,$ö],"", "åäö"]}), + + ok. + +replace(_) -> + ?TEST(["a..b.", [".c"]], ["xxx", "::"], "a..b..c"), + ?TEST(["a..b.", [".c"]], ["..", "::"], "a::b..c"), + ?TEST([<<"a..b.">>, [".c"]], ["..", "::", trailing], "a..b::c"), + ?TEST(["a..b.", [".c"]], ["..", "::", all], "a::b::c"), + ok. + +cd_gc(_) -> + [] = string:next_codepoint(""), + [] = string:next_codepoint(<<>>), + [] = string:next_codepoint([<<>>]), + "abcd" = string:next_codepoint("abcd"), + [$e,778] = string:next_codepoint([$e,778]), + [$e|<<204,138>>] = string:next_codepoint(<<$e,778/utf8>>), + [778|_] = string:next_codepoint(tl(string:next_codepoint(<<$e,778/utf8>>))), + + [] = string:next_grapheme(""), + [] = string:next_grapheme(<<>>), + [] = string:next_grapheme([<<>>]), + "abcd" = string:next_grapheme("abcd"), + [[$e,778]] = string:next_grapheme([$e,778]), + [[$e,778]] = string:next_grapheme(<<$e,778/utf8>>), + + ok. + + +find(_) -> + ?TEST(["h", "ejsan"], [""], "hejsan"), + ?TEST(["h", "ejsan"], [<<>>], "hejsan"), + ?TEST([], [""], ""), + ?TEST([], ["hej"], nomatch), + ?TEST(["h", "ejsan"], ["hej"], "hejsan"), + ?TEST(["h", "e", "jsan"], ["hej"], "hejsan"), + ?TEST(["xh", "e", "san"], ["hej"], nomatch), + ?TEST([<<"xh">>, <<"ejsan">>], ["hej"], "hejsan"), + ?TEST(["xh", <<"ejsan">>], ["hej"], "hejsan"), + ?TEST(["xh", <<"e">>, "jsan"], ["hej"], "hejsan"), + ?TEST(["xh", "e", <<"jsan">>], ["hej"], "hejsan"), + ?TEST(["xh", "er", <<"ljsane">>, "rlang"], ["erl", leading], "erljsanerlang"), + ?TEST("aΩΩb", ["Ω", leading], "ΩΩb"), + ?TEST([<<"aae">>,778,"äöoo"], [[$e,778], leading], [$e,778]++"äöoo"), + ?TEST([<<"aae">>,778,"öeeåäö"], ["e", leading], "eeåäö"), + + ?TEST(["h", "ejsan"], ["", trailing], "hejsan"), + ?TEST([], ["", trailing], ""), + ?TEST([], ["hej", trailing], nomatch), + ?TEST(["h", "ejsan"], ["hej", trailing], "hejsan"), + ?TEST(["h", "e", "jsan"], ["hej", trailing], "hejsan"), + ?TEST(["xh", "e", "san"], ["hej", trailing], nomatch), + ?TEST([<<"xh">>, <<"ejsan">>], ["hej", trailing], "hejsan"), + ?TEST(["xh", <<"ejsan">>], ["hej", trailing], "hejsan"), + ?TEST(["xh", <<"e">>, "jsan"], ["hej", trailing], "hejsan"), + ?TEST(["xh", "e", <<"jsan">>], ["hej", trailing], "hejsan"), + ?TEST(["xh", "er", <<"ljsane">>, "rlang"], ["erl", trailing], "erlang"), + ?TEST("aΩΩb", ["Ω", trailing], "Ωb"), + ?TEST([<<"aae">>,778,"äöoo"], [[$e,778], trailing], [$e,778]++"äöoo"), + ?TEST([<<"aeae">>,778,"äö"], ["e", trailing], "eae"++[778,$ä,$ö]), + + ok. + +lexemes(_) -> + Mod = fun(Res) -> + [unicode:characters_to_nfc_list(io_lib:format("~ts", [Str]))|| Str <- Res] + end, + Res = ["Hej", "san", "Hopp", "san"], + ?TEST("", [" ,."], {Mod, []}), + ?TEST("Hej san", [""], {Mod, ["Hej san"]}), + ?TEST(" ,., ", [" ,."], {Mod, []}), + ?TEST( "Hej san Hopp san", [" ,."], {Mod, Res}), + ?TEST(" Hej san Hopp san ", [" ,."], {Mod, Res}), + ?TEST(" Hej san, .Hopp san ", [" ,."], {Mod, Res}), + + ?TEST([" Hej san",", .Hopp san "], [" ,."], {Mod, Res}), + ?TEST([" Hej sa","n, .Hopp san "], [" ,."], {Mod, Res}), + ?TEST([" Hej san,"," .Hopp san "], [" ,."], {Mod, Res}), + + ?TEST([" Hej san",[", .Hopp san "]], [" ,."], {Mod, Res}), + ?TEST([" Hej sa",["n, .Hopp san "]], [" ,."], {Mod, Res}), + ?TEST([" Hej san,",[" .Hopp san "]], [" ,."], {Mod, Res}), + + ?TEST([" H",<<"ej san, .Hopp "/utf8>>, "san"], [" ,."], {Mod, Res}), + ?TEST([" Hej san",<<", .Hopp "/utf8>>, "san"], [" ,."], {Mod, Res}), + ?TEST([" Hej sa",<<"n, .Hopp"/utf8>>, " san"], [" ,."], {Mod, Res}), + ?TEST([" Hej san,",<<" .Hopp s"/utf8>>, "an"], [" ,."], {Mod, Res}), + ?TEST([" Hej san",[<<", .Hopp san "/utf8>>]], [" ,."], {Mod, Res}), + ?TEST([" Hej sa",[<<"n, .Hopp san "/utf8>>]], [" ,."], {Mod, Res}), + ?TEST([" Hej san,",[<<" .Hopp san "/utf8>>], <<" ">>], [" ,."], {Mod, Res}), + + ?TEST(" Hej\r\nsan\nnl", ["\r\n\s"], {Mod, ["Hej\r\nsan", "nl"]}), + + ?TEST(["b1ec1e",778,"äöo21"], ["eo"], {Mod, ["b1",[$c,$1,$e,778,$ä,$ö],"21"]}), + ?TEST([<<"b1ec1e">>,778,"äöo21"], ["eo"], {Mod, ["b1",[$c,$1,$e,778,$ä,$ö],"21"]}), + %% Grapheme (split) tests + Str10 = [[[<<"÷"/utf8>>,1101],<<"ë"/utf8>>|<<"\"">>]], + ?TEST(Str10, [[1076]], {Mod, [unicode:characters_to_nfc_list(Str10)]}), + ?TEST("a1Ωb1Ωc1", ["Ω"], {Mod, ["a1","b1","c1"]}), + ?TEST([<<"aae">>,778,"äöoo"], [[[$e,778]]], {Mod, ["aa","äöoo"]}), + ?TEST([<<"aae">>,778,"äöo21"], [[[$e,778],$o]], {Mod, ["aa","äö","21"]}), + ?TEST([<<"aae">>,778,"öeeåäö"], ["e"], {Mod, [[$a, $a, $e,778,$ö],"åäö"]}), + ok. + +nth_lexeme(_) -> + {'EXIT', _} = (catch string:nth_lexeme("test test", 0, [])), + {'EXIT', _} = (catch string:nth_lexeme(<<"test test">>, 0, [])), + ?TEST( "", [1, " ,."], []), + ?TEST( "Hej san", [1, ""], "Hej san"), + ?TEST( " ,., ", [1, " ,."], []), + ?TEST( " ,., ", [3, " ,."], []), + ?TEST("Hej san Hopp san", [1, " ,."], "Hej"), + ?TEST("...Hej san Hopp san", [1, " ,."], "Hej"), + ?TEST("Hej san Hopp san", [3, " ,."], "Hopp"), + ?TEST(" Hej san Hopp san ", [3, " ,."], "Hopp"), + ?TEST(" Hej san, .Hopp san ", [3, " ,."], "Hopp"), + ?TEST("ab cd", [3, " "], ""), + + ?TEST([" Hej san",", .Hopp san "], [3, " ,."], "Hopp"), + ?TEST([" Hej sa","n, .Hopp san "], [3, " ,."], "Hopp"), + ?TEST([" Hej san,"," .Hopp san "], [3, " ,."], "Hopp"), + + ?TEST([" Hej san",[", .Hopp san "]], [3," ,."], "Hopp"), + ?TEST([" Hej sa",["n, .Hopp san "]], [3, " ,."], "Hopp"), + ?TEST([" Hej san,",[" .Hopp san "]], [3, " ,."], "Hopp"), + + ?TEST([" Hej san",<<", .Hopp "/utf8>>, "san"], [3, " ,."], "Hopp"), + ?TEST([" Hej sa",<<"n, .Hopp"/utf8>>, " san"], [3, " ,."], "Hopp"), + ?TEST([" Hej san,",<<" .Hopp s"/utf8>>, "an"], [3, " ,."], "Hopp"), + ?TEST([" Hej san,",<<" .Hopp s"/utf8>>, "an"], [4, " ,."], "san"), + ?TEST([" Hej san",[<<", .Hopp san "/utf8>>]], [3, " ,."], "Hopp"), + ?TEST([" Hej sa",[<<"n, .Hopp san "/utf8>>]], [3, " ,."], "Hopp"), + ?TEST([" Hej san,",[<<" .Hopp san "/utf8>>], <<" ">>], [3, " ,."], "Hopp"), + + ?TEST(["b1ec1e",778,"äöo21"], [3,"eo"], "21"), + ?TEST([<<"b1ec1e">>,778,"äöo21"], [3, "eo"], "21"), + %% Grapheme (split) tests + ?TEST("a1Ωb1Ωc1", [1, "Ω"], "a1"), + ?TEST([<<"aae">>,778,"äöoo"], [2,[[$e,778]]], "äöoo"), + ?TEST([<<"aae">>,778,"äöo21"], [2,[[$e,778],$o]], "äö"), + ?TEST([<<"aae">>,778,"öeeåäö"], [2,"e"], "åäö"), + ok. + + +meas(Config) -> + case ct:get_timetrap_info() of + {_,{_,Scale}} when Scale > 1 -> + {skip,{will_not_run_in_debug,Scale}}; + _ -> % No scaling + DataDir = proplists:get_value(data_dir, Config), + TestDir = filename:dirname(string:trim(DataDir, trailing, "/")), + do_measure(TestDir) + end. + +do_measure(TestDir) -> + File = filename:join(TestDir, ?MODULE_STRING ++ ".erl"), + io:format("File ~s ",[File]), + {ok, Bin} = file:read_file(File), + io:format("~p~n",[byte_size(Bin)]), + Do = fun(Name, Func, Mode) -> + {N, Mean, Stddev, _} = time_func(Func, Mode, Bin), + io:format("~10w ~6w ~6.2fms ±~4.2fms #~.2w gc included~n", + [Name, Mode, Mean/1000, Stddev/1000, N]) + end, + io:format("----------------------~n"), + Do(tokens, fun(Str) -> string:tokens(Str, [$\n,$\r]) end, list), + Tokens = {lexemes, fun(Str) -> string:lexemes(Str, [$\n,$\r]) end}, + [Do(Name,Fun,Mode) || {Name,Fun} <- [Tokens], Mode <- [list, binary]], + ok. + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +%% internal functions + +test(Line, Func, Str, Args, Res, Norm) -> + %%io:format("~p: ~p ~w ~w~n",[Line, Func, Str, Args]), + test_1(Line, Func, Str, [Str|norm(none,Args)], Res), + %%io:format("~p: ~p bin ",[Line, Func]), + test_1({Line,list}, Func, Str, + [unicode:characters_to_list(Str)|norm(none,Args)], Res), + Norm andalso + test_1({Line,clist}, Func, Str, + [unicode:characters_to_nfc_list(Str)|norm(nfc,Args)], Res), + Norm andalso + test_1({Line,dlist}, Func, Str, + [unicode:characters_to_nfd_list(Str)|norm(nfd,Args)], Res), + test_1({Line,bin}, Func, Str, + [unicode:characters_to_binary(Str)|norm(none, Args)], Res), + Norm andalso + test_1({Line,cbin}, Func, Str, + [unicode:characters_to_nfc_binary(Str)|norm(nfc,Args)], Res), + Norm andalso + test_1({Line,dbin}, Func, Str, + [unicode:characters_to_nfd_binary(Str)|norm(nfd,Args)], Res), + %%io:format("~n",[]), + ok. + +test_1(Line, Func, Str, Args, Exp) -> + try + Res = apply(string, Func, Args), + check_types(Line, Func, Args, Res), + case res(Res, Exp) of + true -> ok; + {Res1,Exp1} when is_tuple(Exp1) -> + io:format("~p~n",[Args]), + io:format("~p:~p: ~ts~w =>~n :~w:~w~n", + [Func,Line, Str,Str,Res1,Exp1]), + exit({error, Func}); + {Res1,Exp1} -> + io:format("~p:~p: ~ts~w =>~n :~ts~w:~ts~w~n", + [Func,Line, Str,Str, Res1,Res1, Exp1,Exp1]), + exit({error, Func}) + end + catch + error:Exp -> + ok; + error:Reason -> + io:format("~p:~p: Crash ~p ~p~n", + [?MODULE,Line, Reason, erlang:get_stacktrace()]), + exit({error, Func}) + end. + +norm(Type, Args) -> + Norm = case Type of + nfc -> fun unicode:characters_to_nfc_list/1; + nfd -> fun unicode:characters_to_nfd_list/1; + none -> fun(Str) -> Str end + end, + lists:map(fun({norm,Str}) -> Norm(Str); + (Other) -> Other + end, Args). + +res(Str, Str) -> true; +res(Str, Exp) when is_list(Str), is_list(Exp) -> + A = unicode:characters_to_nfc_list(Str), + A==Exp orelse {A,Exp}; +res(Str, Exp) when is_binary(Str), is_list(Exp) -> + A = unicode:characters_to_nfc_list(Str), + A==Exp orelse {A,Exp}; +res(What, {Fun, Exp}) when is_function(Fun) -> + Fun(What) == Exp orelse {Fun(What), Exp}; +res({S1,S2}=S, {Exp1,Exp2}=E) -> %% For take + case {res(S1,Exp1), res(S2,Exp2)} of + {true, true} -> true; + _ -> {S, E} + end; +res(Int, Exp) -> + Int == Exp orelse {Int, Exp}. + + +check_types(_Line, _Func, _Str, Res) + when is_integer(Res); is_boolean(Res); Res =:= nomatch -> + %% length or equal + ok; +check_types(Line, Func, [S1,S2], Res) + when Func =:= concat -> + case check_types_1(type(S1),type(S2)) of + ok -> + case check_types_1(type(S1),type(Res)) of + ok -> ok; + {T1,T2} -> + io:format("Failed: ~p ~p ~p ~p~n",[Line, Func, T1, T2]), + io:format(" ~p ~p => ~p~n", [S1, S2, Res]), + error + end; + _ -> ok + end; +check_types(Line, Func, [Str|_], Res) -> + AddList = fun(mixed) -> mixed; + ({list,{list,_}}) -> {list, deep}; + (R) -> + case lists:member(Func, [lexemes, tokens, split]) of + true -> {list, R}; + false -> R + end + end, + try needs_check(Func) andalso (ok = check_types_1(AddList(type(Str)), type(Res))) of + ok -> ok; + false -> ok + catch _:{badmatch, {T1,T2}} -> + io:format("Failed: ~p ~p: ~p ~p~n",[Line, Func, T1, T2]), + io:format(" ~p => ~p~n", [Str, Res]), + error; + _:Reason -> + io:format("Crash: ~p in~n ~p~n",[Reason, erlang:get_stacktrace()]), + io:format("Failed: ~p ~p: ~p => ~p~n", [Line, Func, Str, Res]), + exit({Reason, erlang:get_stacktrace()}) + end. + +check_types_1(T, T) -> + ok; +check_types_1(Str, Res) + when is_binary(Str), is_binary(Res) -> + ok; +check_types_1({list, _},{list, undefined}) -> + ok; +check_types_1({list, _},{list, codepoints}) -> + ok; +check_types_1({list, _},{list, {list, codepoints}}) -> + ok; +check_types_1({list, {list, _}},{list, {list, codepoints}}) -> + ok; +check_types_1(mixed,_) -> + ok; +check_types_1({list, binary}, binary) -> + ok; +check_types_1({list, binary}, {other, _, _}) -> %% take + ok; +check_types_1({list, deep}, _) -> + ok; +check_types_1({list, {list, deep}}, _) -> + ok; +check_types_1(T1,T2) -> + {T1,T2}. + +type(Bin) when is_binary(Bin) -> + binary; +type([]) -> + {list, undefined}; +type(List) when is_list(List) -> + Deep = fun(L) when is_list(L) -> + lists:any(fun(C) -> is_list(C) orelse is_binary(C) end, L); + (_) -> false + end, + case all(fun(C) -> not is_binary(C) end, List) of + true -> + case all(fun(C) -> is_integer(C) end, List) of + true -> {list, codepoints}; + false -> + case [deep || L <- List, Deep(L)] of + [] -> {list, {list, codepoints}}; + _ -> {list, deep} + end + end; + false -> + case all(fun(C) -> is_binary(C) end, List) of + true -> {list, binary}; + false -> mixed + end + end; +type({R1,R2}) -> + case {type(R1),type(R2)} of + {T,T} -> T; + {{list,undefined}, {list,codepoints}} -> {list,codepoints}; + {{list,codepoints}, {list,undefined}} -> {list,codepoints}; + {T1,T2} -> {other, T1,T2} + end; +type(Other) -> + {other, Other}. + +all(_Check, []) -> + true; +all(Check, [H|T]) -> + Check(H) andalso all(Check,T); +all(Check, Bin) when is_binary(Bin) -> + Check(Bin). + +needs_check(reverse) -> false; +needs_check(pad) -> false; +needs_check(replace) -> false; +needs_check(_) -> true. + +%%%% Timer stuff + +time_func(Fun, Mode, Bin) -> + timer:sleep(100), %% Let emulator catch up and clean things before test runs + Self = self(), + Pid = spawn_link(fun() -> + Str = mode(Mode, Bin), + Self ! {self(),time_func(0,0,0, Fun, Str, undefined)} + end), + receive {Pid,Msg} -> Msg end. + +time_func(N,Sum,SumSq, Fun, Str, _) when N < 50 -> + {Time, Res} = timer:tc(fun() -> Fun(Str) end), + time_func(N+1,Sum+Time,SumSq+Time*Time, Fun, Str, Res); +time_func(N,Sum,SumSq, _, _, Res) -> + Mean = round(Sum / N), + Stdev = round(math:sqrt((SumSq - (Sum*Sum/N))/(N - 1))), + {N, Mean, Stdev, Res}. + +mode(binary, Bin) -> Bin; +mode(list, Bin) -> unicode:characters_to_list(Bin). + %% -%% Test cases starts here. +%% Old string lists Test cases starts here. %% len(Config) when is_list(Config) -> @@ -80,16 +932,14 @@ len(Config) when is_list(Config) -> {'EXIT',_} = (catch string:len({})), ok. -equal(Config) when is_list(Config) -> +old_equal(Config) when is_list(Config) -> true = string:equal("", ""), false = string:equal("", " "), true = string:equal("laban", "laban"), false = string:equal("skvimp", "skvump"), - %% invalid arg type - true = string:equal(2, 2), % not good, should crash ok. -concat(Config) when is_list(Config) -> +old_concat(Config) when is_list(Config) -> "erlang rules" = string:concat("erlang ", "rules"), "" = string:concat("", ""), "x" = string:concat("x", ""), @@ -130,6 +980,7 @@ str_rstr(Config) when is_list(Config) -> 3 = string:rstr("xxxx", "xx"), 3 = string:str("xy z yx", " z"), 3 = string:rstr("xy z yx", " z"), + 3 = string:str("aaab", "ab"), %% invalid arg type {'EXIT',_} = (catch string:str(hello, "he")), %% invalid arg type @@ -184,7 +1035,7 @@ substr(Config) when is_list(Config) -> {'EXIT',_} = (catch string:substr("1234", "1")), ok. -tokens(Config) when is_list(Config) -> +old_tokens(Config) when is_list(Config) -> [] = string:tokens("",""), [] = string:tokens("abc","abc"), ["abc"] = string:tokens("abc", ""), @@ -221,7 +1072,7 @@ replace_sep(C, Seps, New) -> chars(Config) when is_list(Config) -> [] = string:chars($., 0), [] = string:chars($., 0, []), - 10 = length(string:chars(32, 10, [])), + 10 = erlang:length(string:chars(32, 10, [])), "aaargh" = string:chars($a, 3, "rgh"), %% invalid arg type {'EXIT',_} = (catch string:chars($x, [])), @@ -231,7 +1082,7 @@ copies(Config) when is_list(Config) -> "" = string:copies("", 10), "" = string:copies(".", 0), "." = string:copies(".", 1), - 30 = length(string:copies("123", 10)), + 30 = erlang:length(string:copies("123", 10)), %% invalid arg type {'EXIT',_} = (catch string:copies("hej", -1)), {'EXIT',_} = (catch string:copies("hej", 2.0)), @@ -360,7 +1211,7 @@ to_integer(Config) when is_list(Config) -> ok. test_to_integer(Str) -> - io:format("Checking ~p~n", [Str]), + %% io:format("Checking ~p~n", [Str]), case string:to_integer(Str) of {error,_Reason} = Bad -> {'EXIT',_} = (catch list_to_integer(Str)), @@ -403,7 +1254,7 @@ to_float(Config) when is_list(Config) -> ok. test_to_float(Str) -> - io:format("Checking ~p~n", [Str]), + %% io:format("Checking ~p~n", [Str]), case string:to_float(Str) of {error,_Reason} = Bad -> {'EXIT',_} = (catch list_to_float(Str)), @@ -419,7 +1270,7 @@ to_upper_to_lower(Config) when is_list(Config) -> All = lists:seq(0, 255), UC = string:to_upper(All), - 256 = length(UC), + 256 = erlang:length(UC), all_upper_latin1(UC, 0), LC = string:to_lower(All), @@ -450,7 +1301,7 @@ all_lower_latin1([C|T], C) when 0 =< C, C < $A; all_lower_latin1([H|T], C) when $A =< C, C =< $Z; 16#C0 =< C, C =< 16#F6; 16#C8 =< C, C =< 16#DE -> - io:format("~p\n", [{H,C}]), + % io:format("~p\n", [{H,C}]), H = C + 32, all_lower_latin1(T, C+1); all_lower_latin1([], 256) -> ok. -- cgit v1.2.3