diff options
author | Dan Gudmundsson <[email protected]> | 2017-04-03 12:19:21 +0200 |
---|---|---|
committer | Dan Gudmundsson <[email protected]> | 2017-04-24 12:16:56 +0200 |
commit | 2c72e662bad11a41839780f86680d4bb05367c78 (patch) | |
tree | 01e9ae9b32fdb953392e571a0773fb2cd059c498 | |
parent | 75fc94b8b462d7b7f6dd4b706bbe32cff77ee575 (diff) | |
download | otp-2c72e662bad11a41839780f86680d4bb05367c78.tar.gz otp-2c72e662bad11a41839780f86680d4bb05367c78.tar.bz2 otp-2c72e662bad11a41839780f86680d4bb05367c78.zip |
New unicode aware string module that works with unicode:chardata()
Works with unicode:chardata() as input as was decided on OTP board
meeting as response to EEP-35 a long time ago.
Works on graphemes clusters as base, with a few exceptions, does not
handle classic (nor nfd'ified) Hangul nor the extended grapheme
clusters such as the prepend class. That would make handling binaries
as input/output very slow.
List input => list output, binary input => binary output and
mixed input => mixed output for all find/split functions.
So that results can be post-processed without the need to invoke
unicode:characters_to_list|binary for intermediate data.
pad functions return lists of unicode:chardata() for performance.
-rw-r--r-- | .gitignore | 1 | ||||
-rw-r--r-- | lib/stdlib/doc/src/string.xml | 741 | ||||
-rw-r--r-- | lib/stdlib/doc/src/unicode_usage.xml | 70 | ||||
-rw-r--r-- | lib/stdlib/src/string.erl | 1266 | ||||
-rw-r--r-- | lib/stdlib/test/string_SUITE.erl | 893 |
5 files changed, 2838 insertions, 133 deletions
diff --git a/.gitignore b/.gitignore index fdbf0d2d5f..c867b1a597 100644 --- a/.gitignore +++ b/.gitignore @@ -22,6 +22,7 @@ autom4te.cache # /bootstrap/lib/compiler/egen /bootstrap/lib/stdlib/egen +/lib/stdlib/src/unicode_util.erl # Compiler derivatives # diff --git a/lib/stdlib/doc/src/string.xml b/lib/stdlib/doc/src/string.xml index dddedf1132..dc83c40a9a 100644 --- a/lib/stdlib/doc/src/string.xml +++ b/lib/stdlib/doc/src/string.xml @@ -36,8 +36,613 @@ <modulesummary>String processing functions.</modulesummary> <description> <p>This module provides functions for string processing.</p> + <p>A string in this module is represented by <seealso marker="unicode#type-chardata"> + <c>unicode:chardata()</c></seealso>, that is, a list of codepoints, + binaries with UTF-8-encoded codepoints + (<em>UTF-8 binaries</em>), or a mix of the two.</p> + <code> +"abcd" is a valid string +<<"abcd">> is a valid string +["abcd"] is a valid string +<<"abc..åäö"/utf8>> is a valid string +<<"abc..åäö">> is NOT a valid string, + but a binary with Latin-1-encoded codepoints +[<<"abc">>, "..åäö"] is a valid string +[atom] is NOT a valid string</code> + <p> + This module operates on grapheme clusters. A <em>grapheme cluster</em> + is a user-perceived character, which can be represented by several + codepoints. + </p> + <code> +"å" [229] or [97, 778] +"e̊" [101, 778]</code> + <p> + The string length of "ß↑e̊" is 3, even though it is represented by the + codepoints <c>[223,8593,101,778]</c> or the UTF-8 binary + <c><<195,159,226,134,145,101,204,138>></c>. + </p> + <p> + Grapheme clusters for codepoints of class <c>prepend</c> + and non-modern (or decomposed) Hangul is not handled for performance + reasons in + <seealso marker="#find/3"><c>find/3</c></seealso>, + <seealso marker="#replace/3"><c>replace/3</c></seealso>, + <seealso marker="#split/2"><c>split/2</c></seealso>, + <seealso marker="#lexemes/2"><c>split/2</c></seealso> and + <seealso marker="#trim/3"><c>trim/3</c></seealso>. + </p> + <p> + Splitting and appending strings is to be done on grapheme clusters + borders. + There is no verification that the results of appending strings are + valid or normalized. + </p> + <p> + Most of the functions expect all input to be normalized to one form, + see for example <seealso marker="unicode#characters_to_nfc_list/1"> + <c>unicode:characters_to_nfc_list/1</c></seealso>. + </p> + <p> + Language or locale specific handling of input is not considered + in any function. + </p> + <p> + The functions can crash for non-valid input strings. For example, + the functions expect UTF-8 binaries but not all functions + verify that all binaries are encoded correctly. + </p> + <p> + Unless otherwise specified the return value type is the same as + the input type. That is, binary input returns binary output, + list input returns a list output, and mixed input can return a + mixed output.</p> + <code> +1> string:trim(" sarah "). +"sarah" +2> string:trim(<<" sarah ">>). +<<"sarah">> +3> string:lexemes("foo bar", " "). +["foo","bar"] +4> string:lexemes(<<"foo bar">>, " "). +[<<"foo">>,<<"bar">>]</code> + <p>This module has been reworked in Erlang/OTP 20 to + handle <seealso marker="unicode#type-chardata"> + <c>unicode:chardata()</c></seealso> and operate on grapheme + clusters. The <seealso marker="#oldapi"> <c>old + functions</c></seealso> that only work on Latin-1 lists as input + are still available but should not be + used. They will be deprecated in Erlang/OTP 21. + </p> </description> + <datatypes> + <datatype> + <name name="direction"/> + <name name="grapheme_cluster"/> + <desc> + <p>A user-perceived character, consisting of one or more + codepoints.</p> + </desc> + </datatype> + </datatypes> + + <funcs> + + <func> + <name name="casefold" arity="1"/> + <fsummary>Convert a string to a comparable string.</fsummary> + <desc> + <p> + Converts <c><anno>String</anno></c> to a case-agnostic + comparable string. Function <c>casefold/1</c> is preferred + over <c>lowercase/1</c> when two strings are to be compared + for equality. See also <seealso marker="#equal/4"><c>equal/4</c></seealso>. + </p> + <p><em>Example:</em></p> + <pre> +1> <input>string:casefold("Ω and ẞ SHARP S").</input> +"ω and ss sharp s"</pre> + </desc> + </func> + + <func> + <name name="chomp" arity="1"/> + <fsummary>Remove trailing end of line control characters.</fsummary> + <desc> + <p> + Returns a string where any trailing <c>\n</c> or + <c>\r\n</c> have been removed from <c><anno>String</anno></c>. + </p> + <p><em>Example:</em></p> + <pre> +182> <input>string:chomp(<<"\nHello\n\n">>).</input> +<<"\nHello">> +183> <input>string:chomp("\nHello\r\r\n").</input> +"\nHello\r"</pre> + </desc> + </func> + + <func> + <name name="equal" arity="2"/> + <name name="equal" arity="3"/> + <name name="equal" arity="4"/> + <fsummary>Test string equality.</fsummary> + <desc> + <p> + Returns <c>true</c> if <c><anno>A</anno></c> and + <c><anno>B</anno></c> are equal, otherwise <c>false</c>. + </p> + <p> + If <c><anno>IgnoreCase</anno></c> is <c>true</c> + the function does <seealso marker="#casefold/1"> + <c>casefold</c>ing</seealso> on the fly before the equality test. + </p> + <p>If <c><anno>Norm</anno></c> is not <c>none</c> + the function applies normalization on the fly before the equality test. + There are four available normalization forms: + <seealso marker="unicode#characters_to_nfc_list/1"> <c>nfc</c></seealso>, + <seealso marker="unicode#characters_to_nfd_list/1"> <c>nfd</c></seealso>, + <seealso marker="unicode#characters_to_nfkc_list/1"> <c>nfkc</c></seealso>, and + <seealso marker="unicode#characters_to_nfkd_list/1"> <c>nfkd</c></seealso>. + </p> + <p>By default, + <c><anno>IgnoreCase</anno></c> is <c>false</c> and + <c><anno>Norm</anno></c> is <c>none</c>.</p> + <p><em>Example:</em></p> + <pre> +1> <input>string:equal("åäö", <<"åäö"/utf8>>).</input> +true +2> <input>string:equal("åäö", unicode:characters_to_nfd_binary("åäö")).</input> +false +3> <input>string:equal("åäö", unicode:characters_to_nfd_binary("ÅÄÖ"), true, nfc).</input> +true</pre> + </desc> + </func> + + <func> + <name name="find" arity="2"/> + <name name="find" arity="3"/> + <fsummary>Find start of substring.</fsummary> + <desc> + <p> + Removes anything before <c><anno>SearchPattern</anno></c> in <c><anno>String</anno></c> + and returns the remainder of the string or <c>nomatch</c> if <c><anno>SearchPattern</anno></c> is not + found. + <c><anno>Dir</anno></c>, which can be <c>leading</c> or + <c>trailing</c>, indicates from which direction characters + are to be searched. + </p> + <p> + By default, <c><anno>Dir</anno></c> is <c>leading</c>. + </p> + <p><em>Example:</em></p> + <pre> +1> <input>string:find("ab..cd..ef", ".").</input> +"..cd..ef" +2> <input>string:find(<<"ab..cd..ef">>, "..", trailing).</input> +<<"..ef">> +3> <input>string:find(<<"ab..cd..ef">>, "x", leading).</input> +nomatch +4> <input>string:find("ab..cd..ef", "x", trailing).</input> +nomatch</pre> + </desc> + </func> + + <func> + <name name="is_empty" arity="1"/> + <fsummary>Check if the string is empty.</fsummary> + <desc> + <p>Returns <c>true</c> if <c><anno>String</anno></c> is the + empty string, otherwise <c>false</c>.</p> + <p><em>Example:</em></p> + <pre> +1> <input>string:is_empty("foo").</input> +false +2> <input>string:is_empty(["",<<>>]).</input> +true</pre> + </desc> + </func> + + <func> + <name name="length" arity="1"/> + <fsummary>Calculate length of the string.</fsummary> + <desc> + <p> + Returns the number of grapheme clusters in <c><anno>String</anno></c>. + </p> + <p><em>Example:</em></p> + <pre> +1> <input>string:length("ß↑e̊").</input> +3 +2> <input>string:length(<<195,159,226,134,145,101,204,138>>).</input> +3</pre> + </desc> + </func> + + <func> + <name name="lexemes" arity="2"/> + <fsummary>Split string into lexemes.</fsummary> + <desc> + <p> + Returns a list of lexemes in <c><anno>String</anno></c>, separated + by the grapheme clusters in <c><anno>SeparatorList</anno></c>. + </p> + <p> + Notice that, as shown in this example, two or more + adjacent separator graphemes clusters in <c><anno>String</anno></c> + are treated as one. That is, there are no empty + strings in the resulting list of lexemes. + See also <seealso marker="#split/3"><c>split/3</c></seealso> which returns + empty strings. + </p> + <p>Notice that <c>[$\r,$\n]</c> is one grapheme cluster.</p> + <p><em>Example:</em></p> + <pre> +1> <input>string:lexemes("abc de̊fxxghix jkl\r\nfoo", "x e" ++ [[$\r,$\n]]).</input> +["abc","de̊f","ghi","jkl","foo"] +2> <input>string:lexemes(<<"abc de̊fxxghix jkl\r\nfoo"/utf8>>, "x e" ++ [$\r,$\n]).</input> +[<<"abc">>,<<"de̊f"/utf8>>,<<"ghi">>,<<"jkl\r\nfoo">>]</pre> + </desc> + </func> + + <func> + <name name="lowercase" arity="1"/> + <fsummary>Convert a string to lowercase</fsummary> + <desc> + <p> + Converts <c><anno>String</anno></c> to lowercase. + </p> + <p> + Notice that function <seealso marker="#casefold/1"><c>casefold/1</c></seealso> + should be used when converting a string to + be tested for equality. + </p> + <p><em>Example:</em></p> + <pre> +2> <input>string:lowercase(string:uppercase("Michał")).</input> +"michał"</pre> + </desc> + </func> + + <func> + <name name="next_codepoint" arity="1"/> + <fsummary>Pick the first codepoint.</fsummary> + <desc> + <p> + Returns the first codepoint in <c><anno>String</anno></c> + and the rest of <c><anno>String</anno></c> in the tail. + </p> + <p><em>Example:</em></p> + <pre> +1> <input>string:next_codepoint(unicode:characters_to_binary("e̊fg")).</input> +[101|<<"̊fg"/utf8>>]</pre> + </desc> + </func> + + <func> + <name name="next_grapheme" arity="1"/> + <fsummary>Pick the first grapheme cluster.</fsummary> + <desc> + <p> + Returns the first grapheme cluster in <c><anno>String</anno></c> + and the rest of <c><anno>String</anno></c> in the tail. + </p> + <p><em>Example:</em></p> + <pre> +1> <input>string:next_grapheme(unicode:characters_to_binary("e̊fg")).</input> +["e̊"|<<"fg">>]</pre> + </desc> + </func> + + <func> + <name name="nth_lexeme" arity="3"/> + <fsummary>Pick the nth lexeme.</fsummary> + <desc> + <p>Returns lexeme number <c><anno>N</anno></c> in + <c><anno>String</anno></c>, where lexemes are separated by + the grapheme clusters in <c><anno>SeparatorList</anno></c>. + </p> + <p><em>Example:</em></p> + <pre> +1> <input>string:nth_lexeme("abc.de̊f.ghiejkl", 3, ".e").</input> +"ghi"</pre> + </desc> + </func> + + <func> + <name name="pad" arity="2"/> + <name name="pad" arity="3"/> + <name name="pad" arity="4"/> + <fsummary>Pad a string to given length.</fsummary> + <desc> + <p> + Pads <c><anno>String</anno></c> to <c><anno>Length</anno></c> with + grapheme cluster <c><anno>Char</anno></c>. + <c><anno>Dir</anno></c>, which can be <c>leading</c>, <c>trailing</c>, + or <c>both</c>, indicates where the padding should be added. + </p> + <p>By default, <c><anno>Char</anno></c> is <c>$\s</c> and + <c><anno>Dir</anno></c> is <c>trailing</c>. + </p> + <p><em>Example:</em></p> + <pre> +1> <input>string:pad(<<"He̊llö"/utf8>>, 8).</input> +[<<72,101,204,138,108,108,195,182>>,32,32,32] +2> <input>io:format("'~ts'~n",[string:pad("He̊llö", 8, leading)]).</input> +' He̊llö' +3> <input>io:format("'~ts'~n",[string:pad("He̊llö", 8, both)]).</input> +' He̊llö '</pre> + </desc> + </func> + + <func> + <name name="prefix" arity="2"/> + <fsummary>Remove prefix from string.</fsummary> + <desc> + <p> + If <c><anno>Prefix</anno></c> is the prefix of + <c><anno>String</anno></c>, removes it and returns the + remainder of <c><anno>String</anno></c>, otherwise returns + <c>nomatch</c>. + </p> + <p><em>Example:</em></p> + <pre> +1> <input>string:prefix(<<"prefix of string">>, "pre").</input> +<<"fix of string">> +2> <input>string:prefix("pre", "prefix").</input> +nomatch</pre> + </desc> + </func> + + <func> + <name name="replace" arity="3"/> + <name name="replace" arity="4"/> + <fsummary>Replace a pattern in string.</fsummary> + <desc> + <p> + Replaces <c><anno>SearchPattern</anno></c> in <c><anno>String</anno></c> + with <c><anno>Replacement</anno></c>. + <c><anno>Where</anno></c>, default <c>leading</c>, indicates whether + the <c>leading</c>, the <c>trailing</c> or <c>all</c> encounters of + <c><anno>SearchPattern</anno></c> are to be replaced. + </p> + <p>Can be implemented as:</p> + <pre>lists:join(Replacement, split(String, SearchPattern, Where)).</pre> + <p><em>Example:</em></p> + <pre> +1> <input>string:replace(<<"ab..cd..ef">>, "..", "*").</input> +[<<"ab">>,"*",<<"cd..ef">>] +2> <input>string:replace(<<"ab..cd..ef">>, "..", "*", all).</input> +[<<"ab">>,"*",<<"cd">>,"*",<<"ef">>]</pre> + </desc> + </func> + + <func> + <name name="reverse" arity="1"/> + <fsummary>Reverses a string</fsummary> + <desc> + <p> + Returns the reverse list of the grapheme clusters in <c><anno>String</anno></c>. + </p> + <p><em>Example:</em></p> + <pre> +1> Reverse = <input>string:reverse(unicode:characters_to_nfd_binary("ÅÄÖ")).</input> +[[79,776],[65,776],[65,778]] +2> <input>io:format("~ts~n",[Reverse]).</input> +ÖÄÅ</pre> + </desc> + </func> + + <func> + <name name="slice" arity="2"/> + <name name="slice" arity="3"/> + <fsummary>Extract a part of string</fsummary> + <desc> + <p>Returns a substring of <c><anno>String</anno></c> of + at most <c><anno>Length</anno></c> grapheme clusters, starting at position + <c><anno>Start</anno></c>.</p> + <p>By default, <c><anno>Length</anno></c> is <c>infinity</c>.</p> + <p><em>Example:</em></p> + <pre> +1> <input>string:slice(<<"He̊llö Wörld"/utf8>>, 4).</input> +<<"ö Wörld"/utf8>> +2> <input>string:slice(["He̊llö ", <<"Wörld"/utf8>>], 4,4).</input> +"ö Wö" +3> <input>string:slice(["He̊llö ", <<"Wörld"/utf8>>], 4,50).</input> +"ö Wörld"</pre> + </desc> + </func> + + <func> + <name name="split" arity="2"/> + <name name="split" arity="3"/> + <fsummary>Split a string into substrings.</fsummary> + <desc> + <p> + Splits <c><anno>String</anno></c> where <c><anno>SearchPattern</anno></c> + is encountered and return the remaining parts. + <c><anno>Where</anno></c>, default <c>leading</c>, indicates whether + the <c>leading</c>, the <c>trailing</c> or <c>all</c> encounters of + <c><anno>SearchPattern</anno></c> will split <c><anno>String</anno></c>. + </p> + <p><em>Example:</em></p> + <pre> +0> <input>string:split("ab..bc..cd", "..").</input> +["ab","bc..cd"] +1> <input>string:split(<<"ab..bc..cd">>, "..", trailing).</input> +[<<"ab..bc">>,<<"cd">>] +2> <input>string:split(<<"ab..bc....cd">>, "..", all).</input> +[<<"ab">>,<<"bc">>,<<>>,<<"cd">>]</pre> + </desc> + </func> + + <func> + <name name="take" arity="2"/> + <name name="take" arity="3"/> + <name name="take" arity="4"/> + <fsummary>Take leading or trailing parts.</fsummary> + <desc> + <p>Takes characters from <c><anno>String</anno></c> as long as + the characters are members of set <c><anno>Characters</anno></c> + or the complement of set <c><anno>Characters</anno></c>. + <c><anno>Dir</anno></c>, + which can be <c>leading</c> or <c>trailing</c>, indicates from + which direction characters are to be taken. + </p> + <p><em>Example:</em></p> + <pre> +5> <input>string:take("abc0z123", lists:seq($a,$z)).</input> +{"abc","0z123"} +6> <input>string:take(<<"abc0z123">>, lists:seq($0,$9), true, leading).</input> +{<<"abc">>,<<"0z123">>} +7> <input>string:take("abc0z123", lists:seq($0,$9), false, trailing).</input> +{"abc0z","123"} +8> <input>string:take(<<"abc0z123">>, lists:seq($a,$z), true, trailing).</input> +{<<"abc0z">>,<<"123">>}</pre> + </desc> + </func> + + <func> + <name name="titlecase" arity="1"/> + <fsummary>Convert a string to titlecase.</fsummary> + <desc> + <p> + Converts <c><anno>String</anno></c> to titlecase. + </p> + <p><em>Example:</em></p> + <pre> +1> <input>string:titlecase("ß is a SHARP s").</input> +"Ss is a SHARP s"</pre> + </desc> + </func> + + <func> + <name name="to_float" arity="1"/> + <fsummary>Return a float whose text representation is the integers + (ASCII values) of a string.</fsummary> + <desc> + <p>Argument <c><anno>String</anno></c> is expected to start with a + valid text represented float (the digits are ASCII values). + Remaining characters in the string after the float are returned in + <c><anno>Rest</anno></c>.</p> + <p><em>Example:</em></p> + <pre> +> <input>{F1,Fs} = string:to_float("1.0-1.0e-1"),</input> +> <input>{F2,[]} = string:to_float(Fs),</input> +> <input>F1+F2.</input> +0.9 +> <input>string:to_float("3/2=1.5").</input> +{error,no_float} +> <input>string:to_float("-1.5eX").</input> +{-1.5,"eX"}</pre> + </desc> + </func> + + <func> + <name name="to_integer" arity="1"/> + <fsummary>Return an integer whose text representation is the integers + (ASCII values) of a string.</fsummary> + <desc> + <p>Argument <c><anno>String</anno></c> is expected to start with a + valid text represented integer (the digits are ASCII values). + Remaining characters in the string after the integer are returned in + <c><anno>Rest</anno></c>.</p> + <p><em>Example:</em></p> + <pre> +> <input>{I1,Is} = string:to_integer("33+22"),</input> +> <input>{I2,[]} = string:to_integer(Is),</input> +> <input>I1-I2.</input> +11 +> <input>string:to_integer("0.5").</input> +{0,".5"} +> <input>string:to_integer("x=2").</input> +{error,no_integer}</pre> + </desc> + </func> + + <func> + <name name="to_graphemes" arity="1"/> + <fsummary>Convert a string to a list of grapheme clusters.</fsummary> + <desc> + <p> + Converts <c><anno>String</anno></c> to a list of grapheme clusters. + </p> + <p><em>Example:</em></p> + <pre> +1> <input>string:to_graphemes("ß↑e̊").</input> +[223,8593,[101,778]] +2> <input>string:to_graphemes(<<"ß↑e̊"/utf8>>).</input> +[223,8593,[101,778]]</pre> + </desc> + </func> + + <func> + <name name="trim" arity="1"/> + <name name="trim" arity="2"/> + <name name="trim" arity="3"/> + <fsummary>Trim leading or trailing, or both, characters.</fsummary> + <desc> + <p> + Returns a string, where leading or trailing, or both, + <c><anno>Characters</anno></c> have been removed. + <c><anno>Dir</anno></c> which can be <c>leading</c>, <c>trailing</c>, + or <c>both</c>, indicates from which direction characters + are to be removed. + </p> + <p> Default <c><anno>Characters</anno></c> are the set of + nonbreakable whitespace codepoints, defined as + Pattern_White_Space in + <url href="http://unicode.org/reports/tr31/">Unicode Standard Annex #31</url>. + <c>By default, <anno>Dir</anno></c> is <c>both</c>. + </p> + <p> + Notice that <c>[$\r,$\n]</c> is one grapheme cluster according + to the Unicode Standard. + </p> + <p><em>Example:</em></p> + <pre> +1> <input>string:trim("\t Hello \n").</input> +"Hello" +2> <input>string:trim(<<"\t Hello \n">>, leading).</input> +<<"Hello \n">> +3> <input>string:trim(<<".Hello.\n">>, trailing, "\n.").</input> +<<".Hello">></pre> + </desc> + </func> + + <func> + <name name="uppercase" arity="1"/> + <fsummary>Convert a string to uppercase.</fsummary> + <desc> + <p> + Converts <c><anno>String</anno></c> to uppercase. + </p> + <p>See also <seealso marker="#titlecase/1"><c>titlecase/1</c></seealso>.</p> + <p><em>Example:</em></p> + <pre> +1> <input>string:uppercase("Michał").</input> +"MICHAŁ"</pre> + </desc> + </func> + + </funcs> + + <section> + <marker id="oldapi"/> + <title>Obsolete API functions</title> + <p>Here follows the function of the old API. + These functions only work on a list of Latin-1 characters. + </p> + <note><p> + The functions are kept for backward compatibility, but are + not recommended. + They will be deprecated in Erlang/OTP 21. + </p> + <p>Any undocumented functions in <c>string</c> are not to be used.</p> + </note> + </section> + <funcs> <func> <name name="centre" arity="2"/> @@ -47,17 +652,24 @@ <p>Returns a string, where <c><anno>String</anno></c> is centered in the string and surrounded by blanks or <c><anno>Character</anno></c>. The resulting string has length <c><anno>Number</anno></c>.</p> + <p>This function is <seealso marker="#oldapi">obsolete</seealso>. + Use + <seealso marker="#pad/3"><c>pad/3</c></seealso>. + </p> </desc> </func> <func> <name name="chars" arity="2"/> <name name="chars" arity="3"/> - <fsummary>Returns a string consisting of numbers of characters.</fsummary> + <fsummary>Return a string consisting of numbers of characters.</fsummary> <desc> <p>Returns a string consisting of <c><anno>Number</anno></c> characters <c><anno>Character</anno></c>. Optionally, the string can end with string <c><anno>Tail</anno></c>.</p> + <p>This function is <seealso marker="#oldapi">obsolete</seealso>. + Use + <seealso marker="lists#duplicate/2"><c>lists:duplicate/2</c></seealso>.</p> </desc> </func> @@ -69,6 +681,9 @@ <p>Returns the index of the first occurrence of <c><anno>Character</anno></c> in <c><anno>String</anno></c>. Returns <c>0</c> if <c><anno>Character</anno></c> does not occur.</p> + <p>This function is <seealso marker="#oldapi">obsolete</seealso>. + Use + <seealso marker="#find/2"><c>find/2</c></seealso>.</p> </desc> </func> @@ -79,6 +694,16 @@ <p>Concatenates <c><anno>String1</anno></c> and <c><anno>String2</anno></c> to form a new string <c><anno>String3</anno></c>, which is returned.</p> + <p> + This function is <seealso marker="#oldapi">obsolete</seealso>. + Use <c>[<anno>String1</anno>, <anno>String2</anno>]</c> as + <c>Data</c> argument, and call + <seealso marker="unicode#characters_to_list/2"> + <c>unicode:characters_to_list/2</c></seealso> or + <seealso marker="unicode#characters_to_binary/2"> + <c>unicode:characters_to_binary/2</c></seealso> + to flatten the output. + </p> </desc> </func> @@ -88,6 +713,9 @@ <desc> <p>Returns a string containing <c><anno>String</anno></c> repeated <c><anno>Number</anno></c> times.</p> + <p>This function is <seealso marker="#oldapi">obsolete</seealso>. + Use + <seealso marker="lists#duplicate/2"><c>lists:duplicate/2</c></seealso>.</p> </desc> </func> @@ -98,6 +726,9 @@ <p>Returns the length of the maximum initial segment of <c><anno>String</anno></c>, which consists entirely of characters not from <c><anno>Chars</anno></c>.</p> + <p>This function is <seealso marker="#oldapi">obsolete</seealso>. + Use + <seealso marker="#take/3"><c>take/3</c></seealso>.</p> <p><em>Example:</em></p> <code type="none"> > string:cspan("\t abcdef", " \t"). @@ -106,20 +737,14 @@ </func> <func> - <name name="equal" arity="2"/> - <fsummary>Test string equality.</fsummary> - <desc> - <p>Returns <c>true</c> if <c><anno>String1</anno></c> and - <c><anno>String2</anno></c> are equal, otherwise <c>false</c>.</p> - </desc> - </func> - - <func> <name name="join" arity="2"/> <fsummary>Join a list of strings with separator.</fsummary> <desc> <p>Returns a string with the elements of <c><anno>StringList</anno></c> separated by the string in <c><anno>Separator</anno></c>.</p> + <p>This function is <seealso marker="#oldapi">obsolete</seealso>. + Use + <seealso marker="lists#join/2"><c>lists:join/2</c></seealso>.</p> <p><em>Example:</em></p> <code type="none"> > join(["one", "two", "three"], ", "). @@ -137,6 +762,10 @@ fixed. If <c>length(<anno>String</anno>)</c> < <c><anno>Number</anno></c>, then <c><anno>String</anno></c> is padded with blanks or <c><anno>Character</anno></c>s.</p> + <p>This function is <seealso marker="#oldapi">obsolete</seealso>. + Use + <seealso marker="#pad/2"><c>pad/2</c></seealso> or + <seealso marker="#pad/3"><c>pad/3</c></seealso>.</p> <p><em>Example:</em></p> <code type="none"> > string:left("Hello",10,$.). @@ -149,6 +778,9 @@ <fsummary>Return the length of a string.</fsummary> <desc> <p>Returns the number of characters in <c><anno>String</anno></c>.</p> + <p>This function is <seealso marker="#oldapi">obsolete</seealso>. + Use + <seealso marker="#length/1"><c>length/1</c></seealso>.</p> </desc> </func> @@ -160,6 +792,9 @@ <p>Returns the index of the last occurrence of <c><anno>Character</anno></c> in <c><anno>String</anno></c>. Returns <c>0</c> if <c><anno>Character</anno></c> does not occur.</p> + <p>This function is <seealso marker="#oldapi">obsolete</seealso>. + Use + <seealso marker="#find/3"><c>find/3</c></seealso>.</p> </desc> </func> @@ -173,6 +808,9 @@ fixed. If the length of <c>(<anno>String</anno>)</c> < <c><anno>Number</anno></c>, then <c><anno>String</anno></c> is padded with blanks or <c><anno>Character</anno></c>s.</p> + <p>This function is <seealso marker="#oldapi">obsolete</seealso>. + Use + <seealso marker="#pad/3"><c>pad/3</c></seealso>.</p> <p><em>Example:</em></p> <code type="none"> > string:right("Hello", 10, $.). @@ -188,6 +826,9 @@ <c><anno>SubString</anno></c> begins in <c><anno>String</anno></c>. Returns <c>0</c> if <c><anno>SubString</anno></c> does not exist in <c><anno>String</anno></c>.</p> + <p>This function is <seealso marker="#oldapi">obsolete</seealso>. + Use + <seealso marker="#find/3"><c>find/3</c></seealso>.</p> <p><em>Example:</em></p> <code type="none"> > string:rstr(" Hello Hello World World ", "Hello World"). @@ -202,6 +843,9 @@ <p>Returns the length of the maximum initial segment of <c><anno>String</anno></c>, which consists entirely of characters from <c><anno>Chars</anno></c>.</p> + <p>This function is <seealso marker="#oldapi">obsolete</seealso>. + Use + <seealso marker="#take/2"><c>take/2</c></seealso>.</p> <p><em>Example:</em></p> <code type="none"> > string:span("\t abcdef", " \t"). @@ -217,6 +861,9 @@ <c><anno>SubString</anno></c> begins in <c><anno>String</anno></c>. Returns <c>0</c> if <c><anno>SubString</anno></c> does not exist in <c><anno>String</anno></c>.</p> + <p>This function is <seealso marker="#oldapi">obsolete</seealso>. + Use + <seealso marker="#find/2"><c>find/2</c></seealso>.</p> <p><em>Example:</em></p> <code type="none"> > string:str(" Hello Hello World World ", "Hello World"). @@ -230,12 +877,15 @@ <name name="strip" arity="3"/> <fsummary>Strip leading or trailing characters.</fsummary> <desc> - <p>Returns a string, where leading and/or trailing blanks or a + <p>Returns a string, where leading or trailing, or both, blanks or a number of <c><anno>Character</anno></c> have been removed. <c><anno>Direction</anno></c>, which can be <c>left</c>, <c>right</c>, or <c>both</c>, indicates from which direction blanks are to be removed. <c>strip/1</c> is equivalent to <c>strip(String, both)</c>.</p> + <p>This function is <seealso marker="#oldapi">obsolete</seealso>. + Use + <seealso marker="#trim/3"><c>trim/3</c></seealso>.</p> <p><em>Example:</em></p> <code type="none"> > string:strip("...Hello.....", both, $.). @@ -251,6 +901,9 @@ <p>Returns a substring of <c><anno>String</anno></c>, starting at position <c><anno>Start</anno></c> to the end of the string, or to and including position <c><anno>Stop</anno></c>.</p> + <p>This function is <seealso marker="#oldapi">obsolete</seealso>. + Use + <seealso marker="#slice/3"><c>slice/3</c></seealso>.</p> <p><em>Example:</em></p> <code type="none"> sub_string("Hello World", 4, 8). @@ -266,6 +919,9 @@ sub_string("Hello World", 4, 8). <p>Returns a substring of <c><anno>String</anno></c>, starting at position <c><anno>Start</anno></c>, and ending at the end of the string or at length <c><anno>Length</anno></c>.</p> + <p>This function is <seealso marker="#oldapi">obsolete</seealso>. + Use + <seealso marker="#slice/3"><c>slice/3</c></seealso>.</p> <p><em>Example:</em></p> <code type="none"> > substr("Hello World", 4, 5). @@ -281,6 +937,9 @@ sub_string("Hello World", 4, 8). <p>Returns the word in position <c><anno>Number</anno></c> of <c><anno>String</anno></c>. Words are separated by blanks or <c><anno>Character</anno></c>s.</p> + <p>This function is <seealso marker="#oldapi">obsolete</seealso>. + Use + <seealso marker="#nth_lexeme/3"><c>nth_lexeme/3</c></seealso>.</p> <p><em>Example:</em></p> <code type="none"> > string:sub_word(" Hello old boy !",3,$o). @@ -289,50 +948,6 @@ sub_string("Hello World", 4, 8). </func> <func> - <name name="to_float" arity="1"/> - <fsummary>Returns a float whose text representation is the integers - (ASCII values) in a string.</fsummary> - <desc> - <p>Argument <c><anno>String</anno></c> is expected to start with a - valid text represented float (the digits are ASCII values). - Remaining characters in the string after the float are returned in - <c><anno>Rest</anno></c>.</p> - <p><em>Example:</em></p> - <code type="none"> -> {F1,Fs} = string:to_float("1.0-1.0e-1"), -> {F2,[]} = string:to_float(Fs), -> F1+F2. -0.9 -> string:to_float("3/2=1.5"). -{error,no_float} -> string:to_float("-1.5eX"). -{-1.5,"eX"}</code> - </desc> - </func> - - <func> - <name name="to_integer" arity="1"/> - <fsummary>Returns an integer whose text representation is the integers - (ASCII values) in a string.</fsummary> - <desc> - <p>Argument <c><anno>String</anno></c> is expected to start with a - valid text represented integer (the digits are ASCII values). - Remaining characters in the string after the integer are returned in - <c><anno>Rest</anno></c>.</p> - <p><em>Example:</em></p> - <code type="none"> -> {I1,Is} = string:to_integer("33+22"), -> {I2,[]} = string:to_integer(Is), -> I1-I2. -11 -> string:to_integer("0.5"). -{0,".5"} -> string:to_integer("x=2"). -{error,no_integer}</code> - </desc> - </func> - - <func> <name name="to_lower" arity="1" clause_i="1"/> <name name="to_lower" arity="1" clause_i="2"/> <name name="to_upper" arity="1" clause_i="1"/> @@ -346,6 +961,11 @@ sub_string("Hello World", 4, 8). <p>The specified string or character is case-converted. Notice that the supported character set is ISO/IEC 8859-1 (also called Latin 1); all values outside this set are unchanged</p> + <p>This function is <seealso marker="#oldapi">obsolete</seealso> use + <seealso marker="#lowercase/1"><c>lowercase/1</c></seealso>, + <seealso marker="#uppercase/1"><c>uppercase/1</c></seealso>, + <seealso marker="#titlecase/1"><c>titlecase/1</c></seealso> or + <seealso marker="#casefold/1"><c>casefold/1</c></seealso>.</p> </desc> </func> @@ -363,6 +983,9 @@ sub_string("Hello World", 4, 8). adjacent separator characters in <c><anno>String</anno></c> are treated as one. That is, there are no empty strings in the resulting list of tokens.</p> + <p>This function is <seealso marker="#oldapi">obsolete</seealso>. + Use + <seealso marker="#lexemes/2"><c>lexemes/2</c></seealso>.</p> </desc> </func> @@ -373,6 +996,9 @@ sub_string("Hello World", 4, 8). <desc> <p>Returns the number of words in <c><anno>String</anno></c>, separated by blanks or <c><anno>Character</anno></c>.</p> + <p>This function is <seealso marker="#oldapi">obsolete</seealso>. + Use + <seealso marker="#lexemes/2"><c>lexemes/2</c></seealso>.</p> <p><em>Example:</em></p> <code type="none"> > words(" Hello old boy!", $o). @@ -387,10 +1013,7 @@ sub_string("Hello World", 4, 8). other. The reason is that this string package is the combination of two earlier packages and all functions of both packages have been retained.</p> - - <note> - <p>Any undocumented functions in <c>string</c> are not to be used.</p> - </note> </section> + </erlref> diff --git a/lib/stdlib/doc/src/unicode_usage.xml b/lib/stdlib/doc/src/unicode_usage.xml index a8ef8ff5c5..11b84f552a 100644 --- a/lib/stdlib/doc/src/unicode_usage.xml +++ b/lib/stdlib/doc/src/unicode_usage.xml @@ -65,7 +65,10 @@ <item><p>In Erlang/OTP 20.0, atoms and function can contain Unicode characters. Module names are still restricted to - the ISO-Latin-1 range.</p></item> + the ISO-Latin-1 range.</p> + <p>Support was added for normalizations forms in + <c>unicode</c> and the <c>string</c> module now handles + utf8-encoded binaries.</p></item> </list> <p>This section outlines the current Unicode support and gives some @@ -110,23 +113,27 @@ </item> </list> - <p>So, a conversion function must know not only one character at a time, - but possibly the whole sentence, the natural language to translate to, - the differences in input and output string length, and so on. - Erlang/OTP has currently no Unicode <c>to_upper</c>/<c>to_lower</c> - functionality, but publicly available libraries address these issues.</p> - - <p>Another example is the accented characters, where the same glyph has two - different representations. The Swedish letter "ö" is one example. - The Unicode standard has a code point for it, but you can also write it - as "o" followed by "U+0308" (Combining Diaeresis, with the simplified - meaning that the last letter is to have "¨" above). They have the same - glyph. They are for most purposes the same, but have different - representations. For example, MacOS X converts all filenames to use - Combining Diaeresis, while most other programs (including Erlang) try to - hide that by doing the opposite when, for example, listing directories. - However it is done, it is usually important to normalize such - characters to avoid confusion.</p> + <p>So, a conversion function must know not only one character at a + time, but possibly the whole sentence, the natural language to + translate to, the differences in input and output string length, + and so on. Erlang/OTP has currently no Unicode + <c>uppercase</c>/<c>lowercase</c> functionality with language + specific handling, but publicly available libraries address these + issues.</p> + + <p>Another example is the accented characters, where the same + glyph has two different representations. The Swedish letter "ö" is + one example. The Unicode standard has a code point for it, but + you can also write it as "o" followed by "U+0308" (Combining + Diaeresis, with the simplified meaning that the last letter is to + have "¨" above). They have the same glyph, user perceived + character. They are for most purposes the same, but have different + representations. For example, MacOS X converts all filenames to + use Combining Diaeresis, while most other programs (including + Erlang) try to hide that by doing the opposite when, for example, + listing directories. However it is done, it is usually important + to normalize such characters to avoid confusion. + </p> <p>The list of examples can be made long. One need a kind of knowledge that was not needed when programs only considered one or two languages. The @@ -273,7 +280,7 @@ them. In some cases functionality has been added to already existing interfaces (as the <seealso marker="stdlib:string"><c>string</c></seealso> module now can - handle lists with any code points). In some cases new + handle strings with any code points). In some cases new functionality or options have been added (as in the <seealso marker="stdlib:io"><c>io</c></seealso> module, the file handling, the <seealso @@ -977,7 +984,7 @@ Eshell V5.10.1 (abort with ^G) <p>Fortunately, most textual data has been stored in lists and range checking has been sparse, so modules like <c>string</c> work well for - Unicode lists with little need for conversion or extension.</p> + Unicode strings with little need for conversion or extension.</p> <p>Some modules are, however, changed to be explicitly Unicode-aware. These modules include:</p> @@ -1028,18 +1035,17 @@ Eshell V5.10.1 (abort with ^G) has extensive support for Unicode text.</p></item> </taglist> - <p>The <seealso marker="stdlib:string"><c>string</c></seealso> module works - perfectly for Unicode strings and ISO Latin-1 strings, except the - language-dependent functions - <seealso marker="stdlib:string#to_upper/1"><c>string:to_upper/1</c></seealso> - and - <seealso marker="stdlib:string#to_lower/1"><c>string:to_lower/1</c></seealso>, - which are only correct for the ISO Latin-1 character set. These two - functions can never function correctly for Unicode characters in their - current form, as there are language and locale issues as well as - multi-character mappings to consider when converting text between cases. - Converting case in an international environment is a large subject not - yet addressed in OTP.</p> + <p>The <seealso marker="stdlib:string"><c>string</c></seealso> + module works perfectly for Unicode strings and ISO Latin-1 + strings, except the language-dependent functions <seealso + marker="stdlib:string#uppercase/1"><c>string:uppercase/1</c></seealso> + and <seealso + marker="stdlib:string#lowercase/1"><c>string:lowercase/1</c></seealso>. + These two functions can never function correctly for Unicode + characters in their current form, as there are language and locale + issues to consider when converting text between cases. Converting + case in an international environment is a large subject not yet + addressed in OTP.</p> </section> <section> diff --git a/lib/stdlib/src/string.erl b/lib/stdlib/src/string.erl index c659db78bd..4fdfe99b66 100644 --- a/lib/stdlib/src/string.erl +++ b/lib/stdlib/src/string.erl @@ -17,22 +17,72 @@ %% %% %CopyrightEnd% %% +%% A string library that works on grapheme clusters, with the exception +%% of codepoints of class 'prepend' and non modern (or decomposed) Hangul. +%% If these codepoints appear, functions like 'find/2' may return a string +%% which starts inside a grapheme cluster. +%% These exceptions are made because the codepoints classes are +%% seldom used and require that we are able look at previous codepoints in +%% the stream and is thus hard to implement effectively. +%% +%% GC (grapheme cluster) implies that the length of string 'ß↑e̊' is 3 though +%% it is represented by the codepoints [223,8593,101,778] or the +%% utf8 binary <<195,159,226,134,145,101,204,138>> +%% +%% And that searching for strings or graphemes finds the correct positions: +%% +%% find("eeeee̊eee", "e̊") -> "e̊ee".: +%% find("1£4e̊abcdef", "e") -> "ef" +%% +%% Most functions expect all input to be normalized to one form, +%% see unicode:characters_to_nfc and unicode:characters_to_nfd functions. +%% When appending strings no checking is done to verify that the +%% result is valid unicode strings. +%% +%% The functions may crash for invalid utf-8 input. +%% +%% Return value should be kept consistent when return type is +%% unicode:chardata() i.e. binary input => binary output, +%% list input => list output mixed input => mixed output +%% -module(string). --export([len/1,equal/2,concat/2,chr/2,rchr/2,str/2,rstr/2, - span/2,cspan/2,substr/2,substr/3,tokens/2,chars/2,chars/3]). +-export([is_empty/1, length/1, to_graphemes/1, + reverse/1, + equal/2, equal/3, equal/4, + slice/2, slice/3, + pad/2, pad/3, pad/4, trim/1, trim/2, trim/3, chomp/1, + take/2, take/3, take/4, + lexemes/2, nth_lexeme/3, + uppercase/1, lowercase/1, titlecase/1,casefold/1, + prefix/2, + split/2,split/3,replace/3,replace/4, + find/2,find/3, + next_codepoint/1, next_grapheme/1 + ]). + +%% Old (will be deprecated) lists/string API kept for backwards compability +-export([len/1, concat/2, % equal/2, (extended in the new api) + chr/2,rchr/2,str/2,rstr/2, + span/2,cspan/2,substr/2,substr/3, tokens/2, + chars/2,chars/3]). -export([copies/2,words/1,words/2,strip/1,strip/2,strip/3, sub_word/2,sub_word/3,left/2,left/3,right/2,right/3, sub_string/2,sub_string/3,centre/2,centre/3, join/2]). -export([to_upper/1, to_lower/1]). +%% +-import(lists,[member/2]). --import(lists,[reverse/1,member/2]). +-compile({no_auto_import,[length/1]}). -%%--------------------------------------------------------------------------- +-export_type([grapheme_cluster/0]). -%%% BIFs +-type grapheme_cluster() :: char() | [char()]. +-type direction() :: 'leading' | 'trailing'. +%%% BIFs -export([to_float/1, to_integer/1]). +-dialyzer({no_improper_lists, stack/2}). -spec to_float(String) -> {Float, Rest} | {error, Reason} when String :: string(), @@ -54,6 +104,1180 @@ to_integer(_) -> %%% End of BIFs +%% Check if string is the empty string +-spec is_empty(String::unicode:chardata()) -> boolean(). +is_empty([]) -> true; +is_empty(<<>>) -> true; +is_empty([L|R]) -> is_empty(L) andalso is_empty(R); +is_empty(_) -> false. + +%% Count the number of grapheme clusters in chardata +-spec length(String::unicode:chardata()) -> non_neg_integer(). +length(CD) -> + length_1(unicode_util:gc(CD), 0). + +%% Convert a string to a list of grapheme clusters +-spec to_graphemes(String::unicode:chardata()) -> [grapheme_cluster()]. +to_graphemes(CD0) -> + case unicode_util:gc(CD0) of + [GC|CD] -> [GC|to_graphemes(CD)]; + [] -> [] + end. + +%% Compare two strings return boolean, assumes that the input are +%% normalized to same form, see unicode:characters_to_nfX_xxx(..) +-spec equal(A, B) -> boolean() when + A::unicode:chardata(), + B::unicode:chardata(). +equal(A,B) when is_binary(A), is_binary(B) -> + A =:= B; +equal(A,B) -> + equal_1(A,B). + +%% Compare two strings return boolean, assumes that the input are +%% normalized to same form, see unicode:characters_to_nfX_xxx(..) +%% does casefold on the fly +-spec equal(A, B, IgnoreCase) -> boolean() when + A::unicode:chardata(), + B::unicode:chardata(), + IgnoreCase :: boolean(). +equal(A, B, false) -> + equal(A,B); +equal(A, B, true) -> + equal_nocase(A,B). + +%% Compare two strings return boolean +%% if specified does casefold and normalization on the fly +-spec equal(A, B, IgnoreCase, Norm) -> boolean() when + A :: unicode:chardata(), + B :: unicode:chardata(), + IgnoreCase :: boolean(), + Norm :: 'none' | 'nfc' | 'nfd' | 'nfkc' | 'nfkd'. +equal(A, B, Case, none) -> + equal(A,B,Case); +equal(A, B, false, Norm) -> + equal_norm(A, B, Norm); +equal(A, B, true, Norm) -> + equal_norm_nocase(A, B, Norm). + +%% Reverse grapheme clusters +-spec reverse(String::unicode:chardata()) -> [grapheme_cluster()]. +reverse(CD) -> + reverse_1(CD, []). + +%% Slice a string and return rest of string +%% Note: counts grapheme_clusters +-spec slice(String, Start) -> Slice when + String::unicode:chardata(), + Start :: non_neg_integer(), + Slice :: unicode:chardata(). +slice(CD, N) when is_integer(N), N >= 0 -> + slice_l(CD, N, is_binary(CD)). + +-spec slice(String, Start, Length) -> Slice when + String::unicode:chardata(), + Start :: non_neg_integer(), + Length :: 'infinity' | non_neg_integer(), + Slice :: unicode:chardata(). +slice(CD, N, Length) + when is_integer(N), N >= 0, is_integer(Length), Length > 0 -> + slice_trail(slice_l(CD, N, is_binary(CD)), Length); +slice(CD, N, infinity) -> + slice_l(CD, N, is_binary(CD)); +slice(CD, _, 0) -> + case is_binary(CD) of + true -> <<>>; + false -> [] + end. + +%% Pad a string to desired length +-spec pad(String, Length) -> unicode:charlist() when + String ::unicode:chardata(), + Length :: integer(). +pad(CD, Length) -> + pad(CD, Length, trailing, $\s). + +-spec pad(String, Length, Dir) -> unicode:charlist() when + String ::unicode:chardata(), + Length :: integer(), + Dir :: direction() | 'both'. +pad(CD, Length, Dir) -> + pad(CD, Length, Dir, $\s). + +-spec pad(String, Length, Dir, Char) -> unicode:charlist() when + String ::unicode:chardata(), + Length :: integer(), + Dir :: direction() | 'both', + Char :: grapheme_cluster(). +pad(CD, Length, leading, Char) when is_integer(Length) -> + Len = length(CD), + [lists:duplicate(max(0, Length-Len), Char), CD]; +pad(CD, Length, trailing, Char) when is_integer(Length) -> + Len = length(CD), + [CD|lists:duplicate(max(0, Length-Len), Char)]; +pad(CD, Length, both, Char) when is_integer(Length) -> + Len = length(CD), + Size = max(0, Length-Len), + Pre = lists:duplicate(Size div 2, Char), + Post = case Size rem 2 of + 1 -> [Char]; + _ -> [] + end, + [Pre, CD, Pre|Post]. + +%% Strip characters from whitespace or Separator in Direction +-spec trim(String) -> unicode:chardata() when + String :: unicode:chardata(). +trim(Str) -> + trim(Str, both, unicode_util:whitespace()). + +-spec trim(String, Dir) -> unicode:chardata() when + String :: unicode:chardata(), + Dir :: direction() | 'both'. +trim(Str, Dir) -> + trim(Str, Dir, unicode_util:whitespace()). + +-spec trim(String, Dir, Characters) -> unicode:chardata() when + String :: unicode:chardata(), + Dir :: direction() | 'both', + Characters :: [grapheme_cluster()]. +trim(Str, _, []) -> Str; +trim(Str, leading, Sep) when is_list(Sep) -> + trim_l(Str, search_pattern(Sep)); +trim(Str, trailing, Sep) when is_list(Sep) -> + trim_t(Str, 0, search_pattern(Sep)); +trim(Str, both, Sep0) when is_list(Sep0) -> + Sep = search_pattern(Sep0), + trim_t(trim_l(Str,Sep), 0, Sep). + +%% Delete trailing newlines or \r\n +-spec chomp(String::unicode:chardata()) -> unicode:chardata(). +chomp(Str) -> + trim_t(Str,0, {[[$\r,$\n],$\n], [$\r,$\n], [<<$\r>>,<<$\n>>]}). + +%% Split String into two parts where the leading part consists of Characters +-spec take(String, Characters) -> {Leading, Trailing} when + String::unicode:chardata(), + Characters::[grapheme_cluster()], + Leading::unicode:chardata(), + Trailing::unicode:chardata(). +take(Str, Sep) -> + take(Str, Sep, false, leading). +-spec take(String, Characters, Complement) -> {Leading, Trailing} when + String::unicode:chardata(), + Characters::[grapheme_cluster()], + Complement::boolean(), + Leading::unicode:chardata(), + Trailing::unicode:chardata(). +take(Str, Sep, Complement) -> + take(Str, Sep, Complement, leading). +-spec take(String, Characters, Complement, Dir) -> {Leading, Trailing} when + String::unicode:chardata(), + Characters::[grapheme_cluster()], + Complement::boolean(), + Dir::direction(), + Leading::unicode:chardata(), + Trailing::unicode:chardata(). +take(Str, [], Complement, Dir) -> + Empty = case is_binary(Str) of true -> <<>>; false -> [] end, + case {Complement,Dir} of + {false, leading} -> {Empty, Str}; + {false, trailing} -> {Str, Empty}; + {true, leading} -> {Str, Empty}; + {true, trailing} -> {Empty, Str} + end; +take(Str, Sep0, false, leading) -> + Sep = search_pattern(Sep0), + take_l(Str, Sep, []); +take(Str, Sep0, true, leading) -> + Sep = search_pattern(Sep0), + take_lc(Str, Sep, []); +take(Str, Sep0, false, trailing) -> + Sep = search_pattern(Sep0), + take_t(Str, 0, Sep); +take(Str, Sep0, true, trailing) -> + Sep = search_pattern(Sep0), + take_tc(Str, 0, Sep). + +%% Uppercase all chars in Str +-spec uppercase(String::unicode:chardata()) -> unicode:chardata(). +uppercase(CD) when is_list(CD) -> + uppercase_list(CD); +uppercase(CD) when is_binary(CD) -> + uppercase_bin(CD,<<>>). + +%% Lowercase all chars in Str +-spec lowercase(String::unicode:chardata()) -> unicode:chardata(). +lowercase(CD) when is_list(CD) -> + lowercase_list(CD); +lowercase(CD) when is_binary(CD) -> + lowercase_bin(CD,<<>>). + +%% Make a titlecase of the first char in Str +-spec titlecase(String::unicode:chardata()) -> unicode:chardata(). +titlecase(CD) when is_list(CD) -> + case unicode_util:titlecase(CD) of + [GC|Tail] -> append(GC,Tail); + Empty -> Empty + end; +titlecase(CD) when is_binary(CD) -> + case unicode_util:titlecase(CD) of + [CP|Chars] when is_integer(CP) -> <<CP/utf8,Chars/binary>>; + [CPs|Chars] -> + << << <<CP/utf8>> || CP <- CPs>>/binary, Chars/binary>>; + [] -> <<>> + end. + +%% Make a comparable string of the Str should be used for equality tests only +-spec casefold(String::unicode:chardata()) -> unicode:chardata(). +casefold(CD) when is_list(CD) -> + casefold_list(CD); +casefold(CD) when is_binary(CD) -> + casefold_bin(CD,<<>>). + +%% Return the remaining string with prefix removed or else nomatch +-spec prefix(String::unicode:chardata(), Prefix::unicode:chardata()) -> + 'nomatch' | unicode:chardata(). +prefix(Str, []) -> Str; +prefix(Str, Prefix0) -> + Prefix = unicode:characters_to_list(Prefix0), + case prefix_1(Str, Prefix) of + [] when is_binary(Str) -> <<>>; + Res -> Res + end. + +%% split String with the first occurrence of SearchPattern, return list of splits +-spec split(String, SearchPattern) -> [unicode:chardata()] when + String :: unicode:chardata(), + SearchPattern :: unicode:chardata(). +split(String, SearchPattern) -> + split(String, SearchPattern, leading). + +%% split String with SearchPattern, return list of splits +-spec split(String, SearchPattern, Where) -> [unicode:chardata()] when + String :: unicode:chardata(), + SearchPattern :: unicode:chardata(), + Where :: direction() | 'all'. +split(String, SearchPattern, Where) -> + case is_empty(SearchPattern) of + true -> [String]; + false -> + SearchPatternCPs = unicode:characters_to_list(SearchPattern), + case split_1(String, SearchPatternCPs, 0, Where, [], []) of + {_Curr, []} -> [String]; + {_Curr, Acc} when Where =:= trailing -> Acc; + {Curr, Acc} when Where =:= all -> lists:reverse([Curr|Acc]); + Acc when is_list(Acc) -> Acc + end + end. + +%% Replace the first SearchPattern in String with Replacement +-spec replace(String, SearchPattern, Replacement) -> + [unicode:chardata()] when + String :: unicode:chardata(), + SearchPattern :: unicode:chardata(), + Replacement :: unicode:chardata(). +replace(String, SearchPattern, Replacement) -> + lists:join(Replacement, split(String, SearchPattern)). + +%% Replace Where SearchPattern in String with Replacement +-spec replace(String, SearchPattern, Replacement, Where) -> + [unicode:chardata()] when + String :: unicode:chardata(), + SearchPattern :: unicode:chardata(), + Replacement :: unicode:chardata(), + Where :: direction() | 'all'. +replace(String, SearchPattern, Replacement, Where) -> + lists:join(Replacement, split(String, SearchPattern, Where)). + +%% Split Str into a list of chardata separated by one of the grapheme +%% clusters in Seps +-spec lexemes(String::unicode:chardata(), + SeparatorList::[grapheme_cluster()]) -> + [unicode:chardata()]. +lexemes([], _) -> []; +lexemes(Str, Seps0) when is_list(Seps0) -> + Seps = search_pattern(Seps0), + lexemes_m(Str, Seps, []). + +-spec nth_lexeme(String, N, SeparatorList) -> unicode:chardata() when + String::unicode:chardata(), + N::non_neg_integer(), + SeparatorList::[grapheme_cluster()]. + +nth_lexeme(Str, 1, []) -> Str; +nth_lexeme(Str, N, Seps0) when is_list(Seps0), is_integer(N), N > 0 -> + Seps = search_pattern(Seps0), + nth_lexeme_m(Str, Seps, N). + +%% find first SearchPattern in String return rest of string +-spec find(String, SearchPattern) -> unicode:chardata() | 'nomatch' when + String::unicode:chardata(), + SearchPattern::unicode:chardata(). +find(String, SearchPattern) -> + find(String, SearchPattern, leading). + +%% find SearchPattern in String (search in Dir direction) return rest of string +-spec find(String, SearchPattern, Dir) -> unicode:chardata() | 'nomatch' when + String::unicode:chardata(), + SearchPattern::unicode:chardata(), + Dir::direction(). +find(String, "", _) -> String; +find(String, <<>>, _) -> String; +find(String, SearchPattern, leading) -> + find_l(String, unicode:characters_to_list(SearchPattern)); +find(String, SearchPattern, trailing) -> + find_r(String, unicode:characters_to_list(SearchPattern), nomatch). + +%% Fetch first codepoint and return rest in tail +-spec next_grapheme(String::unicode:chardata()) -> + maybe_improper_list(grapheme_cluster(),unicode:chardata()). +next_grapheme(CD) -> unicode_util:gc(CD). + +%% Fetch first grapheme cluster and return rest in tail +-spec next_codepoint(String::unicode:chardata()) -> + maybe_improper_list(char(),unicode:chardata()). +next_codepoint(CD) -> unicode_util:cp(CD). + +%% Internals + +length_1([_|Rest], N) -> + length_1(unicode_util:gc(Rest), N+1); +length_1([], N) -> + N. + +equal_1([A|AR], [B|BR]) when is_integer(A), is_integer(B) -> + A =:= B andalso equal_1(AR, BR); +equal_1([], BR) -> is_empty(BR); +equal_1(A0,B0) -> + case {unicode_util:cp(A0), unicode_util:cp(B0)} of + {[CP|A],[CP|B]} -> equal_1(A,B); + {[], []} -> true; + _ -> false + end. + +equal_nocase(A, A) -> true; +equal_nocase(A0, B0) -> + case {unicode_util:cp(unicode_util:casefold(A0)), + unicode_util:cp(unicode_util:casefold(B0))} of + {[CP|A],[CP|B]} -> equal_nocase(A,B); + {[], []} -> true; + _ -> false + end. + +equal_norm(A, A, _Norm) -> true; +equal_norm(A0, B0, Norm) -> + case {unicode_util:cp(unicode_util:Norm(A0)), + unicode_util:cp(unicode_util:Norm(B0))} of + {[CP|A],[CP|B]} -> equal_norm(A,B, Norm); + {[], []} -> true; + _ -> false + end. + +equal_norm_nocase(A, A, _Norm) -> true; +equal_norm_nocase(A0, B0, Norm) -> + case {unicode_util:cp(unicode_util:casefold(unicode_util:Norm(A0))), + unicode_util:cp(unicode_util:casefold(unicode_util:Norm(B0)))} of + {[CP|A],[CP|B]} -> equal_norm_nocase(A,B, Norm); + {[], []} -> true; + _ -> false + end. + +reverse_1(CD, Acc) -> + case unicode_util:gc(CD) of + [GC|Rest] -> reverse_1(Rest, [GC|Acc]); + [] -> Acc + end. + +slice_l(CD, N, Binary) when N > 0 -> + case unicode_util:gc(CD) of + [_|Cont] -> slice_l(Cont, N-1, Binary); + [] when Binary -> <<>>; + [] -> [] + end; +slice_l(Cont, 0, Binary) -> + case is_empty(Cont) of + true when Binary -> <<>>; + _ -> Cont + end. + +slice_trail(CD, N) when is_list(CD) -> + slice_list(CD, N); +slice_trail(CD, N) when is_binary(CD) -> + slice_bin(CD, N, CD). + +slice_list(CD, N) when N > 0 -> + case unicode_util:gc(CD) of + [GC|Cont] -> append(GC, slice_list(Cont, N-1)); + [] -> [] + end; +slice_list(_, 0) -> + []. + +slice_bin(CD, N, Orig) when N > 0 -> + case unicode_util:gc(CD) of + [_|Cont] -> slice_bin(Cont, N-1, Orig); + [] -> Orig + end; +slice_bin([], 0, Orig) -> + Orig; +slice_bin(CD, 0, Orig) -> + Sz = byte_size(Orig) - byte_size(CD), + <<Keep:Sz/binary, _/binary>> = Orig, + Keep. + +uppercase_list(CPs0) -> + case unicode_util:uppercase(CPs0) of + [Char|CPs] -> append(Char,uppercase_list(CPs)); + [] -> [] + end. + +uppercase_bin(CPs0, Acc) -> + case unicode_util:uppercase(CPs0) of + [Char|CPs] when is_integer(Char) -> + uppercase_bin(CPs, <<Acc/binary, Char/utf8>>); + [Chars|CPs] -> + uppercase_bin(CPs, <<Acc/binary, + << <<CP/utf8>> || CP <- Chars>>/binary >>); + [] -> Acc + end. + +lowercase_list(CPs0) -> + case unicode_util:lowercase(CPs0) of + [Char|CPs] -> append(Char,lowercase_list(CPs)); + [] -> [] + end. + +lowercase_bin(CPs0, Acc) -> + case unicode_util:lowercase(CPs0) of + [Char|CPs] when is_integer(Char) -> + lowercase_bin(CPs, <<Acc/binary, Char/utf8>>); + [Chars|CPs] -> + lowercase_bin(CPs, <<Acc/binary, + << <<CP/utf8>> || CP <- Chars>>/binary >>); + [] -> Acc + end. + +casefold_list(CPs0) -> + case unicode_util:casefold(CPs0) of + [Char|CPs] -> append(Char, casefold_list(CPs)); + [] -> [] + end. + +casefold_bin(CPs0, Acc) -> + case unicode_util:casefold(CPs0) of + [Char|CPs] when is_integer(Char) -> + casefold_bin(CPs, <<Acc/binary, Char/utf8>>); + [Chars|CPs] -> + casefold_bin(CPs, <<Acc/binary, + << <<CP/utf8>> || CP <- Chars>>/binary >>); + [] -> Acc + end. + + +trim_l([Bin|Cont0], Sep) when is_binary(Bin) -> + case bin_search_inv(Bin, Cont0, Sep) of + {nomatch, Cont} -> trim_l(Cont, Sep); + Keep -> Keep + end; +trim_l(Str, {GCs, _, _}=Sep) when is_list(Str) -> + case unicode_util:gc(Str) of + [C|Cs] -> + case lists:member(C, GCs) of + true -> trim_l(Cs, Sep); + false -> Str + end; + [] -> [] + end; +trim_l(Bin, Sep) when is_binary(Bin) -> + case bin_search_inv(Bin, [], Sep) of + {nomatch,_} -> <<>>; + [Keep] -> Keep + end. + +trim_t([Bin|Cont0], N, Sep) when is_binary(Bin) -> + <<_:N/binary, Rest/binary>> = Bin, + case bin_search(Rest, Cont0, Sep) of + {nomatch,_} -> + stack(Bin, trim_t(Cont0, 0, Sep)); + [SepStart|Cont1] -> + case bin_search_inv(SepStart, Cont1, Sep) of + {nomatch, Cont} -> + Tail = trim_t(Cont, 0, Sep), + case is_empty(Tail) of + true -> + KeepSz = byte_size(Bin) - byte_size(SepStart), + <<Keep:KeepSz/binary, _/binary>> = Bin, + Keep; + false -> + Used = cp_prefix(Cont0, Cont), + stack(Bin, stack(Used, Tail)) + end; + [NonSep|Cont] when is_binary(NonSep) -> + KeepSz = byte_size(Bin) - byte_size(NonSep), + trim_t([Bin|Cont], KeepSz, Sep) + end + end; +trim_t(Str, 0, {GCs,CPs,_}=Sep) when is_list(Str) -> + case unicode_util:cp(Str) of + [CP|Cs] -> + case lists:member(CP, CPs) of + true -> + [GC|Cs1] = unicode_util:gc(Str), + case lists:member(GC, GCs) of + true -> + Tail = trim_t(Cs1, 0, Sep), + case is_empty(Tail) of + true -> []; + false -> append(GC,Tail) + end; + false -> + append(GC,trim_t(Cs1, 0, Sep)) + end; + false -> + append(CP,trim_t(Cs, 0, Sep)) + end; + [] -> [] + end; +trim_t(Bin, N, Sep) when is_binary(Bin) -> + <<_:N/binary, Rest/binary>> = Bin, + case bin_search(Rest, Sep) of + {nomatch,_} -> Bin; + [SepStart] -> + case bin_search_inv(SepStart, [], Sep) of + {nomatch,_} -> + KeepSz = byte_size(Bin) - byte_size(SepStart), + <<Keep:KeepSz/binary, _/binary>> = Bin, + Keep; + [NonSep] -> + KeepSz = byte_size(Bin) - byte_size(NonSep), + trim_t(Bin, KeepSz, Sep) + end + end. + +take_l([Bin|Cont0], Sep, Acc) when is_binary(Bin) -> + case bin_search_inv(Bin, Cont0, Sep) of + {nomatch, Cont} -> + Used = cp_prefix(Cont0, Cont), + take_l(Cont, Sep, [unicode:characters_to_binary([Bin|Used])|Acc]); + [Bin1|_]=After when is_binary(Bin1) -> + First = byte_size(Bin) - byte_size(Bin1), + <<Keep:First/binary, _/binary>> = Bin, + {btoken(Keep,Acc), After} + end; +take_l(Str, {GCs, _, _}=Sep, Acc) when is_list(Str) -> + case unicode_util:gc(Str) of + [C|Cs] -> + case lists:member(C, GCs) of + true -> take_l(Cs, Sep, append(rev(C),Acc)); + false -> {rev(Acc), Str} + end; + [] -> {rev(Acc), []} + end; +take_l(Bin, Sep, Acc) when is_binary(Bin) -> + case bin_search_inv(Bin, [], Sep) of + {nomatch,_} -> + {btoken(Bin, Acc), <<>>}; + [After] -> + First = byte_size(Bin) - byte_size(After), + <<Keep:First/binary, _/binary>> = Bin, + {btoken(Keep, Acc), After} + end. + +take_lc([Bin|Cont0], Sep, Acc) when is_binary(Bin) -> + case bin_search(Bin, Cont0, Sep) of + {nomatch, Cont} -> + Used = cp_prefix(Cont0, Cont), + take_lc(Cont, Sep, [unicode:characters_to_binary([Bin|Used])|Acc]); + [Bin1|_]=After when is_binary(Bin1) -> + First = byte_size(Bin) - byte_size(Bin1), + <<Keep:First/binary, _/binary>> = Bin, + {btoken(Keep,Acc), After} + end; +take_lc(Str, {GCs, _, _}=Sep, Acc) when is_list(Str) -> + case unicode_util:gc(Str) of + [C|Cs] -> + case lists:member(C, GCs) of + false -> take_lc(Cs, Sep, append(rev(C),Acc)); + true -> {rev(Acc), Str} + end; + [] -> {rev(Acc), []} + end; +take_lc(Bin, Sep, Acc) when is_binary(Bin) -> + case bin_search(Bin, [], Sep) of + {nomatch,_} -> + {btoken(Bin, Acc), <<>>}; + [After] -> + First = byte_size(Bin) - byte_size(After), + <<Keep:First/binary, _/binary>> = Bin, + {btoken(Keep, Acc), After} + end. + +take_t([Bin|Cont0], N, Sep) when is_binary(Bin) -> + <<_:N/binary, Rest/binary>> = Bin, + case bin_search(Rest, Cont0, Sep) of + {nomatch,Cont} -> + Used = cp_prefix(Cont0, Cont), + {Head, Tail} = take_t(Cont, 0, Sep), + {stack(unicode:characters_to_binary([Bin|Used]), Head), Tail}; + [SepStart|Cont1] -> + case bin_search_inv(SepStart, Cont1, Sep) of + {nomatch, Cont} -> + {Head, Tail} = take_t(Cont, 0, Sep), + Used = cp_prefix(Cont0, Cont), + case equal(Tail, Cont) of + true -> + KeepSz = byte_size(Bin) - byte_size(SepStart), + <<Keep:KeepSz/binary, End/binary>> = Bin, + {stack(Keep,Head), stack(stack(End,Used),Tail)}; + false -> + {stack(unicode:characters_to_binary([Bin|Used]),Head), Tail} + end; + [NonSep|Cont] when is_binary(NonSep) -> + KeepSz = byte_size(Bin) - byte_size(NonSep), + take_t([Bin|Cont], KeepSz, Sep) + end + end; +take_t(Str, 0, {GCs,CPs,_}=Sep) when is_list(Str) -> + case unicode_util:cp(Str) of + [CP|Cs] -> + case lists:member(CP, CPs) of + true -> + [GC|Cs1] = unicode_util:gc(Str), + case lists:member(GC, GCs) of + true -> + {Head, Tail} = take_t(Cs1, 0, Sep), + case equal(Tail, Cs1) of + true -> {Head, append(GC,Tail)}; + false -> {append(GC,Head), Tail} + end; + false -> + {Head, Tail} = take_t(Cs, 0, Sep), + {append(CP,Head), Tail} + end; + false -> + {Head, Tail} = take_t(Cs, 0, Sep), + {append(CP,Head), Tail} + end; + [] -> {[],[]} + end; +take_t(Bin, N, Sep) when is_binary(Bin) -> + <<_:N/binary, Rest/binary>> = Bin, + case bin_search(Rest, Sep) of + {nomatch,_} -> {Bin, <<>>}; + [SepStart] -> + case bin_search_inv(SepStart, [], Sep) of + {nomatch,_} -> + KeepSz = byte_size(Bin) - byte_size(SepStart), + <<Before:KeepSz/binary, End/binary>> = Bin, + {Before, End}; + [NonSep] -> + KeepSz = byte_size(Bin) - byte_size(NonSep), + take_t(Bin, KeepSz, Sep) + end + end. + +take_tc([Bin|Cont0], N, Sep) when is_binary(Bin) -> + <<_:N/binary, Rest/binary>> = Bin, + case bin_search_inv(Rest, Cont0, Sep) of + {nomatch,Cont} -> + Used = cp_prefix(Cont0, Cont), + {Head, Tail} = take_tc(Cont, 0, Sep), + {stack(unicode:characters_to_binary([Bin|Used]), Head), Tail}; + [SepStart|Cont1] -> + case bin_search(SepStart, Cont1, Sep) of + {nomatch, Cont} -> + {Head, Tail} = take_tc(Cont, 0, Sep), + Used = cp_prefix(Cont0, Cont), + case equal(Tail, Cont) of + true -> + KeepSz = byte_size(Bin) - byte_size(SepStart), + <<Keep:KeepSz/binary, End/binary>> = Bin, + {stack(Keep,Head), stack(stack(End,Used),Tail)}; + false -> + {stack(unicode:characters_to_binary([Bin|Used]),Head), Tail} + end; + [NonSep|Cont] when is_binary(NonSep) -> + KeepSz = byte_size(Bin) - byte_size(NonSep), + take_tc([Bin|Cont], KeepSz, Sep) + end + end; +take_tc(Str, 0, {GCs,CPs,_}=Sep) when is_list(Str) -> + case unicode_util:cp(Str) of + [CP|Cs] -> + case lists:member(CP, CPs) of + true -> + [GC|Cs1] = unicode_util:gc(Str), + case lists:member(GC, GCs) of + false -> + {Head, Tail} = take_tc(Cs1, 0, Sep), + case equal(Tail, Cs1) of + true -> {Head, append(GC,Tail)}; + false -> {append(GC,Head), Tail} + end; + true -> + {Head, Tail} = take_tc(Cs1, 0, Sep), + {append(GC,Head), Tail} + end; + false -> + {Head, Tail} = take_tc(Cs, 0, Sep), + case equal(Tail, Cs) of + true -> {Head, append(CP,Tail)}; + false -> {append(CP,Head), Tail} + end + end; + [] -> {[],[]} + end; +take_tc(Bin, N, Sep) when is_binary(Bin) -> + <<_:N/binary, Rest/binary>> = Bin, + case bin_search_inv(Rest, [], Sep) of + {nomatch,_} -> {Bin, <<>>}; + [SepStart] -> + case bin_search(SepStart, [], Sep) of + {nomatch,_} -> + KeepSz = byte_size(Bin) - byte_size(SepStart), + <<Before:KeepSz/binary, End/binary>> = Bin, + {Before, End}; + [NonSep] -> + KeepSz = byte_size(Bin) - byte_size(NonSep), + take_tc(Bin, KeepSz, Sep) + end + end. + +prefix_1(Cs, []) -> Cs; +prefix_1(Cs, [_]=Pre) -> + prefix_2(unicode_util:gc(Cs), Pre); +prefix_1(Cs, Pre) -> + prefix_2(unicode_util:cp(Cs), Pre). + +prefix_2([C|Cs], [C|Pre]) -> + prefix_1(Cs, Pre); +prefix_2(_, _) -> + nomatch. + +split_1([Bin|Cont0], Needle, Start, Where, Curr0, Acc) + when is_binary(Bin) -> + case bin_search_str(Bin, Start, Cont0, Needle) of + {nomatch,Sz,Cont} -> + <<Keep:Sz/binary, _/binary>> = Bin, + split_1(Cont, Needle, 0, Where, [Keep|Curr0], Acc); + {Before, [Cs0|Cont], After} -> + Curr = add_non_empty(Before,Curr0), + case Where of + leading -> + [rev(Curr),After]; + trailing -> + <<_/utf8, Cs/binary>> = Cs0, + Next = byte_size(Bin) - byte_size(Cs), + split_1([Bin|Cont], Needle, Next, Where, + Curr0, [rev(Curr),After]); + all -> + split_1(After, Needle, 0, Where, [], [rev(Curr)|Acc]) + end + end; +split_1(Cs0, [C|_]=Needle, _, Where, Curr, Acc) when is_list(Cs0) -> + case unicode_util:cp(Cs0) of + [C|Cs] -> + case prefix_1(Cs0, Needle) of + nomatch -> split_1(Cs, Needle, 0, Where, append(C,Curr), Acc); + Rest when Where =:= leading -> + [rev(Curr), Rest]; + Rest when Where =:= trailing -> + split_1(Cs, Needle, 0, Where, [C|Curr], [rev(Curr), Rest]); + Rest when Where =:= all -> + split_1(Rest, Needle, 0, Where, [], [rev(Curr)|Acc]) + end; + [Other|Cs] -> + split_1(Cs, Needle, 0, Where, append(Other,Curr), Acc); + [] -> + {rev(Curr), Acc} + end; +split_1(Bin, [_C|_]=Needle, Start, Where, Curr0, Acc) -> + case bin_search_str(Bin, Start, [], Needle) of + {nomatch,_,_} -> + <<_:Start/binary, Keep/binary>> = Bin, + {rev([Keep|Curr0]), Acc}; + {Before, [Cs0], After} -> + case Where of + leading -> + [rev([Before|Curr0]),After]; + trailing -> + <<_/utf8, Cs/binary>> = Cs0, + Next = byte_size(Bin) - byte_size(Cs), + split_1(Bin, Needle, Next, Where, Curr0, + [btoken(Before,Curr0),After]); + all -> + Next = byte_size(Bin) - byte_size(After), + <<_:Start/binary, Keep/binary>> = Before, + Curr = [Keep|Curr0], + split_1(Bin, Needle, Next, Where, [], [rev(Curr)|Acc]) + end + end. + +lexemes_m([Bin|Cont0], Seps, Ts) when is_binary(Bin) -> + case bin_search_inv(Bin, Cont0, Seps) of + {nomatch,Cont} -> + lexemes_m(Cont, Seps, Ts); + Cs -> + {Lexeme,Rest} = lexeme_pick(Cs, Seps, []), + lexemes_m(Rest, Seps, [Lexeme|Ts]) + end; +lexemes_m(Cs0, {GCs, _, _}=Seps, Ts) when is_list(Cs0) -> + case unicode_util:gc(Cs0) of + [C|Cs] -> + case lists:member(C, GCs) of + true -> + lexemes_m(Cs, Seps, Ts); + false -> + {Lexeme,Rest} = lexeme_pick(Cs0, Seps, []), + lexemes_m(Rest, Seps, [Lexeme|Ts]) + end; + [] -> + lists:reverse(Ts) + end; +lexemes_m(Bin, Seps, Ts) when is_binary(Bin) -> + case bin_search_inv(Bin, [], Seps) of + {nomatch,_} -> + lists:reverse(Ts); + [Cs] -> + {Lexeme,Rest} = lexeme_pick(Cs, Seps, []), + lexemes_m(Rest, Seps, add_non_empty(Lexeme,Ts)) + end. + +lexeme_pick([CP|Cs1]=Cs0, {GCs,CPs,_}=Seps, Tkn) when is_integer(CP) -> + case lists:member(CP, CPs) of + true -> + [GC|Cs2] = unicode_util:gc(Cs0), + case lists:member(GC, GCs) of + true -> {rev(Tkn), Cs2}; + false -> lexeme_pick(Cs2, Seps, append(rev(GC),Tkn)) + end; + false -> lexeme_pick(Cs1, Seps, [CP|Tkn]) + end; +lexeme_pick([Bin|Cont0], Seps, Tkn) when is_binary(Bin) -> + case bin_search(Bin, Cont0, Seps) of + {nomatch,_} -> + lexeme_pick(Cont0, Seps, [Bin|Tkn]); + [Left|_Cont] = Cs -> + Bytes = byte_size(Bin) - byte_size(Left), + <<Lexeme:Bytes/binary, _/binary>> = Bin, + {btoken(Lexeme, Tkn), Cs} + end; +lexeme_pick(Cs0, {GCs, CPs, _} = Seps, Tkn) when is_list(Cs0) -> + case unicode_util:cp(Cs0) of + [CP|Cs] -> + case lists:member(CP, CPs) of + true -> + [GC|Cs2] = unicode_util:gc(Cs0), + case lists:member(GC, GCs) of + true -> {rev(Tkn), Cs0}; + false -> lexeme_pick(Cs2, Seps, append(rev(GC),Tkn)) + end; + false -> + lexeme_pick(Cs, Seps, append(CP,Tkn)) + end; + [] -> + {rev(Tkn), []} + end; +lexeme_pick(Bin, Seps, Tkn) when is_binary(Bin) -> + case bin_search(Bin, Seps) of + {nomatch,_} -> + {btoken(Bin,Tkn), []}; + [Left] -> + Bytes = byte_size(Bin) - byte_size(Left), + <<Lexeme:Bytes/binary, _/binary>> = Bin, + {btoken(Lexeme, Tkn), Left} + end. + +nth_lexeme_m([Bin|Cont0], Seps, N) when is_binary(Bin) -> + case bin_search_inv(Bin, Cont0, Seps) of + {nomatch,Cont} -> + nth_lexeme_m(Cont, Seps, N); + Cs when N > 1 -> + Rest = lexeme_skip(Cs, Seps), + nth_lexeme_m(Rest, Seps, N-1); + Cs -> + {Lexeme,_} = lexeme_pick(Cs, Seps, []), + Lexeme + end; +nth_lexeme_m(Cs0, {GCs, _, _}=Seps, N) when is_list(Cs0) -> + case unicode_util:gc(Cs0) of + [C|Cs] -> + case lists:member(C, GCs) of + true -> + nth_lexeme_m(Cs, Seps, N); + false when N > 1 -> + Cs1 = lexeme_skip(Cs, Seps), + nth_lexeme_m(Cs1, Seps, N-1); + false -> + {Lexeme,_} = lexeme_pick(Cs0, Seps, []), + Lexeme + end; + [] -> + [] + end; +nth_lexeme_m(Bin, Seps, N) when is_binary(Bin) -> + case bin_search_inv(Bin, [], Seps) of + [Cs] when N > 1 -> + Cs1 = lexeme_skip(Cs, Seps), + nth_lexeme_m(Cs1, Seps, N-1); + [Cs] -> + {Lexeme,_} = lexeme_pick(Cs, Seps, []), + Lexeme; + {nomatch,_} -> + <<>> + end. + +lexeme_skip([CP|Cs1]=Cs0, {GCs,CPs,_}=Seps) when is_integer(CP) -> + case lists:member(CP, CPs) of + true -> + [GC|Cs2] = unicode_util:gc(Cs0), + case lists:member(GC, GCs) of + true -> Cs0; + false -> lexeme_skip(Cs2, Seps) + end; + false -> + lexeme_skip(Cs1, Seps) + end; +lexeme_skip([Bin|Cont0], Seps) when is_binary(Bin) -> + case bin_search(Bin, Cont0, Seps) of + {nomatch,_} -> lexeme_skip(Cont0, Seps); + Cs -> Cs + end; +lexeme_skip(Cs0, {GCs, CPs, _} = Seps) when is_list(Cs0) -> + case unicode_util:cp(Cs0) of + [CP|Cs] -> + case lists:member(CP, CPs) of + true -> + [GC|Cs2] = unicode_util:gc(Cs0), + case lists:member(GC, GCs) of + true -> Cs0; + false -> lexeme_skip(Cs2, Seps) + end; + false -> + lexeme_skip(Cs, Seps) + end; + [] -> + [] + end; +lexeme_skip(Bin, Seps) when is_binary(Bin) -> + case bin_search(Bin, Seps) of + {nomatch,_} -> <<>>; + [Left] -> Left + end. + +find_l([Bin|Cont0], Needle) when is_binary(Bin) -> + case bin_search_str(Bin, 0, Cont0, Needle) of + {nomatch, _, Cont} -> + find_l(Cont, Needle); + {_Before, Cs, _After} -> + Cs + end; +find_l(Cs0, [C|_]=Needle) when is_list(Cs0) -> + case unicode_util:cp(Cs0) of + [C|Cs] -> + case prefix_1(Cs0, Needle) of + nomatch -> find_l(Cs, Needle); + _ -> Cs0 + end; + [_C|Cs] -> + find_l(Cs, Needle); + [] -> nomatch + end; +find_l(Bin, Needle) -> + case bin_search_str(Bin, 0, [], Needle) of + {nomatch,_,_} -> nomatch; + {_Before, [Cs], _After} -> Cs + end. + +find_r([Bin|Cont0], Needle, Res) when is_binary(Bin) -> + case bin_search_str(Bin, 0, Cont0, Needle) of + {nomatch,_,Cont} -> + find_r(Cont, Needle, Res); + {_, Cs0, _} -> + [_|Cs] = unicode_util:gc(Cs0), + find_r(Cs, Needle, Cs0) + end; +find_r(Cs0, [C|_]=Needle, Res) when is_list(Cs0) -> + case unicode_util:cp(Cs0) of + [C|Cs] -> + case prefix_1(Cs0, Needle) of + nomatch -> find_r(Cs, Needle, Res); + _ -> find_r(Cs, Needle, Cs0) + end; + [_C|Cs] -> + find_r(Cs, Needle, Res); + [] -> Res + end; +find_r(Bin, Needle, Res) -> + case bin_search_str(Bin, 0, [], Needle) of + {nomatch,_,_} -> Res; + {_Before, [Cs0], _After} -> + <<_/utf8, Cs/binary>> = Cs0, + find_r(Cs, Needle, Cs0) + end. + +%% These are used to avoid creating lists around binaries +%% might be unnecessary, is there a better solution? +btoken(Token, []) -> Token; +btoken(BinPart, [C]) when is_integer(C) -> <<C/utf8, BinPart/binary>>; +btoken(<<>>, Tkn) -> lists:reverse(Tkn); +btoken(BinPart, Cs) -> [lists:reverse(Cs),BinPart]. + +rev([B]) when is_binary(B) -> B; +rev(L) when is_list(L) -> lists:reverse(L); +rev(C) when is_integer(C) -> C. + +append(Char, <<>>) when is_integer(Char) -> [Char]; +append(Char, <<>>) when is_list(Char) -> Char; +append(Char, Bin) when is_binary(Bin) -> [Char,Bin]; +append(Char, Str) when is_integer(Char) -> [Char|Str]; +append(GC, Str) when is_list(GC) -> GC ++ Str. + +stack(Bin, []) -> Bin; +stack(<<>>, St) -> St; +stack([], St) -> St; +stack(Bin, St) -> [Bin|St]. + +add_non_empty(<<>>, L) -> L; +add_non_empty(Token, L) -> [Token|L]. + +cp_prefix(Orig, Cont) -> + case unicode_util:cp(Cont) of + [] -> Orig; + [Cp|Rest] -> cp_prefix_1(Orig, Cp, Rest) + end. + +cp_prefix_1(Orig, Until, Cont) -> + case unicode_util:cp(Orig) of + [Until|Rest] -> + case equal(Rest, Cont) of + true -> []; + false-> [Until|cp_prefix_1(Rest, Until, Cont)] + end; + [CP|Rest] -> [CP|cp_prefix_1(Rest, Until, Cont)] + end. + + +%% Binary special +bin_search(Bin, Seps) -> + bin_search(Bin, [], Seps). + +bin_search(_Bin, Cont, {[],_,_}) -> + {nomatch, Cont}; +bin_search(Bin, Cont, {Seps,_,BP}) -> + bin_search_loop(Bin, 0, BP, Cont, Seps). + +%% Need to work with [<<$a>>, <<778/utf8>>], +%% i.e. å in nfd form $a "COMBINING RING ABOVE" +%% and PREPEND characters like "ARABIC NUMBER SIGN" 1536 <<216,128>> +%% combined with other characters are currently ignored. +search_pattern(Seps) -> + CPs = search_cp(Seps), + Bin = bin_pattern(CPs), + {Seps, CPs, Bin}. + +search_cp([CP|Seps]) when is_integer(CP) -> + [CP|search_cp(Seps)]; +search_cp([Pattern|Seps]) -> + [CP|_] = unicode_util:cp(Pattern), + [CP|search_cp(Seps)]; +search_cp([]) -> []. + +bin_pattern([CP|Seps]) -> + [<<CP/utf8>>|bin_pattern(Seps)]; +bin_pattern([]) -> []. + +bin_search_loop(Bin0, Start, _, Cont, _Seps) + when byte_size(Bin0) =< Start; Start < 0 -> + {nomatch, Cont}; +bin_search_loop(Bin0, Start, BinSeps, Cont, Seps) -> + <<_:Start/binary, Bin/binary>> = Bin0, + case binary:match(Bin, BinSeps) of + nomatch -> + {nomatch,Cont}; + {Where, _CL} -> + <<_:Where/binary, Cont0/binary>> = Bin, + Cont1 = stack(Cont0, Cont), + [GC|Cont2] = unicode_util:gc(Cont1), + case lists:member(GC, Seps) of + false -> + case Cont2 of + [BinR|Cont] when is_binary(BinR) -> + Next = byte_size(Bin0) - byte_size(BinR), + bin_search_loop(Bin0, Next, BinSeps, Cont, Seps); + BinR when is_binary(BinR), Cont =:= [] -> + Next = byte_size(Bin0) - byte_size(BinR), + bin_search_loop(Bin0, Next, BinSeps, Cont, Seps); + _ -> + {nomatch, Cont2} + end; + true when is_list(Cont1) -> + Cont1; + true -> + [Cont1] + end + end. + +bin_search_inv(Bin, Cont, {[], _, _}) -> + [Bin|Cont]; +bin_search_inv(Bin, Cont, {[Sep], _, _}) -> + bin_search_inv_1([Bin|Cont], Sep); +bin_search_inv(Bin, Cont, {Seps, _, _}) -> + bin_search_inv_n([Bin|Cont], Seps). + +bin_search_inv_1([<<>>|CPs], _) -> + {nomatch, CPs}; +bin_search_inv_1(CPs = [Bin0|Cont], Sep) when is_binary(Bin0) -> + case unicode_util:gc(CPs) of + [Sep|Bin] when is_binary(Bin), Cont =:= [] -> + bin_search_inv_1([Bin], Sep); + [Sep|[Bin|Cont]=Cs] when is_binary(Bin) -> + bin_search_inv_1(Cs, Sep); + [Sep|Cs] -> + {nomatch, Cs}; + _ -> CPs + end. + +bin_search_inv_n([<<>>|CPs], _) -> + {nomatch, CPs}; +bin_search_inv_n([Bin0|Cont]=CPs, Seps) when is_binary(Bin0) -> + [C|Cs0] = unicode_util:gc(CPs), + case {lists:member(C, Seps), Cs0} of + {true, Cs} when is_binary(Cs), Cont =:= [] -> + bin_search_inv_n([Cs], Seps); + {true, [Bin|Cont]=Cs} when is_binary(Bin) -> + bin_search_inv_n(Cs, Seps); + {true, Cs} -> {nomatch, Cs}; + {false, _} -> CPs + end. + +bin_search_str(Bin0, Start, Cont, [CP|_]=SearchCPs) -> + <<_:Start/binary, Bin/binary>> = Bin0, + case binary:match(Bin, <<CP/utf8>>) of + nomatch -> {nomatch, byte_size(Bin0), Cont}; + {Where0, _} -> + Where = Start+Where0, + <<Keep:Where/binary, Cs0/binary>> = Bin0, + [GC|Cs]=unicode_util:gc(Cs0), + case prefix_1(stack(Cs0,Cont), SearchCPs) of + nomatch when is_binary(Cs) -> + KeepSz = byte_size(Bin0) - byte_size(Cs), + bin_search_str(Bin0, KeepSz, Cont, SearchCPs); + nomatch -> + {nomatch, Where, stack([GC|Cs],Cont)}; + [] -> + {Keep, [Cs0|Cont], <<>>}; + Rest -> + {Keep, [Cs0|Cont], Rest} + end + end. + + +%%--------------------------------------------------------------------------- +%% OLD lists API kept for backwards compability +%%--------------------------------------------------------------------------- + %% Robert's bit %% len(String) @@ -68,12 +1292,12 @@ len(S) -> length(S). %% equal(String1, String2) %% Test if 2 strings are equal. --spec equal(String1, String2) -> boolean() when - String1 :: string(), - String2 :: string(). +%% -spec equal(String1, String2) -> boolean() when +%% String1 :: string(), +%% String2 :: string(). -equal(S, S) -> true; -equal(_, _) -> false. +%% equal(S, S) -> true; +%% equal(_, _) -> false. %% concat(String1, String2) %% Concatenate 2 strings. @@ -127,7 +1351,7 @@ rchr([], _C, _I, L) -> L. str(S, Sub) when is_list(Sub) -> str(S, Sub, 1). str([C|S], [C|Sub], I) -> - case prefix(Sub, S) of + case l_prefix(Sub, S) of true -> I; false -> str(S, [C|Sub], I+1) end; @@ -142,16 +1366,16 @@ str([], _Sub, _I) -> 0. rstr(S, Sub) when is_list(Sub) -> rstr(S, Sub, 1, 0). rstr([C|S], [C|Sub], I, L) -> - case prefix(Sub, S) of + case l_prefix(Sub, S) of true -> rstr(S, [C|Sub], I+1, I); false -> rstr(S, [C|Sub], I+1, L) end; rstr([_|S], Sub, I, L) -> rstr(S, Sub, I+1, L); rstr([], _Sub, _I, L) -> L. -prefix([C|Pre], [C|String]) -> prefix(Pre, String); -prefix([], String) when is_list(String) -> true; -prefix(Pre, String) when is_list(Pre), is_list(String) -> false. +l_prefix([C|Pre], [C|String]) -> l_prefix(Pre, String); +l_prefix([], String) when is_list(String) -> true; +l_prefix(Pre, String) when is_list(Pre), is_list(String) -> false. %% span(String, Chars) -> Length. %% cspan(String, Chars) -> Length. @@ -229,9 +1453,9 @@ tokens(S, Seps) -> [_|_] -> [S] end; [C] -> - tokens_single_1(reverse(S), C, []); + tokens_single_1(lists:reverse(S), C, []); [_|_] -> - tokens_multiple_1(reverse(S), Seps, []) + tokens_multiple_1(lists:reverse(S), Seps, []) end. tokens_single_1([Sep|S], Sep, Toks) -> @@ -342,8 +1566,8 @@ sub_word(String, Index, Char) when is_integer(Index), is_integer(Char) -> s_word(strip(String, left, Char), Index, Char, 1, []) end. -s_word([], _, _, _,Res) -> reverse(Res); -s_word([Char|_],Index,Char,Index,Res) -> reverse(Res); +s_word([], _, _, _,Res) -> lists:reverse(Res); +s_word([Char|_],Index,Char,Index,Res) -> lists:reverse(Res); s_word([H|T],Index,Char,Index,Res) -> s_word(T,Index,Char,Index,[H|Res]); s_word([Char|T],Stop,Char,Index,Res) when Index < Stop -> s_word(strip(T,left,Char),Stop,Char,Index+1,Res); @@ -359,7 +1583,7 @@ strip(String) -> strip(String, both). -spec strip(String, Direction) -> Stripped when String :: string(), Stripped :: string(), - Direction :: left | right | both. + Direction :: 'left' | 'right' | 'both'. strip(String, left) -> strip_left(String, $\s); strip(String, right) -> strip_right(String, $\s); @@ -369,7 +1593,7 @@ strip(String, both) -> -spec strip(String, Direction, Character) -> Stripped when String :: string(), Stripped :: string(), - Direction :: left | right | both, + Direction :: 'left' | 'right' | 'both', Character :: char(). strip(String, right, Char) -> strip_right(String, Char); diff --git a/lib/stdlib/test/string_SUITE.erl b/lib/stdlib/test/string_SUITE.erl index 836f9e5142..a78ddf761b 100644 --- a/lib/stdlib/test/string_SUITE.erl +++ b/lib/stdlib/test/string_SUITE.erl @@ -29,25 +29,46 @@ -export([init_per_testcase/2, end_per_testcase/2]). %% Test cases must be exported. --export([len/1,equal/1,concat/1,chr_rchr/1,str_rstr/1]). --export([span_cspan/1,substr/1,tokens/1,chars/1]). +-export([is_empty/1, length/1, to_graphemes/1, + reverse/1, slice/1, + equal/1, + pad/1, trim/1, chomp/1, take/1, + uppercase/1, lowercase/1, titlecase/1, casefold/1, + prefix/1, split/1, replace/1, find/1, + lexemes/1, nth_lexeme/1, cd_gc/1, meas/1 + ]). + +-export([len/1,old_equal/1,old_concat/1,chr_rchr/1,str_rstr/1]). +-export([span_cspan/1,substr/1,old_tokens/1,chars/1]). -export([copies/1,words/1,strip/1,sub_word/1,left_right/1]). -export([sub_string/1,centre/1, join/1]). -export([to_integer/1,to_float/1]). -export([to_upper_to_lower/1]). +%% Run tests when debugging them +-export([debug/0]). + suite() -> [{ct_hooks,[ts_install_cth]}, {timetrap,{minutes,1}}]. -all() -> - [len, equal, concat, chr_rchr, str_rstr, span_cspan, - substr, tokens, chars, copies, words, strip, sub_word, - left_right, sub_string, centre, join, to_integer, - to_float, to_upper_to_lower]. +all() -> + [{group, chardata}, {group, list_string}]. -groups() -> - []. +groups() -> + [{chardata, + [is_empty, length, to_graphemes, + equal, reverse, slice, + pad, trim, chomp, take, + lexemes, nth_lexeme, + uppercase, lowercase, titlecase, casefold, + prefix, find, split, replace, cd_gc, + meas]}, + {list_string, + [len, old_equal, old_concat, chr_rchr, str_rstr, span_cspan, + substr, old_tokens, chars, copies, words, strip, sub_word, + left_right, sub_string, centre, join, to_integer, + to_float, to_upper_to_lower]}]. init_per_suite(Config) -> Config. @@ -68,8 +89,839 @@ init_per_testcase(_Case, Config) -> end_per_testcase(_Case, _Config) -> ok. +debug() -> + Config = [{data_dir, ?MODULE_STRING++"_data"}], + [io:format("~p:~p~n",[Test,?MODULE:Test(Config)]) || + {_,Tests} <- groups(), Test <- Tests]. + +-define(TEST(B,C,D), test(?LINE,?FUNCTION_NAME,B,C,D, true)). +-define(TEST_EQ(B,C,D), + test(?LINE,?FUNCTION_NAME,B,C,D, true), + test(?LINE,?FUNCTION_NAME,hd(C),[B|tl(C),D, true)). + +-define(TEST_NN(B,C,D), + test(?LINE,?FUNCTION_NAME,B,C,D, false), + test(?LINE,?FUNCTION_NAME,hd(C),[B|tl(C)],D, false)). + + +is_empty(_) -> + ?TEST("", [], true), + ?TEST([""|<<>>], [], true), + ?TEST("a", [], false), + ?TEST([""|<<$a>>], [], false), + ?TEST(["",[<<>>]], [], true), + ok. + +length(_) -> + %% invalid arg type + {'EXIT',_} = (catch string:length({})), + {'EXIT',_} = (catch string:length(foo)), + %% Valid signs + ?TEST("", [], 0), + ?TEST([""|<<>>], [], 0), + L = tuple_size(list_to_tuple(atom_to_list(?MODULE))), + ?TEST(atom_to_list(?MODULE), [], L), + ?TEST("Hello", [], 5), + ?TEST("UC Ω ßð", [], 7), + ?TEST(["abc"|<<"abc">>], [], 6), + ?TEST(["abc",["def"]], [], 6), + ?TEST([<<97/utf8, 778/utf8, 98/utf8>>, [776,111,776]], [], 3), %% åäö in nfd + ok. + +equal(_) -> + %% invalid arg type + {'EXIT',_} = (catch string:equal(1, 2)), + {'EXIT',_} = (catch string:equal(1, 2, foo)), + {'EXIT',_} = (catch string:equal(1, 2, true, foo)), + + ?TEST("", [<<"">>], true), + ?TEST("Hello", ["Hello"], true), + ?TEST("Hello", ["Hell"], false), + ?TEST("Hello", ["Hello!"], false), + ?TEST("Hello", [<<"Hello"/utf8>>], true), + ?TEST("Hello", [<<"Mello"/utf8>>], false), + ?TEST("Hello", [<<"Hello!"/utf8>>], false), + ?TEST(["Hello",[" deep"]], ["Hello deep"], true), + ?TEST(["Hello",[<<" deep"/utf8>>]], ["Hello deep"], true), + ?TEST("Hello deep", [["Hello", [" deep"]]], true), + ?TEST("Hello deep", [["Hello", [" d!eep"]]], false), + ?TEST("Hello deep", [["Hello", [<<" deep"/utf8>>]]], true), + false = string:equal("Åäö", [<<97/utf8, 778/utf8, 98/utf8>>, [776,111,776]]), %% nfc vs nfd + + %% case_insensitive_equal() + ?TEST("", ["", true], true), + ?TEST("a", ["b", true], false), + ?TEST("", [<<>>, true], true), + ?TEST("", [[<<>>,[]], true], true), + ?TEST("", [[<<>>,[$a]], true], false), + ?TEST("123", ["123", true], true), + ?TEST("abc", ["abc", true], true), + ?TEST([[],<<>>,"ABC"|<<>>], [["abc",[]], true], true), + ?TEST("ABCa", ["abcå", true], false), + ?TEST("åäö", [{norm,"åäö"}, true], true), + ?TEST("ÅÄÖ", [{norm,"åäö"}, true], true), + ?TEST("MICHAŁ", ["michał", true], true), + ?TEST(["Mic",<<"HAŁ"/utf8>>], ["michał", true], true), + ?TEST("ß SHARP S", ["ss sharp s", true], true), + ?TEST("ẞ SHARP S", [[<<$ß/utf8, $\s>>,"SHARP S"], true], true), + ?TEST("ẞ SHARP ß", ["ss sharp s", true], false), + ?TEST(<<"İ I WITH DOT ABOVE"/utf8>>, ["i̇ i with dot above", true], true), + %% These should be equivalent with the above + true = string:equal(string:casefold(["Mic",<<"HAŁ"/utf8>>]), string:casefold("michał")), + true = string:equal(string:casefold("ẞ SHARP S"), string:casefold([<<$ß/utf8, $\s>>,"SHARP S"])), + false = string:equal(string:casefold("ẞ SHARP ß"), string:casefold("ss sharp s")), + + %% Normalization + ?TEST_NN("", ["", true, none], true), + ?TEST_NN("a", ["b", true, nfc], false), + ?TEST_NN("a", ["b", true, nfd], false), + ?TEST_NN("a", ["b", true, nfkc], false), + ?TEST_NN("a", ["b", true, nfkd], false), + + ?TEST_NN("a", ["A", false, nfc], false), + ?TEST_NN("a", ["A", false, nfd], false), + ?TEST_NN([<<>>,"a"|<<>>], ["A", true, nfkc], true), + ?TEST_NN(<<"a">>, ["A", true, nfkd], true), + + ?TEST_NN([$a, <<$b>>, [97,776]], ["abä", false, none], false), + ?TEST_NN([$a, <<$b>>, [97,776]], ["abä", false, nfc], true), + ?TEST_NN([$a, <<$b>>, [97,776]], ["abä", false, nfd], true), + ?TEST_NN([$a, <<$b>>, [97,776]], ["abä", false, nfkc], true), + ?TEST_NN([$a, <<$b>>, [97,776]], ["abä", false, nfkd], true), + + ?TEST_NN([$a, <<$b>>, [97,776]], ["abÄ", true, none], false), + ?TEST_NN([$a, <<$b>>, [97,776]], ["abÄ", false, nfc], false), + ?TEST_NN([$a, <<$b>>, [97,776]], ["abÄ", true, nfc], true), + ?TEST_NN([$a, <<$b>>, [97,776]], ["abÄ", true, nfd], true), + ?TEST_NN([$a, <<$b>>, [97,776]], ["abÄ", true, nfkc], true), + ?TEST_NN([$a, <<$b>>, [97,776]], ["abÄ", true, nfkd], true), + + ?TEST_NN([$a, <<$b>>, "ホンダ"], ["abホンダ", true, none], false), + ?TEST_NN([$a, <<$b>>, "ホンダ"], ["abホンダ", true, nfc], false), + ?TEST_NN([$a, <<$b>>, "ホンダ"], ["abホンダ", true, nfd], false), + ?TEST_NN([$a, <<$b>>, "ホンダ"], ["abホンダ", true, nfkc], true), + ?TEST_NN([$a, <<$b>>, "ホンダ"], ["abホンダ", true, nfkd], true), + + ?TEST_NN([$a, <<$b>>, "32"], ["ab32", true, none], false), + ?TEST_NN([$a, <<$b>>, "32"], ["ab32", true, nfc], false), + ?TEST_NN([$a, <<$b>>, "32"], ["ab32", true, nfd], false), + ?TEST_NN([$a, <<$b>>, "32"], ["ab32", true, nfkc], true), + ?TEST_NN([$a, <<$b>>, "32"], ["ab32", true, nfkd], true), + + %% Coverage. + ?TEST("", [<<"">>, false, nfc], true), + ?TEST("", [<<"">>, true, nfc], true), + + ok. + +to_graphemes(_) -> + %% More tests are in unicode_util_SUITE.erl + {'EXIT', _} = (catch unicode:characters_to_nfd_binary(["asdåäö", an_atom])), + String = ["abc..åäö", $e, 788, <<"Ωµe`è"/utf8>>, "œŒþæÆħ§ß"], + NFD = unicode:characters_to_nfd_list(String), + [] = string:to_graphemes([]), + [] = string:to_graphemes(<<>>), + GCs = string:to_graphemes(String), + true = erlang:length(GCs) =:= string:length(String), + true = erlang:length(GCs) =:= erlang:length(string:to_graphemes(NFD)), + true = erlang:length(GCs) =:= + erlang:length(string:to_graphemes(unicode:characters_to_nfc_list(String))), + ok. + +reverse(_) -> + {'EXIT',_} = (catch string:reverse(2)), + Str1 = "Hello ", + Str2 = "Ω ßð", + Str3 = "åäö", + ?TEST("", [], ""), + ?TEST(Str1, [], lists:reverse(Str1)), + ?TEST(Str2, [], lists:reverse(Str2)), + ?TEST(Str3, [], lists:reverse(Str3)), + true = string:reverse(Str3) =:= lists:reverse(string:to_graphemes(Str3)), + ok. + +slice(_) -> + {'EXIT',_} = (catch string:slice(2, 2, 2)), + {'EXIT',_} = (catch string:slice("asd", foo, 2)), + {'EXIT',_} = (catch string:slice("asd", 2, -1)), + ?TEST("", [3], ""), + ?TEST("aåä", [1, 0], ""), + ?TEST("aåä", [3], ""), + ?TEST("aåäöbcd", [3], "öbcd"), + ?TEST([<<"aå"/utf8>>,"äöbcd"], [3], "öbcd"), + ?TEST([<<"aåä"/utf8>>,"öbcd"], [3], "öbcd"), + ?TEST([<<"aåä"/utf8>>,"öbcd"], [3, infinity], "öbcd"), + + ?TEST("", [3, 2], ""), + ?TEST("aåä", [3, 2], ""), + ?TEST("aåäöbcd", [3,2], "öb"), + ?TEST([<<"aå"/utf8>>,"äöbcd"], [3,3], "öbc"), + ?TEST([<<"aåä"/utf8>>,"öbcd"], [3,10], "öbcd"), + + ok. + +pad(_) -> + Str = "Hallå", + ?TEST(Str, [7], "Hallå "), + ?TEST(Str, [7, leading], " Hallå"), + ?TEST(Str, [4, both, $.], "Hallå"), + ?TEST(Str, [10, both, $.], "..Hallå..."), + ?TEST(Str, [10, leading, $.], ".....Hallå"), + ?TEST(Str, [10, trailing, $.], "Hallå....."), + ?TEST(Str++["f"], [10, trailing, $.], "Hallåf...."), + ?TEST(Str++[" flåwer"], [10, trailing, $.], "Hallå flåwer"), + ok. + +trim(_) -> + Str = "\t\s..Ha\s.llå..\t\n\r", + ?TEST("", [], ""), + ?TEST(Str, [both, "x"], Str), + ?TEST(Str, [leading], "..Ha\s.llå..\t\n\r"), + ?TEST(Str, [trailing], "\t\s..Ha\s.llå.."), + ?TEST(Str, [], "..Ha .llå.."), + ?TEST(".. ", [both, ""], ".. "), + ?TEST([<<".. ">>], [both, ". "], ""), + ?TEST(".. h.ej ..", [leading, ". "], "h.ej .."), + ?TEST(".. h.ej ..", [trailing, ". "], ".. h.ej"), + ?TEST(".. h.ej ..", [both, ". "], "h.ej"), + ?TEST(["..", <<"h.ej">>, ".."], [both, ". "], "h.ej"), + ?TEST([[], "..", " h.ej ", <<"..">>], [both, ". "], "h.ej"), + ?TEST([<<>>,<<"..">>, " h.ej", <<" ..">>], [both, ". "], "h.ej"), + ?TEST([<<>>,<<"..">>, " h.ej", <<" ..">>], [trailing, ". "], ".. h.ej"), + ?TEST([<<".. h.ej .">>, <<"..">>], [both, ". "], "h.ej"), + ?TEST(["..h", ".e", <<"j..">>], [both, ". "], "h.ej"), + ?TEST(["..h", <<".ejsa"/utf8>>, "n.."], [both, ". "], "h.ejsan"), + %% Test that it behaves with graphemes (i.e. nfd tests are the hard part) + ?TEST("aaåaa", [both, "a"], "å"), + ?TEST(["aaa",778,"äöoo"], [both, "ao"], "åäö"), + ?TEST([<<"aaa">>,778,"äöoo"], [both, "ao"], "åäö"), + ?TEST([<<"e">>,778,"åäöe", <<778/utf8>>], [both, [[$e,778]]], "åäö"), + ?TEST([[<<"!v">>|<<204,128,$v,204,129>>]],[trailing, [[$v,769]]], [$!,$v,768]), + ?TEST([[[<<"v">>|<<204,129,118,204,128,118>>],769,118,769]], [trailing, [[118,769]]], [$v,769,$v,768]), + ?TEST([<<"vv">>|<<204,128,118,204,128>>], [trailing, [[118,768]]], "v"), + ok. + +chomp(_) -> + Str = "åäö\na\r\nsd\n", + Res = "åäö\na\r\nsd", + ?TEST("", [], ""), + ?TEST("\n", [], ""), + ?TEST("str \t", [], "str \t"), + ?TEST("str \t\n\r", [], "str \t\n\r"), + ?TEST(Str, [], Res), + ?TEST([Str,$\n], [], Res), + ?TEST([Str|"\n"], [], Res), + ?TEST([Str|<<"\n">>], [], Res), + ?TEST([Str,$\r|<<"\n">>], [], Res), + ?TEST([Str, <<$\r>>|"\n"], [], Res), + ?TEST([<<$a,$\r>>,"\na\n"], [], "a\r\na"), + ok. + +take(_) -> + Str = "\t\s..Ha\s.llå..\t\n\r", + WS = "\t\s\n\r", + Chars = lists:seq($a,$z)++lists:seq($A,$Z), + %% complement=false, dir=leading + ?TEST("", ["abc"], {"",""}), + ?TEST(Str, ["x"], {[], Str}), + ?TEST(Str, [WS], {"\t\s","..Ha\s.llå..\t\n\r"}), + ?TEST(".. ", ["", false], {"", ".. "}), + ?TEST([<<".. ">>], [". ", false, leading], {".. ", ""}), + ?TEST(".. h.ej ..", [". ", false, leading], {".. ", "h.ej .."}), + ?TEST(["..", <<"h.ej">>, ".."], [". ", false, leading], {"..", "h.ej.."}), + ?TEST([[], "..", " h.ej ", <<"..">>], [". ", false, leading], {".. ","h.ej .."}), + ?TEST([<<>>,<<"..">>, " h.ej", <<" ..">>], [". ", false, leading], {".. ", "h.ej .."}), + ?TEST(["..h", <<".ejsa"/utf8>>, "n.."], [". ", false, leading], {"..", "h.ejsan.."}), + ?TEST([[<<101,204,138,33>>]], [[[$e,778]]], {[$e,778], "!"}), + %% Test that it behaves with graphemes (i.e. nfd tests are the hard part) + ?TEST("aaåaa", ["a", false, leading], {"aa", "åaa"}), + ?TEST(["aaa",778,"äöoo"], ["ao", false, leading], {"aa", "åäöoo"}), + ?TEST([<<"aaa">>,778,"äöoo"], ["ao",false,leading], {"aa", "åäöoo"}), + ?TEST([<<"e">>,778,"åäöe", <<778/utf8>>], [[[$e,778]], false, leading], {[$e,778],"åäöe"++[778]}), + + %% complement=true, dir=leading + ?TEST("", ["abc", true], {"",""}), + ?TEST(Str, ["x", true], {Str, []}), + ?TEST(Str, [Chars, true], {"\t\s..","Ha\s.llå..\t\n\r"}), + ?TEST(".. ", ["",true], {".. ", ""}), + ?TEST([<<".. ">>], [Chars, true, leading], {".. ", ""}), + ?TEST(".. h.ej ..", [Chars, true, leading], {".. ", "h.ej .."}), + ?TEST(["..", <<"h.ej">>, ".."], [Chars, true, leading], {"..", "h.ej.."}), + ?TEST([[], "..", " h.ej ", <<"..">>], [Chars, true, leading], {".. ","h.ej .."}), + ?TEST([<<>>,<<"..">>, " h.ej", <<" ..">>], [Chars, true, leading], {".. ", "h.ej .."}), + ?TEST(["..h", <<".ejsa"/utf8>>, "n.."], [Chars, true, leading], {"..", "h.ejsan.."}), + %% Test that it behaves with graphemes (i.e. nfd tests are the hard part) + ?TEST(["aaee",778,"äöoo"], [[[$e,778]], true, leading], {"aae", [$e,778|"äöoo"]}), + ?TEST([<<"aae">>,778,"äöoo"], [[[$e,778]],true,leading], {"aa", [$e,778|"äöoo"]}), + ?TEST([<<"e">>,778,"åäöe", <<778/utf8>>], [[[$e,778]], true, leading], {[], [$e,778]++"åäöe"++[778]}), + + %% complement=false, dir=trailing + ?TEST(Str, ["", false, trailing], {Str, []}), + ?TEST(Str, ["x", false, trailing], {Str, []}), + ?TEST(Str, [WS, false,trailing], {"\t\s..Ha\s.llå..", "\t\n\r"}), + ?TEST(".. h.ej ..", [". ", false, trailing], {".. h.ej", " .."}), + ?TEST(["..", <<"h.ej">>, ".."], [". ", false, trailing], {"..h.ej", ".."}), + ?TEST([[], "..", " h.ej ", <<"..">>], [". ", false, trailing], {".. h.ej", " .."}), + ?TEST([<<>>,<<"..">>, " h.ej", <<" ..">>], [". ", false, trailing], {".. h.ej", " .."}), + ?TEST(["..h", <<".ejsa"/utf8>>, "n.."], [". ", false, trailing], {"..h.ejsan", ".."}), + ?TEST("aaåaa", ["a", false, trailing], {"aaå", "aa"}), + ?TEST([<<"KMШ"/utf8>>], [[1064], false, trailing], {"KMШ",[]}), + ?TEST([[<<"!\"">>|<<"\"">>]], ["\"", false, trailing], {"!", "\"\""}), + ?TEST([<<$v>>, 769], [[[$v,769]], false, trailing], {"", [$v,769]}), + ?TEST(["aaa",778,"äöoo"], ["ao", false, trailing], {"aaåäö", "oo"}), + ?TEST([<<"aaa">>,778,"äöoo"], ["ao", false, trailing], {"aaåäö", "oo"}), + ?TEST([<<"e">>,778,"åäöee", <<778/utf8>>], [[[$e,778]], false, trailing], {[$e,778|"åäöe"], [$e,778]}), + + %% complement=true, dir=trailing + ?TEST("", ["abc", true, trailing], {"",""}), + ?TEST(Str, ["x", true, trailing], {[], Str}), + %?TEST(Str, [{norm,Chars}, true, trailing], {"\t\s..Ha\s.ll","å..\t\n\r"}), + ?TEST(".. ", ["", true, trailing], {"", ".. "}), + ?TEST([<<".. ">>], [Chars, true, trailing], {"", ".. "}), + ?TEST(".. h.ej ..", [Chars, true, trailing], {".. h.ej", " .."}), + ?TEST(["..", <<"h.ej">>, ".."], [Chars, true, trailing], {"..h.ej", ".."}), + ?TEST([[], "..", " h.ej ", <<"..">>], [Chars, true, trailing], {".. h.ej"," .."}), + ?TEST([<<>>,<<"..">>, " h.ej", <<" ..">>], [Chars, true, trailing], {".. h.ej"," .."}), + ?TEST(["..h", <<".ejsa"/utf8>>, "n.."], [Chars, true, trailing], {"..h.ejsan", ".."}), + ?TEST([[<<101,204,138,33>>]], [[[$e,778]], true, trailing], {[$e,778], "!"}), + ?TEST([<<"Fa">>], [[$F], true, trailing], {"F", "a"}), + ?TEST([[<<101,101,204,138>>,1045,778]], ["e", true, trailing], {"e", [101,778,1045,778]}), + ?TEST([[<<101,101,204,138>>,<<1045/utf8,778/utf8>>]], ["e", true, trailing], {"e", [101,778,1045,778]}), + ?TEST([[[118,769,118],<<204,129,118,204,129,120,204,128,118>>,768,120,768]], + [[[118,769]], true, trailing], {[118,769,118,769,118,769],[120,768,118,768,120,768]}), + ?TEST([[<<118,204,128,118>>|<<204,128,118,204,128,118,204,128,206,132,204,129,206,132,204,129>>]], + [[[118,768]], true, trailing], {[118,768,118,768,118,768,118,768], [900,769,900,769]}), + %% Test that it behaves with graphemes (i.e. nfd tests are the hard part) + ?TEST(["aaee",778,"äöoo"], [[[$e,778]], true, trailing], {"aae"++[$e,778], "äöoo"}), + ?TEST([<<"aae">>,778,"äöoo"], [[[$e,778]],true,trailing], {"aa"++[$e,778], "äöoo"}), + ?TEST([<<"e">>,778,"åäöe", <<778/utf8>>], [[[$e,778]], true, trailing], {[$e,778]++"åäöe"++[778], []}), + ?TEST([<<"e">>,778,"åäöe", <<778/utf8>>, $e, 779], [[[$e,778]], true, trailing], + {[$e,778]++"åäöe"++[778], [$e,779]}), + + ok. + + +uppercase(_) -> + ?TEST("", [], ""), + ?TEST("123", [], "123"), + ?TEST("abc", [], "ABC"), + ?TEST("ABC", [], "ABC"), + ?TEST("abcdefghiljklmnopqrstvxyzåäö",[], "ABCDEFGHILJKLMNOPQRSTVXYZÅÄÖ"), + ?TEST("åäö", [], "ÅÄÖ"), + ?TEST("ÅÄÖ", [], "ÅÄÖ"), + ?TEST("Michał", [], "MICHAŁ"), + ?TEST(["Mic",<<"hał"/utf8>>], [], "MICHAŁ"), + ?TEST("ljLJ", [], "LJLJ"), + ?TEST("LJlj", [], "LJLJ"), + ?TEST("ß sharp s", [], "SS SHARP S"), + ok. + +lowercase(_) -> + ?TEST("", [], ""), + ?TEST("123", [], "123"), + ?TEST("abc", [], "abc"), + ?TEST("ABC", [], "abc"), + ?TEST("åäö", [], "åäö"), + ?TEST("ÅÄÖ", [], "åäö"), + ?TEST("MICHAŁ", [], "michał"), + ?TEST(["Mic",<<"HAŁ"/utf8>>], [], "michał"), + ?TEST("ß SHARP S", [], "ß sharp s"), + ?TEST("İ I WITH DOT ABOVE", [], "i̇ i with dot above"), + ok. + +titlecase(_) -> + ?TEST("", [], ""), + ?TEST("123", [], "123"), + %% Titlecase is the same as uppercase for most chars + [?TEST([C,$x], [], string:uppercase([C])++[$x]) || + C <-"abcdefghiljklmnopqrstvxyzåäö"], + %% Example of a different mapping + ?TEST("ljusad", [],"Ljusad"), + ?TEST("ljLJ", [], "LjLJ"), + ?TEST("LJlj", [], "Ljlj"), + ?TEST("ß sharp s", [], "Ss sharp s"), + ok. + +casefold(_) -> + ?TEST("", [], ""), + ?TEST("123", [], "123"), + ?TEST("abc", [], "abc"), + ?TEST("ABC", [], "abc"), + ?TEST("åäö", [], "åäö"), + ?TEST("ÅÄÖ", [], "åäö"), + ?TEST("MICHAŁ", [], "michał"), + ?TEST(["Mic",<<"HAŁ"/utf8>>], [], "michał"), + ?TEST("ß SHARP S", [], "ss sharp s"), + ?TEST("ẞ SHARP S", [], "ss sharp s"), + ?TEST("İ I WITH DOT ABOVE", [], "i̇ i with dot above"), + ok. + +prefix(_) -> + ?TEST("", ["a"], nomatch), + ?TEST("a", [""], "a"), + ?TEST("b", ["a"], nomatch), + ?TEST("a", ["a"], ""), + ?TEST("å", ["a"], nomatch), + ?TEST(["a",<<778/utf8>>], ["a"], nomatch), + ?TEST([<<"a"/utf8>>,778], ["a"], nomatch), + ?TEST("hejsan", [""], "hejsan"), + ?TEST("hejsan", ["hej"], "san"), + ?TEST("hejsan", ["hes"], nomatch), + ?TEST(["h", "ejsan"], ["hej"], "san"), + ?TEST(["h", "e", "jsan"], ["hej"], "san"), + ?TEST(["h", "e", "san"], ["hej"], nomatch), + ?TEST(["h", <<"ejsan">>], ["hej"], "san"), + ?TEST(["h", <<"e">>, "jsan"], ["hej"], "san"), + ?TEST(["h", "e", <<"jsan">>], ["hej"], "san"), + ok. + +split(_) -> + Mod = fun(Res) -> + [lists:flatten(unicode:characters_to_nfc_list(io_lib:format("~ts", [Str]))) + || Str <- Res] end, + ?TEST("..", ["", leading], {Mod, [".."]}), + ?TEST("..", ["..", leading], {Mod, [[],[]]}), + ?TEST("abcd", ["..", leading], {Mod, ["abcd"]}), + ?TEST("ab..bc", ["..", leading], {Mod, ["ab","bc"]}), + ?TEST("ab..bc..cd", ["..", leading], {Mod, ["ab","bc..cd"]}), + ?TEST("..ab", [".."], {Mod, [[],"ab"]}), + ?TEST("ab..", ["..", leading], {Mod, ["ab",[]]}), + ?TEST(["ab..bc..cd"], ["..", leading], {Mod, ["ab","bc..cd"]}), + ?TEST(["ab","..bc..cd"], ["..", leading], {Mod, ["ab","bc..cd"]}), + ?TEST(["ab",<<"..bc..cd">>], ["..", leading], {Mod, ["ab","bc..cd"]}), + ?TEST(["ab.",".bc..cd"], ["..", leading], {Mod, ["ab","bc..cd"]}), + ?TEST(["ab.",<<".bc..cd">>], ["..", leading], {Mod, ["ab","bc..cd"]}), + ?TEST(["ab..","bc..cd"], ["..", leading], {Mod, ["ab","bc..cd"]}), + ?TEST(["ab..",<<"bc..cd">>], ["..", leading], {Mod, ["ab","bc..cd"]}), + ?TEST(["ab.","bc..cd"], ["..", leading], {Mod, ["ab.bc","cd"]}), + ?TEST("ab...bc", ["..", leading], {Mod, ["ab",".bc"]}), + + ?TEST("..", ["", trailing], {Mod, [".."]}), + ?TEST("..", ["..", trailing], {Mod, [[],[]]}), + ?TEST("abcd", ["..", trailing], {Mod, ["abcd"]}), + ?TEST("ab..bc", ["..", trailing], {Mod, ["ab","bc"]}), + ?TEST("ab..bc..cd", ["..", trailing], {Mod, ["ab..bc","cd"]}), + ?TEST("..ab", ["..", trailing], {Mod, [[],"ab"]}), + ?TEST("ab..", ["..", trailing], {Mod, ["ab",[]]}), + ?TEST(["ab..bc..cd"], ["..", trailing], {Mod, ["ab..bc","cd"]}), + ?TEST(["ab","..bc..cd"], ["..", trailing], {Mod, ["ab..bc","cd"]}), + ?TEST(["ab"|<<"a">>], ["a", trailing], {Mod, ["ab",[]]}), + ?TEST(["ab",<<"..bc..cd">>], ["..", trailing], {Mod, ["ab..bc","cd"]}), + ?TEST([<<"ab.">>,".bc..cd"], ["..", trailing], {Mod, ["ab..bc","cd"]}), + ?TEST(["ab.",<<".bc..cd">>], ["..", trailing], {Mod, ["ab..bc","cd"]}), + ?TEST(["ab..","bc..cd"], ["..", trailing], {Mod, ["ab..bc","cd"]}), + ?TEST(["ab..",<<"bc..cd">>], ["..", trailing], {Mod, ["ab..bc","cd"]}), + ?TEST(["ab.","bc..cd"], ["..", trailing], {Mod, ["ab.bc","cd"]}), + ?TEST("ab...bc", ["..", trailing], {Mod, ["ab.","bc"]}), + + ?TEST("..", ["..", all], {Mod, [[],[]]}), + ?TEST("abcd", ["..", all], {Mod, ["abcd"]}), + ?TEST("a..b", ["..", all], {Mod, ["a","b"]}), + ?TEST("a..b..c", ["..", all], {Mod, ["a","b","c"]}), + ?TEST("a..", ["..", all], {Mod, ["a",[]]}), + ?TEST(["a..b..c"], ["..", all], {Mod, ["a","b","c"]}), + ?TEST(["a","..b..c"], ["..", all], {Mod, ["a","b","c"]}), + ?TEST(["a",<<"..b..c">>], ["..", all], {Mod, ["a","b","c"]}), + ?TEST(["a.",".b..c"], ["..", all], {Mod, ["a","b","c"]}), + ?TEST(["a.",<<".b..c">>], ["..", all], {Mod, ["a","b","c"]}), + ?TEST(["a..","b..c"], ["..", all], {Mod, ["a","b","c"]}), + ?TEST(["a..",<<"b..c">>], ["..", all], {Mod, ["a","b","c"]}), + ?TEST(["a.","b..c"], ["..", all], {Mod, ["a.b","c"]}), + ?TEST("a...b", ["..", all], {Mod, ["a",".b"]}), + + %% Grapheme (split) tests + ?TEST("aΩΩb", ["Ω", all], {Mod, ["a","","b"]}), + ?TEST([<<"aae">>,778,"äöoo"], [[$e,778], leading], {Mod, ["aa","äöoo"]}), + ?TEST([<<"aae">>,778,"äöoo"], [[$e,778], trailing], {Mod, ["aa","äöoo"]}), + ?TEST([<<"aae">>,778,"äöoo"], [[$e,778], all], {Mod, ["aa","äöoo"]}), + ?TEST([<<"aae">>,778,"öeeåäö"], ["e", leading], {Mod, [[$a, $a, $e,778,$ö],"eåäö"]}), + ?TEST([<<"aae">>,778,"öeeåäö"], ["e", trailing], {Mod, [[$a, $a, $e,778,$ö, $e],"åäö"]}), + ?TEST([<<"aae">>,778,"öeeåäö"], ["e", all], {Mod, [[$a, $a, $e,778,$ö],"", "åäö"]}), + + ok. + +replace(_) -> + ?TEST(["a..b.", [".c"]], ["xxx", "::"], "a..b..c"), + ?TEST(["a..b.", [".c"]], ["..", "::"], "a::b..c"), + ?TEST([<<"a..b.">>, [".c"]], ["..", "::", trailing], "a..b::c"), + ?TEST(["a..b.", [".c"]], ["..", "::", all], "a::b::c"), + ok. + +cd_gc(_) -> + [] = string:next_codepoint(""), + [] = string:next_codepoint(<<>>), + [] = string:next_codepoint([<<>>]), + "abcd" = string:next_codepoint("abcd"), + [$e,778] = string:next_codepoint([$e,778]), + [$e|<<204,138>>] = string:next_codepoint(<<$e,778/utf8>>), + [778|_] = string:next_codepoint(tl(string:next_codepoint(<<$e,778/utf8>>))), + + [] = string:next_grapheme(""), + [] = string:next_grapheme(<<>>), + [] = string:next_grapheme([<<>>]), + "abcd" = string:next_grapheme("abcd"), + [[$e,778]] = string:next_grapheme([$e,778]), + [[$e,778]] = string:next_grapheme(<<$e,778/utf8>>), + + ok. + + +find(_) -> + ?TEST(["h", "ejsan"], [""], "hejsan"), + ?TEST(["h", "ejsan"], [<<>>], "hejsan"), + ?TEST([], [""], ""), + ?TEST([], ["hej"], nomatch), + ?TEST(["h", "ejsan"], ["hej"], "hejsan"), + ?TEST(["h", "e", "jsan"], ["hej"], "hejsan"), + ?TEST(["xh", "e", "san"], ["hej"], nomatch), + ?TEST([<<"xh">>, <<"ejsan">>], ["hej"], "hejsan"), + ?TEST(["xh", <<"ejsan">>], ["hej"], "hejsan"), + ?TEST(["xh", <<"e">>, "jsan"], ["hej"], "hejsan"), + ?TEST(["xh", "e", <<"jsan">>], ["hej"], "hejsan"), + ?TEST(["xh", "er", <<"ljsane">>, "rlang"], ["erl", leading], "erljsanerlang"), + ?TEST("aΩΩb", ["Ω", leading], "ΩΩb"), + ?TEST([<<"aae">>,778,"äöoo"], [[$e,778], leading], [$e,778]++"äöoo"), + ?TEST([<<"aae">>,778,"öeeåäö"], ["e", leading], "eeåäö"), + + ?TEST(["h", "ejsan"], ["", trailing], "hejsan"), + ?TEST([], ["", trailing], ""), + ?TEST([], ["hej", trailing], nomatch), + ?TEST(["h", "ejsan"], ["hej", trailing], "hejsan"), + ?TEST(["h", "e", "jsan"], ["hej", trailing], "hejsan"), + ?TEST(["xh", "e", "san"], ["hej", trailing], nomatch), + ?TEST([<<"xh">>, <<"ejsan">>], ["hej", trailing], "hejsan"), + ?TEST(["xh", <<"ejsan">>], ["hej", trailing], "hejsan"), + ?TEST(["xh", <<"e">>, "jsan"], ["hej", trailing], "hejsan"), + ?TEST(["xh", "e", <<"jsan">>], ["hej", trailing], "hejsan"), + ?TEST(["xh", "er", <<"ljsane">>, "rlang"], ["erl", trailing], "erlang"), + ?TEST("aΩΩb", ["Ω", trailing], "Ωb"), + ?TEST([<<"aae">>,778,"äöoo"], [[$e,778], trailing], [$e,778]++"äöoo"), + ?TEST([<<"aeae">>,778,"äö"], ["e", trailing], "eae"++[778,$ä,$ö]), + + ok. + +lexemes(_) -> + Mod = fun(Res) -> + [unicode:characters_to_nfc_list(io_lib:format("~ts", [Str]))|| Str <- Res] + end, + Res = ["Hej", "san", "Hopp", "san"], + ?TEST("", [" ,."], {Mod, []}), + ?TEST("Hej san", [""], {Mod, ["Hej san"]}), + ?TEST(" ,., ", [" ,."], {Mod, []}), + ?TEST( "Hej san Hopp san", [" ,."], {Mod, Res}), + ?TEST(" Hej san Hopp san ", [" ,."], {Mod, Res}), + ?TEST(" Hej san, .Hopp san ", [" ,."], {Mod, Res}), + + ?TEST([" Hej san",", .Hopp san "], [" ,."], {Mod, Res}), + ?TEST([" Hej sa","n, .Hopp san "], [" ,."], {Mod, Res}), + ?TEST([" Hej san,"," .Hopp san "], [" ,."], {Mod, Res}), + + ?TEST([" Hej san",[", .Hopp san "]], [" ,."], {Mod, Res}), + ?TEST([" Hej sa",["n, .Hopp san "]], [" ,."], {Mod, Res}), + ?TEST([" Hej san,",[" .Hopp san "]], [" ,."], {Mod, Res}), + + ?TEST([" H",<<"ej san, .Hopp "/utf8>>, "san"], [" ,."], {Mod, Res}), + ?TEST([" Hej san",<<", .Hopp "/utf8>>, "san"], [" ,."], {Mod, Res}), + ?TEST([" Hej sa",<<"n, .Hopp"/utf8>>, " san"], [" ,."], {Mod, Res}), + ?TEST([" Hej san,",<<" .Hopp s"/utf8>>, "an"], [" ,."], {Mod, Res}), + ?TEST([" Hej san",[<<", .Hopp san "/utf8>>]], [" ,."], {Mod, Res}), + ?TEST([" Hej sa",[<<"n, .Hopp san "/utf8>>]], [" ,."], {Mod, Res}), + ?TEST([" Hej san,",[<<" .Hopp san "/utf8>>], <<" ">>], [" ,."], {Mod, Res}), + + ?TEST(" Hej\r\nsan\nnl", ["\r\n\s"], {Mod, ["Hej\r\nsan", "nl"]}), + + ?TEST(["b1ec1e",778,"äöo21"], ["eo"], {Mod, ["b1",[$c,$1,$e,778,$ä,$ö],"21"]}), + ?TEST([<<"b1ec1e">>,778,"äöo21"], ["eo"], {Mod, ["b1",[$c,$1,$e,778,$ä,$ö],"21"]}), + %% Grapheme (split) tests + Str10 = [[[<<"÷"/utf8>>,1101],<<"ë"/utf8>>|<<"\"">>]], + ?TEST(Str10, [[1076]], {Mod, [unicode:characters_to_nfc_list(Str10)]}), + ?TEST("a1Ωb1Ωc1", ["Ω"], {Mod, ["a1","b1","c1"]}), + ?TEST([<<"aae">>,778,"äöoo"], [[[$e,778]]], {Mod, ["aa","äöoo"]}), + ?TEST([<<"aae">>,778,"äöo21"], [[[$e,778],$o]], {Mod, ["aa","äö","21"]}), + ?TEST([<<"aae">>,778,"öeeåäö"], ["e"], {Mod, [[$a, $a, $e,778,$ö],"åäö"]}), + ok. + +nth_lexeme(_) -> + {'EXIT', _} = (catch string:nth_lexeme("test test", 0, [])), + {'EXIT', _} = (catch string:nth_lexeme(<<"test test">>, 0, [])), + ?TEST( "", [1, " ,."], []), + ?TEST( "Hej san", [1, ""], "Hej san"), + ?TEST( " ,., ", [1, " ,."], []), + ?TEST( " ,., ", [3, " ,."], []), + ?TEST("Hej san Hopp san", [1, " ,."], "Hej"), + ?TEST("...Hej san Hopp san", [1, " ,."], "Hej"), + ?TEST("Hej san Hopp san", [3, " ,."], "Hopp"), + ?TEST(" Hej san Hopp san ", [3, " ,."], "Hopp"), + ?TEST(" Hej san, .Hopp san ", [3, " ,."], "Hopp"), + ?TEST("ab cd", [3, " "], ""), + + ?TEST([" Hej san",", .Hopp san "], [3, " ,."], "Hopp"), + ?TEST([" Hej sa","n, .Hopp san "], [3, " ,."], "Hopp"), + ?TEST([" Hej san,"," .Hopp san "], [3, " ,."], "Hopp"), + + ?TEST([" Hej san",[", .Hopp san "]], [3," ,."], "Hopp"), + ?TEST([" Hej sa",["n, .Hopp san "]], [3, " ,."], "Hopp"), + ?TEST([" Hej san,",[" .Hopp san "]], [3, " ,."], "Hopp"), + + ?TEST([" Hej san",<<", .Hopp "/utf8>>, "san"], [3, " ,."], "Hopp"), + ?TEST([" Hej sa",<<"n, .Hopp"/utf8>>, " san"], [3, " ,."], "Hopp"), + ?TEST([" Hej san,",<<" .Hopp s"/utf8>>, "an"], [3, " ,."], "Hopp"), + ?TEST([" Hej san,",<<" .Hopp s"/utf8>>, "an"], [4, " ,."], "san"), + ?TEST([" Hej san",[<<", .Hopp san "/utf8>>]], [3, " ,."], "Hopp"), + ?TEST([" Hej sa",[<<"n, .Hopp san "/utf8>>]], [3, " ,."], "Hopp"), + ?TEST([" Hej san,",[<<" .Hopp san "/utf8>>], <<" ">>], [3, " ,."], "Hopp"), + + ?TEST(["b1ec1e",778,"äöo21"], [3,"eo"], "21"), + ?TEST([<<"b1ec1e">>,778,"äöo21"], [3, "eo"], "21"), + %% Grapheme (split) tests + ?TEST("a1Ωb1Ωc1", [1, "Ω"], "a1"), + ?TEST([<<"aae">>,778,"äöoo"], [2,[[$e,778]]], "äöoo"), + ?TEST([<<"aae">>,778,"äöo21"], [2,[[$e,778],$o]], "äö"), + ?TEST([<<"aae">>,778,"öeeåäö"], [2,"e"], "åäö"), + ok. + + +meas(Config) -> + case ct:get_timetrap_info() of + {_,{_,Scale}} when Scale > 1 -> + {skip,{will_not_run_in_debug,Scale}}; + _ -> % No scaling + DataDir = proplists:get_value(data_dir, Config), + TestDir = filename:dirname(string:trim(DataDir, trailing, "/")), + do_measure(TestDir) + end. + +do_measure(TestDir) -> + File = filename:join(TestDir, ?MODULE_STRING ++ ".erl"), + io:format("File ~s ",[File]), + {ok, Bin} = file:read_file(File), + io:format("~p~n",[byte_size(Bin)]), + Do = fun(Name, Func, Mode) -> + {N, Mean, Stddev, _} = time_func(Func, Mode, Bin), + io:format("~10w ~6w ~6.2fms ±~4.2fms #~.2w gc included~n", + [Name, Mode, Mean/1000, Stddev/1000, N]) + end, + io:format("----------------------~n"), + Do(tokens, fun(Str) -> string:tokens(Str, [$\n,$\r]) end, list), + Tokens = {lexemes, fun(Str) -> string:lexemes(Str, [$\n,$\r]) end}, + [Do(Name,Fun,Mode) || {Name,Fun} <- [Tokens], Mode <- [list, binary]], + ok. + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +%% internal functions + +test(Line, Func, Str, Args, Res, Norm) -> + %%io:format("~p: ~p ~w ~w~n",[Line, Func, Str, Args]), + test_1(Line, Func, Str, [Str|norm(none,Args)], Res), + %%io:format("~p: ~p bin ",[Line, Func]), + test_1({Line,list}, Func, Str, + [unicode:characters_to_list(Str)|norm(none,Args)], Res), + Norm andalso + test_1({Line,clist}, Func, Str, + [unicode:characters_to_nfc_list(Str)|norm(nfc,Args)], Res), + Norm andalso + test_1({Line,dlist}, Func, Str, + [unicode:characters_to_nfd_list(Str)|norm(nfd,Args)], Res), + test_1({Line,bin}, Func, Str, + [unicode:characters_to_binary(Str)|norm(none, Args)], Res), + Norm andalso + test_1({Line,cbin}, Func, Str, + [unicode:characters_to_nfc_binary(Str)|norm(nfc,Args)], Res), + Norm andalso + test_1({Line,dbin}, Func, Str, + [unicode:characters_to_nfd_binary(Str)|norm(nfd,Args)], Res), + %%io:format("~n",[]), + ok. + +test_1(Line, Func, Str, Args, Exp) -> + try + Res = apply(string, Func, Args), + check_types(Line, Func, Args, Res), + case res(Res, Exp) of + true -> ok; + {Res1,Exp1} when is_tuple(Exp1) -> + io:format("~p~n",[Args]), + io:format("~p:~p: ~ts~w =>~n :~w:~w~n", + [Func,Line, Str,Str,Res1,Exp1]), + exit({error, Func}); + {Res1,Exp1} -> + io:format("~p:~p: ~ts~w =>~n :~ts~w:~ts~w~n", + [Func,Line, Str,Str, Res1,Res1, Exp1,Exp1]), + exit({error, Func}) + end + catch + error:Exp -> + ok; + error:Reason -> + io:format("~p:~p: Crash ~p ~p~n", + [?MODULE,Line, Reason, erlang:get_stacktrace()]), + exit({error, Func}) + end. + +norm(Type, Args) -> + Norm = case Type of + nfc -> fun unicode:characters_to_nfc_list/1; + nfd -> fun unicode:characters_to_nfd_list/1; + none -> fun(Str) -> Str end + end, + lists:map(fun({norm,Str}) -> Norm(Str); + (Other) -> Other + end, Args). + +res(Str, Str) -> true; +res(Str, Exp) when is_list(Str), is_list(Exp) -> + A = unicode:characters_to_nfc_list(Str), + A==Exp orelse {A,Exp}; +res(Str, Exp) when is_binary(Str), is_list(Exp) -> + A = unicode:characters_to_nfc_list(Str), + A==Exp orelse {A,Exp}; +res(What, {Fun, Exp}) when is_function(Fun) -> + Fun(What) == Exp orelse {Fun(What), Exp}; +res({S1,S2}=S, {Exp1,Exp2}=E) -> %% For take + case {res(S1,Exp1), res(S2,Exp2)} of + {true, true} -> true; + _ -> {S, E} + end; +res(Int, Exp) -> + Int == Exp orelse {Int, Exp}. + + +check_types(_Line, _Func, _Str, Res) + when is_integer(Res); is_boolean(Res); Res =:= nomatch -> + %% length or equal + ok; +check_types(Line, Func, [S1,S2], Res) + when Func =:= concat -> + case check_types_1(type(S1),type(S2)) of + ok -> + case check_types_1(type(S1),type(Res)) of + ok -> ok; + {T1,T2} -> + io:format("Failed: ~p ~p ~p ~p~n",[Line, Func, T1, T2]), + io:format(" ~p ~p => ~p~n", [S1, S2, Res]), + error + end; + _ -> ok + end; +check_types(Line, Func, [Str|_], Res) -> + AddList = fun(mixed) -> mixed; + ({list,{list,_}}) -> {list, deep}; + (R) -> + case lists:member(Func, [lexemes, tokens, split]) of + true -> {list, R}; + false -> R + end + end, + try needs_check(Func) andalso (ok = check_types_1(AddList(type(Str)), type(Res))) of + ok -> ok; + false -> ok + catch _:{badmatch, {T1,T2}} -> + io:format("Failed: ~p ~p: ~p ~p~n",[Line, Func, T1, T2]), + io:format(" ~p => ~p~n", [Str, Res]), + error; + _:Reason -> + io:format("Crash: ~p in~n ~p~n",[Reason, erlang:get_stacktrace()]), + io:format("Failed: ~p ~p: ~p => ~p~n", [Line, Func, Str, Res]), + exit({Reason, erlang:get_stacktrace()}) + end. + +check_types_1(T, T) -> + ok; +check_types_1(Str, Res) + when is_binary(Str), is_binary(Res) -> + ok; +check_types_1({list, _},{list, undefined}) -> + ok; +check_types_1({list, _},{list, codepoints}) -> + ok; +check_types_1({list, _},{list, {list, codepoints}}) -> + ok; +check_types_1({list, {list, _}},{list, {list, codepoints}}) -> + ok; +check_types_1(mixed,_) -> + ok; +check_types_1({list, binary}, binary) -> + ok; +check_types_1({list, binary}, {other, _, _}) -> %% take + ok; +check_types_1({list, deep}, _) -> + ok; +check_types_1({list, {list, deep}}, _) -> + ok; +check_types_1(T1,T2) -> + {T1,T2}. + +type(Bin) when is_binary(Bin) -> + binary; +type([]) -> + {list, undefined}; +type(List) when is_list(List) -> + Deep = fun(L) when is_list(L) -> + lists:any(fun(C) -> is_list(C) orelse is_binary(C) end, L); + (_) -> false + end, + case all(fun(C) -> not is_binary(C) end, List) of + true -> + case all(fun(C) -> is_integer(C) end, List) of + true -> {list, codepoints}; + false -> + case [deep || L <- List, Deep(L)] of + [] -> {list, {list, codepoints}}; + _ -> {list, deep} + end + end; + false -> + case all(fun(C) -> is_binary(C) end, List) of + true -> {list, binary}; + false -> mixed + end + end; +type({R1,R2}) -> + case {type(R1),type(R2)} of + {T,T} -> T; + {{list,undefined}, {list,codepoints}} -> {list,codepoints}; + {{list,codepoints}, {list,undefined}} -> {list,codepoints}; + {T1,T2} -> {other, T1,T2} + end; +type(Other) -> + {other, Other}. + +all(_Check, []) -> + true; +all(Check, [H|T]) -> + Check(H) andalso all(Check,T); +all(Check, Bin) when is_binary(Bin) -> + Check(Bin). + +needs_check(reverse) -> false; +needs_check(pad) -> false; +needs_check(replace) -> false; +needs_check(_) -> true. + +%%%% Timer stuff + +time_func(Fun, Mode, Bin) -> + timer:sleep(100), %% Let emulator catch up and clean things before test runs + Self = self(), + Pid = spawn_link(fun() -> + Str = mode(Mode, Bin), + Self ! {self(),time_func(0,0,0, Fun, Str, undefined)} + end), + receive {Pid,Msg} -> Msg end. + +time_func(N,Sum,SumSq, Fun, Str, _) when N < 50 -> + {Time, Res} = timer:tc(fun() -> Fun(Str) end), + time_func(N+1,Sum+Time,SumSq+Time*Time, Fun, Str, Res); +time_func(N,Sum,SumSq, _, _, Res) -> + Mean = round(Sum / N), + Stdev = round(math:sqrt((SumSq - (Sum*Sum/N))/(N - 1))), + {N, Mean, Stdev, Res}. + +mode(binary, Bin) -> Bin; +mode(list, Bin) -> unicode:characters_to_list(Bin). + %% -%% Test cases starts here. +%% Old string lists Test cases starts here. %% len(Config) when is_list(Config) -> @@ -80,16 +932,14 @@ len(Config) when is_list(Config) -> {'EXIT',_} = (catch string:len({})), ok. -equal(Config) when is_list(Config) -> +old_equal(Config) when is_list(Config) -> true = string:equal("", ""), false = string:equal("", " "), true = string:equal("laban", "laban"), false = string:equal("skvimp", "skvump"), - %% invalid arg type - true = string:equal(2, 2), % not good, should crash ok. -concat(Config) when is_list(Config) -> +old_concat(Config) when is_list(Config) -> "erlang rules" = string:concat("erlang ", "rules"), "" = string:concat("", ""), "x" = string:concat("x", ""), @@ -130,6 +980,7 @@ str_rstr(Config) when is_list(Config) -> 3 = string:rstr("xxxx", "xx"), 3 = string:str("xy z yx", " z"), 3 = string:rstr("xy z yx", " z"), + 3 = string:str("aaab", "ab"), %% invalid arg type {'EXIT',_} = (catch string:str(hello, "he")), %% invalid arg type @@ -184,7 +1035,7 @@ substr(Config) when is_list(Config) -> {'EXIT',_} = (catch string:substr("1234", "1")), ok. -tokens(Config) when is_list(Config) -> +old_tokens(Config) when is_list(Config) -> [] = string:tokens("",""), [] = string:tokens("abc","abc"), ["abc"] = string:tokens("abc", ""), @@ -221,7 +1072,7 @@ replace_sep(C, Seps, New) -> chars(Config) when is_list(Config) -> [] = string:chars($., 0), [] = string:chars($., 0, []), - 10 = length(string:chars(32, 10, [])), + 10 = erlang:length(string:chars(32, 10, [])), "aaargh" = string:chars($a, 3, "rgh"), %% invalid arg type {'EXIT',_} = (catch string:chars($x, [])), @@ -231,7 +1082,7 @@ copies(Config) when is_list(Config) -> "" = string:copies("", 10), "" = string:copies(".", 0), "." = string:copies(".", 1), - 30 = length(string:copies("123", 10)), + 30 = erlang:length(string:copies("123", 10)), %% invalid arg type {'EXIT',_} = (catch string:copies("hej", -1)), {'EXIT',_} = (catch string:copies("hej", 2.0)), @@ -360,7 +1211,7 @@ to_integer(Config) when is_list(Config) -> ok. test_to_integer(Str) -> - io:format("Checking ~p~n", [Str]), + %% io:format("Checking ~p~n", [Str]), case string:to_integer(Str) of {error,_Reason} = Bad -> {'EXIT',_} = (catch list_to_integer(Str)), @@ -403,7 +1254,7 @@ to_float(Config) when is_list(Config) -> ok. test_to_float(Str) -> - io:format("Checking ~p~n", [Str]), + %% io:format("Checking ~p~n", [Str]), case string:to_float(Str) of {error,_Reason} = Bad -> {'EXIT',_} = (catch list_to_float(Str)), @@ -419,7 +1270,7 @@ to_upper_to_lower(Config) when is_list(Config) -> All = lists:seq(0, 255), UC = string:to_upper(All), - 256 = length(UC), + 256 = erlang:length(UC), all_upper_latin1(UC, 0), LC = string:to_lower(All), @@ -450,7 +1301,7 @@ all_lower_latin1([C|T], C) when 0 =< C, C < $A; all_lower_latin1([H|T], C) when $A =< C, C =< $Z; 16#C0 =< C, C =< 16#F6; 16#C8 =< C, C =< 16#DE -> - io:format("~p\n", [{H,C}]), + % io:format("~p\n", [{H,C}]), H = C + 32, all_lower_latin1(T, C+1); all_lower_latin1([], 256) -> ok. |