aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorDan Gudmundsson <[email protected]>2017-04-03 12:19:21 +0200
committerDan Gudmundsson <[email protected]>2017-04-24 12:16:56 +0200
commit2c72e662bad11a41839780f86680d4bb05367c78 (patch)
tree01e9ae9b32fdb953392e571a0773fb2cd059c498
parent75fc94b8b462d7b7f6dd4b706bbe32cff77ee575 (diff)
downloadotp-2c72e662bad11a41839780f86680d4bb05367c78.tar.gz
otp-2c72e662bad11a41839780f86680d4bb05367c78.tar.bz2
otp-2c72e662bad11a41839780f86680d4bb05367c78.zip
New unicode aware string module that works with unicode:chardata()
Works with unicode:chardata() as input as was decided on OTP board meeting as response to EEP-35 a long time ago. Works on graphemes clusters as base, with a few exceptions, does not handle classic (nor nfd'ified) Hangul nor the extended grapheme clusters such as the prepend class. That would make handling binaries as input/output very slow. List input => list output, binary input => binary output and mixed input => mixed output for all find/split functions. So that results can be post-processed without the need to invoke unicode:characters_to_list|binary for intermediate data. pad functions return lists of unicode:chardata() for performance.
-rw-r--r--.gitignore1
-rw-r--r--lib/stdlib/doc/src/string.xml741
-rw-r--r--lib/stdlib/doc/src/unicode_usage.xml70
-rw-r--r--lib/stdlib/src/string.erl1266
-rw-r--r--lib/stdlib/test/string_SUITE.erl893
5 files changed, 2838 insertions, 133 deletions
diff --git a/.gitignore b/.gitignore
index fdbf0d2d5f..c867b1a597 100644
--- a/.gitignore
+++ b/.gitignore
@@ -22,6 +22,7 @@ autom4te.cache
#
/bootstrap/lib/compiler/egen
/bootstrap/lib/stdlib/egen
+/lib/stdlib/src/unicode_util.erl
# Compiler derivatives
#
diff --git a/lib/stdlib/doc/src/string.xml b/lib/stdlib/doc/src/string.xml
index dddedf1132..dc83c40a9a 100644
--- a/lib/stdlib/doc/src/string.xml
+++ b/lib/stdlib/doc/src/string.xml
@@ -36,8 +36,613 @@
<modulesummary>String processing functions.</modulesummary>
<description>
<p>This module provides functions for string processing.</p>
+ <p>A string in this module is represented by <seealso marker="unicode#type-chardata">
+ <c>unicode:chardata()</c></seealso>, that is, a list of codepoints,
+ binaries with UTF-8-encoded codepoints
+ (<em>UTF-8 binaries</em>), or a mix of the two.</p>
+ <code>
+"abcd" is a valid string
+&lt;&lt;"abcd">> is a valid string
+["abcd"] is a valid string
+&lt;&lt;"abc..åäö"/utf8>> is a valid string
+&lt;&lt;"abc..åäö">> is NOT a valid string,
+ but a binary with Latin-1-encoded codepoints
+[&lt;&lt;"abc">>, "..åäö"] is a valid string
+[atom] is NOT a valid string</code>
+ <p>
+ This module operates on grapheme clusters. A <em>grapheme cluster</em>
+ is a user-perceived character, which can be represented by several
+ codepoints.
+ </p>
+ <code>
+"å" [229] or [97, 778]
+"e̊" [101, 778]</code>
+ <p>
+ The string length of "ß↑e̊" is 3, even though it is represented by the
+ codepoints <c>[223,8593,101,778]</c> or the UTF-8 binary
+ <c>&lt;&lt;195,159,226,134,145,101,204,138>></c>.
+ </p>
+ <p>
+ Grapheme clusters for codepoints of class <c>prepend</c>
+ and non-modern (or decomposed) Hangul is not handled for performance
+ reasons in
+ <seealso marker="#find/3"><c>find/3</c></seealso>,
+ <seealso marker="#replace/3"><c>replace/3</c></seealso>,
+ <seealso marker="#split/2"><c>split/2</c></seealso>,
+ <seealso marker="#lexemes/2"><c>split/2</c></seealso> and
+ <seealso marker="#trim/3"><c>trim/3</c></seealso>.
+ </p>
+ <p>
+ Splitting and appending strings is to be done on grapheme clusters
+ borders.
+ There is no verification that the results of appending strings are
+ valid or normalized.
+ </p>
+ <p>
+ Most of the functions expect all input to be normalized to one form,
+ see for example <seealso marker="unicode#characters_to_nfc_list/1">
+ <c>unicode:characters_to_nfc_list/1</c></seealso>.
+ </p>
+ <p>
+ Language or locale specific handling of input is not considered
+ in any function.
+ </p>
+ <p>
+ The functions can crash for non-valid input strings. For example,
+ the functions expect UTF-8 binaries but not all functions
+ verify that all binaries are encoded correctly.
+ </p>
+ <p>
+ Unless otherwise specified the return value type is the same as
+ the input type. That is, binary input returns binary output,
+ list input returns a list output, and mixed input can return a
+ mixed output.</p>
+ <code>
+1> string:trim(" sarah ").
+"sarah"
+2> string:trim(&lt;&lt;" sarah ">>).
+&lt;&lt;"sarah">>
+3> string:lexemes("foo bar", " ").
+["foo","bar"]
+4> string:lexemes(&lt;&lt;"foo bar">>, " ").
+[&lt;&lt;"foo">>,&lt;&lt;"bar">>]</code>
+ <p>This module has been reworked in Erlang/OTP 20 to
+ handle <seealso marker="unicode#type-chardata">
+ <c>unicode:chardata()</c></seealso> and operate on grapheme
+ clusters. The <seealso marker="#oldapi"> <c>old
+ functions</c></seealso> that only work on Latin-1 lists as input
+ are still available but should not be
+ used. They will be deprecated in Erlang/OTP 21.
+ </p>
</description>
+ <datatypes>
+ <datatype>
+ <name name="direction"/>
+ <name name="grapheme_cluster"/>
+ <desc>
+ <p>A user-perceived character, consisting of one or more
+ codepoints.</p>
+ </desc>
+ </datatype>
+ </datatypes>
+
+ <funcs>
+
+ <func>
+ <name name="casefold" arity="1"/>
+ <fsummary>Convert a string to a comparable string.</fsummary>
+ <desc>
+ <p>
+ Converts <c><anno>String</anno></c> to a case-agnostic
+ comparable string. Function <c>casefold/1</c> is preferred
+ over <c>lowercase/1</c> when two strings are to be compared
+ for equality. See also <seealso marker="#equal/4"><c>equal/4</c></seealso>.
+ </p>
+ <p><em>Example:</em></p>
+ <pre>
+1> <input>string:casefold("Ω and ẞ SHARP S").</input>
+"ω and ss sharp s"</pre>
+ </desc>
+ </func>
+
+ <func>
+ <name name="chomp" arity="1"/>
+ <fsummary>Remove trailing end of line control characters.</fsummary>
+ <desc>
+ <p>
+ Returns a string where any trailing <c>\n</c> or
+ <c>\r\n</c> have been removed from <c><anno>String</anno></c>.
+ </p>
+ <p><em>Example:</em></p>
+ <pre>
+182> <input>string:chomp(&lt;&lt;"\nHello\n\n">>).</input>
+&lt;&lt;"\nHello">>
+183> <input>string:chomp("\nHello\r\r\n").</input>
+"\nHello\r"</pre>
+ </desc>
+ </func>
+
+ <func>
+ <name name="equal" arity="2"/>
+ <name name="equal" arity="3"/>
+ <name name="equal" arity="4"/>
+ <fsummary>Test string equality.</fsummary>
+ <desc>
+ <p>
+ Returns <c>true</c> if <c><anno>A</anno></c> and
+ <c><anno>B</anno></c> are equal, otherwise <c>false</c>.
+ </p>
+ <p>
+ If <c><anno>IgnoreCase</anno></c> is <c>true</c>
+ the function does <seealso marker="#casefold/1">
+ <c>casefold</c>ing</seealso> on the fly before the equality test.
+ </p>
+ <p>If <c><anno>Norm</anno></c> is not <c>none</c>
+ the function applies normalization on the fly before the equality test.
+ There are four available normalization forms:
+ <seealso marker="unicode#characters_to_nfc_list/1"> <c>nfc</c></seealso>,
+ <seealso marker="unicode#characters_to_nfd_list/1"> <c>nfd</c></seealso>,
+ <seealso marker="unicode#characters_to_nfkc_list/1"> <c>nfkc</c></seealso>, and
+ <seealso marker="unicode#characters_to_nfkd_list/1"> <c>nfkd</c></seealso>.
+ </p>
+ <p>By default,
+ <c><anno>IgnoreCase</anno></c> is <c>false</c> and
+ <c><anno>Norm</anno></c> is <c>none</c>.</p>
+ <p><em>Example:</em></p>
+ <pre>
+1> <input>string:equal("åäö", &lt;&lt;"åäö"/utf8>>).</input>
+true
+2> <input>string:equal("åäö", unicode:characters_to_nfd_binary("åäö")).</input>
+false
+3> <input>string:equal("åäö", unicode:characters_to_nfd_binary("ÅÄÖ"), true, nfc).</input>
+true</pre>
+ </desc>
+ </func>
+
+ <func>
+ <name name="find" arity="2"/>
+ <name name="find" arity="3"/>
+ <fsummary>Find start of substring.</fsummary>
+ <desc>
+ <p>
+ Removes anything before <c><anno>SearchPattern</anno></c> in <c><anno>String</anno></c>
+ and returns the remainder of the string or <c>nomatch</c> if <c><anno>SearchPattern</anno></c> is not
+ found.
+ <c><anno>Dir</anno></c>, which can be <c>leading</c> or
+ <c>trailing</c>, indicates from which direction characters
+ are to be searched.
+ </p>
+ <p>
+ By default, <c><anno>Dir</anno></c> is <c>leading</c>.
+ </p>
+ <p><em>Example:</em></p>
+ <pre>
+1> <input>string:find("ab..cd..ef", ".").</input>
+"..cd..ef"
+2> <input>string:find(&lt;&lt;"ab..cd..ef">>, "..", trailing).</input>
+&lt;&lt;"..ef">>
+3> <input>string:find(&lt;&lt;"ab..cd..ef">>, "x", leading).</input>
+nomatch
+4> <input>string:find("ab..cd..ef", "x", trailing).</input>
+nomatch</pre>
+ </desc>
+ </func>
+
+ <func>
+ <name name="is_empty" arity="1"/>
+ <fsummary>Check if the string is empty.</fsummary>
+ <desc>
+ <p>Returns <c>true</c> if <c><anno>String</anno></c> is the
+ empty string, otherwise <c>false</c>.</p>
+ <p><em>Example:</em></p>
+ <pre>
+1> <input>string:is_empty("foo").</input>
+false
+2> <input>string:is_empty(["",&lt;&lt;>>]).</input>
+true</pre>
+ </desc>
+ </func>
+
+ <func>
+ <name name="length" arity="1"/>
+ <fsummary>Calculate length of the string.</fsummary>
+ <desc>
+ <p>
+ Returns the number of grapheme clusters in <c><anno>String</anno></c>.
+ </p>
+ <p><em>Example:</em></p>
+ <pre>
+1> <input>string:length("ß↑e̊").</input>
+3
+2> <input>string:length(&lt;&lt;195,159,226,134,145,101,204,138>>).</input>
+3</pre>
+ </desc>
+ </func>
+
+ <func>
+ <name name="lexemes" arity="2"/>
+ <fsummary>Split string into lexemes.</fsummary>
+ <desc>
+ <p>
+ Returns a list of lexemes in <c><anno>String</anno></c>, separated
+ by the grapheme clusters in <c><anno>SeparatorList</anno></c>.
+ </p>
+ <p>
+ Notice that, as shown in this example, two or more
+ adjacent separator graphemes clusters in <c><anno>String</anno></c>
+ are treated as one. That is, there are no empty
+ strings in the resulting list of lexemes.
+ See also <seealso marker="#split/3"><c>split/3</c></seealso> which returns
+ empty strings.
+ </p>
+ <p>Notice that <c>[$\r,$\n]</c> is one grapheme cluster.</p>
+ <p><em>Example:</em></p>
+ <pre>
+1> <input>string:lexemes("abc de̊fxxghix jkl\r\nfoo", "x e" ++ [[$\r,$\n]]).</input>
+["abc","de̊f","ghi","jkl","foo"]
+2> <input>string:lexemes(&lt;&lt;"abc de̊fxxghix jkl\r\nfoo"/utf8>>, "x e" ++ [$\r,$\n]).</input>
+[&lt;&lt;"abc">>,&lt;&lt;"de̊f"/utf8>>,&lt;&lt;"ghi">>,&lt;&lt;"jkl\r\nfoo">>]</pre>
+ </desc>
+ </func>
+
+ <func>
+ <name name="lowercase" arity="1"/>
+ <fsummary>Convert a string to lowercase</fsummary>
+ <desc>
+ <p>
+ Converts <c><anno>String</anno></c> to lowercase.
+ </p>
+ <p>
+ Notice that function <seealso marker="#casefold/1"><c>casefold/1</c></seealso>
+ should be used when converting a string to
+ be tested for equality.
+ </p>
+ <p><em>Example:</em></p>
+ <pre>
+2> <input>string:lowercase(string:uppercase("Michał")).</input>
+"michał"</pre>
+ </desc>
+ </func>
+
+ <func>
+ <name name="next_codepoint" arity="1"/>
+ <fsummary>Pick the first codepoint.</fsummary>
+ <desc>
+ <p>
+ Returns the first codepoint in <c><anno>String</anno></c>
+ and the rest of <c><anno>String</anno></c> in the tail.
+ </p>
+ <p><em>Example:</em></p>
+ <pre>
+1> <input>string:next_codepoint(unicode:characters_to_binary("e̊fg")).</input>
+[101|&lt;&lt;"̊fg"/utf8>>]</pre>
+ </desc>
+ </func>
+
+ <func>
+ <name name="next_grapheme" arity="1"/>
+ <fsummary>Pick the first grapheme cluster.</fsummary>
+ <desc>
+ <p>
+ Returns the first grapheme cluster in <c><anno>String</anno></c>
+ and the rest of <c><anno>String</anno></c> in the tail.
+ </p>
+ <p><em>Example:</em></p>
+ <pre>
+1> <input>string:next_grapheme(unicode:characters_to_binary("e̊fg")).</input>
+["e̊"|&lt;&lt;"fg">>]</pre>
+ </desc>
+ </func>
+
+ <func>
+ <name name="nth_lexeme" arity="3"/>
+ <fsummary>Pick the nth lexeme.</fsummary>
+ <desc>
+ <p>Returns lexeme number <c><anno>N</anno></c> in
+ <c><anno>String</anno></c>, where lexemes are separated by
+ the grapheme clusters in <c><anno>SeparatorList</anno></c>.
+ </p>
+ <p><em>Example:</em></p>
+ <pre>
+1> <input>string:nth_lexeme("abc.de̊f.ghiejkl", 3, ".e").</input>
+"ghi"</pre>
+ </desc>
+ </func>
+
+ <func>
+ <name name="pad" arity="2"/>
+ <name name="pad" arity="3"/>
+ <name name="pad" arity="4"/>
+ <fsummary>Pad a string to given length.</fsummary>
+ <desc>
+ <p>
+ Pads <c><anno>String</anno></c> to <c><anno>Length</anno></c> with
+ grapheme cluster <c><anno>Char</anno></c>.
+ <c><anno>Dir</anno></c>, which can be <c>leading</c>, <c>trailing</c>,
+ or <c>both</c>, indicates where the padding should be added.
+ </p>
+ <p>By default, <c><anno>Char</anno></c> is <c>$\s</c> and
+ <c><anno>Dir</anno></c> is <c>trailing</c>.
+ </p>
+ <p><em>Example:</em></p>
+ <pre>
+1> <input>string:pad(&lt;&lt;"He̊llö"/utf8>>, 8).</input>
+[&lt;&lt;72,101,204,138,108,108,195,182>>,32,32,32]
+2> <input>io:format("'~ts'~n",[string:pad("He̊llö", 8, leading)]).</input>
+' He̊llö'
+3> <input>io:format("'~ts'~n",[string:pad("He̊llö", 8, both)]).</input>
+' He̊llö '</pre>
+ </desc>
+ </func>
+
+ <func>
+ <name name="prefix" arity="2"/>
+ <fsummary>Remove prefix from string.</fsummary>
+ <desc>
+ <p>
+ If <c><anno>Prefix</anno></c> is the prefix of
+ <c><anno>String</anno></c>, removes it and returns the
+ remainder of <c><anno>String</anno></c>, otherwise returns
+ <c>nomatch</c>.
+ </p>
+ <p><em>Example:</em></p>
+ <pre>
+1> <input>string:prefix(&lt;&lt;"prefix of string">>, "pre").</input>
+&lt;&lt;"fix of string">>
+2> <input>string:prefix("pre", "prefix").</input>
+nomatch</pre>
+ </desc>
+ </func>
+
+ <func>
+ <name name="replace" arity="3"/>
+ <name name="replace" arity="4"/>
+ <fsummary>Replace a pattern in string.</fsummary>
+ <desc>
+ <p>
+ Replaces <c><anno>SearchPattern</anno></c> in <c><anno>String</anno></c>
+ with <c><anno>Replacement</anno></c>.
+ <c><anno>Where</anno></c>, default <c>leading</c>, indicates whether
+ the <c>leading</c>, the <c>trailing</c> or <c>all</c> encounters of
+ <c><anno>SearchPattern</anno></c> are to be replaced.
+ </p>
+ <p>Can be implemented as:</p>
+ <pre>lists:join(Replacement, split(String, SearchPattern, Where)).</pre>
+ <p><em>Example:</em></p>
+ <pre>
+1> <input>string:replace(&lt;&lt;"ab..cd..ef">>, "..", "*").</input>
+[&lt;&lt;"ab">>,"*",&lt;&lt;"cd..ef">>]
+2> <input>string:replace(&lt;&lt;"ab..cd..ef">>, "..", "*", all).</input>
+[&lt;&lt;"ab">>,"*",&lt;&lt;"cd">>,"*",&lt;&lt;"ef">>]</pre>
+ </desc>
+ </func>
+
+ <func>
+ <name name="reverse" arity="1"/>
+ <fsummary>Reverses a string</fsummary>
+ <desc>
+ <p>
+ Returns the reverse list of the grapheme clusters in <c><anno>String</anno></c>.
+ </p>
+ <p><em>Example:</em></p>
+ <pre>
+1> Reverse = <input>string:reverse(unicode:characters_to_nfd_binary("ÅÄÖ")).</input>
+[[79,776],[65,776],[65,778]]
+2> <input>io:format("~ts~n",[Reverse]).</input>
+ÖÄÅ</pre>
+ </desc>
+ </func>
+
+ <func>
+ <name name="slice" arity="2"/>
+ <name name="slice" arity="3"/>
+ <fsummary>Extract a part of string</fsummary>
+ <desc>
+ <p>Returns a substring of <c><anno>String</anno></c> of
+ at most <c><anno>Length</anno></c> grapheme clusters, starting at position
+ <c><anno>Start</anno></c>.</p>
+ <p>By default, <c><anno>Length</anno></c> is <c>infinity</c>.</p>
+ <p><em>Example:</em></p>
+ <pre>
+1> <input>string:slice(&lt;&lt;"He̊llö Wörld"/utf8>>, 4).</input>
+&lt;&lt;"ö Wörld"/utf8>>
+2> <input>string:slice(["He̊llö ", &lt;&lt;"Wörld"/utf8>>], 4,4).</input>
+"ö Wö"
+3> <input>string:slice(["He̊llö ", &lt;&lt;"Wörld"/utf8>>], 4,50).</input>
+"ö Wörld"</pre>
+ </desc>
+ </func>
+
+ <func>
+ <name name="split" arity="2"/>
+ <name name="split" arity="3"/>
+ <fsummary>Split a string into substrings.</fsummary>
+ <desc>
+ <p>
+ Splits <c><anno>String</anno></c> where <c><anno>SearchPattern</anno></c>
+ is encountered and return the remaining parts.
+ <c><anno>Where</anno></c>, default <c>leading</c>, indicates whether
+ the <c>leading</c>, the <c>trailing</c> or <c>all</c> encounters of
+ <c><anno>SearchPattern</anno></c> will split <c><anno>String</anno></c>.
+ </p>
+ <p><em>Example:</em></p>
+ <pre>
+0> <input>string:split("ab..bc..cd", "..").</input>
+["ab","bc..cd"]
+1> <input>string:split(&lt;&lt;"ab..bc..cd">>, "..", trailing).</input>
+[&lt;&lt;"ab..bc">>,&lt;&lt;"cd">>]
+2> <input>string:split(&lt;&lt;"ab..bc....cd">>, "..", all).</input>
+[&lt;&lt;"ab">>,&lt;&lt;"bc">>,&lt;&lt;>>,&lt;&lt;"cd">>]</pre>
+ </desc>
+ </func>
+
+ <func>
+ <name name="take" arity="2"/>
+ <name name="take" arity="3"/>
+ <name name="take" arity="4"/>
+ <fsummary>Take leading or trailing parts.</fsummary>
+ <desc>
+ <p>Takes characters from <c><anno>String</anno></c> as long as
+ the characters are members of set <c><anno>Characters</anno></c>
+ or the complement of set <c><anno>Characters</anno></c>.
+ <c><anno>Dir</anno></c>,
+ which can be <c>leading</c> or <c>trailing</c>, indicates from
+ which direction characters are to be taken.
+ </p>
+ <p><em>Example:</em></p>
+ <pre>
+5> <input>string:take("abc0z123", lists:seq($a,$z)).</input>
+{"abc","0z123"}
+6> <input>string:take(&lt;&lt;"abc0z123">>, lists:seq($0,$9), true, leading).</input>
+{&lt;&lt;"abc">>,&lt;&lt;"0z123">>}
+7> <input>string:take("abc0z123", lists:seq($0,$9), false, trailing).</input>
+{"abc0z","123"}
+8> <input>string:take(&lt;&lt;"abc0z123">>, lists:seq($a,$z), true, trailing).</input>
+{&lt;&lt;"abc0z">>,&lt;&lt;"123">>}</pre>
+ </desc>
+ </func>
+
+ <func>
+ <name name="titlecase" arity="1"/>
+ <fsummary>Convert a string to titlecase.</fsummary>
+ <desc>
+ <p>
+ Converts <c><anno>String</anno></c> to titlecase.
+ </p>
+ <p><em>Example:</em></p>
+ <pre>
+1> <input>string:titlecase("ß is a SHARP s").</input>
+"Ss is a SHARP s"</pre>
+ </desc>
+ </func>
+
+ <func>
+ <name name="to_float" arity="1"/>
+ <fsummary>Return a float whose text representation is the integers
+ (ASCII values) of a string.</fsummary>
+ <desc>
+ <p>Argument <c><anno>String</anno></c> is expected to start with a
+ valid text represented float (the digits are ASCII values).
+ Remaining characters in the string after the float are returned in
+ <c><anno>Rest</anno></c>.</p>
+ <p><em>Example:</em></p>
+ <pre>
+> <input>{F1,Fs} = string:to_float("1.0-1.0e-1"),</input>
+> <input>{F2,[]} = string:to_float(Fs),</input>
+> <input>F1+F2.</input>
+0.9
+> <input>string:to_float("3/2=1.5").</input>
+{error,no_float}
+> <input>string:to_float("-1.5eX").</input>
+{-1.5,"eX"}</pre>
+ </desc>
+ </func>
+
+ <func>
+ <name name="to_integer" arity="1"/>
+ <fsummary>Return an integer whose text representation is the integers
+ (ASCII values) of a string.</fsummary>
+ <desc>
+ <p>Argument <c><anno>String</anno></c> is expected to start with a
+ valid text represented integer (the digits are ASCII values).
+ Remaining characters in the string after the integer are returned in
+ <c><anno>Rest</anno></c>.</p>
+ <p><em>Example:</em></p>
+ <pre>
+> <input>{I1,Is} = string:to_integer("33+22"),</input>
+> <input>{I2,[]} = string:to_integer(Is),</input>
+> <input>I1-I2.</input>
+11
+> <input>string:to_integer("0.5").</input>
+{0,".5"}
+> <input>string:to_integer("x=2").</input>
+{error,no_integer}</pre>
+ </desc>
+ </func>
+
+ <func>
+ <name name="to_graphemes" arity="1"/>
+ <fsummary>Convert a string to a list of grapheme clusters.</fsummary>
+ <desc>
+ <p>
+ Converts <c><anno>String</anno></c> to a list of grapheme clusters.
+ </p>
+ <p><em>Example:</em></p>
+ <pre>
+1> <input>string:to_graphemes("ß↑e̊").</input>
+[223,8593,[101,778]]
+2> <input>string:to_graphemes(&lt;&lt;"ß↑e̊"/utf8>>).</input>
+[223,8593,[101,778]]</pre>
+ </desc>
+ </func>
+
+ <func>
+ <name name="trim" arity="1"/>
+ <name name="trim" arity="2"/>
+ <name name="trim" arity="3"/>
+ <fsummary>Trim leading or trailing, or both, characters.</fsummary>
+ <desc>
+ <p>
+ Returns a string, where leading or trailing, or both,
+ <c><anno>Characters</anno></c> have been removed.
+ <c><anno>Dir</anno></c> which can be <c>leading</c>, <c>trailing</c>,
+ or <c>both</c>, indicates from which direction characters
+ are to be removed.
+ </p>
+ <p> Default <c><anno>Characters</anno></c> are the set of
+ nonbreakable whitespace codepoints, defined as
+ Pattern_White_Space in
+ <url href="http://unicode.org/reports/tr31/">Unicode Standard Annex #31</url>.
+ <c>By default, <anno>Dir</anno></c> is <c>both</c>.
+ </p>
+ <p>
+ Notice that <c>[$\r,$\n]</c> is one grapheme cluster according
+ to the Unicode Standard.
+ </p>
+ <p><em>Example:</em></p>
+ <pre>
+1> <input>string:trim("\t Hello \n").</input>
+"Hello"
+2> <input>string:trim(&lt;&lt;"\t Hello \n">>, leading).</input>
+&lt;&lt;"Hello \n">>
+3> <input>string:trim(&lt;&lt;".Hello.\n">>, trailing, "\n.").</input>
+&lt;&lt;".Hello">></pre>
+ </desc>
+ </func>
+
+ <func>
+ <name name="uppercase" arity="1"/>
+ <fsummary>Convert a string to uppercase.</fsummary>
+ <desc>
+ <p>
+ Converts <c><anno>String</anno></c> to uppercase.
+ </p>
+ <p>See also <seealso marker="#titlecase/1"><c>titlecase/1</c></seealso>.</p>
+ <p><em>Example:</em></p>
+ <pre>
+1> <input>string:uppercase("Michał").</input>
+"MICHAŁ"</pre>
+ </desc>
+ </func>
+
+ </funcs>
+
+ <section>
+ <marker id="oldapi"/>
+ <title>Obsolete API functions</title>
+ <p>Here follows the function of the old API.
+ These functions only work on a list of Latin-1 characters.
+ </p>
+ <note><p>
+ The functions are kept for backward compatibility, but are
+ not recommended.
+ They will be deprecated in Erlang/OTP 21.
+ </p>
+ <p>Any undocumented functions in <c>string</c> are not to be used.</p>
+ </note>
+ </section>
+
<funcs>
<func>
<name name="centre" arity="2"/>
@@ -47,17 +652,24 @@
<p>Returns a string, where <c><anno>String</anno></c> is centered in the
string and surrounded by blanks or <c><anno>Character</anno></c>.
The resulting string has length <c><anno>Number</anno></c>.</p>
+ <p>This function is <seealso marker="#oldapi">obsolete</seealso>.
+ Use
+ <seealso marker="#pad/3"><c>pad/3</c></seealso>.
+ </p>
</desc>
</func>
<func>
<name name="chars" arity="2"/>
<name name="chars" arity="3"/>
- <fsummary>Returns a string consisting of numbers of characters.</fsummary>
+ <fsummary>Return a string consisting of numbers of characters.</fsummary>
<desc>
<p>Returns a string consisting of <c><anno>Number</anno></c> characters
<c><anno>Character</anno></c>. Optionally, the string can end with
string <c><anno>Tail</anno></c>.</p>
+ <p>This function is <seealso marker="#oldapi">obsolete</seealso>.
+ Use
+ <seealso marker="lists#duplicate/2"><c>lists:duplicate/2</c></seealso>.</p>
</desc>
</func>
@@ -69,6 +681,9 @@
<p>Returns the index of the first occurrence of
<c><anno>Character</anno></c> in <c><anno>String</anno></c>. Returns
<c>0</c> if <c><anno>Character</anno></c> does not occur.</p>
+ <p>This function is <seealso marker="#oldapi">obsolete</seealso>.
+ Use
+ <seealso marker="#find/2"><c>find/2</c></seealso>.</p>
</desc>
</func>
@@ -79,6 +694,16 @@
<p>Concatenates <c><anno>String1</anno></c> and
<c><anno>String2</anno></c> to form a new string
<c><anno>String3</anno></c>, which is returned.</p>
+ <p>
+ This function is <seealso marker="#oldapi">obsolete</seealso>.
+ Use <c>[<anno>String1</anno>, <anno>String2</anno>]</c> as
+ <c>Data</c> argument, and call
+ <seealso marker="unicode#characters_to_list/2">
+ <c>unicode:characters_to_list/2</c></seealso> or
+ <seealso marker="unicode#characters_to_binary/2">
+ <c>unicode:characters_to_binary/2</c></seealso>
+ to flatten the output.
+ </p>
</desc>
</func>
@@ -88,6 +713,9 @@
<desc>
<p>Returns a string containing <c><anno>String</anno></c> repeated
<c><anno>Number</anno></c> times.</p>
+ <p>This function is <seealso marker="#oldapi">obsolete</seealso>.
+ Use
+ <seealso marker="lists#duplicate/2"><c>lists:duplicate/2</c></seealso>.</p>
</desc>
</func>
@@ -98,6 +726,9 @@
<p>Returns the length of the maximum initial segment of
<c><anno>String</anno></c>, which consists entirely of characters
not from <c><anno>Chars</anno></c>.</p>
+ <p>This function is <seealso marker="#oldapi">obsolete</seealso>.
+ Use
+ <seealso marker="#take/3"><c>take/3</c></seealso>.</p>
<p><em>Example:</em></p>
<code type="none">
> string:cspan("\t abcdef", " \t").
@@ -106,20 +737,14 @@
</func>
<func>
- <name name="equal" arity="2"/>
- <fsummary>Test string equality.</fsummary>
- <desc>
- <p>Returns <c>true</c> if <c><anno>String1</anno></c> and
- <c><anno>String2</anno></c> are equal, otherwise <c>false</c>.</p>
- </desc>
- </func>
-
- <func>
<name name="join" arity="2"/>
<fsummary>Join a list of strings with separator.</fsummary>
<desc>
<p>Returns a string with the elements of <c><anno>StringList</anno></c>
separated by the string in <c><anno>Separator</anno></c>.</p>
+ <p>This function is <seealso marker="#oldapi">obsolete</seealso>.
+ Use
+ <seealso marker="lists#join/2"><c>lists:join/2</c></seealso>.</p>
<p><em>Example:</em></p>
<code type="none">
> join(["one", "two", "three"], ", ").
@@ -137,6 +762,10 @@
fixed. If <c>length(<anno>String</anno>)</c> &lt;
<c><anno>Number</anno></c>, then <c><anno>String</anno></c> is padded
with blanks or <c><anno>Character</anno></c>s.</p>
+ <p>This function is <seealso marker="#oldapi">obsolete</seealso>.
+ Use
+ <seealso marker="#pad/2"><c>pad/2</c></seealso> or
+ <seealso marker="#pad/3"><c>pad/3</c></seealso>.</p>
<p><em>Example:</em></p>
<code type="none">
> string:left("Hello",10,$.).
@@ -149,6 +778,9 @@
<fsummary>Return the length of a string.</fsummary>
<desc>
<p>Returns the number of characters in <c><anno>String</anno></c>.</p>
+ <p>This function is <seealso marker="#oldapi">obsolete</seealso>.
+ Use
+ <seealso marker="#length/1"><c>length/1</c></seealso>.</p>
</desc>
</func>
@@ -160,6 +792,9 @@
<p>Returns the index of the last occurrence of
<c><anno>Character</anno></c> in <c><anno>String</anno></c>. Returns
<c>0</c> if <c><anno>Character</anno></c> does not occur.</p>
+ <p>This function is <seealso marker="#oldapi">obsolete</seealso>.
+ Use
+ <seealso marker="#find/3"><c>find/3</c></seealso>.</p>
</desc>
</func>
@@ -173,6 +808,9 @@
fixed. If the length of <c>(<anno>String</anno>)</c> &lt;
<c><anno>Number</anno></c>, then <c><anno>String</anno></c> is padded
with blanks or <c><anno>Character</anno></c>s.</p>
+ <p>This function is <seealso marker="#oldapi">obsolete</seealso>.
+ Use
+ <seealso marker="#pad/3"><c>pad/3</c></seealso>.</p>
<p><em>Example:</em></p>
<code type="none">
> string:right("Hello", 10, $.).
@@ -188,6 +826,9 @@
<c><anno>SubString</anno></c> begins in <c><anno>String</anno></c>.
Returns <c>0</c> if <c><anno>SubString</anno></c>
does not exist in <c><anno>String</anno></c>.</p>
+ <p>This function is <seealso marker="#oldapi">obsolete</seealso>.
+ Use
+ <seealso marker="#find/3"><c>find/3</c></seealso>.</p>
<p><em>Example:</em></p>
<code type="none">
> string:rstr(" Hello Hello World World ", "Hello World").
@@ -202,6 +843,9 @@
<p>Returns the length of the maximum initial segment of
<c><anno>String</anno></c>, which consists entirely of characters
from <c><anno>Chars</anno></c>.</p>
+ <p>This function is <seealso marker="#oldapi">obsolete</seealso>.
+ Use
+ <seealso marker="#take/2"><c>take/2</c></seealso>.</p>
<p><em>Example:</em></p>
<code type="none">
> string:span("\t abcdef", " \t").
@@ -217,6 +861,9 @@
<c><anno>SubString</anno></c> begins in <c><anno>String</anno></c>.
Returns <c>0</c> if <c><anno>SubString</anno></c>
does not exist in <c><anno>String</anno></c>.</p>
+ <p>This function is <seealso marker="#oldapi">obsolete</seealso>.
+ Use
+ <seealso marker="#find/2"><c>find/2</c></seealso>.</p>
<p><em>Example:</em></p>
<code type="none">
> string:str(" Hello Hello World World ", "Hello World").
@@ -230,12 +877,15 @@
<name name="strip" arity="3"/>
<fsummary>Strip leading or trailing characters.</fsummary>
<desc>
- <p>Returns a string, where leading and/or trailing blanks or a
+ <p>Returns a string, where leading or trailing, or both, blanks or a
number of <c><anno>Character</anno></c> have been removed.
<c><anno>Direction</anno></c>, which can be <c>left</c>, <c>right</c>,
or <c>both</c>, indicates from which direction blanks are to be
removed. <c>strip/1</c> is equivalent to
<c>strip(String, both)</c>.</p>
+ <p>This function is <seealso marker="#oldapi">obsolete</seealso>.
+ Use
+ <seealso marker="#trim/3"><c>trim/3</c></seealso>.</p>
<p><em>Example:</em></p>
<code type="none">
> string:strip("...Hello.....", both, $.).
@@ -251,6 +901,9 @@
<p>Returns a substring of <c><anno>String</anno></c>, starting at
position <c><anno>Start</anno></c> to the end of the string, or to
and including position <c><anno>Stop</anno></c>.</p>
+ <p>This function is <seealso marker="#oldapi">obsolete</seealso>.
+ Use
+ <seealso marker="#slice/3"><c>slice/3</c></seealso>.</p>
<p><em>Example:</em></p>
<code type="none">
sub_string("Hello World", 4, 8).
@@ -266,6 +919,9 @@ sub_string("Hello World", 4, 8).
<p>Returns a substring of <c><anno>String</anno></c>, starting at
position <c><anno>Start</anno></c>, and ending at the end of the
string or at length <c><anno>Length</anno></c>.</p>
+ <p>This function is <seealso marker="#oldapi">obsolete</seealso>.
+ Use
+ <seealso marker="#slice/3"><c>slice/3</c></seealso>.</p>
<p><em>Example:</em></p>
<code type="none">
> substr("Hello World", 4, 5).
@@ -281,6 +937,9 @@ sub_string("Hello World", 4, 8).
<p>Returns the word in position <c><anno>Number</anno></c> of
<c><anno>String</anno></c>. Words are separated by blanks or
<c><anno>Character</anno></c>s.</p>
+ <p>This function is <seealso marker="#oldapi">obsolete</seealso>.
+ Use
+ <seealso marker="#nth_lexeme/3"><c>nth_lexeme/3</c></seealso>.</p>
<p><em>Example:</em></p>
<code type="none">
> string:sub_word(" Hello old boy !",3,$o).
@@ -289,50 +948,6 @@ sub_string("Hello World", 4, 8).
</func>
<func>
- <name name="to_float" arity="1"/>
- <fsummary>Returns a float whose text representation is the integers
- (ASCII values) in a string.</fsummary>
- <desc>
- <p>Argument <c><anno>String</anno></c> is expected to start with a
- valid text represented float (the digits are ASCII values).
- Remaining characters in the string after the float are returned in
- <c><anno>Rest</anno></c>.</p>
- <p><em>Example:</em></p>
- <code type="none">
-> {F1,Fs} = string:to_float("1.0-1.0e-1"),
-> {F2,[]} = string:to_float(Fs),
-> F1+F2.
-0.9
-> string:to_float("3/2=1.5").
-{error,no_float}
-> string:to_float("-1.5eX").
-{-1.5,"eX"}</code>
- </desc>
- </func>
-
- <func>
- <name name="to_integer" arity="1"/>
- <fsummary>Returns an integer whose text representation is the integers
- (ASCII values) in a string.</fsummary>
- <desc>
- <p>Argument <c><anno>String</anno></c> is expected to start with a
- valid text represented integer (the digits are ASCII values).
- Remaining characters in the string after the integer are returned in
- <c><anno>Rest</anno></c>.</p>
- <p><em>Example:</em></p>
- <code type="none">
-> {I1,Is} = string:to_integer("33+22"),
-> {I2,[]} = string:to_integer(Is),
-> I1-I2.
-11
-> string:to_integer("0.5").
-{0,".5"}
-> string:to_integer("x=2").
-{error,no_integer}</code>
- </desc>
- </func>
-
- <func>
<name name="to_lower" arity="1" clause_i="1"/>
<name name="to_lower" arity="1" clause_i="2"/>
<name name="to_upper" arity="1" clause_i="1"/>
@@ -346,6 +961,11 @@ sub_string("Hello World", 4, 8).
<p>The specified string or character is case-converted. Notice that
the supported character set is ISO/IEC 8859-1 (also called Latin 1);
all values outside this set are unchanged</p>
+ <p>This function is <seealso marker="#oldapi">obsolete</seealso> use
+ <seealso marker="#lowercase/1"><c>lowercase/1</c></seealso>,
+ <seealso marker="#uppercase/1"><c>uppercase/1</c></seealso>,
+ <seealso marker="#titlecase/1"><c>titlecase/1</c></seealso> or
+ <seealso marker="#casefold/1"><c>casefold/1</c></seealso>.</p>
</desc>
</func>
@@ -363,6 +983,9 @@ sub_string("Hello World", 4, 8).
adjacent separator characters in <c><anno>String</anno></c>
are treated as one. That is, there are no empty
strings in the resulting list of tokens.</p>
+ <p>This function is <seealso marker="#oldapi">obsolete</seealso>.
+ Use
+ <seealso marker="#lexemes/2"><c>lexemes/2</c></seealso>.</p>
</desc>
</func>
@@ -373,6 +996,9 @@ sub_string("Hello World", 4, 8).
<desc>
<p>Returns the number of words in <c><anno>String</anno></c>, separated
by blanks or <c><anno>Character</anno></c>.</p>
+ <p>This function is <seealso marker="#oldapi">obsolete</seealso>.
+ Use
+ <seealso marker="#lexemes/2"><c>lexemes/2</c></seealso>.</p>
<p><em>Example:</em></p>
<code type="none">
> words(" Hello old boy!", $o).
@@ -387,10 +1013,7 @@ sub_string("Hello World", 4, 8).
other. The reason is that this string package is the
combination of two earlier packages and all functions of
both packages have been retained.</p>
-
- <note>
- <p>Any undocumented functions in <c>string</c> are not to be used.</p>
- </note>
</section>
+
</erlref>
diff --git a/lib/stdlib/doc/src/unicode_usage.xml b/lib/stdlib/doc/src/unicode_usage.xml
index a8ef8ff5c5..11b84f552a 100644
--- a/lib/stdlib/doc/src/unicode_usage.xml
+++ b/lib/stdlib/doc/src/unicode_usage.xml
@@ -65,7 +65,10 @@
<item><p>In Erlang/OTP 20.0, atoms and function can contain
Unicode characters. Module names are still restricted to
- the ISO-Latin-1 range.</p></item>
+ the ISO-Latin-1 range.</p>
+ <p>Support was added for normalizations forms in
+ <c>unicode</c> and the <c>string</c> module now handles
+ utf8-encoded binaries.</p></item>
</list>
<p>This section outlines the current Unicode support and gives some
@@ -110,23 +113,27 @@
</item>
</list>
- <p>So, a conversion function must know not only one character at a time,
- but possibly the whole sentence, the natural language to translate to,
- the differences in input and output string length, and so on.
- Erlang/OTP has currently no Unicode <c>to_upper</c>/<c>to_lower</c>
- functionality, but publicly available libraries address these issues.</p>
-
- <p>Another example is the accented characters, where the same glyph has two
- different representations. The Swedish letter "ö" is one example.
- The Unicode standard has a code point for it, but you can also write it
- as "o" followed by "U+0308" (Combining Diaeresis, with the simplified
- meaning that the last letter is to have "¨" above). They have the same
- glyph. They are for most purposes the same, but have different
- representations. For example, MacOS X converts all filenames to use
- Combining Diaeresis, while most other programs (including Erlang) try to
- hide that by doing the opposite when, for example, listing directories.
- However it is done, it is usually important to normalize such
- characters to avoid confusion.</p>
+ <p>So, a conversion function must know not only one character at a
+ time, but possibly the whole sentence, the natural language to
+ translate to, the differences in input and output string length,
+ and so on. Erlang/OTP has currently no Unicode
+ <c>uppercase</c>/<c>lowercase</c> functionality with language
+ specific handling, but publicly available libraries address these
+ issues.</p>
+
+ <p>Another example is the accented characters, where the same
+ glyph has two different representations. The Swedish letter "ö" is
+ one example. The Unicode standard has a code point for it, but
+ you can also write it as "o" followed by "U+0308" (Combining
+ Diaeresis, with the simplified meaning that the last letter is to
+ have "¨" above). They have the same glyph, user perceived
+ character. They are for most purposes the same, but have different
+ representations. For example, MacOS X converts all filenames to
+ use Combining Diaeresis, while most other programs (including
+ Erlang) try to hide that by doing the opposite when, for example,
+ listing directories. However it is done, it is usually important
+ to normalize such characters to avoid confusion.
+ </p>
<p>The list of examples can be made long. One need a kind of knowledge that
was not needed when programs only considered one or two languages. The
@@ -273,7 +280,7 @@
them. In some cases functionality has been added to already
existing interfaces (as the <seealso
marker="stdlib:string"><c>string</c></seealso> module now can
- handle lists with any code points). In some cases new
+ handle strings with any code points). In some cases new
functionality or options have been added (as in the <seealso
marker="stdlib:io"><c>io</c></seealso> module, the file
handling, the <seealso
@@ -977,7 +984,7 @@ Eshell V5.10.1 (abort with ^G)
<p>Fortunately, most textual data has been stored in lists and range
checking has been sparse, so modules like <c>string</c> work well for
- Unicode lists with little need for conversion or extension.</p>
+ Unicode strings with little need for conversion or extension.</p>
<p>Some modules are, however, changed to be explicitly Unicode-aware. These
modules include:</p>
@@ -1028,18 +1035,17 @@ Eshell V5.10.1 (abort with ^G)
has extensive support for Unicode text.</p></item>
</taglist>
- <p>The <seealso marker="stdlib:string"><c>string</c></seealso> module works
- perfectly for Unicode strings and ISO Latin-1 strings, except the
- language-dependent functions
- <seealso marker="stdlib:string#to_upper/1"><c>string:to_upper/1</c></seealso>
- and
- <seealso marker="stdlib:string#to_lower/1"><c>string:to_lower/1</c></seealso>,
- which are only correct for the ISO Latin-1 character set. These two
- functions can never function correctly for Unicode characters in their
- current form, as there are language and locale issues as well as
- multi-character mappings to consider when converting text between cases.
- Converting case in an international environment is a large subject not
- yet addressed in OTP.</p>
+ <p>The <seealso marker="stdlib:string"><c>string</c></seealso>
+ module works perfectly for Unicode strings and ISO Latin-1
+ strings, except the language-dependent functions <seealso
+ marker="stdlib:string#uppercase/1"><c>string:uppercase/1</c></seealso>
+ and <seealso
+ marker="stdlib:string#lowercase/1"><c>string:lowercase/1</c></seealso>.
+ These two functions can never function correctly for Unicode
+ characters in their current form, as there are language and locale
+ issues to consider when converting text between cases. Converting
+ case in an international environment is a large subject not yet
+ addressed in OTP.</p>
</section>
<section>
diff --git a/lib/stdlib/src/string.erl b/lib/stdlib/src/string.erl
index c659db78bd..4fdfe99b66 100644
--- a/lib/stdlib/src/string.erl
+++ b/lib/stdlib/src/string.erl
@@ -17,22 +17,72 @@
%%
%% %CopyrightEnd%
%%
+%% A string library that works on grapheme clusters, with the exception
+%% of codepoints of class 'prepend' and non modern (or decomposed) Hangul.
+%% If these codepoints appear, functions like 'find/2' may return a string
+%% which starts inside a grapheme cluster.
+%% These exceptions are made because the codepoints classes are
+%% seldom used and require that we are able look at previous codepoints in
+%% the stream and is thus hard to implement effectively.
+%%
+%% GC (grapheme cluster) implies that the length of string 'ß↑e̊' is 3 though
+%% it is represented by the codepoints [223,8593,101,778] or the
+%% utf8 binary <<195,159,226,134,145,101,204,138>>
+%%
+%% And that searching for strings or graphemes finds the correct positions:
+%%
+%% find("eeeee̊eee", "e̊") -> "e̊ee".:
+%% find("1£4e̊abcdef", "e") -> "ef"
+%%
+%% Most functions expect all input to be normalized to one form,
+%% see unicode:characters_to_nfc and unicode:characters_to_nfd functions.
+%% When appending strings no checking is done to verify that the
+%% result is valid unicode strings.
+%%
+%% The functions may crash for invalid utf-8 input.
+%%
+%% Return value should be kept consistent when return type is
+%% unicode:chardata() i.e. binary input => binary output,
+%% list input => list output mixed input => mixed output
+%%
-module(string).
--export([len/1,equal/2,concat/2,chr/2,rchr/2,str/2,rstr/2,
- span/2,cspan/2,substr/2,substr/3,tokens/2,chars/2,chars/3]).
+-export([is_empty/1, length/1, to_graphemes/1,
+ reverse/1,
+ equal/2, equal/3, equal/4,
+ slice/2, slice/3,
+ pad/2, pad/3, pad/4, trim/1, trim/2, trim/3, chomp/1,
+ take/2, take/3, take/4,
+ lexemes/2, nth_lexeme/3,
+ uppercase/1, lowercase/1, titlecase/1,casefold/1,
+ prefix/2,
+ split/2,split/3,replace/3,replace/4,
+ find/2,find/3,
+ next_codepoint/1, next_grapheme/1
+ ]).
+
+%% Old (will be deprecated) lists/string API kept for backwards compability
+-export([len/1, concat/2, % equal/2, (extended in the new api)
+ chr/2,rchr/2,str/2,rstr/2,
+ span/2,cspan/2,substr/2,substr/3, tokens/2,
+ chars/2,chars/3]).
-export([copies/2,words/1,words/2,strip/1,strip/2,strip/3,
sub_word/2,sub_word/3,left/2,left/3,right/2,right/3,
sub_string/2,sub_string/3,centre/2,centre/3, join/2]).
-export([to_upper/1, to_lower/1]).
+%%
+-import(lists,[member/2]).
--import(lists,[reverse/1,member/2]).
+-compile({no_auto_import,[length/1]}).
-%%---------------------------------------------------------------------------
+-export_type([grapheme_cluster/0]).
-%%% BIFs
+-type grapheme_cluster() :: char() | [char()].
+-type direction() :: 'leading' | 'trailing'.
+%%% BIFs
-export([to_float/1, to_integer/1]).
+-dialyzer({no_improper_lists, stack/2}).
-spec to_float(String) -> {Float, Rest} | {error, Reason} when
String :: string(),
@@ -54,6 +104,1180 @@ to_integer(_) ->
%%% End of BIFs
+%% Check if string is the empty string
+-spec is_empty(String::unicode:chardata()) -> boolean().
+is_empty([]) -> true;
+is_empty(<<>>) -> true;
+is_empty([L|R]) -> is_empty(L) andalso is_empty(R);
+is_empty(_) -> false.
+
+%% Count the number of grapheme clusters in chardata
+-spec length(String::unicode:chardata()) -> non_neg_integer().
+length(CD) ->
+ length_1(unicode_util:gc(CD), 0).
+
+%% Convert a string to a list of grapheme clusters
+-spec to_graphemes(String::unicode:chardata()) -> [grapheme_cluster()].
+to_graphemes(CD0) ->
+ case unicode_util:gc(CD0) of
+ [GC|CD] -> [GC|to_graphemes(CD)];
+ [] -> []
+ end.
+
+%% Compare two strings return boolean, assumes that the input are
+%% normalized to same form, see unicode:characters_to_nfX_xxx(..)
+-spec equal(A, B) -> boolean() when
+ A::unicode:chardata(),
+ B::unicode:chardata().
+equal(A,B) when is_binary(A), is_binary(B) ->
+ A =:= B;
+equal(A,B) ->
+ equal_1(A,B).
+
+%% Compare two strings return boolean, assumes that the input are
+%% normalized to same form, see unicode:characters_to_nfX_xxx(..)
+%% does casefold on the fly
+-spec equal(A, B, IgnoreCase) -> boolean() when
+ A::unicode:chardata(),
+ B::unicode:chardata(),
+ IgnoreCase :: boolean().
+equal(A, B, false) ->
+ equal(A,B);
+equal(A, B, true) ->
+ equal_nocase(A,B).
+
+%% Compare two strings return boolean
+%% if specified does casefold and normalization on the fly
+-spec equal(A, B, IgnoreCase, Norm) -> boolean() when
+ A :: unicode:chardata(),
+ B :: unicode:chardata(),
+ IgnoreCase :: boolean(),
+ Norm :: 'none' | 'nfc' | 'nfd' | 'nfkc' | 'nfkd'.
+equal(A, B, Case, none) ->
+ equal(A,B,Case);
+equal(A, B, false, Norm) ->
+ equal_norm(A, B, Norm);
+equal(A, B, true, Norm) ->
+ equal_norm_nocase(A, B, Norm).
+
+%% Reverse grapheme clusters
+-spec reverse(String::unicode:chardata()) -> [grapheme_cluster()].
+reverse(CD) ->
+ reverse_1(CD, []).
+
+%% Slice a string and return rest of string
+%% Note: counts grapheme_clusters
+-spec slice(String, Start) -> Slice when
+ String::unicode:chardata(),
+ Start :: non_neg_integer(),
+ Slice :: unicode:chardata().
+slice(CD, N) when is_integer(N), N >= 0 ->
+ slice_l(CD, N, is_binary(CD)).
+
+-spec slice(String, Start, Length) -> Slice when
+ String::unicode:chardata(),
+ Start :: non_neg_integer(),
+ Length :: 'infinity' | non_neg_integer(),
+ Slice :: unicode:chardata().
+slice(CD, N, Length)
+ when is_integer(N), N >= 0, is_integer(Length), Length > 0 ->
+ slice_trail(slice_l(CD, N, is_binary(CD)), Length);
+slice(CD, N, infinity) ->
+ slice_l(CD, N, is_binary(CD));
+slice(CD, _, 0) ->
+ case is_binary(CD) of
+ true -> <<>>;
+ false -> []
+ end.
+
+%% Pad a string to desired length
+-spec pad(String, Length) -> unicode:charlist() when
+ String ::unicode:chardata(),
+ Length :: integer().
+pad(CD, Length) ->
+ pad(CD, Length, trailing, $\s).
+
+-spec pad(String, Length, Dir) -> unicode:charlist() when
+ String ::unicode:chardata(),
+ Length :: integer(),
+ Dir :: direction() | 'both'.
+pad(CD, Length, Dir) ->
+ pad(CD, Length, Dir, $\s).
+
+-spec pad(String, Length, Dir, Char) -> unicode:charlist() when
+ String ::unicode:chardata(),
+ Length :: integer(),
+ Dir :: direction() | 'both',
+ Char :: grapheme_cluster().
+pad(CD, Length, leading, Char) when is_integer(Length) ->
+ Len = length(CD),
+ [lists:duplicate(max(0, Length-Len), Char), CD];
+pad(CD, Length, trailing, Char) when is_integer(Length) ->
+ Len = length(CD),
+ [CD|lists:duplicate(max(0, Length-Len), Char)];
+pad(CD, Length, both, Char) when is_integer(Length) ->
+ Len = length(CD),
+ Size = max(0, Length-Len),
+ Pre = lists:duplicate(Size div 2, Char),
+ Post = case Size rem 2 of
+ 1 -> [Char];
+ _ -> []
+ end,
+ [Pre, CD, Pre|Post].
+
+%% Strip characters from whitespace or Separator in Direction
+-spec trim(String) -> unicode:chardata() when
+ String :: unicode:chardata().
+trim(Str) ->
+ trim(Str, both, unicode_util:whitespace()).
+
+-spec trim(String, Dir) -> unicode:chardata() when
+ String :: unicode:chardata(),
+ Dir :: direction() | 'both'.
+trim(Str, Dir) ->
+ trim(Str, Dir, unicode_util:whitespace()).
+
+-spec trim(String, Dir, Characters) -> unicode:chardata() when
+ String :: unicode:chardata(),
+ Dir :: direction() | 'both',
+ Characters :: [grapheme_cluster()].
+trim(Str, _, []) -> Str;
+trim(Str, leading, Sep) when is_list(Sep) ->
+ trim_l(Str, search_pattern(Sep));
+trim(Str, trailing, Sep) when is_list(Sep) ->
+ trim_t(Str, 0, search_pattern(Sep));
+trim(Str, both, Sep0) when is_list(Sep0) ->
+ Sep = search_pattern(Sep0),
+ trim_t(trim_l(Str,Sep), 0, Sep).
+
+%% Delete trailing newlines or \r\n
+-spec chomp(String::unicode:chardata()) -> unicode:chardata().
+chomp(Str) ->
+ trim_t(Str,0, {[[$\r,$\n],$\n], [$\r,$\n], [<<$\r>>,<<$\n>>]}).
+
+%% Split String into two parts where the leading part consists of Characters
+-spec take(String, Characters) -> {Leading, Trailing} when
+ String::unicode:chardata(),
+ Characters::[grapheme_cluster()],
+ Leading::unicode:chardata(),
+ Trailing::unicode:chardata().
+take(Str, Sep) ->
+ take(Str, Sep, false, leading).
+-spec take(String, Characters, Complement) -> {Leading, Trailing} when
+ String::unicode:chardata(),
+ Characters::[grapheme_cluster()],
+ Complement::boolean(),
+ Leading::unicode:chardata(),
+ Trailing::unicode:chardata().
+take(Str, Sep, Complement) ->
+ take(Str, Sep, Complement, leading).
+-spec take(String, Characters, Complement, Dir) -> {Leading, Trailing} when
+ String::unicode:chardata(),
+ Characters::[grapheme_cluster()],
+ Complement::boolean(),
+ Dir::direction(),
+ Leading::unicode:chardata(),
+ Trailing::unicode:chardata().
+take(Str, [], Complement, Dir) ->
+ Empty = case is_binary(Str) of true -> <<>>; false -> [] end,
+ case {Complement,Dir} of
+ {false, leading} -> {Empty, Str};
+ {false, trailing} -> {Str, Empty};
+ {true, leading} -> {Str, Empty};
+ {true, trailing} -> {Empty, Str}
+ end;
+take(Str, Sep0, false, leading) ->
+ Sep = search_pattern(Sep0),
+ take_l(Str, Sep, []);
+take(Str, Sep0, true, leading) ->
+ Sep = search_pattern(Sep0),
+ take_lc(Str, Sep, []);
+take(Str, Sep0, false, trailing) ->
+ Sep = search_pattern(Sep0),
+ take_t(Str, 0, Sep);
+take(Str, Sep0, true, trailing) ->
+ Sep = search_pattern(Sep0),
+ take_tc(Str, 0, Sep).
+
+%% Uppercase all chars in Str
+-spec uppercase(String::unicode:chardata()) -> unicode:chardata().
+uppercase(CD) when is_list(CD) ->
+ uppercase_list(CD);
+uppercase(CD) when is_binary(CD) ->
+ uppercase_bin(CD,<<>>).
+
+%% Lowercase all chars in Str
+-spec lowercase(String::unicode:chardata()) -> unicode:chardata().
+lowercase(CD) when is_list(CD) ->
+ lowercase_list(CD);
+lowercase(CD) when is_binary(CD) ->
+ lowercase_bin(CD,<<>>).
+
+%% Make a titlecase of the first char in Str
+-spec titlecase(String::unicode:chardata()) -> unicode:chardata().
+titlecase(CD) when is_list(CD) ->
+ case unicode_util:titlecase(CD) of
+ [GC|Tail] -> append(GC,Tail);
+ Empty -> Empty
+ end;
+titlecase(CD) when is_binary(CD) ->
+ case unicode_util:titlecase(CD) of
+ [CP|Chars] when is_integer(CP) -> <<CP/utf8,Chars/binary>>;
+ [CPs|Chars] ->
+ << << <<CP/utf8>> || CP <- CPs>>/binary, Chars/binary>>;
+ [] -> <<>>
+ end.
+
+%% Make a comparable string of the Str should be used for equality tests only
+-spec casefold(String::unicode:chardata()) -> unicode:chardata().
+casefold(CD) when is_list(CD) ->
+ casefold_list(CD);
+casefold(CD) when is_binary(CD) ->
+ casefold_bin(CD,<<>>).
+
+%% Return the remaining string with prefix removed or else nomatch
+-spec prefix(String::unicode:chardata(), Prefix::unicode:chardata()) ->
+ 'nomatch' | unicode:chardata().
+prefix(Str, []) -> Str;
+prefix(Str, Prefix0) ->
+ Prefix = unicode:characters_to_list(Prefix0),
+ case prefix_1(Str, Prefix) of
+ [] when is_binary(Str) -> <<>>;
+ Res -> Res
+ end.
+
+%% split String with the first occurrence of SearchPattern, return list of splits
+-spec split(String, SearchPattern) -> [unicode:chardata()] when
+ String :: unicode:chardata(),
+ SearchPattern :: unicode:chardata().
+split(String, SearchPattern) ->
+ split(String, SearchPattern, leading).
+
+%% split String with SearchPattern, return list of splits
+-spec split(String, SearchPattern, Where) -> [unicode:chardata()] when
+ String :: unicode:chardata(),
+ SearchPattern :: unicode:chardata(),
+ Where :: direction() | 'all'.
+split(String, SearchPattern, Where) ->
+ case is_empty(SearchPattern) of
+ true -> [String];
+ false ->
+ SearchPatternCPs = unicode:characters_to_list(SearchPattern),
+ case split_1(String, SearchPatternCPs, 0, Where, [], []) of
+ {_Curr, []} -> [String];
+ {_Curr, Acc} when Where =:= trailing -> Acc;
+ {Curr, Acc} when Where =:= all -> lists:reverse([Curr|Acc]);
+ Acc when is_list(Acc) -> Acc
+ end
+ end.
+
+%% Replace the first SearchPattern in String with Replacement
+-spec replace(String, SearchPattern, Replacement) ->
+ [unicode:chardata()] when
+ String :: unicode:chardata(),
+ SearchPattern :: unicode:chardata(),
+ Replacement :: unicode:chardata().
+replace(String, SearchPattern, Replacement) ->
+ lists:join(Replacement, split(String, SearchPattern)).
+
+%% Replace Where SearchPattern in String with Replacement
+-spec replace(String, SearchPattern, Replacement, Where) ->
+ [unicode:chardata()] when
+ String :: unicode:chardata(),
+ SearchPattern :: unicode:chardata(),
+ Replacement :: unicode:chardata(),
+ Where :: direction() | 'all'.
+replace(String, SearchPattern, Replacement, Where) ->
+ lists:join(Replacement, split(String, SearchPattern, Where)).
+
+%% Split Str into a list of chardata separated by one of the grapheme
+%% clusters in Seps
+-spec lexemes(String::unicode:chardata(),
+ SeparatorList::[grapheme_cluster()]) ->
+ [unicode:chardata()].
+lexemes([], _) -> [];
+lexemes(Str, Seps0) when is_list(Seps0) ->
+ Seps = search_pattern(Seps0),
+ lexemes_m(Str, Seps, []).
+
+-spec nth_lexeme(String, N, SeparatorList) -> unicode:chardata() when
+ String::unicode:chardata(),
+ N::non_neg_integer(),
+ SeparatorList::[grapheme_cluster()].
+
+nth_lexeme(Str, 1, []) -> Str;
+nth_lexeme(Str, N, Seps0) when is_list(Seps0), is_integer(N), N > 0 ->
+ Seps = search_pattern(Seps0),
+ nth_lexeme_m(Str, Seps, N).
+
+%% find first SearchPattern in String return rest of string
+-spec find(String, SearchPattern) -> unicode:chardata() | 'nomatch' when
+ String::unicode:chardata(),
+ SearchPattern::unicode:chardata().
+find(String, SearchPattern) ->
+ find(String, SearchPattern, leading).
+
+%% find SearchPattern in String (search in Dir direction) return rest of string
+-spec find(String, SearchPattern, Dir) -> unicode:chardata() | 'nomatch' when
+ String::unicode:chardata(),
+ SearchPattern::unicode:chardata(),
+ Dir::direction().
+find(String, "", _) -> String;
+find(String, <<>>, _) -> String;
+find(String, SearchPattern, leading) ->
+ find_l(String, unicode:characters_to_list(SearchPattern));
+find(String, SearchPattern, trailing) ->
+ find_r(String, unicode:characters_to_list(SearchPattern), nomatch).
+
+%% Fetch first codepoint and return rest in tail
+-spec next_grapheme(String::unicode:chardata()) ->
+ maybe_improper_list(grapheme_cluster(),unicode:chardata()).
+next_grapheme(CD) -> unicode_util:gc(CD).
+
+%% Fetch first grapheme cluster and return rest in tail
+-spec next_codepoint(String::unicode:chardata()) ->
+ maybe_improper_list(char(),unicode:chardata()).
+next_codepoint(CD) -> unicode_util:cp(CD).
+
+%% Internals
+
+length_1([_|Rest], N) ->
+ length_1(unicode_util:gc(Rest), N+1);
+length_1([], N) ->
+ N.
+
+equal_1([A|AR], [B|BR]) when is_integer(A), is_integer(B) ->
+ A =:= B andalso equal_1(AR, BR);
+equal_1([], BR) -> is_empty(BR);
+equal_1(A0,B0) ->
+ case {unicode_util:cp(A0), unicode_util:cp(B0)} of
+ {[CP|A],[CP|B]} -> equal_1(A,B);
+ {[], []} -> true;
+ _ -> false
+ end.
+
+equal_nocase(A, A) -> true;
+equal_nocase(A0, B0) ->
+ case {unicode_util:cp(unicode_util:casefold(A0)),
+ unicode_util:cp(unicode_util:casefold(B0))} of
+ {[CP|A],[CP|B]} -> equal_nocase(A,B);
+ {[], []} -> true;
+ _ -> false
+ end.
+
+equal_norm(A, A, _Norm) -> true;
+equal_norm(A0, B0, Norm) ->
+ case {unicode_util:cp(unicode_util:Norm(A0)),
+ unicode_util:cp(unicode_util:Norm(B0))} of
+ {[CP|A],[CP|B]} -> equal_norm(A,B, Norm);
+ {[], []} -> true;
+ _ -> false
+ end.
+
+equal_norm_nocase(A, A, _Norm) -> true;
+equal_norm_nocase(A0, B0, Norm) ->
+ case {unicode_util:cp(unicode_util:casefold(unicode_util:Norm(A0))),
+ unicode_util:cp(unicode_util:casefold(unicode_util:Norm(B0)))} of
+ {[CP|A],[CP|B]} -> equal_norm_nocase(A,B, Norm);
+ {[], []} -> true;
+ _ -> false
+ end.
+
+reverse_1(CD, Acc) ->
+ case unicode_util:gc(CD) of
+ [GC|Rest] -> reverse_1(Rest, [GC|Acc]);
+ [] -> Acc
+ end.
+
+slice_l(CD, N, Binary) when N > 0 ->
+ case unicode_util:gc(CD) of
+ [_|Cont] -> slice_l(Cont, N-1, Binary);
+ [] when Binary -> <<>>;
+ [] -> []
+ end;
+slice_l(Cont, 0, Binary) ->
+ case is_empty(Cont) of
+ true when Binary -> <<>>;
+ _ -> Cont
+ end.
+
+slice_trail(CD, N) when is_list(CD) ->
+ slice_list(CD, N);
+slice_trail(CD, N) when is_binary(CD) ->
+ slice_bin(CD, N, CD).
+
+slice_list(CD, N) when N > 0 ->
+ case unicode_util:gc(CD) of
+ [GC|Cont] -> append(GC, slice_list(Cont, N-1));
+ [] -> []
+ end;
+slice_list(_, 0) ->
+ [].
+
+slice_bin(CD, N, Orig) when N > 0 ->
+ case unicode_util:gc(CD) of
+ [_|Cont] -> slice_bin(Cont, N-1, Orig);
+ [] -> Orig
+ end;
+slice_bin([], 0, Orig) ->
+ Orig;
+slice_bin(CD, 0, Orig) ->
+ Sz = byte_size(Orig) - byte_size(CD),
+ <<Keep:Sz/binary, _/binary>> = Orig,
+ Keep.
+
+uppercase_list(CPs0) ->
+ case unicode_util:uppercase(CPs0) of
+ [Char|CPs] -> append(Char,uppercase_list(CPs));
+ [] -> []
+ end.
+
+uppercase_bin(CPs0, Acc) ->
+ case unicode_util:uppercase(CPs0) of
+ [Char|CPs] when is_integer(Char) ->
+ uppercase_bin(CPs, <<Acc/binary, Char/utf8>>);
+ [Chars|CPs] ->
+ uppercase_bin(CPs, <<Acc/binary,
+ << <<CP/utf8>> || CP <- Chars>>/binary >>);
+ [] -> Acc
+ end.
+
+lowercase_list(CPs0) ->
+ case unicode_util:lowercase(CPs0) of
+ [Char|CPs] -> append(Char,lowercase_list(CPs));
+ [] -> []
+ end.
+
+lowercase_bin(CPs0, Acc) ->
+ case unicode_util:lowercase(CPs0) of
+ [Char|CPs] when is_integer(Char) ->
+ lowercase_bin(CPs, <<Acc/binary, Char/utf8>>);
+ [Chars|CPs] ->
+ lowercase_bin(CPs, <<Acc/binary,
+ << <<CP/utf8>> || CP <- Chars>>/binary >>);
+ [] -> Acc
+ end.
+
+casefold_list(CPs0) ->
+ case unicode_util:casefold(CPs0) of
+ [Char|CPs] -> append(Char, casefold_list(CPs));
+ [] -> []
+ end.
+
+casefold_bin(CPs0, Acc) ->
+ case unicode_util:casefold(CPs0) of
+ [Char|CPs] when is_integer(Char) ->
+ casefold_bin(CPs, <<Acc/binary, Char/utf8>>);
+ [Chars|CPs] ->
+ casefold_bin(CPs, <<Acc/binary,
+ << <<CP/utf8>> || CP <- Chars>>/binary >>);
+ [] -> Acc
+ end.
+
+
+trim_l([Bin|Cont0], Sep) when is_binary(Bin) ->
+ case bin_search_inv(Bin, Cont0, Sep) of
+ {nomatch, Cont} -> trim_l(Cont, Sep);
+ Keep -> Keep
+ end;
+trim_l(Str, {GCs, _, _}=Sep) when is_list(Str) ->
+ case unicode_util:gc(Str) of
+ [C|Cs] ->
+ case lists:member(C, GCs) of
+ true -> trim_l(Cs, Sep);
+ false -> Str
+ end;
+ [] -> []
+ end;
+trim_l(Bin, Sep) when is_binary(Bin) ->
+ case bin_search_inv(Bin, [], Sep) of
+ {nomatch,_} -> <<>>;
+ [Keep] -> Keep
+ end.
+
+trim_t([Bin|Cont0], N, Sep) when is_binary(Bin) ->
+ <<_:N/binary, Rest/binary>> = Bin,
+ case bin_search(Rest, Cont0, Sep) of
+ {nomatch,_} ->
+ stack(Bin, trim_t(Cont0, 0, Sep));
+ [SepStart|Cont1] ->
+ case bin_search_inv(SepStart, Cont1, Sep) of
+ {nomatch, Cont} ->
+ Tail = trim_t(Cont, 0, Sep),
+ case is_empty(Tail) of
+ true ->
+ KeepSz = byte_size(Bin) - byte_size(SepStart),
+ <<Keep:KeepSz/binary, _/binary>> = Bin,
+ Keep;
+ false ->
+ Used = cp_prefix(Cont0, Cont),
+ stack(Bin, stack(Used, Tail))
+ end;
+ [NonSep|Cont] when is_binary(NonSep) ->
+ KeepSz = byte_size(Bin) - byte_size(NonSep),
+ trim_t([Bin|Cont], KeepSz, Sep)
+ end
+ end;
+trim_t(Str, 0, {GCs,CPs,_}=Sep) when is_list(Str) ->
+ case unicode_util:cp(Str) of
+ [CP|Cs] ->
+ case lists:member(CP, CPs) of
+ true ->
+ [GC|Cs1] = unicode_util:gc(Str),
+ case lists:member(GC, GCs) of
+ true ->
+ Tail = trim_t(Cs1, 0, Sep),
+ case is_empty(Tail) of
+ true -> [];
+ false -> append(GC,Tail)
+ end;
+ false ->
+ append(GC,trim_t(Cs1, 0, Sep))
+ end;
+ false ->
+ append(CP,trim_t(Cs, 0, Sep))
+ end;
+ [] -> []
+ end;
+trim_t(Bin, N, Sep) when is_binary(Bin) ->
+ <<_:N/binary, Rest/binary>> = Bin,
+ case bin_search(Rest, Sep) of
+ {nomatch,_} -> Bin;
+ [SepStart] ->
+ case bin_search_inv(SepStart, [], Sep) of
+ {nomatch,_} ->
+ KeepSz = byte_size(Bin) - byte_size(SepStart),
+ <<Keep:KeepSz/binary, _/binary>> = Bin,
+ Keep;
+ [NonSep] ->
+ KeepSz = byte_size(Bin) - byte_size(NonSep),
+ trim_t(Bin, KeepSz, Sep)
+ end
+ end.
+
+take_l([Bin|Cont0], Sep, Acc) when is_binary(Bin) ->
+ case bin_search_inv(Bin, Cont0, Sep) of
+ {nomatch, Cont} ->
+ Used = cp_prefix(Cont0, Cont),
+ take_l(Cont, Sep, [unicode:characters_to_binary([Bin|Used])|Acc]);
+ [Bin1|_]=After when is_binary(Bin1) ->
+ First = byte_size(Bin) - byte_size(Bin1),
+ <<Keep:First/binary, _/binary>> = Bin,
+ {btoken(Keep,Acc), After}
+ end;
+take_l(Str, {GCs, _, _}=Sep, Acc) when is_list(Str) ->
+ case unicode_util:gc(Str) of
+ [C|Cs] ->
+ case lists:member(C, GCs) of
+ true -> take_l(Cs, Sep, append(rev(C),Acc));
+ false -> {rev(Acc), Str}
+ end;
+ [] -> {rev(Acc), []}
+ end;
+take_l(Bin, Sep, Acc) when is_binary(Bin) ->
+ case bin_search_inv(Bin, [], Sep) of
+ {nomatch,_} ->
+ {btoken(Bin, Acc), <<>>};
+ [After] ->
+ First = byte_size(Bin) - byte_size(After),
+ <<Keep:First/binary, _/binary>> = Bin,
+ {btoken(Keep, Acc), After}
+ end.
+
+take_lc([Bin|Cont0], Sep, Acc) when is_binary(Bin) ->
+ case bin_search(Bin, Cont0, Sep) of
+ {nomatch, Cont} ->
+ Used = cp_prefix(Cont0, Cont),
+ take_lc(Cont, Sep, [unicode:characters_to_binary([Bin|Used])|Acc]);
+ [Bin1|_]=After when is_binary(Bin1) ->
+ First = byte_size(Bin) - byte_size(Bin1),
+ <<Keep:First/binary, _/binary>> = Bin,
+ {btoken(Keep,Acc), After}
+ end;
+take_lc(Str, {GCs, _, _}=Sep, Acc) when is_list(Str) ->
+ case unicode_util:gc(Str) of
+ [C|Cs] ->
+ case lists:member(C, GCs) of
+ false -> take_lc(Cs, Sep, append(rev(C),Acc));
+ true -> {rev(Acc), Str}
+ end;
+ [] -> {rev(Acc), []}
+ end;
+take_lc(Bin, Sep, Acc) when is_binary(Bin) ->
+ case bin_search(Bin, [], Sep) of
+ {nomatch,_} ->
+ {btoken(Bin, Acc), <<>>};
+ [After] ->
+ First = byte_size(Bin) - byte_size(After),
+ <<Keep:First/binary, _/binary>> = Bin,
+ {btoken(Keep, Acc), After}
+ end.
+
+take_t([Bin|Cont0], N, Sep) when is_binary(Bin) ->
+ <<_:N/binary, Rest/binary>> = Bin,
+ case bin_search(Rest, Cont0, Sep) of
+ {nomatch,Cont} ->
+ Used = cp_prefix(Cont0, Cont),
+ {Head, Tail} = take_t(Cont, 0, Sep),
+ {stack(unicode:characters_to_binary([Bin|Used]), Head), Tail};
+ [SepStart|Cont1] ->
+ case bin_search_inv(SepStart, Cont1, Sep) of
+ {nomatch, Cont} ->
+ {Head, Tail} = take_t(Cont, 0, Sep),
+ Used = cp_prefix(Cont0, Cont),
+ case equal(Tail, Cont) of
+ true ->
+ KeepSz = byte_size(Bin) - byte_size(SepStart),
+ <<Keep:KeepSz/binary, End/binary>> = Bin,
+ {stack(Keep,Head), stack(stack(End,Used),Tail)};
+ false ->
+ {stack(unicode:characters_to_binary([Bin|Used]),Head), Tail}
+ end;
+ [NonSep|Cont] when is_binary(NonSep) ->
+ KeepSz = byte_size(Bin) - byte_size(NonSep),
+ take_t([Bin|Cont], KeepSz, Sep)
+ end
+ end;
+take_t(Str, 0, {GCs,CPs,_}=Sep) when is_list(Str) ->
+ case unicode_util:cp(Str) of
+ [CP|Cs] ->
+ case lists:member(CP, CPs) of
+ true ->
+ [GC|Cs1] = unicode_util:gc(Str),
+ case lists:member(GC, GCs) of
+ true ->
+ {Head, Tail} = take_t(Cs1, 0, Sep),
+ case equal(Tail, Cs1) of
+ true -> {Head, append(GC,Tail)};
+ false -> {append(GC,Head), Tail}
+ end;
+ false ->
+ {Head, Tail} = take_t(Cs, 0, Sep),
+ {append(CP,Head), Tail}
+ end;
+ false ->
+ {Head, Tail} = take_t(Cs, 0, Sep),
+ {append(CP,Head), Tail}
+ end;
+ [] -> {[],[]}
+ end;
+take_t(Bin, N, Sep) when is_binary(Bin) ->
+ <<_:N/binary, Rest/binary>> = Bin,
+ case bin_search(Rest, Sep) of
+ {nomatch,_} -> {Bin, <<>>};
+ [SepStart] ->
+ case bin_search_inv(SepStart, [], Sep) of
+ {nomatch,_} ->
+ KeepSz = byte_size(Bin) - byte_size(SepStart),
+ <<Before:KeepSz/binary, End/binary>> = Bin,
+ {Before, End};
+ [NonSep] ->
+ KeepSz = byte_size(Bin) - byte_size(NonSep),
+ take_t(Bin, KeepSz, Sep)
+ end
+ end.
+
+take_tc([Bin|Cont0], N, Sep) when is_binary(Bin) ->
+ <<_:N/binary, Rest/binary>> = Bin,
+ case bin_search_inv(Rest, Cont0, Sep) of
+ {nomatch,Cont} ->
+ Used = cp_prefix(Cont0, Cont),
+ {Head, Tail} = take_tc(Cont, 0, Sep),
+ {stack(unicode:characters_to_binary([Bin|Used]), Head), Tail};
+ [SepStart|Cont1] ->
+ case bin_search(SepStart, Cont1, Sep) of
+ {nomatch, Cont} ->
+ {Head, Tail} = take_tc(Cont, 0, Sep),
+ Used = cp_prefix(Cont0, Cont),
+ case equal(Tail, Cont) of
+ true ->
+ KeepSz = byte_size(Bin) - byte_size(SepStart),
+ <<Keep:KeepSz/binary, End/binary>> = Bin,
+ {stack(Keep,Head), stack(stack(End,Used),Tail)};
+ false ->
+ {stack(unicode:characters_to_binary([Bin|Used]),Head), Tail}
+ end;
+ [NonSep|Cont] when is_binary(NonSep) ->
+ KeepSz = byte_size(Bin) - byte_size(NonSep),
+ take_tc([Bin|Cont], KeepSz, Sep)
+ end
+ end;
+take_tc(Str, 0, {GCs,CPs,_}=Sep) when is_list(Str) ->
+ case unicode_util:cp(Str) of
+ [CP|Cs] ->
+ case lists:member(CP, CPs) of
+ true ->
+ [GC|Cs1] = unicode_util:gc(Str),
+ case lists:member(GC, GCs) of
+ false ->
+ {Head, Tail} = take_tc(Cs1, 0, Sep),
+ case equal(Tail, Cs1) of
+ true -> {Head, append(GC,Tail)};
+ false -> {append(GC,Head), Tail}
+ end;
+ true ->
+ {Head, Tail} = take_tc(Cs1, 0, Sep),
+ {append(GC,Head), Tail}
+ end;
+ false ->
+ {Head, Tail} = take_tc(Cs, 0, Sep),
+ case equal(Tail, Cs) of
+ true -> {Head, append(CP,Tail)};
+ false -> {append(CP,Head), Tail}
+ end
+ end;
+ [] -> {[],[]}
+ end;
+take_tc(Bin, N, Sep) when is_binary(Bin) ->
+ <<_:N/binary, Rest/binary>> = Bin,
+ case bin_search_inv(Rest, [], Sep) of
+ {nomatch,_} -> {Bin, <<>>};
+ [SepStart] ->
+ case bin_search(SepStart, [], Sep) of
+ {nomatch,_} ->
+ KeepSz = byte_size(Bin) - byte_size(SepStart),
+ <<Before:KeepSz/binary, End/binary>> = Bin,
+ {Before, End};
+ [NonSep] ->
+ KeepSz = byte_size(Bin) - byte_size(NonSep),
+ take_tc(Bin, KeepSz, Sep)
+ end
+ end.
+
+prefix_1(Cs, []) -> Cs;
+prefix_1(Cs, [_]=Pre) ->
+ prefix_2(unicode_util:gc(Cs), Pre);
+prefix_1(Cs, Pre) ->
+ prefix_2(unicode_util:cp(Cs), Pre).
+
+prefix_2([C|Cs], [C|Pre]) ->
+ prefix_1(Cs, Pre);
+prefix_2(_, _) ->
+ nomatch.
+
+split_1([Bin|Cont0], Needle, Start, Where, Curr0, Acc)
+ when is_binary(Bin) ->
+ case bin_search_str(Bin, Start, Cont0, Needle) of
+ {nomatch,Sz,Cont} ->
+ <<Keep:Sz/binary, _/binary>> = Bin,
+ split_1(Cont, Needle, 0, Where, [Keep|Curr0], Acc);
+ {Before, [Cs0|Cont], After} ->
+ Curr = add_non_empty(Before,Curr0),
+ case Where of
+ leading ->
+ [rev(Curr),After];
+ trailing ->
+ <<_/utf8, Cs/binary>> = Cs0,
+ Next = byte_size(Bin) - byte_size(Cs),
+ split_1([Bin|Cont], Needle, Next, Where,
+ Curr0, [rev(Curr),After]);
+ all ->
+ split_1(After, Needle, 0, Where, [], [rev(Curr)|Acc])
+ end
+ end;
+split_1(Cs0, [C|_]=Needle, _, Where, Curr, Acc) when is_list(Cs0) ->
+ case unicode_util:cp(Cs0) of
+ [C|Cs] ->
+ case prefix_1(Cs0, Needle) of
+ nomatch -> split_1(Cs, Needle, 0, Where, append(C,Curr), Acc);
+ Rest when Where =:= leading ->
+ [rev(Curr), Rest];
+ Rest when Where =:= trailing ->
+ split_1(Cs, Needle, 0, Where, [C|Curr], [rev(Curr), Rest]);
+ Rest when Where =:= all ->
+ split_1(Rest, Needle, 0, Where, [], [rev(Curr)|Acc])
+ end;
+ [Other|Cs] ->
+ split_1(Cs, Needle, 0, Where, append(Other,Curr), Acc);
+ [] ->
+ {rev(Curr), Acc}
+ end;
+split_1(Bin, [_C|_]=Needle, Start, Where, Curr0, Acc) ->
+ case bin_search_str(Bin, Start, [], Needle) of
+ {nomatch,_,_} ->
+ <<_:Start/binary, Keep/binary>> = Bin,
+ {rev([Keep|Curr0]), Acc};
+ {Before, [Cs0], After} ->
+ case Where of
+ leading ->
+ [rev([Before|Curr0]),After];
+ trailing ->
+ <<_/utf8, Cs/binary>> = Cs0,
+ Next = byte_size(Bin) - byte_size(Cs),
+ split_1(Bin, Needle, Next, Where, Curr0,
+ [btoken(Before,Curr0),After]);
+ all ->
+ Next = byte_size(Bin) - byte_size(After),
+ <<_:Start/binary, Keep/binary>> = Before,
+ Curr = [Keep|Curr0],
+ split_1(Bin, Needle, Next, Where, [], [rev(Curr)|Acc])
+ end
+ end.
+
+lexemes_m([Bin|Cont0], Seps, Ts) when is_binary(Bin) ->
+ case bin_search_inv(Bin, Cont0, Seps) of
+ {nomatch,Cont} ->
+ lexemes_m(Cont, Seps, Ts);
+ Cs ->
+ {Lexeme,Rest} = lexeme_pick(Cs, Seps, []),
+ lexemes_m(Rest, Seps, [Lexeme|Ts])
+ end;
+lexemes_m(Cs0, {GCs, _, _}=Seps, Ts) when is_list(Cs0) ->
+ case unicode_util:gc(Cs0) of
+ [C|Cs] ->
+ case lists:member(C, GCs) of
+ true ->
+ lexemes_m(Cs, Seps, Ts);
+ false ->
+ {Lexeme,Rest} = lexeme_pick(Cs0, Seps, []),
+ lexemes_m(Rest, Seps, [Lexeme|Ts])
+ end;
+ [] ->
+ lists:reverse(Ts)
+ end;
+lexemes_m(Bin, Seps, Ts) when is_binary(Bin) ->
+ case bin_search_inv(Bin, [], Seps) of
+ {nomatch,_} ->
+ lists:reverse(Ts);
+ [Cs] ->
+ {Lexeme,Rest} = lexeme_pick(Cs, Seps, []),
+ lexemes_m(Rest, Seps, add_non_empty(Lexeme,Ts))
+ end.
+
+lexeme_pick([CP|Cs1]=Cs0, {GCs,CPs,_}=Seps, Tkn) when is_integer(CP) ->
+ case lists:member(CP, CPs) of
+ true ->
+ [GC|Cs2] = unicode_util:gc(Cs0),
+ case lists:member(GC, GCs) of
+ true -> {rev(Tkn), Cs2};
+ false -> lexeme_pick(Cs2, Seps, append(rev(GC),Tkn))
+ end;
+ false -> lexeme_pick(Cs1, Seps, [CP|Tkn])
+ end;
+lexeme_pick([Bin|Cont0], Seps, Tkn) when is_binary(Bin) ->
+ case bin_search(Bin, Cont0, Seps) of
+ {nomatch,_} ->
+ lexeme_pick(Cont0, Seps, [Bin|Tkn]);
+ [Left|_Cont] = Cs ->
+ Bytes = byte_size(Bin) - byte_size(Left),
+ <<Lexeme:Bytes/binary, _/binary>> = Bin,
+ {btoken(Lexeme, Tkn), Cs}
+ end;
+lexeme_pick(Cs0, {GCs, CPs, _} = Seps, Tkn) when is_list(Cs0) ->
+ case unicode_util:cp(Cs0) of
+ [CP|Cs] ->
+ case lists:member(CP, CPs) of
+ true ->
+ [GC|Cs2] = unicode_util:gc(Cs0),
+ case lists:member(GC, GCs) of
+ true -> {rev(Tkn), Cs0};
+ false -> lexeme_pick(Cs2, Seps, append(rev(GC),Tkn))
+ end;
+ false ->
+ lexeme_pick(Cs, Seps, append(CP,Tkn))
+ end;
+ [] ->
+ {rev(Tkn), []}
+ end;
+lexeme_pick(Bin, Seps, Tkn) when is_binary(Bin) ->
+ case bin_search(Bin, Seps) of
+ {nomatch,_} ->
+ {btoken(Bin,Tkn), []};
+ [Left] ->
+ Bytes = byte_size(Bin) - byte_size(Left),
+ <<Lexeme:Bytes/binary, _/binary>> = Bin,
+ {btoken(Lexeme, Tkn), Left}
+ end.
+
+nth_lexeme_m([Bin|Cont0], Seps, N) when is_binary(Bin) ->
+ case bin_search_inv(Bin, Cont0, Seps) of
+ {nomatch,Cont} ->
+ nth_lexeme_m(Cont, Seps, N);
+ Cs when N > 1 ->
+ Rest = lexeme_skip(Cs, Seps),
+ nth_lexeme_m(Rest, Seps, N-1);
+ Cs ->
+ {Lexeme,_} = lexeme_pick(Cs, Seps, []),
+ Lexeme
+ end;
+nth_lexeme_m(Cs0, {GCs, _, _}=Seps, N) when is_list(Cs0) ->
+ case unicode_util:gc(Cs0) of
+ [C|Cs] ->
+ case lists:member(C, GCs) of
+ true ->
+ nth_lexeme_m(Cs, Seps, N);
+ false when N > 1 ->
+ Cs1 = lexeme_skip(Cs, Seps),
+ nth_lexeme_m(Cs1, Seps, N-1);
+ false ->
+ {Lexeme,_} = lexeme_pick(Cs0, Seps, []),
+ Lexeme
+ end;
+ [] ->
+ []
+ end;
+nth_lexeme_m(Bin, Seps, N) when is_binary(Bin) ->
+ case bin_search_inv(Bin, [], Seps) of
+ [Cs] when N > 1 ->
+ Cs1 = lexeme_skip(Cs, Seps),
+ nth_lexeme_m(Cs1, Seps, N-1);
+ [Cs] ->
+ {Lexeme,_} = lexeme_pick(Cs, Seps, []),
+ Lexeme;
+ {nomatch,_} ->
+ <<>>
+ end.
+
+lexeme_skip([CP|Cs1]=Cs0, {GCs,CPs,_}=Seps) when is_integer(CP) ->
+ case lists:member(CP, CPs) of
+ true ->
+ [GC|Cs2] = unicode_util:gc(Cs0),
+ case lists:member(GC, GCs) of
+ true -> Cs0;
+ false -> lexeme_skip(Cs2, Seps)
+ end;
+ false ->
+ lexeme_skip(Cs1, Seps)
+ end;
+lexeme_skip([Bin|Cont0], Seps) when is_binary(Bin) ->
+ case bin_search(Bin, Cont0, Seps) of
+ {nomatch,_} -> lexeme_skip(Cont0, Seps);
+ Cs -> Cs
+ end;
+lexeme_skip(Cs0, {GCs, CPs, _} = Seps) when is_list(Cs0) ->
+ case unicode_util:cp(Cs0) of
+ [CP|Cs] ->
+ case lists:member(CP, CPs) of
+ true ->
+ [GC|Cs2] = unicode_util:gc(Cs0),
+ case lists:member(GC, GCs) of
+ true -> Cs0;
+ false -> lexeme_skip(Cs2, Seps)
+ end;
+ false ->
+ lexeme_skip(Cs, Seps)
+ end;
+ [] ->
+ []
+ end;
+lexeme_skip(Bin, Seps) when is_binary(Bin) ->
+ case bin_search(Bin, Seps) of
+ {nomatch,_} -> <<>>;
+ [Left] -> Left
+ end.
+
+find_l([Bin|Cont0], Needle) when is_binary(Bin) ->
+ case bin_search_str(Bin, 0, Cont0, Needle) of
+ {nomatch, _, Cont} ->
+ find_l(Cont, Needle);
+ {_Before, Cs, _After} ->
+ Cs
+ end;
+find_l(Cs0, [C|_]=Needle) when is_list(Cs0) ->
+ case unicode_util:cp(Cs0) of
+ [C|Cs] ->
+ case prefix_1(Cs0, Needle) of
+ nomatch -> find_l(Cs, Needle);
+ _ -> Cs0
+ end;
+ [_C|Cs] ->
+ find_l(Cs, Needle);
+ [] -> nomatch
+ end;
+find_l(Bin, Needle) ->
+ case bin_search_str(Bin, 0, [], Needle) of
+ {nomatch,_,_} -> nomatch;
+ {_Before, [Cs], _After} -> Cs
+ end.
+
+find_r([Bin|Cont0], Needle, Res) when is_binary(Bin) ->
+ case bin_search_str(Bin, 0, Cont0, Needle) of
+ {nomatch,_,Cont} ->
+ find_r(Cont, Needle, Res);
+ {_, Cs0, _} ->
+ [_|Cs] = unicode_util:gc(Cs0),
+ find_r(Cs, Needle, Cs0)
+ end;
+find_r(Cs0, [C|_]=Needle, Res) when is_list(Cs0) ->
+ case unicode_util:cp(Cs0) of
+ [C|Cs] ->
+ case prefix_1(Cs0, Needle) of
+ nomatch -> find_r(Cs, Needle, Res);
+ _ -> find_r(Cs, Needle, Cs0)
+ end;
+ [_C|Cs] ->
+ find_r(Cs, Needle, Res);
+ [] -> Res
+ end;
+find_r(Bin, Needle, Res) ->
+ case bin_search_str(Bin, 0, [], Needle) of
+ {nomatch,_,_} -> Res;
+ {_Before, [Cs0], _After} ->
+ <<_/utf8, Cs/binary>> = Cs0,
+ find_r(Cs, Needle, Cs0)
+ end.
+
+%% These are used to avoid creating lists around binaries
+%% might be unnecessary, is there a better solution?
+btoken(Token, []) -> Token;
+btoken(BinPart, [C]) when is_integer(C) -> <<C/utf8, BinPart/binary>>;
+btoken(<<>>, Tkn) -> lists:reverse(Tkn);
+btoken(BinPart, Cs) -> [lists:reverse(Cs),BinPart].
+
+rev([B]) when is_binary(B) -> B;
+rev(L) when is_list(L) -> lists:reverse(L);
+rev(C) when is_integer(C) -> C.
+
+append(Char, <<>>) when is_integer(Char) -> [Char];
+append(Char, <<>>) when is_list(Char) -> Char;
+append(Char, Bin) when is_binary(Bin) -> [Char,Bin];
+append(Char, Str) when is_integer(Char) -> [Char|Str];
+append(GC, Str) when is_list(GC) -> GC ++ Str.
+
+stack(Bin, []) -> Bin;
+stack(<<>>, St) -> St;
+stack([], St) -> St;
+stack(Bin, St) -> [Bin|St].
+
+add_non_empty(<<>>, L) -> L;
+add_non_empty(Token, L) -> [Token|L].
+
+cp_prefix(Orig, Cont) ->
+ case unicode_util:cp(Cont) of
+ [] -> Orig;
+ [Cp|Rest] -> cp_prefix_1(Orig, Cp, Rest)
+ end.
+
+cp_prefix_1(Orig, Until, Cont) ->
+ case unicode_util:cp(Orig) of
+ [Until|Rest] ->
+ case equal(Rest, Cont) of
+ true -> [];
+ false-> [Until|cp_prefix_1(Rest, Until, Cont)]
+ end;
+ [CP|Rest] -> [CP|cp_prefix_1(Rest, Until, Cont)]
+ end.
+
+
+%% Binary special
+bin_search(Bin, Seps) ->
+ bin_search(Bin, [], Seps).
+
+bin_search(_Bin, Cont, {[],_,_}) ->
+ {nomatch, Cont};
+bin_search(Bin, Cont, {Seps,_,BP}) ->
+ bin_search_loop(Bin, 0, BP, Cont, Seps).
+
+%% Need to work with [<<$a>>, <<778/utf8>>],
+%% i.e. å in nfd form $a "COMBINING RING ABOVE"
+%% and PREPEND characters like "ARABIC NUMBER SIGN" 1536 <<216,128>>
+%% combined with other characters are currently ignored.
+search_pattern(Seps) ->
+ CPs = search_cp(Seps),
+ Bin = bin_pattern(CPs),
+ {Seps, CPs, Bin}.
+
+search_cp([CP|Seps]) when is_integer(CP) ->
+ [CP|search_cp(Seps)];
+search_cp([Pattern|Seps]) ->
+ [CP|_] = unicode_util:cp(Pattern),
+ [CP|search_cp(Seps)];
+search_cp([]) -> [].
+
+bin_pattern([CP|Seps]) ->
+ [<<CP/utf8>>|bin_pattern(Seps)];
+bin_pattern([]) -> [].
+
+bin_search_loop(Bin0, Start, _, Cont, _Seps)
+ when byte_size(Bin0) =< Start; Start < 0 ->
+ {nomatch, Cont};
+bin_search_loop(Bin0, Start, BinSeps, Cont, Seps) ->
+ <<_:Start/binary, Bin/binary>> = Bin0,
+ case binary:match(Bin, BinSeps) of
+ nomatch ->
+ {nomatch,Cont};
+ {Where, _CL} ->
+ <<_:Where/binary, Cont0/binary>> = Bin,
+ Cont1 = stack(Cont0, Cont),
+ [GC|Cont2] = unicode_util:gc(Cont1),
+ case lists:member(GC, Seps) of
+ false ->
+ case Cont2 of
+ [BinR|Cont] when is_binary(BinR) ->
+ Next = byte_size(Bin0) - byte_size(BinR),
+ bin_search_loop(Bin0, Next, BinSeps, Cont, Seps);
+ BinR when is_binary(BinR), Cont =:= [] ->
+ Next = byte_size(Bin0) - byte_size(BinR),
+ bin_search_loop(Bin0, Next, BinSeps, Cont, Seps);
+ _ ->
+ {nomatch, Cont2}
+ end;
+ true when is_list(Cont1) ->
+ Cont1;
+ true ->
+ [Cont1]
+ end
+ end.
+
+bin_search_inv(Bin, Cont, {[], _, _}) ->
+ [Bin|Cont];
+bin_search_inv(Bin, Cont, {[Sep], _, _}) ->
+ bin_search_inv_1([Bin|Cont], Sep);
+bin_search_inv(Bin, Cont, {Seps, _, _}) ->
+ bin_search_inv_n([Bin|Cont], Seps).
+
+bin_search_inv_1([<<>>|CPs], _) ->
+ {nomatch, CPs};
+bin_search_inv_1(CPs = [Bin0|Cont], Sep) when is_binary(Bin0) ->
+ case unicode_util:gc(CPs) of
+ [Sep|Bin] when is_binary(Bin), Cont =:= [] ->
+ bin_search_inv_1([Bin], Sep);
+ [Sep|[Bin|Cont]=Cs] when is_binary(Bin) ->
+ bin_search_inv_1(Cs, Sep);
+ [Sep|Cs] ->
+ {nomatch, Cs};
+ _ -> CPs
+ end.
+
+bin_search_inv_n([<<>>|CPs], _) ->
+ {nomatch, CPs};
+bin_search_inv_n([Bin0|Cont]=CPs, Seps) when is_binary(Bin0) ->
+ [C|Cs0] = unicode_util:gc(CPs),
+ case {lists:member(C, Seps), Cs0} of
+ {true, Cs} when is_binary(Cs), Cont =:= [] ->
+ bin_search_inv_n([Cs], Seps);
+ {true, [Bin|Cont]=Cs} when is_binary(Bin) ->
+ bin_search_inv_n(Cs, Seps);
+ {true, Cs} -> {nomatch, Cs};
+ {false, _} -> CPs
+ end.
+
+bin_search_str(Bin0, Start, Cont, [CP|_]=SearchCPs) ->
+ <<_:Start/binary, Bin/binary>> = Bin0,
+ case binary:match(Bin, <<CP/utf8>>) of
+ nomatch -> {nomatch, byte_size(Bin0), Cont};
+ {Where0, _} ->
+ Where = Start+Where0,
+ <<Keep:Where/binary, Cs0/binary>> = Bin0,
+ [GC|Cs]=unicode_util:gc(Cs0),
+ case prefix_1(stack(Cs0,Cont), SearchCPs) of
+ nomatch when is_binary(Cs) ->
+ KeepSz = byte_size(Bin0) - byte_size(Cs),
+ bin_search_str(Bin0, KeepSz, Cont, SearchCPs);
+ nomatch ->
+ {nomatch, Where, stack([GC|Cs],Cont)};
+ [] ->
+ {Keep, [Cs0|Cont], <<>>};
+ Rest ->
+ {Keep, [Cs0|Cont], Rest}
+ end
+ end.
+
+
+%%---------------------------------------------------------------------------
+%% OLD lists API kept for backwards compability
+%%---------------------------------------------------------------------------
+
%% Robert's bit
%% len(String)
@@ -68,12 +1292,12 @@ len(S) -> length(S).
%% equal(String1, String2)
%% Test if 2 strings are equal.
--spec equal(String1, String2) -> boolean() when
- String1 :: string(),
- String2 :: string().
+%% -spec equal(String1, String2) -> boolean() when
+%% String1 :: string(),
+%% String2 :: string().
-equal(S, S) -> true;
-equal(_, _) -> false.
+%% equal(S, S) -> true;
+%% equal(_, _) -> false.
%% concat(String1, String2)
%% Concatenate 2 strings.
@@ -127,7 +1351,7 @@ rchr([], _C, _I, L) -> L.
str(S, Sub) when is_list(Sub) -> str(S, Sub, 1).
str([C|S], [C|Sub], I) ->
- case prefix(Sub, S) of
+ case l_prefix(Sub, S) of
true -> I;
false -> str(S, [C|Sub], I+1)
end;
@@ -142,16 +1366,16 @@ str([], _Sub, _I) -> 0.
rstr(S, Sub) when is_list(Sub) -> rstr(S, Sub, 1, 0).
rstr([C|S], [C|Sub], I, L) ->
- case prefix(Sub, S) of
+ case l_prefix(Sub, S) of
true -> rstr(S, [C|Sub], I+1, I);
false -> rstr(S, [C|Sub], I+1, L)
end;
rstr([_|S], Sub, I, L) -> rstr(S, Sub, I+1, L);
rstr([], _Sub, _I, L) -> L.
-prefix([C|Pre], [C|String]) -> prefix(Pre, String);
-prefix([], String) when is_list(String) -> true;
-prefix(Pre, String) when is_list(Pre), is_list(String) -> false.
+l_prefix([C|Pre], [C|String]) -> l_prefix(Pre, String);
+l_prefix([], String) when is_list(String) -> true;
+l_prefix(Pre, String) when is_list(Pre), is_list(String) -> false.
%% span(String, Chars) -> Length.
%% cspan(String, Chars) -> Length.
@@ -229,9 +1453,9 @@ tokens(S, Seps) ->
[_|_] -> [S]
end;
[C] ->
- tokens_single_1(reverse(S), C, []);
+ tokens_single_1(lists:reverse(S), C, []);
[_|_] ->
- tokens_multiple_1(reverse(S), Seps, [])
+ tokens_multiple_1(lists:reverse(S), Seps, [])
end.
tokens_single_1([Sep|S], Sep, Toks) ->
@@ -342,8 +1566,8 @@ sub_word(String, Index, Char) when is_integer(Index), is_integer(Char) ->
s_word(strip(String, left, Char), Index, Char, 1, [])
end.
-s_word([], _, _, _,Res) -> reverse(Res);
-s_word([Char|_],Index,Char,Index,Res) -> reverse(Res);
+s_word([], _, _, _,Res) -> lists:reverse(Res);
+s_word([Char|_],Index,Char,Index,Res) -> lists:reverse(Res);
s_word([H|T],Index,Char,Index,Res) -> s_word(T,Index,Char,Index,[H|Res]);
s_word([Char|T],Stop,Char,Index,Res) when Index < Stop ->
s_word(strip(T,left,Char),Stop,Char,Index+1,Res);
@@ -359,7 +1583,7 @@ strip(String) -> strip(String, both).
-spec strip(String, Direction) -> Stripped when
String :: string(),
Stripped :: string(),
- Direction :: left | right | both.
+ Direction :: 'left' | 'right' | 'both'.
strip(String, left) -> strip_left(String, $\s);
strip(String, right) -> strip_right(String, $\s);
@@ -369,7 +1593,7 @@ strip(String, both) ->
-spec strip(String, Direction, Character) -> Stripped when
String :: string(),
Stripped :: string(),
- Direction :: left | right | both,
+ Direction :: 'left' | 'right' | 'both',
Character :: char().
strip(String, right, Char) -> strip_right(String, Char);
diff --git a/lib/stdlib/test/string_SUITE.erl b/lib/stdlib/test/string_SUITE.erl
index 836f9e5142..a78ddf761b 100644
--- a/lib/stdlib/test/string_SUITE.erl
+++ b/lib/stdlib/test/string_SUITE.erl
@@ -29,25 +29,46 @@
-export([init_per_testcase/2, end_per_testcase/2]).
%% Test cases must be exported.
--export([len/1,equal/1,concat/1,chr_rchr/1,str_rstr/1]).
--export([span_cspan/1,substr/1,tokens/1,chars/1]).
+-export([is_empty/1, length/1, to_graphemes/1,
+ reverse/1, slice/1,
+ equal/1,
+ pad/1, trim/1, chomp/1, take/1,
+ uppercase/1, lowercase/1, titlecase/1, casefold/1,
+ prefix/1, split/1, replace/1, find/1,
+ lexemes/1, nth_lexeme/1, cd_gc/1, meas/1
+ ]).
+
+-export([len/1,old_equal/1,old_concat/1,chr_rchr/1,str_rstr/1]).
+-export([span_cspan/1,substr/1,old_tokens/1,chars/1]).
-export([copies/1,words/1,strip/1,sub_word/1,left_right/1]).
-export([sub_string/1,centre/1, join/1]).
-export([to_integer/1,to_float/1]).
-export([to_upper_to_lower/1]).
+%% Run tests when debugging them
+-export([debug/0]).
+
suite() ->
[{ct_hooks,[ts_install_cth]},
{timetrap,{minutes,1}}].
-all() ->
- [len, equal, concat, chr_rchr, str_rstr, span_cspan,
- substr, tokens, chars, copies, words, strip, sub_word,
- left_right, sub_string, centre, join, to_integer,
- to_float, to_upper_to_lower].
+all() ->
+ [{group, chardata}, {group, list_string}].
-groups() ->
- [].
+groups() ->
+ [{chardata,
+ [is_empty, length, to_graphemes,
+ equal, reverse, slice,
+ pad, trim, chomp, take,
+ lexemes, nth_lexeme,
+ uppercase, lowercase, titlecase, casefold,
+ prefix, find, split, replace, cd_gc,
+ meas]},
+ {list_string,
+ [len, old_equal, old_concat, chr_rchr, str_rstr, span_cspan,
+ substr, old_tokens, chars, copies, words, strip, sub_word,
+ left_right, sub_string, centre, join, to_integer,
+ to_float, to_upper_to_lower]}].
init_per_suite(Config) ->
Config.
@@ -68,8 +89,839 @@ init_per_testcase(_Case, Config) ->
end_per_testcase(_Case, _Config) ->
ok.
+debug() ->
+ Config = [{data_dir, ?MODULE_STRING++"_data"}],
+ [io:format("~p:~p~n",[Test,?MODULE:Test(Config)]) ||
+ {_,Tests} <- groups(), Test <- Tests].
+
+-define(TEST(B,C,D), test(?LINE,?FUNCTION_NAME,B,C,D, true)).
+-define(TEST_EQ(B,C,D),
+ test(?LINE,?FUNCTION_NAME,B,C,D, true),
+ test(?LINE,?FUNCTION_NAME,hd(C),[B|tl(C),D, true)).
+
+-define(TEST_NN(B,C,D),
+ test(?LINE,?FUNCTION_NAME,B,C,D, false),
+ test(?LINE,?FUNCTION_NAME,hd(C),[B|tl(C)],D, false)).
+
+
+is_empty(_) ->
+ ?TEST("", [], true),
+ ?TEST([""|<<>>], [], true),
+ ?TEST("a", [], false),
+ ?TEST([""|<<$a>>], [], false),
+ ?TEST(["",[<<>>]], [], true),
+ ok.
+
+length(_) ->
+ %% invalid arg type
+ {'EXIT',_} = (catch string:length({})),
+ {'EXIT',_} = (catch string:length(foo)),
+ %% Valid signs
+ ?TEST("", [], 0),
+ ?TEST([""|<<>>], [], 0),
+ L = tuple_size(list_to_tuple(atom_to_list(?MODULE))),
+ ?TEST(atom_to_list(?MODULE), [], L),
+ ?TEST("Hello", [], 5),
+ ?TEST("UC Ω ßð", [], 7),
+ ?TEST(["abc"|<<"abc">>], [], 6),
+ ?TEST(["abc",["def"]], [], 6),
+ ?TEST([<<97/utf8, 778/utf8, 98/utf8>>, [776,111,776]], [], 3), %% åäö in nfd
+ ok.
+
+equal(_) ->
+ %% invalid arg type
+ {'EXIT',_} = (catch string:equal(1, 2)),
+ {'EXIT',_} = (catch string:equal(1, 2, foo)),
+ {'EXIT',_} = (catch string:equal(1, 2, true, foo)),
+
+ ?TEST("", [<<"">>], true),
+ ?TEST("Hello", ["Hello"], true),
+ ?TEST("Hello", ["Hell"], false),
+ ?TEST("Hello", ["Hello!"], false),
+ ?TEST("Hello", [<<"Hello"/utf8>>], true),
+ ?TEST("Hello", [<<"Mello"/utf8>>], false),
+ ?TEST("Hello", [<<"Hello!"/utf8>>], false),
+ ?TEST(["Hello",[" deep"]], ["Hello deep"], true),
+ ?TEST(["Hello",[<<" deep"/utf8>>]], ["Hello deep"], true),
+ ?TEST("Hello deep", [["Hello", [" deep"]]], true),
+ ?TEST("Hello deep", [["Hello", [" d!eep"]]], false),
+ ?TEST("Hello deep", [["Hello", [<<" deep"/utf8>>]]], true),
+ false = string:equal("Åäö", [<<97/utf8, 778/utf8, 98/utf8>>, [776,111,776]]), %% nfc vs nfd
+
+ %% case_insensitive_equal()
+ ?TEST("", ["", true], true),
+ ?TEST("a", ["b", true], false),
+ ?TEST("", [<<>>, true], true),
+ ?TEST("", [[<<>>,[]], true], true),
+ ?TEST("", [[<<>>,[$a]], true], false),
+ ?TEST("123", ["123", true], true),
+ ?TEST("abc", ["abc", true], true),
+ ?TEST([[],<<>>,"ABC"|<<>>], [["abc",[]], true], true),
+ ?TEST("ABCa", ["abcå", true], false),
+ ?TEST("åäö", [{norm,"åäö"}, true], true),
+ ?TEST("ÅÄÖ", [{norm,"åäö"}, true], true),
+ ?TEST("MICHAŁ", ["michał", true], true),
+ ?TEST(["Mic",<<"HAŁ"/utf8>>], ["michał", true], true),
+ ?TEST("ß SHARP S", ["ss sharp s", true], true),
+ ?TEST("ẞ SHARP S", [[<<$ß/utf8, $\s>>,"SHARP S"], true], true),
+ ?TEST("ẞ SHARP ß", ["ss sharp s", true], false),
+ ?TEST(<<"İ I WITH DOT ABOVE"/utf8>>, ["i̇ i with dot above", true], true),
+ %% These should be equivalent with the above
+ true = string:equal(string:casefold(["Mic",<<"HAŁ"/utf8>>]), string:casefold("michał")),
+ true = string:equal(string:casefold("ẞ SHARP S"), string:casefold([<<$ß/utf8, $\s>>,"SHARP S"])),
+ false = string:equal(string:casefold("ẞ SHARP ß"), string:casefold("ss sharp s")),
+
+ %% Normalization
+ ?TEST_NN("", ["", true, none], true),
+ ?TEST_NN("a", ["b", true, nfc], false),
+ ?TEST_NN("a", ["b", true, nfd], false),
+ ?TEST_NN("a", ["b", true, nfkc], false),
+ ?TEST_NN("a", ["b", true, nfkd], false),
+
+ ?TEST_NN("a", ["A", false, nfc], false),
+ ?TEST_NN("a", ["A", false, nfd], false),
+ ?TEST_NN([<<>>,"a"|<<>>], ["A", true, nfkc], true),
+ ?TEST_NN(<<"a">>, ["A", true, nfkd], true),
+
+ ?TEST_NN([$a, <<$b>>, [97,776]], ["abä", false, none], false),
+ ?TEST_NN([$a, <<$b>>, [97,776]], ["abä", false, nfc], true),
+ ?TEST_NN([$a, <<$b>>, [97,776]], ["abä", false, nfd], true),
+ ?TEST_NN([$a, <<$b>>, [97,776]], ["abä", false, nfkc], true),
+ ?TEST_NN([$a, <<$b>>, [97,776]], ["abä", false, nfkd], true),
+
+ ?TEST_NN([$a, <<$b>>, [97,776]], ["abÄ", true, none], false),
+ ?TEST_NN([$a, <<$b>>, [97,776]], ["abÄ", false, nfc], false),
+ ?TEST_NN([$a, <<$b>>, [97,776]], ["abÄ", true, nfc], true),
+ ?TEST_NN([$a, <<$b>>, [97,776]], ["abÄ", true, nfd], true),
+ ?TEST_NN([$a, <<$b>>, [97,776]], ["abÄ", true, nfkc], true),
+ ?TEST_NN([$a, <<$b>>, [97,776]], ["abÄ", true, nfkd], true),
+
+ ?TEST_NN([$a, <<$b>>, "ホンダ"], ["abホンダ", true, none], false),
+ ?TEST_NN([$a, <<$b>>, "ホンダ"], ["abホンダ", true, nfc], false),
+ ?TEST_NN([$a, <<$b>>, "ホンダ"], ["abホンダ", true, nfd], false),
+ ?TEST_NN([$a, <<$b>>, "ホンダ"], ["abホンダ", true, nfkc], true),
+ ?TEST_NN([$a, <<$b>>, "ホンダ"], ["abホンダ", true, nfkd], true),
+
+ ?TEST_NN([$a, <<$b>>, "32"], ["ab32", true, none], false),
+ ?TEST_NN([$a, <<$b>>, "32"], ["ab32", true, nfc], false),
+ ?TEST_NN([$a, <<$b>>, "32"], ["ab32", true, nfd], false),
+ ?TEST_NN([$a, <<$b>>, "32"], ["ab32", true, nfkc], true),
+ ?TEST_NN([$a, <<$b>>, "32"], ["ab32", true, nfkd], true),
+
+ %% Coverage.
+ ?TEST("", [<<"">>, false, nfc], true),
+ ?TEST("", [<<"">>, true, nfc], true),
+
+ ok.
+
+to_graphemes(_) ->
+ %% More tests are in unicode_util_SUITE.erl
+ {'EXIT', _} = (catch unicode:characters_to_nfd_binary(["asdåäö", an_atom])),
+ String = ["abc..åäö", $e, 788, <<"Ωµe`è"/utf8>>, "œŒþæÆħ§ß"],
+ NFD = unicode:characters_to_nfd_list(String),
+ [] = string:to_graphemes([]),
+ [] = string:to_graphemes(<<>>),
+ GCs = string:to_graphemes(String),
+ true = erlang:length(GCs) =:= string:length(String),
+ true = erlang:length(GCs) =:= erlang:length(string:to_graphemes(NFD)),
+ true = erlang:length(GCs) =:=
+ erlang:length(string:to_graphemes(unicode:characters_to_nfc_list(String))),
+ ok.
+
+reverse(_) ->
+ {'EXIT',_} = (catch string:reverse(2)),
+ Str1 = "Hello ",
+ Str2 = "Ω ßð",
+ Str3 = "åäö",
+ ?TEST("", [], ""),
+ ?TEST(Str1, [], lists:reverse(Str1)),
+ ?TEST(Str2, [], lists:reverse(Str2)),
+ ?TEST(Str3, [], lists:reverse(Str3)),
+ true = string:reverse(Str3) =:= lists:reverse(string:to_graphemes(Str3)),
+ ok.
+
+slice(_) ->
+ {'EXIT',_} = (catch string:slice(2, 2, 2)),
+ {'EXIT',_} = (catch string:slice("asd", foo, 2)),
+ {'EXIT',_} = (catch string:slice("asd", 2, -1)),
+ ?TEST("", [3], ""),
+ ?TEST("aåä", [1, 0], ""),
+ ?TEST("aåä", [3], ""),
+ ?TEST("aåäöbcd", [3], "öbcd"),
+ ?TEST([<<"aå"/utf8>>,"äöbcd"], [3], "öbcd"),
+ ?TEST([<<"aåä"/utf8>>,"öbcd"], [3], "öbcd"),
+ ?TEST([<<"aåä"/utf8>>,"öbcd"], [3, infinity], "öbcd"),
+
+ ?TEST("", [3, 2], ""),
+ ?TEST("aåä", [3, 2], ""),
+ ?TEST("aåäöbcd", [3,2], "öb"),
+ ?TEST([<<"aå"/utf8>>,"äöbcd"], [3,3], "öbc"),
+ ?TEST([<<"aåä"/utf8>>,"öbcd"], [3,10], "öbcd"),
+
+ ok.
+
+pad(_) ->
+ Str = "Hallå",
+ ?TEST(Str, [7], "Hallå "),
+ ?TEST(Str, [7, leading], " Hallå"),
+ ?TEST(Str, [4, both, $.], "Hallå"),
+ ?TEST(Str, [10, both, $.], "..Hallå..."),
+ ?TEST(Str, [10, leading, $.], ".....Hallå"),
+ ?TEST(Str, [10, trailing, $.], "Hallå....."),
+ ?TEST(Str++["f"], [10, trailing, $.], "Hallåf...."),
+ ?TEST(Str++[" flåwer"], [10, trailing, $.], "Hallå flåwer"),
+ ok.
+
+trim(_) ->
+ Str = "\t\s..Ha\s.llå..\t\n\r",
+ ?TEST("", [], ""),
+ ?TEST(Str, [both, "x"], Str),
+ ?TEST(Str, [leading], "..Ha\s.llå..\t\n\r"),
+ ?TEST(Str, [trailing], "\t\s..Ha\s.llå.."),
+ ?TEST(Str, [], "..Ha .llå.."),
+ ?TEST(".. ", [both, ""], ".. "),
+ ?TEST([<<".. ">>], [both, ". "], ""),
+ ?TEST(".. h.ej ..", [leading, ". "], "h.ej .."),
+ ?TEST(".. h.ej ..", [trailing, ". "], ".. h.ej"),
+ ?TEST(".. h.ej ..", [both, ". "], "h.ej"),
+ ?TEST(["..", <<"h.ej">>, ".."], [both, ". "], "h.ej"),
+ ?TEST([[], "..", " h.ej ", <<"..">>], [both, ". "], "h.ej"),
+ ?TEST([<<>>,<<"..">>, " h.ej", <<" ..">>], [both, ". "], "h.ej"),
+ ?TEST([<<>>,<<"..">>, " h.ej", <<" ..">>], [trailing, ". "], ".. h.ej"),
+ ?TEST([<<".. h.ej .">>, <<"..">>], [both, ". "], "h.ej"),
+ ?TEST(["..h", ".e", <<"j..">>], [both, ". "], "h.ej"),
+ ?TEST(["..h", <<".ejsa"/utf8>>, "n.."], [both, ". "], "h.ejsan"),
+ %% Test that it behaves with graphemes (i.e. nfd tests are the hard part)
+ ?TEST("aaåaa", [both, "a"], "å"),
+ ?TEST(["aaa",778,"äöoo"], [both, "ao"], "åäö"),
+ ?TEST([<<"aaa">>,778,"äöoo"], [both, "ao"], "åäö"),
+ ?TEST([<<"e">>,778,"åäöe", <<778/utf8>>], [both, [[$e,778]]], "åäö"),
+ ?TEST([[<<"!v">>|<<204,128,$v,204,129>>]],[trailing, [[$v,769]]], [$!,$v,768]),
+ ?TEST([[[<<"v">>|<<204,129,118,204,128,118>>],769,118,769]], [trailing, [[118,769]]], [$v,769,$v,768]),
+ ?TEST([<<"vv">>|<<204,128,118,204,128>>], [trailing, [[118,768]]], "v"),
+ ok.
+
+chomp(_) ->
+ Str = "åäö\na\r\nsd\n",
+ Res = "åäö\na\r\nsd",
+ ?TEST("", [], ""),
+ ?TEST("\n", [], ""),
+ ?TEST("str \t", [], "str \t"),
+ ?TEST("str \t\n\r", [], "str \t\n\r"),
+ ?TEST(Str, [], Res),
+ ?TEST([Str,$\n], [], Res),
+ ?TEST([Str|"\n"], [], Res),
+ ?TEST([Str|<<"\n">>], [], Res),
+ ?TEST([Str,$\r|<<"\n">>], [], Res),
+ ?TEST([Str, <<$\r>>|"\n"], [], Res),
+ ?TEST([<<$a,$\r>>,"\na\n"], [], "a\r\na"),
+ ok.
+
+take(_) ->
+ Str = "\t\s..Ha\s.llå..\t\n\r",
+ WS = "\t\s\n\r",
+ Chars = lists:seq($a,$z)++lists:seq($A,$Z),
+ %% complement=false, dir=leading
+ ?TEST("", ["abc"], {"",""}),
+ ?TEST(Str, ["x"], {[], Str}),
+ ?TEST(Str, [WS], {"\t\s","..Ha\s.llå..\t\n\r"}),
+ ?TEST(".. ", ["", false], {"", ".. "}),
+ ?TEST([<<".. ">>], [". ", false, leading], {".. ", ""}),
+ ?TEST(".. h.ej ..", [". ", false, leading], {".. ", "h.ej .."}),
+ ?TEST(["..", <<"h.ej">>, ".."], [". ", false, leading], {"..", "h.ej.."}),
+ ?TEST([[], "..", " h.ej ", <<"..">>], [". ", false, leading], {".. ","h.ej .."}),
+ ?TEST([<<>>,<<"..">>, " h.ej", <<" ..">>], [". ", false, leading], {".. ", "h.ej .."}),
+ ?TEST(["..h", <<".ejsa"/utf8>>, "n.."], [". ", false, leading], {"..", "h.ejsan.."}),
+ ?TEST([[<<101,204,138,33>>]], [[[$e,778]]], {[$e,778], "!"}),
+ %% Test that it behaves with graphemes (i.e. nfd tests are the hard part)
+ ?TEST("aaåaa", ["a", false, leading], {"aa", "åaa"}),
+ ?TEST(["aaa",778,"äöoo"], ["ao", false, leading], {"aa", "åäöoo"}),
+ ?TEST([<<"aaa">>,778,"äöoo"], ["ao",false,leading], {"aa", "åäöoo"}),
+ ?TEST([<<"e">>,778,"åäöe", <<778/utf8>>], [[[$e,778]], false, leading], {[$e,778],"åäöe"++[778]}),
+
+ %% complement=true, dir=leading
+ ?TEST("", ["abc", true], {"",""}),
+ ?TEST(Str, ["x", true], {Str, []}),
+ ?TEST(Str, [Chars, true], {"\t\s..","Ha\s.llå..\t\n\r"}),
+ ?TEST(".. ", ["",true], {".. ", ""}),
+ ?TEST([<<".. ">>], [Chars, true, leading], {".. ", ""}),
+ ?TEST(".. h.ej ..", [Chars, true, leading], {".. ", "h.ej .."}),
+ ?TEST(["..", <<"h.ej">>, ".."], [Chars, true, leading], {"..", "h.ej.."}),
+ ?TEST([[], "..", " h.ej ", <<"..">>], [Chars, true, leading], {".. ","h.ej .."}),
+ ?TEST([<<>>,<<"..">>, " h.ej", <<" ..">>], [Chars, true, leading], {".. ", "h.ej .."}),
+ ?TEST(["..h", <<".ejsa"/utf8>>, "n.."], [Chars, true, leading], {"..", "h.ejsan.."}),
+ %% Test that it behaves with graphemes (i.e. nfd tests are the hard part)
+ ?TEST(["aaee",778,"äöoo"], [[[$e,778]], true, leading], {"aae", [$e,778|"äöoo"]}),
+ ?TEST([<<"aae">>,778,"äöoo"], [[[$e,778]],true,leading], {"aa", [$e,778|"äöoo"]}),
+ ?TEST([<<"e">>,778,"åäöe", <<778/utf8>>], [[[$e,778]], true, leading], {[], [$e,778]++"åäöe"++[778]}),
+
+ %% complement=false, dir=trailing
+ ?TEST(Str, ["", false, trailing], {Str, []}),
+ ?TEST(Str, ["x", false, trailing], {Str, []}),
+ ?TEST(Str, [WS, false,trailing], {"\t\s..Ha\s.llå..", "\t\n\r"}),
+ ?TEST(".. h.ej ..", [". ", false, trailing], {".. h.ej", " .."}),
+ ?TEST(["..", <<"h.ej">>, ".."], [". ", false, trailing], {"..h.ej", ".."}),
+ ?TEST([[], "..", " h.ej ", <<"..">>], [". ", false, trailing], {".. h.ej", " .."}),
+ ?TEST([<<>>,<<"..">>, " h.ej", <<" ..">>], [". ", false, trailing], {".. h.ej", " .."}),
+ ?TEST(["..h", <<".ejsa"/utf8>>, "n.."], [". ", false, trailing], {"..h.ejsan", ".."}),
+ ?TEST("aaåaa", ["a", false, trailing], {"aaå", "aa"}),
+ ?TEST([<<"KMШ"/utf8>>], [[1064], false, trailing], {"KMШ",[]}),
+ ?TEST([[<<"!\"">>|<<"\"">>]], ["\"", false, trailing], {"!", "\"\""}),
+ ?TEST([<<$v>>, 769], [[[$v,769]], false, trailing], {"", [$v,769]}),
+ ?TEST(["aaa",778,"äöoo"], ["ao", false, trailing], {"aaåäö", "oo"}),
+ ?TEST([<<"aaa">>,778,"äöoo"], ["ao", false, trailing], {"aaåäö", "oo"}),
+ ?TEST([<<"e">>,778,"åäöee", <<778/utf8>>], [[[$e,778]], false, trailing], {[$e,778|"åäöe"], [$e,778]}),
+
+ %% complement=true, dir=trailing
+ ?TEST("", ["abc", true, trailing], {"",""}),
+ ?TEST(Str, ["x", true, trailing], {[], Str}),
+ %?TEST(Str, [{norm,Chars}, true, trailing], {"\t\s..Ha\s.ll","å..\t\n\r"}),
+ ?TEST(".. ", ["", true, trailing], {"", ".. "}),
+ ?TEST([<<".. ">>], [Chars, true, trailing], {"", ".. "}),
+ ?TEST(".. h.ej ..", [Chars, true, trailing], {".. h.ej", " .."}),
+ ?TEST(["..", <<"h.ej">>, ".."], [Chars, true, trailing], {"..h.ej", ".."}),
+ ?TEST([[], "..", " h.ej ", <<"..">>], [Chars, true, trailing], {".. h.ej"," .."}),
+ ?TEST([<<>>,<<"..">>, " h.ej", <<" ..">>], [Chars, true, trailing], {".. h.ej"," .."}),
+ ?TEST(["..h", <<".ejsa"/utf8>>, "n.."], [Chars, true, trailing], {"..h.ejsan", ".."}),
+ ?TEST([[<<101,204,138,33>>]], [[[$e,778]], true, trailing], {[$e,778], "!"}),
+ ?TEST([<<"Fa">>], [[$F], true, trailing], {"F", "a"}),
+ ?TEST([[<<101,101,204,138>>,1045,778]], ["e", true, trailing], {"e", [101,778,1045,778]}),
+ ?TEST([[<<101,101,204,138>>,<<1045/utf8,778/utf8>>]], ["e", true, trailing], {"e", [101,778,1045,778]}),
+ ?TEST([[[118,769,118],<<204,129,118,204,129,120,204,128,118>>,768,120,768]],
+ [[[118,769]], true, trailing], {[118,769,118,769,118,769],[120,768,118,768,120,768]}),
+ ?TEST([[<<118,204,128,118>>|<<204,128,118,204,128,118,204,128,206,132,204,129,206,132,204,129>>]],
+ [[[118,768]], true, trailing], {[118,768,118,768,118,768,118,768], [900,769,900,769]}),
+ %% Test that it behaves with graphemes (i.e. nfd tests are the hard part)
+ ?TEST(["aaee",778,"äöoo"], [[[$e,778]], true, trailing], {"aae"++[$e,778], "äöoo"}),
+ ?TEST([<<"aae">>,778,"äöoo"], [[[$e,778]],true,trailing], {"aa"++[$e,778], "äöoo"}),
+ ?TEST([<<"e">>,778,"åäöe", <<778/utf8>>], [[[$e,778]], true, trailing], {[$e,778]++"åäöe"++[778], []}),
+ ?TEST([<<"e">>,778,"åäöe", <<778/utf8>>, $e, 779], [[[$e,778]], true, trailing],
+ {[$e,778]++"åäöe"++[778], [$e,779]}),
+
+ ok.
+
+
+uppercase(_) ->
+ ?TEST("", [], ""),
+ ?TEST("123", [], "123"),
+ ?TEST("abc", [], "ABC"),
+ ?TEST("ABC", [], "ABC"),
+ ?TEST("abcdefghiljklmnopqrstvxyzåäö",[], "ABCDEFGHILJKLMNOPQRSTVXYZÅÄÖ"),
+ ?TEST("åäö", [], "ÅÄÖ"),
+ ?TEST("ÅÄÖ", [], "ÅÄÖ"),
+ ?TEST("Michał", [], "MICHAŁ"),
+ ?TEST(["Mic",<<"hał"/utf8>>], [], "MICHAŁ"),
+ ?TEST("ljLJ", [], "LJLJ"),
+ ?TEST("LJlj", [], "LJLJ"),
+ ?TEST("ß sharp s", [], "SS SHARP S"),
+ ok.
+
+lowercase(_) ->
+ ?TEST("", [], ""),
+ ?TEST("123", [], "123"),
+ ?TEST("abc", [], "abc"),
+ ?TEST("ABC", [], "abc"),
+ ?TEST("åäö", [], "åäö"),
+ ?TEST("ÅÄÖ", [], "åäö"),
+ ?TEST("MICHAŁ", [], "michał"),
+ ?TEST(["Mic",<<"HAŁ"/utf8>>], [], "michał"),
+ ?TEST("ß SHARP S", [], "ß sharp s"),
+ ?TEST("İ I WITH DOT ABOVE", [], "i̇ i with dot above"),
+ ok.
+
+titlecase(_) ->
+ ?TEST("", [], ""),
+ ?TEST("123", [], "123"),
+ %% Titlecase is the same as uppercase for most chars
+ [?TEST([C,$x], [], string:uppercase([C])++[$x]) ||
+ C <-"abcdefghiljklmnopqrstvxyzåäö"],
+ %% Example of a different mapping
+ ?TEST("ljusad", [],"Ljusad"),
+ ?TEST("ljLJ", [], "LjLJ"),
+ ?TEST("LJlj", [], "Ljlj"),
+ ?TEST("ß sharp s", [], "Ss sharp s"),
+ ok.
+
+casefold(_) ->
+ ?TEST("", [], ""),
+ ?TEST("123", [], "123"),
+ ?TEST("abc", [], "abc"),
+ ?TEST("ABC", [], "abc"),
+ ?TEST("åäö", [], "åäö"),
+ ?TEST("ÅÄÖ", [], "åäö"),
+ ?TEST("MICHAŁ", [], "michał"),
+ ?TEST(["Mic",<<"HAŁ"/utf8>>], [], "michał"),
+ ?TEST("ß SHARP S", [], "ss sharp s"),
+ ?TEST("ẞ SHARP S", [], "ss sharp s"),
+ ?TEST("İ I WITH DOT ABOVE", [], "i̇ i with dot above"),
+ ok.
+
+prefix(_) ->
+ ?TEST("", ["a"], nomatch),
+ ?TEST("a", [""], "a"),
+ ?TEST("b", ["a"], nomatch),
+ ?TEST("a", ["a"], ""),
+ ?TEST("å", ["a"], nomatch),
+ ?TEST(["a",<<778/utf8>>], ["a"], nomatch),
+ ?TEST([<<"a"/utf8>>,778], ["a"], nomatch),
+ ?TEST("hejsan", [""], "hejsan"),
+ ?TEST("hejsan", ["hej"], "san"),
+ ?TEST("hejsan", ["hes"], nomatch),
+ ?TEST(["h", "ejsan"], ["hej"], "san"),
+ ?TEST(["h", "e", "jsan"], ["hej"], "san"),
+ ?TEST(["h", "e", "san"], ["hej"], nomatch),
+ ?TEST(["h", <<"ejsan">>], ["hej"], "san"),
+ ?TEST(["h", <<"e">>, "jsan"], ["hej"], "san"),
+ ?TEST(["h", "e", <<"jsan">>], ["hej"], "san"),
+ ok.
+
+split(_) ->
+ Mod = fun(Res) ->
+ [lists:flatten(unicode:characters_to_nfc_list(io_lib:format("~ts", [Str])))
+ || Str <- Res] end,
+ ?TEST("..", ["", leading], {Mod, [".."]}),
+ ?TEST("..", ["..", leading], {Mod, [[],[]]}),
+ ?TEST("abcd", ["..", leading], {Mod, ["abcd"]}),
+ ?TEST("ab..bc", ["..", leading], {Mod, ["ab","bc"]}),
+ ?TEST("ab..bc..cd", ["..", leading], {Mod, ["ab","bc..cd"]}),
+ ?TEST("..ab", [".."], {Mod, [[],"ab"]}),
+ ?TEST("ab..", ["..", leading], {Mod, ["ab",[]]}),
+ ?TEST(["ab..bc..cd"], ["..", leading], {Mod, ["ab","bc..cd"]}),
+ ?TEST(["ab","..bc..cd"], ["..", leading], {Mod, ["ab","bc..cd"]}),
+ ?TEST(["ab",<<"..bc..cd">>], ["..", leading], {Mod, ["ab","bc..cd"]}),
+ ?TEST(["ab.",".bc..cd"], ["..", leading], {Mod, ["ab","bc..cd"]}),
+ ?TEST(["ab.",<<".bc..cd">>], ["..", leading], {Mod, ["ab","bc..cd"]}),
+ ?TEST(["ab..","bc..cd"], ["..", leading], {Mod, ["ab","bc..cd"]}),
+ ?TEST(["ab..",<<"bc..cd">>], ["..", leading], {Mod, ["ab","bc..cd"]}),
+ ?TEST(["ab.","bc..cd"], ["..", leading], {Mod, ["ab.bc","cd"]}),
+ ?TEST("ab...bc", ["..", leading], {Mod, ["ab",".bc"]}),
+
+ ?TEST("..", ["", trailing], {Mod, [".."]}),
+ ?TEST("..", ["..", trailing], {Mod, [[],[]]}),
+ ?TEST("abcd", ["..", trailing], {Mod, ["abcd"]}),
+ ?TEST("ab..bc", ["..", trailing], {Mod, ["ab","bc"]}),
+ ?TEST("ab..bc..cd", ["..", trailing], {Mod, ["ab..bc","cd"]}),
+ ?TEST("..ab", ["..", trailing], {Mod, [[],"ab"]}),
+ ?TEST("ab..", ["..", trailing], {Mod, ["ab",[]]}),
+ ?TEST(["ab..bc..cd"], ["..", trailing], {Mod, ["ab..bc","cd"]}),
+ ?TEST(["ab","..bc..cd"], ["..", trailing], {Mod, ["ab..bc","cd"]}),
+ ?TEST(["ab"|<<"a">>], ["a", trailing], {Mod, ["ab",[]]}),
+ ?TEST(["ab",<<"..bc..cd">>], ["..", trailing], {Mod, ["ab..bc","cd"]}),
+ ?TEST([<<"ab.">>,".bc..cd"], ["..", trailing], {Mod, ["ab..bc","cd"]}),
+ ?TEST(["ab.",<<".bc..cd">>], ["..", trailing], {Mod, ["ab..bc","cd"]}),
+ ?TEST(["ab..","bc..cd"], ["..", trailing], {Mod, ["ab..bc","cd"]}),
+ ?TEST(["ab..",<<"bc..cd">>], ["..", trailing], {Mod, ["ab..bc","cd"]}),
+ ?TEST(["ab.","bc..cd"], ["..", trailing], {Mod, ["ab.bc","cd"]}),
+ ?TEST("ab...bc", ["..", trailing], {Mod, ["ab.","bc"]}),
+
+ ?TEST("..", ["..", all], {Mod, [[],[]]}),
+ ?TEST("abcd", ["..", all], {Mod, ["abcd"]}),
+ ?TEST("a..b", ["..", all], {Mod, ["a","b"]}),
+ ?TEST("a..b..c", ["..", all], {Mod, ["a","b","c"]}),
+ ?TEST("a..", ["..", all], {Mod, ["a",[]]}),
+ ?TEST(["a..b..c"], ["..", all], {Mod, ["a","b","c"]}),
+ ?TEST(["a","..b..c"], ["..", all], {Mod, ["a","b","c"]}),
+ ?TEST(["a",<<"..b..c">>], ["..", all], {Mod, ["a","b","c"]}),
+ ?TEST(["a.",".b..c"], ["..", all], {Mod, ["a","b","c"]}),
+ ?TEST(["a.",<<".b..c">>], ["..", all], {Mod, ["a","b","c"]}),
+ ?TEST(["a..","b..c"], ["..", all], {Mod, ["a","b","c"]}),
+ ?TEST(["a..",<<"b..c">>], ["..", all], {Mod, ["a","b","c"]}),
+ ?TEST(["a.","b..c"], ["..", all], {Mod, ["a.b","c"]}),
+ ?TEST("a...b", ["..", all], {Mod, ["a",".b"]}),
+
+ %% Grapheme (split) tests
+ ?TEST("aΩΩb", ["Ω", all], {Mod, ["a","","b"]}),
+ ?TEST([<<"aae">>,778,"äöoo"], [[$e,778], leading], {Mod, ["aa","äöoo"]}),
+ ?TEST([<<"aae">>,778,"äöoo"], [[$e,778], trailing], {Mod, ["aa","äöoo"]}),
+ ?TEST([<<"aae">>,778,"äöoo"], [[$e,778], all], {Mod, ["aa","äöoo"]}),
+ ?TEST([<<"aae">>,778,"öeeåäö"], ["e", leading], {Mod, [[$a, $a, $e,778,$ö],"eåäö"]}),
+ ?TEST([<<"aae">>,778,"öeeåäö"], ["e", trailing], {Mod, [[$a, $a, $e,778,$ö, $e],"åäö"]}),
+ ?TEST([<<"aae">>,778,"öeeåäö"], ["e", all], {Mod, [[$a, $a, $e,778,$ö],"", "åäö"]}),
+
+ ok.
+
+replace(_) ->
+ ?TEST(["a..b.", [".c"]], ["xxx", "::"], "a..b..c"),
+ ?TEST(["a..b.", [".c"]], ["..", "::"], "a::b..c"),
+ ?TEST([<<"a..b.">>, [".c"]], ["..", "::", trailing], "a..b::c"),
+ ?TEST(["a..b.", [".c"]], ["..", "::", all], "a::b::c"),
+ ok.
+
+cd_gc(_) ->
+ [] = string:next_codepoint(""),
+ [] = string:next_codepoint(<<>>),
+ [] = string:next_codepoint([<<>>]),
+ "abcd" = string:next_codepoint("abcd"),
+ [$e,778] = string:next_codepoint([$e,778]),
+ [$e|<<204,138>>] = string:next_codepoint(<<$e,778/utf8>>),
+ [778|_] = string:next_codepoint(tl(string:next_codepoint(<<$e,778/utf8>>))),
+
+ [] = string:next_grapheme(""),
+ [] = string:next_grapheme(<<>>),
+ [] = string:next_grapheme([<<>>]),
+ "abcd" = string:next_grapheme("abcd"),
+ [[$e,778]] = string:next_grapheme([$e,778]),
+ [[$e,778]] = string:next_grapheme(<<$e,778/utf8>>),
+
+ ok.
+
+
+find(_) ->
+ ?TEST(["h", "ejsan"], [""], "hejsan"),
+ ?TEST(["h", "ejsan"], [<<>>], "hejsan"),
+ ?TEST([], [""], ""),
+ ?TEST([], ["hej"], nomatch),
+ ?TEST(["h", "ejsan"], ["hej"], "hejsan"),
+ ?TEST(["h", "e", "jsan"], ["hej"], "hejsan"),
+ ?TEST(["xh", "e", "san"], ["hej"], nomatch),
+ ?TEST([<<"xh">>, <<"ejsan">>], ["hej"], "hejsan"),
+ ?TEST(["xh", <<"ejsan">>], ["hej"], "hejsan"),
+ ?TEST(["xh", <<"e">>, "jsan"], ["hej"], "hejsan"),
+ ?TEST(["xh", "e", <<"jsan">>], ["hej"], "hejsan"),
+ ?TEST(["xh", "er", <<"ljsane">>, "rlang"], ["erl", leading], "erljsanerlang"),
+ ?TEST("aΩΩb", ["Ω", leading], "ΩΩb"),
+ ?TEST([<<"aae">>,778,"äöoo"], [[$e,778], leading], [$e,778]++"äöoo"),
+ ?TEST([<<"aae">>,778,"öeeåäö"], ["e", leading], "eeåäö"),
+
+ ?TEST(["h", "ejsan"], ["", trailing], "hejsan"),
+ ?TEST([], ["", trailing], ""),
+ ?TEST([], ["hej", trailing], nomatch),
+ ?TEST(["h", "ejsan"], ["hej", trailing], "hejsan"),
+ ?TEST(["h", "e", "jsan"], ["hej", trailing], "hejsan"),
+ ?TEST(["xh", "e", "san"], ["hej", trailing], nomatch),
+ ?TEST([<<"xh">>, <<"ejsan">>], ["hej", trailing], "hejsan"),
+ ?TEST(["xh", <<"ejsan">>], ["hej", trailing], "hejsan"),
+ ?TEST(["xh", <<"e">>, "jsan"], ["hej", trailing], "hejsan"),
+ ?TEST(["xh", "e", <<"jsan">>], ["hej", trailing], "hejsan"),
+ ?TEST(["xh", "er", <<"ljsane">>, "rlang"], ["erl", trailing], "erlang"),
+ ?TEST("aΩΩb", ["Ω", trailing], "Ωb"),
+ ?TEST([<<"aae">>,778,"äöoo"], [[$e,778], trailing], [$e,778]++"äöoo"),
+ ?TEST([<<"aeae">>,778,"äö"], ["e", trailing], "eae"++[778,$ä,$ö]),
+
+ ok.
+
+lexemes(_) ->
+ Mod = fun(Res) ->
+ [unicode:characters_to_nfc_list(io_lib:format("~ts", [Str]))|| Str <- Res]
+ end,
+ Res = ["Hej", "san", "Hopp", "san"],
+ ?TEST("", [" ,."], {Mod, []}),
+ ?TEST("Hej san", [""], {Mod, ["Hej san"]}),
+ ?TEST(" ,., ", [" ,."], {Mod, []}),
+ ?TEST( "Hej san Hopp san", [" ,."], {Mod, Res}),
+ ?TEST(" Hej san Hopp san ", [" ,."], {Mod, Res}),
+ ?TEST(" Hej san, .Hopp san ", [" ,."], {Mod, Res}),
+
+ ?TEST([" Hej san",", .Hopp san "], [" ,."], {Mod, Res}),
+ ?TEST([" Hej sa","n, .Hopp san "], [" ,."], {Mod, Res}),
+ ?TEST([" Hej san,"," .Hopp san "], [" ,."], {Mod, Res}),
+
+ ?TEST([" Hej san",[", .Hopp san "]], [" ,."], {Mod, Res}),
+ ?TEST([" Hej sa",["n, .Hopp san "]], [" ,."], {Mod, Res}),
+ ?TEST([" Hej san,",[" .Hopp san "]], [" ,."], {Mod, Res}),
+
+ ?TEST([" H",<<"ej san, .Hopp "/utf8>>, "san"], [" ,."], {Mod, Res}),
+ ?TEST([" Hej san",<<", .Hopp "/utf8>>, "san"], [" ,."], {Mod, Res}),
+ ?TEST([" Hej sa",<<"n, .Hopp"/utf8>>, " san"], [" ,."], {Mod, Res}),
+ ?TEST([" Hej san,",<<" .Hopp s"/utf8>>, "an"], [" ,."], {Mod, Res}),
+ ?TEST([" Hej san",[<<", .Hopp san "/utf8>>]], [" ,."], {Mod, Res}),
+ ?TEST([" Hej sa",[<<"n, .Hopp san "/utf8>>]], [" ,."], {Mod, Res}),
+ ?TEST([" Hej san,",[<<" .Hopp san "/utf8>>], <<" ">>], [" ,."], {Mod, Res}),
+
+ ?TEST(" Hej\r\nsan\nnl", ["\r\n\s"], {Mod, ["Hej\r\nsan", "nl"]}),
+
+ ?TEST(["b1ec1e",778,"äöo21"], ["eo"], {Mod, ["b1",[$c,$1,$e,778,$ä,$ö],"21"]}),
+ ?TEST([<<"b1ec1e">>,778,"äöo21"], ["eo"], {Mod, ["b1",[$c,$1,$e,778,$ä,$ö],"21"]}),
+ %% Grapheme (split) tests
+ Str10 = [[[<<"÷"/utf8>>,1101],<<"ë"/utf8>>|<<"\"">>]],
+ ?TEST(Str10, [[1076]], {Mod, [unicode:characters_to_nfc_list(Str10)]}),
+ ?TEST("a1Ωb1Ωc1", ["Ω"], {Mod, ["a1","b1","c1"]}),
+ ?TEST([<<"aae">>,778,"äöoo"], [[[$e,778]]], {Mod, ["aa","äöoo"]}),
+ ?TEST([<<"aae">>,778,"äöo21"], [[[$e,778],$o]], {Mod, ["aa","äö","21"]}),
+ ?TEST([<<"aae">>,778,"öeeåäö"], ["e"], {Mod, [[$a, $a, $e,778,$ö],"åäö"]}),
+ ok.
+
+nth_lexeme(_) ->
+ {'EXIT', _} = (catch string:nth_lexeme("test test", 0, [])),
+ {'EXIT', _} = (catch string:nth_lexeme(<<"test test">>, 0, [])),
+ ?TEST( "", [1, " ,."], []),
+ ?TEST( "Hej san", [1, ""], "Hej san"),
+ ?TEST( " ,., ", [1, " ,."], []),
+ ?TEST( " ,., ", [3, " ,."], []),
+ ?TEST("Hej san Hopp san", [1, " ,."], "Hej"),
+ ?TEST("...Hej san Hopp san", [1, " ,."], "Hej"),
+ ?TEST("Hej san Hopp san", [3, " ,."], "Hopp"),
+ ?TEST(" Hej san Hopp san ", [3, " ,."], "Hopp"),
+ ?TEST(" Hej san, .Hopp san ", [3, " ,."], "Hopp"),
+ ?TEST("ab cd", [3, " "], ""),
+
+ ?TEST([" Hej san",", .Hopp san "], [3, " ,."], "Hopp"),
+ ?TEST([" Hej sa","n, .Hopp san "], [3, " ,."], "Hopp"),
+ ?TEST([" Hej san,"," .Hopp san "], [3, " ,."], "Hopp"),
+
+ ?TEST([" Hej san",[", .Hopp san "]], [3," ,."], "Hopp"),
+ ?TEST([" Hej sa",["n, .Hopp san "]], [3, " ,."], "Hopp"),
+ ?TEST([" Hej san,",[" .Hopp san "]], [3, " ,."], "Hopp"),
+
+ ?TEST([" Hej san",<<", .Hopp "/utf8>>, "san"], [3, " ,."], "Hopp"),
+ ?TEST([" Hej sa",<<"n, .Hopp"/utf8>>, " san"], [3, " ,."], "Hopp"),
+ ?TEST([" Hej san,",<<" .Hopp s"/utf8>>, "an"], [3, " ,."], "Hopp"),
+ ?TEST([" Hej san,",<<" .Hopp s"/utf8>>, "an"], [4, " ,."], "san"),
+ ?TEST([" Hej san",[<<", .Hopp san "/utf8>>]], [3, " ,."], "Hopp"),
+ ?TEST([" Hej sa",[<<"n, .Hopp san "/utf8>>]], [3, " ,."], "Hopp"),
+ ?TEST([" Hej san,",[<<" .Hopp san "/utf8>>], <<" ">>], [3, " ,."], "Hopp"),
+
+ ?TEST(["b1ec1e",778,"äöo21"], [3,"eo"], "21"),
+ ?TEST([<<"b1ec1e">>,778,"äöo21"], [3, "eo"], "21"),
+ %% Grapheme (split) tests
+ ?TEST("a1Ωb1Ωc1", [1, "Ω"], "a1"),
+ ?TEST([<<"aae">>,778,"äöoo"], [2,[[$e,778]]], "äöoo"),
+ ?TEST([<<"aae">>,778,"äöo21"], [2,[[$e,778],$o]], "äö"),
+ ?TEST([<<"aae">>,778,"öeeåäö"], [2,"e"], "åäö"),
+ ok.
+
+
+meas(Config) ->
+ case ct:get_timetrap_info() of
+ {_,{_,Scale}} when Scale > 1 ->
+ {skip,{will_not_run_in_debug,Scale}};
+ _ -> % No scaling
+ DataDir = proplists:get_value(data_dir, Config),
+ TestDir = filename:dirname(string:trim(DataDir, trailing, "/")),
+ do_measure(TestDir)
+ end.
+
+do_measure(TestDir) ->
+ File = filename:join(TestDir, ?MODULE_STRING ++ ".erl"),
+ io:format("File ~s ",[File]),
+ {ok, Bin} = file:read_file(File),
+ io:format("~p~n",[byte_size(Bin)]),
+ Do = fun(Name, Func, Mode) ->
+ {N, Mean, Stddev, _} = time_func(Func, Mode, Bin),
+ io:format("~10w ~6w ~6.2fms ±~4.2fms #~.2w gc included~n",
+ [Name, Mode, Mean/1000, Stddev/1000, N])
+ end,
+ io:format("----------------------~n"),
+ Do(tokens, fun(Str) -> string:tokens(Str, [$\n,$\r]) end, list),
+ Tokens = {lexemes, fun(Str) -> string:lexemes(Str, [$\n,$\r]) end},
+ [Do(Name,Fun,Mode) || {Name,Fun} <- [Tokens], Mode <- [list, binary]],
+ ok.
+
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+%% internal functions
+
+test(Line, Func, Str, Args, Res, Norm) ->
+ %%io:format("~p: ~p ~w ~w~n",[Line, Func, Str, Args]),
+ test_1(Line, Func, Str, [Str|norm(none,Args)], Res),
+ %%io:format("~p: ~p bin ",[Line, Func]),
+ test_1({Line,list}, Func, Str,
+ [unicode:characters_to_list(Str)|norm(none,Args)], Res),
+ Norm andalso
+ test_1({Line,clist}, Func, Str,
+ [unicode:characters_to_nfc_list(Str)|norm(nfc,Args)], Res),
+ Norm andalso
+ test_1({Line,dlist}, Func, Str,
+ [unicode:characters_to_nfd_list(Str)|norm(nfd,Args)], Res),
+ test_1({Line,bin}, Func, Str,
+ [unicode:characters_to_binary(Str)|norm(none, Args)], Res),
+ Norm andalso
+ test_1({Line,cbin}, Func, Str,
+ [unicode:characters_to_nfc_binary(Str)|norm(nfc,Args)], Res),
+ Norm andalso
+ test_1({Line,dbin}, Func, Str,
+ [unicode:characters_to_nfd_binary(Str)|norm(nfd,Args)], Res),
+ %%io:format("~n",[]),
+ ok.
+
+test_1(Line, Func, Str, Args, Exp) ->
+ try
+ Res = apply(string, Func, Args),
+ check_types(Line, Func, Args, Res),
+ case res(Res, Exp) of
+ true -> ok;
+ {Res1,Exp1} when is_tuple(Exp1) ->
+ io:format("~p~n",[Args]),
+ io:format("~p:~p: ~ts~w =>~n :~w:~w~n",
+ [Func,Line, Str,Str,Res1,Exp1]),
+ exit({error, Func});
+ {Res1,Exp1} ->
+ io:format("~p:~p: ~ts~w =>~n :~ts~w:~ts~w~n",
+ [Func,Line, Str,Str, Res1,Res1, Exp1,Exp1]),
+ exit({error, Func})
+ end
+ catch
+ error:Exp ->
+ ok;
+ error:Reason ->
+ io:format("~p:~p: Crash ~p ~p~n",
+ [?MODULE,Line, Reason, erlang:get_stacktrace()]),
+ exit({error, Func})
+ end.
+
+norm(Type, Args) ->
+ Norm = case Type of
+ nfc -> fun unicode:characters_to_nfc_list/1;
+ nfd -> fun unicode:characters_to_nfd_list/1;
+ none -> fun(Str) -> Str end
+ end,
+ lists:map(fun({norm,Str}) -> Norm(Str);
+ (Other) -> Other
+ end, Args).
+
+res(Str, Str) -> true;
+res(Str, Exp) when is_list(Str), is_list(Exp) ->
+ A = unicode:characters_to_nfc_list(Str),
+ A==Exp orelse {A,Exp};
+res(Str, Exp) when is_binary(Str), is_list(Exp) ->
+ A = unicode:characters_to_nfc_list(Str),
+ A==Exp orelse {A,Exp};
+res(What, {Fun, Exp}) when is_function(Fun) ->
+ Fun(What) == Exp orelse {Fun(What), Exp};
+res({S1,S2}=S, {Exp1,Exp2}=E) -> %% For take
+ case {res(S1,Exp1), res(S2,Exp2)} of
+ {true, true} -> true;
+ _ -> {S, E}
+ end;
+res(Int, Exp) ->
+ Int == Exp orelse {Int, Exp}.
+
+
+check_types(_Line, _Func, _Str, Res)
+ when is_integer(Res); is_boolean(Res); Res =:= nomatch ->
+ %% length or equal
+ ok;
+check_types(Line, Func, [S1,S2], Res)
+ when Func =:= concat ->
+ case check_types_1(type(S1),type(S2)) of
+ ok ->
+ case check_types_1(type(S1),type(Res)) of
+ ok -> ok;
+ {T1,T2} ->
+ io:format("Failed: ~p ~p ~p ~p~n",[Line, Func, T1, T2]),
+ io:format(" ~p ~p => ~p~n", [S1, S2, Res]),
+ error
+ end;
+ _ -> ok
+ end;
+check_types(Line, Func, [Str|_], Res) ->
+ AddList = fun(mixed) -> mixed;
+ ({list,{list,_}}) -> {list, deep};
+ (R) ->
+ case lists:member(Func, [lexemes, tokens, split]) of
+ true -> {list, R};
+ false -> R
+ end
+ end,
+ try needs_check(Func) andalso (ok = check_types_1(AddList(type(Str)), type(Res))) of
+ ok -> ok;
+ false -> ok
+ catch _:{badmatch, {T1,T2}} ->
+ io:format("Failed: ~p ~p: ~p ~p~n",[Line, Func, T1, T2]),
+ io:format(" ~p => ~p~n", [Str, Res]),
+ error;
+ _:Reason ->
+ io:format("Crash: ~p in~n ~p~n",[Reason, erlang:get_stacktrace()]),
+ io:format("Failed: ~p ~p: ~p => ~p~n", [Line, Func, Str, Res]),
+ exit({Reason, erlang:get_stacktrace()})
+ end.
+
+check_types_1(T, T) ->
+ ok;
+check_types_1(Str, Res)
+ when is_binary(Str), is_binary(Res) ->
+ ok;
+check_types_1({list, _},{list, undefined}) ->
+ ok;
+check_types_1({list, _},{list, codepoints}) ->
+ ok;
+check_types_1({list, _},{list, {list, codepoints}}) ->
+ ok;
+check_types_1({list, {list, _}},{list, {list, codepoints}}) ->
+ ok;
+check_types_1(mixed,_) ->
+ ok;
+check_types_1({list, binary}, binary) ->
+ ok;
+check_types_1({list, binary}, {other, _, _}) -> %% take
+ ok;
+check_types_1({list, deep}, _) ->
+ ok;
+check_types_1({list, {list, deep}}, _) ->
+ ok;
+check_types_1(T1,T2) ->
+ {T1,T2}.
+
+type(Bin) when is_binary(Bin) ->
+ binary;
+type([]) ->
+ {list, undefined};
+type(List) when is_list(List) ->
+ Deep = fun(L) when is_list(L) ->
+ lists:any(fun(C) -> is_list(C) orelse is_binary(C) end, L);
+ (_) -> false
+ end,
+ case all(fun(C) -> not is_binary(C) end, List) of
+ true ->
+ case all(fun(C) -> is_integer(C) end, List) of
+ true -> {list, codepoints};
+ false ->
+ case [deep || L <- List, Deep(L)] of
+ [] -> {list, {list, codepoints}};
+ _ -> {list, deep}
+ end
+ end;
+ false ->
+ case all(fun(C) -> is_binary(C) end, List) of
+ true -> {list, binary};
+ false -> mixed
+ end
+ end;
+type({R1,R2}) ->
+ case {type(R1),type(R2)} of
+ {T,T} -> T;
+ {{list,undefined}, {list,codepoints}} -> {list,codepoints};
+ {{list,codepoints}, {list,undefined}} -> {list,codepoints};
+ {T1,T2} -> {other, T1,T2}
+ end;
+type(Other) ->
+ {other, Other}.
+
+all(_Check, []) ->
+ true;
+all(Check, [H|T]) ->
+ Check(H) andalso all(Check,T);
+all(Check, Bin) when is_binary(Bin) ->
+ Check(Bin).
+
+needs_check(reverse) -> false;
+needs_check(pad) -> false;
+needs_check(replace) -> false;
+needs_check(_) -> true.
+
+%%%% Timer stuff
+
+time_func(Fun, Mode, Bin) ->
+ timer:sleep(100), %% Let emulator catch up and clean things before test runs
+ Self = self(),
+ Pid = spawn_link(fun() ->
+ Str = mode(Mode, Bin),
+ Self ! {self(),time_func(0,0,0, Fun, Str, undefined)}
+ end),
+ receive {Pid,Msg} -> Msg end.
+
+time_func(N,Sum,SumSq, Fun, Str, _) when N < 50 ->
+ {Time, Res} = timer:tc(fun() -> Fun(Str) end),
+ time_func(N+1,Sum+Time,SumSq+Time*Time, Fun, Str, Res);
+time_func(N,Sum,SumSq, _, _, Res) ->
+ Mean = round(Sum / N),
+ Stdev = round(math:sqrt((SumSq - (Sum*Sum/N))/(N - 1))),
+ {N, Mean, Stdev, Res}.
+
+mode(binary, Bin) -> Bin;
+mode(list, Bin) -> unicode:characters_to_list(Bin).
+
%%
-%% Test cases starts here.
+%% Old string lists Test cases starts here.
%%
len(Config) when is_list(Config) ->
@@ -80,16 +932,14 @@ len(Config) when is_list(Config) ->
{'EXIT',_} = (catch string:len({})),
ok.
-equal(Config) when is_list(Config) ->
+old_equal(Config) when is_list(Config) ->
true = string:equal("", ""),
false = string:equal("", " "),
true = string:equal("laban", "laban"),
false = string:equal("skvimp", "skvump"),
- %% invalid arg type
- true = string:equal(2, 2), % not good, should crash
ok.
-concat(Config) when is_list(Config) ->
+old_concat(Config) when is_list(Config) ->
"erlang rules" = string:concat("erlang ", "rules"),
"" = string:concat("", ""),
"x" = string:concat("x", ""),
@@ -130,6 +980,7 @@ str_rstr(Config) when is_list(Config) ->
3 = string:rstr("xxxx", "xx"),
3 = string:str("xy z yx", " z"),
3 = string:rstr("xy z yx", " z"),
+ 3 = string:str("aaab", "ab"),
%% invalid arg type
{'EXIT',_} = (catch string:str(hello, "he")),
%% invalid arg type
@@ -184,7 +1035,7 @@ substr(Config) when is_list(Config) ->
{'EXIT',_} = (catch string:substr("1234", "1")),
ok.
-tokens(Config) when is_list(Config) ->
+old_tokens(Config) when is_list(Config) ->
[] = string:tokens("",""),
[] = string:tokens("abc","abc"),
["abc"] = string:tokens("abc", ""),
@@ -221,7 +1072,7 @@ replace_sep(C, Seps, New) ->
chars(Config) when is_list(Config) ->
[] = string:chars($., 0),
[] = string:chars($., 0, []),
- 10 = length(string:chars(32, 10, [])),
+ 10 = erlang:length(string:chars(32, 10, [])),
"aaargh" = string:chars($a, 3, "rgh"),
%% invalid arg type
{'EXIT',_} = (catch string:chars($x, [])),
@@ -231,7 +1082,7 @@ copies(Config) when is_list(Config) ->
"" = string:copies("", 10),
"" = string:copies(".", 0),
"." = string:copies(".", 1),
- 30 = length(string:copies("123", 10)),
+ 30 = erlang:length(string:copies("123", 10)),
%% invalid arg type
{'EXIT',_} = (catch string:copies("hej", -1)),
{'EXIT',_} = (catch string:copies("hej", 2.0)),
@@ -360,7 +1211,7 @@ to_integer(Config) when is_list(Config) ->
ok.
test_to_integer(Str) ->
- io:format("Checking ~p~n", [Str]),
+ %% io:format("Checking ~p~n", [Str]),
case string:to_integer(Str) of
{error,_Reason} = Bad ->
{'EXIT',_} = (catch list_to_integer(Str)),
@@ -403,7 +1254,7 @@ to_float(Config) when is_list(Config) ->
ok.
test_to_float(Str) ->
- io:format("Checking ~p~n", [Str]),
+ %% io:format("Checking ~p~n", [Str]),
case string:to_float(Str) of
{error,_Reason} = Bad ->
{'EXIT',_} = (catch list_to_float(Str)),
@@ -419,7 +1270,7 @@ to_upper_to_lower(Config) when is_list(Config) ->
All = lists:seq(0, 255),
UC = string:to_upper(All),
- 256 = length(UC),
+ 256 = erlang:length(UC),
all_upper_latin1(UC, 0),
LC = string:to_lower(All),
@@ -450,7 +1301,7 @@ all_lower_latin1([C|T], C) when 0 =< C, C < $A;
all_lower_latin1([H|T], C) when $A =< C, C =< $Z;
16#C0 =< C, C =< 16#F6;
16#C8 =< C, C =< 16#DE ->
- io:format("~p\n", [{H,C}]),
+ % io:format("~p\n", [{H,C}]),
H = C + 32,
all_lower_latin1(T, C+1);
all_lower_latin1([], 256) -> ok.