New unicode aware string module that works with unicode:chardata()

Works with unicode:chardata() as input as was decided on OTP board meeting as response to EEP-35 a long time ago. Works on graphemes clusters as base, with a few exceptions, does not handle classic (nor nfd'ified) Hangul nor the extended grapheme clusters such as the prepend class. That would make handling binaries as input/output very slow. List input => list output, binary input => binary output and mixed input => mixed output for all find/split functions. So that results can be post-processed without the need to invoke unicode:characters_to_list|binary for intermediate data. pad functions return lists of unicode:chardata() for performance.
author: Dan Gudmundsson <[email protected]> 2017-04-03 12:19:21 +0200
committer: Dan Gudmundsson <[email protected]> 2017-04-24 12:16:56 +0200
commit: 2c72e662bad11a41839780f86680d4bb05367c78 (patch)
tree: 01e9ae9b32fdb953392e571a0773fb2cd059c498 /lib
parent: 75fc94b8b462d7b7f6dd4b706bbe32cff77ee575 (diff)
download: otp-2c72e662bad11a41839780f86680d4bb05367c78.tar.gz
otp-2c72e662bad11a41839780f86680d4bb05367c78.tar.bz2
otp-2c72e662bad11a41839780f86680d4bb05367c78.zip
4 files changed, 2837 insertions, 133 deletions
diff --git a/lib/stdlib/doc/src/string.xml b/lib/stdlib/doc/src/string.xml
index dddedf1132..dc83c40a9a 100644
--- a/lib/stdlib/doc/src/string.xml
+++ b/lib/stdlib/doc/src/string.xml
@@ -36,8 +36,613 @@
   <modulesummary>String processing functions.</modulesummary>
   <description>
     <p>This module provides functions for string processing.</p>
+    <p>A string in this module is represented by <seealso marker="unicode#type-chardata">
+    <c>unicode:chardata()</c></seealso>, that is, a list of codepoints,
+    binaries with UTF-8-encoded codepoints
+    (<em>UTF-8 binaries</em>), or a mix of the two.</p>
+    <code>
+"abcd"               is a valid string
+&lt;&lt;"abcd">>           is a valid string
+["abcd"]             is a valid string
+&lt;&lt;"abc..åäö"/utf8>>  is a valid string
+&lt;&lt;"abc..åäö">>       is NOT a valid string,
+                     but a binary with Latin-1-encoded codepoints
+[&lt;&lt;"abc">>, "..åäö"] is a valid string
+[atom]               is NOT a valid string</code>
+    <p>
+      This module operates on grapheme clusters. A <em>grapheme cluster</em>
+      is a user-perceived character, which can be represented by several
+      codepoints.
+    </p>
+    <code>
+"å"  [229] or [97, 778]
+"e̊"  [101, 778]</code>
+    <p>
+      The string length of "ß↑e̊" is 3, even though it is represented by the
+      codepoints <c>[223,8593,101,778]</c> or the UTF-8 binary
+      <c>&lt;&lt;195,159,226,134,145,101,204,138>></c>.
+    </p>
+    <p>
+      Grapheme clusters for codepoints of class <c>prepend</c>
+      and non-modern (or decomposed) Hangul is not handled for performance
+      reasons in
+      <seealso marker="#find/3"><c>find/3</c></seealso>,
+      <seealso marker="#replace/3"><c>replace/3</c></seealso>,
+      <seealso marker="#split/2"><c>split/2</c></seealso>,
+      <seealso marker="#lexemes/2"><c>split/2</c></seealso> and
+      <seealso marker="#trim/3"><c>trim/3</c></seealso>.
+    </p>
+    <p>
+      Splitting and appending strings is to be done on grapheme clusters
+      borders.
+      There is no verification that the results of appending strings are
+      valid or normalized.
+    </p>
+    <p>
+      Most of the functions expect all input to be normalized to one form,
+      see for example <seealso marker="unicode#characters_to_nfc_list/1">
+      <c>unicode:characters_to_nfc_list/1</c></seealso>.
+    </p>
+    <p>
+      Language or locale specific handling of input is not considered
+      in any function.
+    </p>
+    <p>
+      The functions can crash for non-valid input strings. For example,
+      the functions expect UTF-8 binaries but not all functions
+      verify that all binaries are encoded correctly.
+    </p>
+    <p>
+      Unless otherwise specified the return value type is the same as
+      the input type. That is, binary input returns binary output,
+      list input returns a list output, and mixed input can return a
+      mixed output.</p>
+      <code>
+1> string:trim("  sarah  ").
+"sarah"
+2> string:trim(&lt;&lt;"  sarah  ">>).
+&lt;&lt;"sarah">>
+3> string:lexemes("foo bar", " ").
+["foo","bar"]
+4> string:lexemes(&lt;&lt;"foo bar">>, " ").
+[&lt;&lt;"foo">>,&lt;&lt;"bar">>]</code>
+    <p>This module has been reworked in Erlang/OTP 20 to
+    handle <seealso marker="unicode#type-chardata">
+    <c>unicode:chardata()</c></seealso> and operate on grapheme
+    clusters. The <seealso marker="#oldapi"> <c>old
+    functions</c></seealso> that only work on Latin-1 lists as input
+    are still available but should not be
+    used. They will be deprecated in Erlang/OTP 21.
+    </p>
   </description>
 
+  <datatypes>
+    <datatype>
+      <name name="direction"/>
+      <name name="grapheme_cluster"/>
+      <desc>
+        <p>A user-perceived character, consisting of one or more
+        codepoints.</p>
+      </desc>
+    </datatype>
+  </datatypes>
+
+  <funcs>
+
+    <func>
+      <name name="casefold" arity="1"/>
+      <fsummary>Convert a string to a comparable string.</fsummary>
+      <desc>
+        <p>
+	  Converts <c><anno>String</anno></c> to a case-agnostic
+	  comparable string. Function <c>casefold/1</c> is preferred
+	  over <c>lowercase/1</c> when two strings are to be compared
+	  for equality. See also <seealso marker="#equal/4"><c>equal/4</c></seealso>.
+	</p>
+	<p><em>Example:</em></p>
+	<pre>
+1> <input>string:casefold("Ω and ẞ SHARP S").</input>
+"ω and ss sharp s"</pre>
+      </desc>
+    </func>
+
+    <func>
+      <name name="chomp" arity="1"/>
+      <fsummary>Remove trailing end of line control characters.</fsummary>
+      <desc>
+        <p>
+	  Returns a string where any trailing <c>\n</c> or
+	  <c>\r\n</c> have been removed from <c><anno>String</anno></c>.
+	</p>
+	<p><em>Example:</em></p>
+	<pre>
+182> <input>string:chomp(&lt;&lt;"\nHello\n\n">>).</input>
+&lt;&lt;"\nHello">>
+183> <input>string:chomp("\nHello\r\r\n").</input>
+"\nHello\r"</pre>
+      </desc>
+    </func>
+
+    <func>
+      <name name="equal" arity="2"/>
+      <name name="equal" arity="3"/>
+      <name name="equal" arity="4"/>
+      <fsummary>Test string equality.</fsummary>
+      <desc>
+        <p>
+	  Returns <c>true</c> if <c><anno>A</anno></c> and
+          <c><anno>B</anno></c> are equal, otherwise <c>false</c>.
+	</p>
+	<p>
+	  If <c><anno>IgnoreCase</anno></c> is <c>true</c>
+	  the function does <seealso marker="#casefold/1">
+	  <c>casefold</c>ing</seealso> on the fly before the equality test.
+	</p>
+	<p>If <c><anno>Norm</anno></c> is not <c>none</c>
+	the function applies normalization on the fly before the equality test.
+	There are four available normalization forms:
+	<seealso marker="unicode#characters_to_nfc_list/1"> <c>nfc</c></seealso>,
+	<seealso marker="unicode#characters_to_nfd_list/1"> <c>nfd</c></seealso>,
+	<seealso marker="unicode#characters_to_nfkc_list/1"> <c>nfkc</c></seealso>, and
+	<seealso marker="unicode#characters_to_nfkd_list/1"> <c>nfkd</c></seealso>.
+	</p>
+	<p>By default,
+	<c><anno>IgnoreCase</anno></c> is <c>false</c> and
+	<c><anno>Norm</anno></c> is <c>none</c>.</p>
+	<p><em>Example:</em></p>
+	<pre>
+1> <input>string:equal("åäö", &lt;&lt;"åäö"/utf8>>).</input>
+true
+2> <input>string:equal("åäö", unicode:characters_to_nfd_binary("åäö")).</input>
+false
+3> <input>string:equal("åäö", unicode:characters_to_nfd_binary("ÅÄÖ"), true, nfc).</input>
+true</pre>
+      </desc>
+    </func>
+
+    <func>
+      <name name="find" arity="2"/>
+      <name name="find" arity="3"/>
+      <fsummary>Find start of substring.</fsummary>
+      <desc>
+        <p>
+	  Removes anything before <c><anno>SearchPattern</anno></c> in <c><anno>String</anno></c>
+	  and returns the remainder of the string or <c>nomatch</c> if <c><anno>SearchPattern</anno></c> is not
+	  found.
+          <c><anno>Dir</anno></c>, which can be <c>leading</c> or
+	  <c>trailing</c>, indicates from which direction characters
+	  are to be searched.
+        </p>
+	<p>
+          By default, <c><anno>Dir</anno></c> is <c>leading</c>.
+	</p>
+	<p><em>Example:</em></p>
+	<pre>
+1> <input>string:find("ab..cd..ef", ".").</input>
+"..cd..ef"
+2> <input>string:find(&lt;&lt;"ab..cd..ef">>, "..", trailing).</input>
+&lt;&lt;"..ef">>
+3> <input>string:find(&lt;&lt;"ab..cd..ef">>, "x", leading).</input>
+nomatch
+4> <input>string:find("ab..cd..ef", "x", trailing).</input>
+nomatch</pre>
+      </desc>
+    </func>
+
+    <func>
+      <name name="is_empty" arity="1"/>
+      <fsummary>Check if the string is empty.</fsummary>
+      <desc>
+        <p>Returns <c>true</c> if <c><anno>String</anno></c> is the
+        empty string, otherwise <c>false</c>.</p>
+	<p><em>Example:</em></p>
+	<pre>
+1> <input>string:is_empty("foo").</input>
+false
+2> <input>string:is_empty(["",&lt;&lt;>>]).</input>
+true</pre>
+      </desc>
+    </func>
+
+    <func>
+      <name name="length" arity="1"/>
+      <fsummary>Calculate length of the string.</fsummary>
+      <desc>
+        <p>
+	  Returns the number of grapheme clusters in <c><anno>String</anno></c>.
+	</p>
+	<p><em>Example:</em></p>
+	<pre>
+1> <input>string:length("ß↑e̊").</input>
+3
+2> <input>string:length(&lt;&lt;195,159,226,134,145,101,204,138>>).</input>
+3</pre>
+      </desc>
+    </func>
+
+    <func>
+      <name name="lexemes" arity="2"/>
+      <fsummary>Split string into lexemes.</fsummary>
+      <desc>
+	<p>
+	  Returns a list of lexemes in <c><anno>String</anno></c>, separated
+          by the grapheme clusters in <c><anno>SeparatorList</anno></c>.
+	</p>
+	<p>
+	  Notice that, as shown in this example, two or more
+          adjacent separator graphemes clusters in <c><anno>String</anno></c>
+          are treated as one. That is, there are no empty
+          strings in the resulting list of lexemes.
+	  See also <seealso marker="#split/3"><c>split/3</c></seealso> which returns
+	  empty strings.
+	</p>
+	<p>Notice that <c>[$\r,$\n]</c> is one grapheme cluster.</p>
+	<p><em>Example:</em></p>
+	<pre>
+1> <input>string:lexemes("abc de̊fxxghix jkl\r\nfoo", "x e" ++ [[$\r,$\n]]).</input>
+["abc","de̊f","ghi","jkl","foo"]
+2> <input>string:lexemes(&lt;&lt;"abc de̊fxxghix jkl\r\nfoo"/utf8>>, "x e" ++ [$\r,$\n]).</input>
+[&lt;&lt;"abc">>,&lt;&lt;"de̊f"/utf8>>,&lt;&lt;"ghi">>,&lt;&lt;"jkl\r\nfoo">>]</pre>
+      </desc>
+    </func>
+
+    <func>
+      <name name="lowercase" arity="1"/>
+      <fsummary>Convert a string to lowercase</fsummary>
+      <desc>
+        <p>
+	  Converts <c><anno>String</anno></c> to lowercase.
+	</p>
+	<p>
+	  Notice that function <seealso marker="#casefold/1"><c>casefold/1</c></seealso>
+	  should be used when converting a string to
+	  be tested for equality.
+	</p>
+	<p><em>Example:</em></p>
+	<pre>
+2> <input>string:lowercase(string:uppercase("Michał")).</input>
+"michał"</pre>
+      </desc>
+    </func>
+
+    <func>
+      <name name="next_codepoint" arity="1"/>
+      <fsummary>Pick the first codepoint.</fsummary>
+      <desc>
+        <p>
+	  Returns the first codepoint in <c><anno>String</anno></c>
+	  and the rest of <c><anno>String</anno></c> in the tail.
+	</p>
+	<p><em>Example:</em></p>
+	<pre>
+1> <input>string:next_codepoint(unicode:characters_to_binary("e̊fg")).</input>
+[101|&lt;&lt;"̊fg"/utf8>>]</pre>
+      </desc>
+    </func>
+
+    <func>
+      <name name="next_grapheme" arity="1"/>
+      <fsummary>Pick the first grapheme cluster.</fsummary>
+      <desc>
+        <p>
+	  Returns the first grapheme cluster in <c><anno>String</anno></c>
+	  and the rest of <c><anno>String</anno></c> in the tail.
+	</p>
+	<p><em>Example:</em></p>
+	<pre>
+1> <input>string:next_grapheme(unicode:characters_to_binary("e̊fg")).</input>
+["e̊"|&lt;&lt;"fg">>]</pre>
+      </desc>
+    </func>
+
+    <func>
+      <name name="nth_lexeme" arity="3"/>
+      <fsummary>Pick the nth lexeme.</fsummary>
+      <desc>
+	<p>Returns lexeme number <c><anno>N</anno></c> in
+	<c><anno>String</anno></c>, where lexemes are separated by
+	the grapheme clusters in <c><anno>SeparatorList</anno></c>.
+	</p>
+	<p><em>Example:</em></p>
+	<pre>
+1> <input>string:nth_lexeme("abc.de̊f.ghiejkl", 3, ".e").</input>
+"ghi"</pre>
+      </desc>
+    </func>
+
+    <func>
+      <name name="pad" arity="2"/>
+      <name name="pad" arity="3"/>
+      <name name="pad" arity="4"/>
+      <fsummary>Pad a string to given length.</fsummary>
+      <desc>
+        <p>
+	  Pads <c><anno>String</anno></c> to <c><anno>Length</anno></c> with
+	  grapheme cluster <c><anno>Char</anno></c>.
+	  <c><anno>Dir</anno></c>, which can be <c>leading</c>, <c>trailing</c>,
+	  or <c>both</c>, indicates where the padding should be added.
+	</p>
+	<p>By default, <c><anno>Char</anno></c> is <c>$\s</c> and
+	<c><anno>Dir</anno></c> is <c>trailing</c>.
+	</p>
+	<p><em>Example:</em></p>
+	<pre>
+1> <input>string:pad(&lt;&lt;"He̊llö"/utf8>>, 8).</input>
+[&lt;&lt;72,101,204,138,108,108,195,182>>,32,32,32]
+2> <input>io:format("'~ts'~n",[string:pad("He̊llö", 8, leading)]).</input>
+'   He̊llö'
+3> <input>io:format("'~ts'~n",[string:pad("He̊llö", 8, both)]).</input>
+' He̊llö  '</pre>
+      </desc>
+    </func>
+
+    <func>
+      <name name="prefix" arity="2"/>
+      <fsummary>Remove prefix from string.</fsummary>
+      <desc>
+        <p>
+	  If <c><anno>Prefix</anno></c> is the prefix of
+	  <c><anno>String</anno></c>, removes it and returns the
+	  remainder of <c><anno>String</anno></c>, otherwise returns
+	  <c>nomatch</c>.
+	</p>
+	<p><em>Example:</em></p>
+	<pre>
+1> <input>string:prefix(&lt;&lt;"prefix of string">>, "pre").</input>
+&lt;&lt;"fix of string">>
+2> <input>string:prefix("pre", "prefix").</input>
+nomatch</pre>
+      </desc>
+    </func>
+
+    <func>
+      <name name="replace" arity="3"/>
+      <name name="replace" arity="4"/>
+      <fsummary>Replace a pattern in string.</fsummary>
+      <desc>
+        <p>
+	  Replaces <c><anno>SearchPattern</anno></c> in <c><anno>String</anno></c>
+	  with <c><anno>Replacement</anno></c>.
+	  <c><anno>Where</anno></c>, default <c>leading</c>, indicates whether
+	  the <c>leading</c>, the <c>trailing</c> or <c>all</c> encounters of
+	  <c><anno>SearchPattern</anno></c> are to be replaced.
+	</p>
+	<p>Can be implemented as:</p>
+	<pre>lists:join(Replacement, split(String, SearchPattern, Where)).</pre>
+	<p><em>Example:</em></p>
+	<pre>
+1> <input>string:replace(&lt;&lt;"ab..cd..ef">>, "..", "*").</input>
+[&lt;&lt;"ab">>,"*",&lt;&lt;"cd..ef">>]
+2> <input>string:replace(&lt;&lt;"ab..cd..ef">>, "..", "*", all).</input>
+[&lt;&lt;"ab">>,"*",&lt;&lt;"cd">>,"*",&lt;&lt;"ef">>]</pre>
+      </desc>
+    </func>
+
+    <func>
+      <name name="reverse" arity="1"/>
+      <fsummary>Reverses a string</fsummary>
+      <desc>
+        <p>
+	  Returns the reverse list of the grapheme clusters in <c><anno>String</anno></c>.
+	</p>
+	<p><em>Example:</em></p>
+	<pre>
+1> Reverse = <input>string:reverse(unicode:characters_to_nfd_binary("ÅÄÖ")).</input>
+[[79,776],[65,776],[65,778]]
+2> <input>io:format("~ts~n",[Reverse]).</input>
+ÖÄÅ</pre>
+      </desc>
+    </func>
+
+    <func>
+      <name name="slice" arity="2"/>
+      <name name="slice" arity="3"/>
+      <fsummary>Extract a part of string</fsummary>
+      <desc>
+	<p>Returns a substring of <c><anno>String</anno></c> of
+	at most <c><anno>Length</anno></c> grapheme clusters, starting at position
+	<c><anno>Start</anno></c>.</p>
+	<p>By default, <c><anno>Length</anno></c> is <c>infinity</c>.</p>
+	<p><em>Example:</em></p>
+	<pre>
+1> <input>string:slice(&lt;&lt;"He̊llö Wörld"/utf8>>, 4).</input>
+&lt;&lt;"ö Wörld"/utf8>>
+2> <input>string:slice(["He̊llö ", &lt;&lt;"Wörld"/utf8>>], 4,4).</input>
+"ö Wö"
+3> <input>string:slice(["He̊llö ", &lt;&lt;"Wörld"/utf8>>], 4,50).</input>
+"ö Wörld"</pre>
+      </desc>
+    </func>
+
+    <func>
+      <name name="split" arity="2"/>
+      <name name="split" arity="3"/>
+      <fsummary>Split a string into substrings.</fsummary>
+      <desc>
+        <p>
+	  Splits <c><anno>String</anno></c> where <c><anno>SearchPattern</anno></c>
+	  is encountered and return the remaining parts.
+	  <c><anno>Where</anno></c>, default <c>leading</c>, indicates whether
+	  the <c>leading</c>, the <c>trailing</c> or <c>all</c> encounters of
+	  <c><anno>SearchPattern</anno></c> will split <c><anno>String</anno></c>.
+	</p>
+	<p><em>Example:</em></p>
+	<pre>
+0> <input>string:split("ab..bc..cd", "..").</input>
+["ab","bc..cd"]
+1> <input>string:split(&lt;&lt;"ab..bc..cd">>, "..", trailing).</input>
+[&lt;&lt;"ab..bc">>,&lt;&lt;"cd">>]
+2> <input>string:split(&lt;&lt;"ab..bc....cd">>, "..", all).</input>
+[&lt;&lt;"ab">>,&lt;&lt;"bc">>,&lt;&lt;>>,&lt;&lt;"cd">>]</pre>
+      </desc>
+    </func>
+
+    <func>
+      <name name="take" arity="2"/>
+      <name name="take" arity="3"/>
+      <name name="take" arity="4"/>
+      <fsummary>Take leading or trailing parts.</fsummary>
+      <desc>
+        <p>Takes characters from <c><anno>String</anno></c> as long as
+        the characters are members of set <c><anno>Characters</anno></c>
+	or the complement of set <c><anno>Characters</anno></c>.
+        <c><anno>Dir</anno></c>,
+        which can be <c>leading</c> or <c>trailing</c>, indicates from
+        which direction characters are to be taken.
+	</p>
+	<p><em>Example:</em></p>
+	<pre>
+5> <input>string:take("abc0z123", lists:seq($a,$z)).</input>
+{"abc","0z123"}
+6> <input>string:take(&lt;&lt;"abc0z123">>, lists:seq($0,$9), true, leading).</input>
+{&lt;&lt;"abc">>,&lt;&lt;"0z123">>}
+7> <input>string:take("abc0z123", lists:seq($0,$9), false, trailing).</input>
+{"abc0z","123"}
+8> <input>string:take(&lt;&lt;"abc0z123">>, lists:seq($a,$z), true, trailing).</input>
+{&lt;&lt;"abc0z">>,&lt;&lt;"123">>}</pre>
+      </desc>
+    </func>
+
+    <func>
+      <name name="titlecase" arity="1"/>
+      <fsummary>Convert a string to titlecase.</fsummary>
+      <desc>
+        <p>
+	  Converts <c><anno>String</anno></c> to titlecase.
+	</p>
+	<p><em>Example:</em></p>
+	<pre>
+1> <input>string:titlecase("ß is a SHARP s").</input>
+"Ss is a SHARP s"</pre>
+      </desc>
+    </func>
+
+    <func>
+      <name name="to_float" arity="1"/>
+      <fsummary>Return a float whose text representation is the integers
+        (ASCII values) of a string.</fsummary>
+      <desc>
+        <p>Argument <c><anno>String</anno></c> is expected to start with a
+          valid text represented float (the digits are ASCII values).
+          Remaining characters in the string after the float are returned in
+          <c><anno>Rest</anno></c>.</p>
+        <p><em>Example:</em></p>
+        <pre>
+> <input>{F1,Fs} = string:to_float("1.0-1.0e-1"),</input>
+> <input>{F2,[]} = string:to_float(Fs),</input>
+> <input>F1+F2.</input>
+0.9
+> <input>string:to_float("3/2=1.5").</input>
+{error,no_float}
+> <input>string:to_float("-1.5eX").</input>
+{-1.5,"eX"}</pre>
+      </desc>
+    </func>
+
+    <func>
+      <name name="to_integer" arity="1"/>
+      <fsummary>Return an integer whose text representation is the integers
+        (ASCII values) of a string.</fsummary>
+      <desc>
+        <p>Argument <c><anno>String</anno></c> is expected to start with a
+          valid text represented integer (the digits are ASCII values).
+          Remaining characters in the string after the integer are returned in
+          <c><anno>Rest</anno></c>.</p>
+        <p><em>Example:</em></p>
+        <pre>
+> <input>{I1,Is} = string:to_integer("33+22"),</input>
+> <input>{I2,[]} = string:to_integer(Is),</input>
+> <input>I1-I2.</input>
+11
+> <input>string:to_integer("0.5").</input>
+{0,".5"}
+> <input>string:to_integer("x=2").</input>
+{error,no_integer}</pre>
+      </desc>
+    </func>
+
+    <func>
+      <name name="to_graphemes" arity="1"/>
+      <fsummary>Convert a string to a list of grapheme clusters.</fsummary>
+      <desc>
+        <p>
+	  Converts <c><anno>String</anno></c> to a list of grapheme clusters.
+	</p>
+	<p><em>Example:</em></p>
+	<pre>
+1> <input>string:to_graphemes("ß↑e̊").</input>
+[223,8593,[101,778]]
+2> <input>string:to_graphemes(&lt;&lt;"ß↑e̊"/utf8>>).</input>
+[223,8593,[101,778]]</pre>
+      </desc>
+    </func>
+
+    <func>
+      <name name="trim" arity="1"/>
+      <name name="trim" arity="2"/>
+      <name name="trim" arity="3"/>
+      <fsummary>Trim leading or trailing, or both, characters.</fsummary>
+      <desc>
+	<p>
+	  Returns a string, where leading or trailing, or both,
+	  <c><anno>Characters</anno></c> have been removed.
+	  <c><anno>Dir</anno></c> which can be <c>leading</c>, <c>trailing</c>,
+	  or <c>both</c>, indicates from which direction characters
+	  are to be removed.
+	</p>
+	<p> Default <c><anno>Characters</anno></c> are the set of
+	nonbreakable whitespace codepoints, defined as
+	Pattern_White_Space in
+	<url href="http://unicode.org/reports/tr31/">Unicode Standard Annex #31</url>.
+	<c>By default, <anno>Dir</anno></c> is <c>both</c>.
+	</p>
+	<p>
+	  Notice that <c>[$\r,$\n]</c> is one grapheme cluster according
+	  to the Unicode Standard.
+	</p>
+	<p><em>Example:</em></p>
+	<pre>
+1> <input>string:trim("\t  Hello  \n").</input>
+"Hello"
+2> <input>string:trim(&lt;&lt;"\t  Hello  \n">>, leading).</input>
+&lt;&lt;"Hello  \n">>
+3> <input>string:trim(&lt;&lt;".Hello.\n">>, trailing, "\n.").</input>
+&lt;&lt;".Hello">></pre>
+      </desc>
+    </func>
+
+    <func>
+      <name name="uppercase" arity="1"/>
+      <fsummary>Convert a string to uppercase.</fsummary>
+      <desc>
+        <p>
+	  Converts <c><anno>String</anno></c> to uppercase.
+	</p>
+	<p>See also <seealso marker="#titlecase/1"><c>titlecase/1</c></seealso>.</p>
+	<p><em>Example:</em></p>
+	<pre>
+1> <input>string:uppercase("Michał").</input>
+"MICHAŁ"</pre>
+      </desc>
+    </func>
+
+  </funcs>
+
+  <section>
+    <marker id="oldapi"/>
+    <title>Obsolete API functions</title>
+    <p>Here follows the function of the old API.
+    These functions only work on a list of Latin-1 characters.
+    </p>
+    <note><p>
+      The functions are kept for backward compatibility, but are
+      not recommended.
+      They will be deprecated in Erlang/OTP 21.
+    </p>
+    <p>Any undocumented functions in <c>string</c> are not to be used.</p>
+    </note>
+  </section>
+
   <funcs>
     <func>
       <name name="centre" arity="2"/>
@@ -47,17 +652,24 @@
         <p>Returns a string, where <c><anno>String</anno></c> is centered in the
           string and surrounded by blanks or <c><anno>Character</anno></c>.
 	  The resulting string has length <c><anno>Number</anno></c>.</p>
+	<p>This function is <seealso marker="#oldapi">obsolete</seealso>.
+	Use
+	<seealso marker="#pad/3"><c>pad/3</c></seealso>.
+	</p>
       </desc>
     </func>
 
     <func>
       <name name="chars" arity="2"/>
       <name name="chars" arity="3"/>
-      <fsummary>Returns a string consisting of numbers of characters.</fsummary>
+      <fsummary>Return a string consisting of numbers of characters.</fsummary>
       <desc>
         <p>Returns a string consisting of <c><anno>Number</anno></c> characters
           <c><anno>Character</anno></c>. Optionally, the string can end with
           string <c><anno>Tail</anno></c>.</p>
+	<p>This function is <seealso marker="#oldapi">obsolete</seealso>.
+	Use
+	<seealso marker="lists#duplicate/2"><c>lists:duplicate/2</c></seealso>.</p>
       </desc>
     </func>
 
@@ -69,6 +681,9 @@
         <p>Returns the index of the first occurrence of
           <c><anno>Character</anno></c> in <c><anno>String</anno></c>. Returns
           <c>0</c> if <c><anno>Character</anno></c> does not occur.</p>
+	<p>This function is <seealso marker="#oldapi">obsolete</seealso>.
+	Use
+	<seealso marker="#find/2"><c>find/2</c></seealso>.</p>
       </desc>
     </func>
 
@@ -79,6 +694,16 @@
         <p>Concatenates <c><anno>String1</anno></c> and
           <c><anno>String2</anno></c> to form a new string
           <c><anno>String3</anno></c>, which is returned.</p>
+	<p>
+	  This function is <seealso marker="#oldapi">obsolete</seealso>.
+	  Use <c>[<anno>String1</anno>, <anno>String2</anno>]</c> as
+	  <c>Data</c> argument, and call
+	  <seealso marker="unicode#characters_to_list/2">
+	  <c>unicode:characters_to_list/2</c></seealso> or
+	  <seealso marker="unicode#characters_to_binary/2">
+	  <c>unicode:characters_to_binary/2</c></seealso>
+	  to flatten the output.
+	</p>
       </desc>
     </func>
 
@@ -88,6 +713,9 @@
       <desc>
         <p>Returns a string containing <c><anno>String</anno></c> repeated
           <c><anno>Number</anno></c> times.</p>
+	<p>This function is <seealso marker="#oldapi">obsolete</seealso>.
+	Use
+	<seealso marker="lists#duplicate/2"><c>lists:duplicate/2</c></seealso>.</p>
       </desc>
     </func>
 
@@ -98,6 +726,9 @@
         <p>Returns the length of the maximum initial segment of
           <c><anno>String</anno></c>, which consists entirely of characters
           not from <c><anno>Chars</anno></c>.</p>
+	<p>This function is <seealso marker="#oldapi">obsolete</seealso>.
+	Use
+	<seealso marker="#take/3"><c>take/3</c></seealso>.</p>
         <p><em>Example:</em></p>
         <code type="none">
 > string:cspan("\t    abcdef", " \t").
@@ -106,20 +737,14 @@
     </func>
 
     <func>
-      <name name="equal" arity="2"/>
-      <fsummary>Test string equality.</fsummary>
-      <desc>
-        <p>Returns <c>true</c> if <c><anno>String1</anno></c> and
-          <c><anno>String2</anno></c> are equal, otherwise <c>false</c>.</p>
-      </desc>
-    </func>
-
-    <func>
       <name name="join" arity="2"/>
       <fsummary>Join a list of strings with separator.</fsummary>
       <desc>
         <p>Returns a string with the elements of <c><anno>StringList</anno></c>
           separated by the string in <c><anno>Separator</anno></c>.</p>
+	<p>This function is <seealso marker="#oldapi">obsolete</seealso>.
+	Use
+	<seealso marker="lists#join/2"><c>lists:join/2</c></seealso>.</p>
         <p><em>Example:</em></p>
         <code type="none">
 > join(["one", "two", "three"], ", ").
@@ -137,6 +762,10 @@
           fixed. If <c>length(<anno>String</anno>)</c> &lt;
           <c><anno>Number</anno></c>, then <c><anno>String</anno></c> is padded
           with blanks or <c><anno>Character</anno></c>s.</p>
+	<p>This function is <seealso marker="#oldapi">obsolete</seealso>.
+	Use
+	<seealso marker="#pad/2"><c>pad/2</c></seealso> or
+	<seealso marker="#pad/3"><c>pad/3</c></seealso>.</p>
         <p><em>Example:</em></p>
         <code type="none">
 > string:left("Hello",10,$.).
@@ -149,6 +778,9 @@
       <fsummary>Return the length of a string.</fsummary>
       <desc>
         <p>Returns the number of characters in <c><anno>String</anno></c>.</p>
+	<p>This function is <seealso marker="#oldapi">obsolete</seealso>.
+	Use
+	<seealso marker="#length/1"><c>length/1</c></seealso>.</p>
       </desc>
     </func>
 
@@ -160,6 +792,9 @@
         <p>Returns the index of the last occurrence of
           <c><anno>Character</anno></c> in <c><anno>String</anno></c>. Returns
           <c>0</c> if <c><anno>Character</anno></c> does not occur.</p>
+	<p>This function is <seealso marker="#oldapi">obsolete</seealso>.
+	Use
+	<seealso marker="#find/3"><c>find/3</c></seealso>.</p>
       </desc>
     </func>
 
@@ -173,6 +808,9 @@
           fixed. If the length of <c>(<anno>String</anno>)</c> &lt;
           <c><anno>Number</anno></c>, then <c><anno>String</anno></c> is padded
           with blanks or <c><anno>Character</anno></c>s.</p>
+	<p>This function is <seealso marker="#oldapi">obsolete</seealso>.
+	Use
+	<seealso marker="#pad/3"><c>pad/3</c></seealso>.</p>
         <p><em>Example:</em></p>
         <code type="none">
 > string:right("Hello", 10, $.).
@@ -188,6 +826,9 @@
           <c><anno>SubString</anno></c> begins in <c><anno>String</anno></c>.
           Returns <c>0</c> if <c><anno>SubString</anno></c>
           does not exist in <c><anno>String</anno></c>.</p>
+	<p>This function is <seealso marker="#oldapi">obsolete</seealso>.
+	Use
+	<seealso marker="#find/3"><c>find/3</c></seealso>.</p>
         <p><em>Example:</em></p>
         <code type="none">
 > string:rstr(" Hello Hello World World ", "Hello World").
@@ -202,6 +843,9 @@
         <p>Returns the length of the maximum initial segment of
           <c><anno>String</anno></c>, which consists entirely of characters
           from <c><anno>Chars</anno></c>.</p>
+	<p>This function is <seealso marker="#oldapi">obsolete</seealso>.
+	Use
+	<seealso marker="#take/2"><c>take/2</c></seealso>.</p>
         <p><em>Example:</em></p>
         <code type="none">
 > string:span("\t    abcdef", " \t").
@@ -217,6 +861,9 @@
           <c><anno>SubString</anno></c> begins in <c><anno>String</anno></c>.
           Returns <c>0</c> if <c><anno>SubString</anno></c>
           does not exist in <c><anno>String</anno></c>.</p>
+	<p>This function is <seealso marker="#oldapi">obsolete</seealso>.
+	Use
+	<seealso marker="#find/2"><c>find/2</c></seealso>.</p>
         <p><em>Example:</em></p>
         <code type="none">
 > string:str(" Hello Hello World World ", "Hello World").
@@ -230,12 +877,15 @@
       <name name="strip" arity="3"/>
       <fsummary>Strip leading or trailing characters.</fsummary>
       <desc>
-        <p>Returns a string, where leading and/or trailing blanks or a
+        <p>Returns a string, where leading or trailing, or both, blanks or a
           number of <c><anno>Character</anno></c> have been removed.
           <c><anno>Direction</anno></c>, which can be <c>left</c>, <c>right</c>,
           or <c>both</c>, indicates from which direction blanks are to be
           removed. <c>strip/1</c> is equivalent to
           <c>strip(String, both)</c>.</p>
+	<p>This function is <seealso marker="#oldapi">obsolete</seealso>.
+	Use
+	<seealso marker="#trim/3"><c>trim/3</c></seealso>.</p>
         <p><em>Example:</em></p>
         <code type="none">
 > string:strip("...Hello.....", both, $.).
@@ -251,6 +901,9 @@
         <p>Returns a substring of <c><anno>String</anno></c>, starting at
           position <c><anno>Start</anno></c> to the end of the string, or to
           and including position <c><anno>Stop</anno></c>.</p>
+	<p>This function is <seealso marker="#oldapi">obsolete</seealso>.
+	Use
+	<seealso marker="#slice/3"><c>slice/3</c></seealso>.</p>
         <p><em>Example:</em></p>
         <code type="none">
 sub_string("Hello World", 4, 8).
@@ -266,6 +919,9 @@ sub_string("Hello World", 4, 8).
         <p>Returns a substring of <c><anno>String</anno></c>, starting at
           position <c><anno>Start</anno></c>, and ending at the end of the
           string or at length <c><anno>Length</anno></c>.</p>
+	<p>This function is <seealso marker="#oldapi">obsolete</seealso>.
+	Use
+	<seealso marker="#slice/3"><c>slice/3</c></seealso>.</p>
         <p><em>Example:</em></p>
         <code type="none">
 > substr("Hello World", 4, 5).
@@ -281,6 +937,9 @@ sub_string("Hello World", 4, 8).
         <p>Returns the word in position <c><anno>Number</anno></c> of
           <c><anno>String</anno></c>. Words are separated by blanks or
           <c><anno>Character</anno></c>s.</p>
+	<p>This function is <seealso marker="#oldapi">obsolete</seealso>.
+	Use
+	<seealso marker="#nth_lexeme/3"><c>nth_lexeme/3</c></seealso>.</p>
         <p><em>Example:</em></p>
         <code type="none">
 > string:sub_word(" Hello old boy !",3,$o).
@@ -289,50 +948,6 @@ sub_string("Hello World", 4, 8).
     </func>
 
     <func>
-      <name name="to_float" arity="1"/>
-      <fsummary>Returns a float whose text representation is the integers
-        (ASCII values) in a string.</fsummary>
-      <desc>
-        <p>Argument <c><anno>String</anno></c> is expected to start with a
-          valid text represented float (the digits are ASCII values).
-          Remaining characters in the string after the float are returned in
-          <c><anno>Rest</anno></c>.</p>
-        <p><em>Example:</em></p>
-        <code type="none">
-> {F1,Fs} = string:to_float("1.0-1.0e-1"),
-> {F2,[]} = string:to_float(Fs),
-> F1+F2.
-0.9
-> string:to_float("3/2=1.5").
-{error,no_float}
-> string:to_float("-1.5eX").
-{-1.5,"eX"}</code>
-      </desc>
-    </func>
-
-    <func>
-      <name name="to_integer" arity="1"/>
-      <fsummary>Returns an integer whose text representation is the integers
-        (ASCII values) in a string.</fsummary>
-      <desc>
-        <p>Argument <c><anno>String</anno></c> is expected to start with a
-          valid text represented integer (the digits are ASCII values).
-          Remaining characters in the string after the integer are returned in
-          <c><anno>Rest</anno></c>.</p>
-        <p><em>Example:</em></p>
-        <code type="none">
-> {I1,Is} = string:to_integer("33+22"),
-> {I2,[]} = string:to_integer(Is),
-> I1-I2.
-11
-> string:to_integer("0.5").
-{0,".5"}
-> string:to_integer("x=2").
-{error,no_integer}</code>
-      </desc>
-    </func>
-
-    <func>
       <name name="to_lower" arity="1" clause_i="1"/>
       <name name="to_lower" arity="1" clause_i="2"/>
       <name name="to_upper" arity="1" clause_i="1"/>
@@ -346,6 +961,11 @@ sub_string("Hello World", 4, 8).
         <p>The specified string or character is case-converted. Notice that
           the supported character set is ISO/IEC 8859-1 (also called Latin 1);
           all values outside this set are unchanged</p>
+	<p>This function is <seealso marker="#oldapi">obsolete</seealso> use
+	<seealso marker="#lowercase/1"><c>lowercase/1</c></seealso>,
+	<seealso marker="#uppercase/1"><c>uppercase/1</c></seealso>,
+	<seealso marker="#titlecase/1"><c>titlecase/1</c></seealso> or
+	<seealso marker="#casefold/1"><c>casefold/1</c></seealso>.</p>
       </desc>
     </func>
 
@@ -363,6 +983,9 @@ sub_string("Hello World", 4, 8).
           adjacent separator characters in <c><anno>String</anno></c>
           are treated as one. That is, there are no empty
           strings in the resulting list of tokens.</p>
+	<p>This function is <seealso marker="#oldapi">obsolete</seealso>.
+	Use
+	<seealso marker="#lexemes/2"><c>lexemes/2</c></seealso>.</p>
       </desc>
     </func>
 
@@ -373,6 +996,9 @@ sub_string("Hello World", 4, 8).
       <desc>
         <p>Returns the number of words in <c><anno>String</anno></c>, separated
           by blanks or <c><anno>Character</anno></c>.</p>
+	<p>This function is <seealso marker="#oldapi">obsolete</seealso>.
+	Use
+	<seealso marker="#lexemes/2"><c>lexemes/2</c></seealso>.</p>
         <p><em>Example:</em></p>
         <code type="none">
 > words(" Hello old boy!", $o).
@@ -387,10 +1013,7 @@ sub_string("Hello World", 4, 8).
       other. The reason is that this string package is the
       combination of two earlier packages and all functions of
       both packages have been retained.</p>
-
-    <note>
-      <p>Any undocumented functions in <c>string</c> are not to be used.</p>
-    </note>
   </section>
+
 </erlref>
 
diff --git a/lib/stdlib/doc/src/unicode_usage.xml b/lib/stdlib/doc/src/unicode_usage.xml
index a8ef8ff5c5..11b84f552a 100644
--- a/lib/stdlib/doc/src/unicode_usage.xml
+++ b/lib/stdlib/doc/src/unicode_usage.xml
@@ -65,7 +65,10 @@
 
 	<item><p>In Erlang/OTP 20.0, atoms and function can contain
 	Unicode characters. Module names are still restricted to
-	the ISO-Latin-1 range.</p></item>
+	the ISO-Latin-1 range.</p>
+	<p>Support was added for normalizations forms in
+	<c>unicode</c> and the <c>string</c> module now handles
+	utf8-encoded binaries.</p></item>
       </list>
 
     <p>This section outlines the current Unicode support and gives some
@@ -110,23 +113,27 @@
       </item>
     </list>
 
-    <p>So, a conversion function must know not only one character at a time,
-      but possibly the whole sentence, the natural language to translate to,
-      the differences in input and output string length, and so on.
-      Erlang/OTP has currently no Unicode <c>to_upper</c>/<c>to_lower</c>
-      functionality, but publicly available libraries address these issues.</p>
-
-    <p>Another example is the accented characters, where the same glyph has two
-      different representations. The Swedish letter "ö" is one example.
-      The Unicode standard has a code point for it, but you can also write it
-      as "o" followed by "U+0308" (Combining Diaeresis, with the simplified
-      meaning that the last letter is to have "¨" above). They have the same
-      glyph. They are for most purposes the same, but have different
-      representations. For example, MacOS X converts all filenames to use
-      Combining Diaeresis, while most other programs (including Erlang) try to
-      hide that by doing the opposite when, for example, listing directories.
-      However it is done, it is usually important to normalize such
-      characters to avoid confusion.</p>
+    <p>So, a conversion function must know not only one character at a
+    time, but possibly the whole sentence, the natural language to
+    translate to, the differences in input and output string length,
+    and so on.  Erlang/OTP has currently no Unicode
+    <c>uppercase</c>/<c>lowercase</c> functionality with language
+    specific handling, but publicly available libraries address these
+    issues.</p>
+
+    <p>Another example is the accented characters, where the same
+    glyph has two different representations. The Swedish letter "ö" is
+    one example.  The Unicode standard has a code point for it, but
+    you can also write it as "o" followed by "U+0308" (Combining
+    Diaeresis, with the simplified meaning that the last letter is to
+    have "¨" above). They have the same glyph, user perceived
+    character. They are for most purposes the same, but have different
+    representations. For example, MacOS X converts all filenames to
+    use Combining Diaeresis, while most other programs (including
+    Erlang) try to hide that by doing the opposite when, for example,
+    listing directories.  However it is done, it is usually important
+    to normalize such characters to avoid confusion.
+    </p>
 
     <p>The list of examples can be made long. One need a kind of knowledge that
       was not needed when programs only considered one or two languages. The
@@ -273,7 +280,7 @@
         them. In some cases functionality has been added to already
         existing interfaces (as the <seealso
         marker="stdlib:string"><c>string</c></seealso> module now can
-        handle lists with any code points). In some cases new
+        handle strings with any code points). In some cases new
         functionality or options have been added (as in the <seealso
         marker="stdlib:io"><c>io</c></seealso> module, the file
         handling, the <seealso
@@ -977,7 +984,7 @@ Eshell V5.10.1  (abort with ^G)
 
     <p>Fortunately, most textual data has been stored in lists and range
       checking has been sparse, so modules like <c>string</c> work well for
-      Unicode lists with little need for conversion or extension.</p>
+      Unicode strings with little need for conversion or extension.</p>
 
     <p>Some modules are, however, changed to be explicitly Unicode-aware. These
       modules include:</p>
@@ -1028,18 +1035,17 @@ Eshell V5.10.1  (abort with ^G)
           has extensive support for Unicode text.</p></item>
     </taglist>
 
-    <p>The <seealso marker="stdlib:string"><c>string</c></seealso> module works
-      perfectly for Unicode strings and ISO Latin-1 strings, except the
-      language-dependent functions
-      <seealso marker="stdlib:string#to_upper/1"><c>string:to_upper/1</c></seealso>
-      and
-      <seealso marker="stdlib:string#to_lower/1"><c>string:to_lower/1</c></seealso>,
-      which are only correct for the ISO Latin-1 character set. These two
-      functions can never function correctly for Unicode characters in their
-      current form, as there are language and locale issues as well as
-      multi-character mappings to consider when converting text between cases.
-      Converting case in an international environment is a large subject not
-      yet addressed in OTP.</p>
+    <p>The <seealso marker="stdlib:string"><c>string</c></seealso>
+    module works perfectly for Unicode strings and ISO Latin-1
+    strings, except the language-dependent functions <seealso
+    marker="stdlib:string#uppercase/1"><c>string:uppercase/1</c></seealso>
+    and <seealso
+    marker="stdlib:string#lowercase/1"><c>string:lowercase/1</c></seealso>.
+    These two functions can never function correctly for Unicode
+    characters in their current form, as there are language and locale
+    issues to consider when converting text between cases.  Converting
+    case in an international environment is a large subject not yet
+    addressed in OTP.</p>
   </section>
 
   <section>
diff --git a/lib/stdlib/src/string.erl b/lib/stdlib/src/string.erl
index c659db78bd..4fdfe99b66 100644
--- a/lib/stdlib/src/string.erl
+++ b/lib/stdlib/src/string.erl
@@ -17,22 +17,72 @@
 %% 
 %% %CopyrightEnd%
 %%
+%% A string library that works on grapheme clusters, with the exception
+%% of codepoints of class 'prepend' and non modern (or decomposed) Hangul.
+%% If these codepoints appear, functions like 'find/2' may return a string
+%% which starts inside a grapheme cluster.
+%% These exceptions are made because the codepoints classes are
+%% seldom used and require that we are able look at previous codepoints in
+%% the stream and is thus hard to implement effectively.
+%%
+%% GC (grapheme cluster) implies that the length of string 'ß↑e̊' is 3 though
+%% it is represented by the codepoints [223,8593,101,778] or the
+%% utf8 binary <<195,159,226,134,145,101,204,138>>
+%%
+%% And that searching for strings or graphemes finds the correct positions:
+%%
+%% find("eeeee̊eee", "e̊") -> "e̊ee".:
+%% find("1£4e̊abcdef", "e") -> "ef"
+%%
+%% Most functions expect all input to be normalized to one form,
+%% see unicode:characters_to_nfc and unicode:characters_to_nfd functions.
+%% When appending strings no checking is done to verify that the
+%% result is valid unicode strings.
+%%
+%% The functions may crash for invalid utf-8 input.
+%%
+%% Return value should be kept consistent when return type is
+%% unicode:chardata() i.e. binary input => binary output,
+%% list input => list output mixed input => mixed output
+%%
 -module(string).
 
--export([len/1,equal/2,concat/2,chr/2,rchr/2,str/2,rstr/2,
-	 span/2,cspan/2,substr/2,substr/3,tokens/2,chars/2,chars/3]).
+-export([is_empty/1, length/1, to_graphemes/1,
+         reverse/1,
+         equal/2, equal/3, equal/4,
+         slice/2, slice/3,
+         pad/2, pad/3, pad/4, trim/1, trim/2, trim/3, chomp/1,
+         take/2, take/3, take/4,
+         lexemes/2, nth_lexeme/3,
+         uppercase/1, lowercase/1, titlecase/1,casefold/1,
+         prefix/2,
+         split/2,split/3,replace/3,replace/4,
+         find/2,find/3,
+         next_codepoint/1, next_grapheme/1
+        ]).
+
+%% Old (will be deprecated) lists/string API kept for backwards compability
+-export([len/1, concat/2, % equal/2, (extended in the new api)
+         chr/2,rchr/2,str/2,rstr/2,
+	 span/2,cspan/2,substr/2,substr/3, tokens/2,
+         chars/2,chars/3]).
 -export([copies/2,words/1,words/2,strip/1,strip/2,strip/3,
 	 sub_word/2,sub_word/3,left/2,left/3,right/2,right/3,
 	 sub_string/2,sub_string/3,centre/2,centre/3, join/2]).
 -export([to_upper/1, to_lower/1]).
+%%
+-import(lists,[member/2]).
 
--import(lists,[reverse/1,member/2]).
+-compile({no_auto_import,[length/1]}).
 
-%%---------------------------------------------------------------------------
+-export_type([grapheme_cluster/0]).
 
-%%% BIFs
+-type grapheme_cluster() :: char() | [char()].
+-type direction() :: 'leading' | 'trailing'.
 
+%%% BIFs
 -export([to_float/1, to_integer/1]).
+-dialyzer({no_improper_lists, stack/2}).
 
 -spec to_float(String) -> {Float, Rest} | {error, Reason} when
       String :: string(),
@@ -54,6 +104,1180 @@ to_integer(_) ->
 
 %%% End of BIFs
 
+%% Check if string is the empty string
+-spec is_empty(String::unicode:chardata()) -> boolean().
+is_empty([]) -> true;
+is_empty(<<>>) -> true;
+is_empty([L|R]) -> is_empty(L) andalso is_empty(R);
+is_empty(_) -> false.
+
+%% Count the number of grapheme clusters in chardata
+-spec length(String::unicode:chardata()) -> non_neg_integer().
+length(CD) ->
+    length_1(unicode_util:gc(CD), 0).
+
+%% Convert a string to a list of grapheme clusters
+-spec to_graphemes(String::unicode:chardata()) -> [grapheme_cluster()].
+to_graphemes(CD0) ->
+    case unicode_util:gc(CD0) of
+        [GC|CD] -> [GC|to_graphemes(CD)];
+        [] -> []
+    end.
+
+%% Compare two strings return boolean, assumes that the input are
+%% normalized to same form, see unicode:characters_to_nfX_xxx(..)
+-spec equal(A, B) -> boolean() when
+      A::unicode:chardata(),
+      B::unicode:chardata().
+equal(A,B) when is_binary(A), is_binary(B) ->
+    A =:= B;
+equal(A,B) ->
+    equal_1(A,B).
+
+%% Compare two strings return boolean, assumes that the input are
+%% normalized to same form, see unicode:characters_to_nfX_xxx(..)
+%% does casefold on the fly
+-spec equal(A, B, IgnoreCase) -> boolean() when
+      A::unicode:chardata(),
+      B::unicode:chardata(),
+      IgnoreCase :: boolean().
+equal(A, B, false) ->
+    equal(A,B);
+equal(A, B, true) ->
+    equal_nocase(A,B).
+
+%% Compare two strings return boolean
+%% if specified does casefold and normalization on the fly
+-spec equal(A, B, IgnoreCase, Norm) -> boolean() when
+      A :: unicode:chardata(),
+      B :: unicode:chardata(),
+      IgnoreCase :: boolean(),
+      Norm :: 'none' | 'nfc' | 'nfd' | 'nfkc' | 'nfkd'.
+equal(A, B, Case, none) ->
+    equal(A,B,Case);
+equal(A, B, false, Norm) ->
+    equal_norm(A, B, Norm);
+equal(A, B, true, Norm) ->
+    equal_norm_nocase(A, B, Norm).
+
+%% Reverse grapheme clusters
+-spec reverse(String::unicode:chardata()) -> [grapheme_cluster()].
+reverse(CD) ->
+    reverse_1(CD, []).
+
+%% Slice a string and return rest of string
+%% Note: counts grapheme_clusters
+-spec slice(String, Start) -> Slice when
+      String::unicode:chardata(),
+      Start :: non_neg_integer(),
+      Slice :: unicode:chardata().
+slice(CD, N) when is_integer(N), N >= 0 ->
+    slice_l(CD, N, is_binary(CD)).
+
+-spec slice(String, Start, Length) -> Slice when
+      String::unicode:chardata(),
+      Start :: non_neg_integer(),
+      Length :: 'infinity' | non_neg_integer(),
+      Slice :: unicode:chardata().
+slice(CD, N, Length)
+  when is_integer(N), N >= 0, is_integer(Length), Length > 0 ->
+    slice_trail(slice_l(CD, N, is_binary(CD)), Length);
+slice(CD, N, infinity) ->
+    slice_l(CD, N, is_binary(CD));
+slice(CD, _, 0) ->
+    case is_binary(CD) of
+        true  -> <<>>;
+        false -> []
+    end.
+
+%% Pad a string to desired length
+-spec pad(String, Length) -> unicode:charlist() when
+      String ::unicode:chardata(),
+      Length :: integer().
+pad(CD, Length) ->
+    pad(CD, Length, trailing, $\s).
+
+-spec pad(String, Length, Dir) -> unicode:charlist() when
+      String ::unicode:chardata(),
+      Length :: integer(),
+      Dir :: direction() | 'both'.
+pad(CD, Length, Dir) ->
+    pad(CD, Length, Dir, $\s).
+
+-spec pad(String, Length, Dir, Char) -> unicode:charlist() when
+      String ::unicode:chardata(),
+      Length :: integer(),
+      Dir :: direction() | 'both',
+      Char :: grapheme_cluster().
+pad(CD, Length, leading, Char) when is_integer(Length) ->
+    Len = length(CD),
+    [lists:duplicate(max(0, Length-Len), Char), CD];
+pad(CD, Length, trailing, Char) when is_integer(Length) ->
+    Len = length(CD),
+    [CD|lists:duplicate(max(0, Length-Len), Char)];
+pad(CD, Length, both, Char) when is_integer(Length) ->
+    Len = length(CD),
+    Size = max(0, Length-Len),
+    Pre = lists:duplicate(Size div 2, Char),
+    Post = case Size rem 2 of
+               1 -> [Char];
+               _ -> []
+           end,
+    [Pre, CD, Pre|Post].
+
+%%  Strip characters from whitespace or Separator in Direction
+-spec trim(String) -> unicode:chardata() when
+      String :: unicode:chardata().
+trim(Str) ->
+    trim(Str, both, unicode_util:whitespace()).
+
+-spec trim(String, Dir) -> unicode:chardata() when
+      String :: unicode:chardata(),
+      Dir :: direction() | 'both'.
+trim(Str, Dir) ->
+    trim(Str, Dir, unicode_util:whitespace()).
+
+-spec trim(String, Dir, Characters) -> unicode:chardata() when
+      String :: unicode:chardata(),
+      Dir :: direction() | 'both',
+      Characters :: [grapheme_cluster()].
+trim(Str, _, []) -> Str;
+trim(Str, leading, Sep) when is_list(Sep) ->
+    trim_l(Str, search_pattern(Sep));
+trim(Str, trailing, Sep) when is_list(Sep) ->
+    trim_t(Str, 0, search_pattern(Sep));
+trim(Str, both, Sep0) when is_list(Sep0) ->
+    Sep = search_pattern(Sep0),
+    trim_t(trim_l(Str,Sep), 0, Sep).
+
+%% Delete trailing newlines or \r\n
+-spec chomp(String::unicode:chardata()) -> unicode:chardata().
+chomp(Str) ->
+    trim_t(Str,0, {[[$\r,$\n],$\n], [$\r,$\n], [<<$\r>>,<<$\n>>]}).
+
+%% Split String into two parts where the leading part consists of Characters
+-spec take(String, Characters) -> {Leading, Trailing} when
+      String::unicode:chardata(),
+      Characters::[grapheme_cluster()],
+      Leading::unicode:chardata(),
+      Trailing::unicode:chardata().
+take(Str, Sep) ->
+    take(Str, Sep, false, leading).
+-spec take(String, Characters, Complement) -> {Leading, Trailing} when
+      String::unicode:chardata(),
+      Characters::[grapheme_cluster()],
+      Complement::boolean(),
+      Leading::unicode:chardata(),
+      Trailing::unicode:chardata().
+take(Str, Sep, Complement) ->
+    take(Str, Sep, Complement, leading).
+-spec take(String, Characters, Complement, Dir) -> {Leading, Trailing} when
+      String::unicode:chardata(),
+      Characters::[grapheme_cluster()],
+      Complement::boolean(),
+      Dir::direction(),
+      Leading::unicode:chardata(),
+      Trailing::unicode:chardata().
+take(Str, [], Complement, Dir) ->
+    Empty = case is_binary(Str) of true -> <<>>; false -> [] end,
+    case {Complement,Dir} of
+        {false, leading} -> {Empty, Str};
+        {false, trailing} -> {Str, Empty};
+        {true,  leading} -> {Str, Empty};
+        {true,  trailing} -> {Empty, Str}
+    end;
+take(Str, Sep0, false, leading) ->
+    Sep = search_pattern(Sep0),
+    take_l(Str, Sep, []);
+take(Str, Sep0, true, leading) ->
+    Sep = search_pattern(Sep0),
+    take_lc(Str, Sep, []);
+take(Str, Sep0, false, trailing) ->
+    Sep = search_pattern(Sep0),
+    take_t(Str, 0, Sep);
+take(Str, Sep0, true, trailing) ->
+    Sep = search_pattern(Sep0),
+    take_tc(Str, 0, Sep).
+
+%% Uppercase all chars in Str
+-spec uppercase(String::unicode:chardata()) -> unicode:chardata().
+uppercase(CD) when is_list(CD) ->
+    uppercase_list(CD);
+uppercase(CD) when is_binary(CD) ->
+    uppercase_bin(CD,<<>>).
+
+%% Lowercase all chars in Str
+-spec lowercase(String::unicode:chardata()) -> unicode:chardata().
+lowercase(CD) when is_list(CD) ->
+    lowercase_list(CD);
+lowercase(CD) when is_binary(CD) ->
+    lowercase_bin(CD,<<>>).
+
+%% Make a titlecase of the first char in Str
+-spec titlecase(String::unicode:chardata()) -> unicode:chardata().
+titlecase(CD) when is_list(CD) ->
+    case unicode_util:titlecase(CD) of
+        [GC|Tail] -> append(GC,Tail);
+        Empty -> Empty
+    end;
+titlecase(CD) when is_binary(CD) ->
+    case unicode_util:titlecase(CD) of
+        [CP|Chars] when is_integer(CP) -> <<CP/utf8,Chars/binary>>;
+        [CPs|Chars] ->
+            << << <<CP/utf8>> || CP <- CPs>>/binary, Chars/binary>>;
+        [] -> <<>>
+    end.
+
+%% Make a comparable string of the Str should be used for equality tests only
+-spec casefold(String::unicode:chardata()) -> unicode:chardata().
+casefold(CD) when is_list(CD) ->
+    casefold_list(CD);
+casefold(CD) when is_binary(CD) ->
+    casefold_bin(CD,<<>>).
+
+%% Return the remaining string with prefix removed or else nomatch
+-spec prefix(String::unicode:chardata(), Prefix::unicode:chardata()) ->
+                    'nomatch' | unicode:chardata().
+prefix(Str, []) -> Str;
+prefix(Str, Prefix0) ->
+    Prefix = unicode:characters_to_list(Prefix0),
+    case prefix_1(Str, Prefix) of
+        [] when is_binary(Str) -> <<>>;
+        Res -> Res
+    end.
+
+%% split String with the first occurrence of SearchPattern, return list of splits
+-spec split(String, SearchPattern) -> [unicode:chardata()] when
+      String :: unicode:chardata(),
+      SearchPattern :: unicode:chardata().
+split(String, SearchPattern) ->
+    split(String, SearchPattern, leading).
+
+%% split String with SearchPattern, return list of splits
+-spec split(String, SearchPattern, Where) -> [unicode:chardata()] when
+      String :: unicode:chardata(),
+      SearchPattern :: unicode:chardata(),
+      Where :: direction() | 'all'.
+split(String, SearchPattern, Where) ->
+    case is_empty(SearchPattern) of
+        true -> [String];
+        false ->
+            SearchPatternCPs = unicode:characters_to_list(SearchPattern),
+            case split_1(String, SearchPatternCPs, 0, Where, [], []) of
+                {_Curr, []} -> [String];
+                {_Curr, Acc} when Where =:= trailing -> Acc;
+                {Curr, Acc} when Where =:= all -> lists:reverse([Curr|Acc]);
+                Acc when is_list(Acc) -> Acc
+            end
+    end.
+
+%% Replace the first SearchPattern in String with Replacement
+-spec replace(String, SearchPattern, Replacement) ->
+                     [unicode:chardata()] when
+      String :: unicode:chardata(),
+      SearchPattern :: unicode:chardata(),
+      Replacement :: unicode:chardata().
+replace(String, SearchPattern, Replacement) ->
+    lists:join(Replacement, split(String, SearchPattern)).
+
+%% Replace Where SearchPattern in String with Replacement
+-spec replace(String, SearchPattern, Replacement, Where) ->
+                     [unicode:chardata()] when
+      String :: unicode:chardata(),
+      SearchPattern :: unicode:chardata(),
+      Replacement :: unicode:chardata(),
+      Where :: direction() | 'all'.
+replace(String, SearchPattern, Replacement, Where) ->
+    lists:join(Replacement, split(String, SearchPattern, Where)).
+
+%% Split Str into a list of chardata separated by one of the grapheme
+%% clusters in Seps
+-spec lexemes(String::unicode:chardata(),
+              SeparatorList::[grapheme_cluster()]) ->
+                     [unicode:chardata()].
+lexemes([], _) -> [];
+lexemes(Str, Seps0) when is_list(Seps0) ->
+    Seps = search_pattern(Seps0),
+    lexemes_m(Str, Seps, []).
+
+-spec nth_lexeme(String, N, SeparatorList) -> unicode:chardata() when
+      String::unicode:chardata(),
+      N::non_neg_integer(),
+      SeparatorList::[grapheme_cluster()].
+
+nth_lexeme(Str, 1, []) -> Str;
+nth_lexeme(Str, N, Seps0) when is_list(Seps0), is_integer(N), N > 0 ->
+    Seps = search_pattern(Seps0),
+    nth_lexeme_m(Str, Seps, N).
+
+%% find first SearchPattern in String return rest of string
+-spec find(String, SearchPattern) -> unicode:chardata() | 'nomatch' when
+      String::unicode:chardata(),
+      SearchPattern::unicode:chardata().
+find(String, SearchPattern) ->
+    find(String, SearchPattern, leading).
+
+%% find SearchPattern in String (search in Dir direction) return rest of string
+-spec find(String, SearchPattern, Dir) -> unicode:chardata() | 'nomatch' when
+      String::unicode:chardata(),
+      SearchPattern::unicode:chardata(),
+      Dir::direction().
+find(String, "", _) -> String;
+find(String, <<>>, _) -> String;
+find(String, SearchPattern, leading) ->
+    find_l(String, unicode:characters_to_list(SearchPattern));
+find(String, SearchPattern, trailing) ->
+    find_r(String, unicode:characters_to_list(SearchPattern), nomatch).
+
+%% Fetch first codepoint and return rest in tail
+-spec next_grapheme(String::unicode:chardata()) ->
+                           maybe_improper_list(grapheme_cluster(),unicode:chardata()).
+next_grapheme(CD) -> unicode_util:gc(CD).
+
+%% Fetch first grapheme cluster and return rest in tail
+-spec next_codepoint(String::unicode:chardata()) ->
+                            maybe_improper_list(char(),unicode:chardata()).
+next_codepoint(CD) -> unicode_util:cp(CD).
+
+%% Internals
+
+length_1([_|Rest], N) ->
+    length_1(unicode_util:gc(Rest), N+1);
+length_1([], N) ->
+    N.
+
+equal_1([A|AR], [B|BR]) when is_integer(A), is_integer(B) ->
+    A =:= B andalso equal_1(AR, BR);
+equal_1([], BR) -> is_empty(BR);
+equal_1(A0,B0) ->
+    case {unicode_util:cp(A0), unicode_util:cp(B0)} of
+        {[CP|A],[CP|B]} -> equal_1(A,B);
+        {[], []} -> true;
+        _ -> false
+    end.
+
+equal_nocase(A, A) -> true;
+equal_nocase(A0, B0) ->
+    case {unicode_util:cp(unicode_util:casefold(A0)),
+          unicode_util:cp(unicode_util:casefold(B0))} of
+        {[CP|A],[CP|B]} -> equal_nocase(A,B);
+        {[], []} -> true;
+        _ -> false
+    end.
+
+equal_norm(A, A, _Norm) -> true;
+equal_norm(A0, B0, Norm) ->
+    case {unicode_util:cp(unicode_util:Norm(A0)),
+          unicode_util:cp(unicode_util:Norm(B0))} of
+        {[CP|A],[CP|B]} -> equal_norm(A,B, Norm);
+        {[], []} -> true;
+        _ -> false
+    end.
+
+equal_norm_nocase(A, A, _Norm) -> true;
+equal_norm_nocase(A0, B0, Norm) ->
+    case {unicode_util:cp(unicode_util:casefold(unicode_util:Norm(A0))),
+          unicode_util:cp(unicode_util:casefold(unicode_util:Norm(B0)))} of
+        {[CP|A],[CP|B]} -> equal_norm_nocase(A,B, Norm);
+        {[], []} -> true;
+        _ -> false
+    end.
+
+reverse_1(CD, Acc) ->
+    case unicode_util:gc(CD) of
+        [GC|Rest] -> reverse_1(Rest, [GC|Acc]);
+        [] -> Acc
+    end.
+
+slice_l(CD, N, Binary) when N > 0 ->
+    case unicode_util:gc(CD) of
+        [_|Cont] -> slice_l(Cont, N-1, Binary);
+        [] when Binary -> <<>>;
+        [] -> []
+    end;
+slice_l(Cont, 0, Binary) ->
+    case is_empty(Cont) of
+        true when Binary -> <<>>;
+        _ -> Cont
+    end.
+
+slice_trail(CD, N) when is_list(CD) ->
+    slice_list(CD, N);
+slice_trail(CD, N) when is_binary(CD) ->
+    slice_bin(CD, N, CD).
+
+slice_list(CD, N) when N > 0 ->
+    case unicode_util:gc(CD) of
+        [GC|Cont] -> append(GC, slice_list(Cont, N-1));
+        [] -> []
+    end;
+slice_list(_, 0) ->
+    [].
+
+slice_bin(CD, N, Orig) when N > 0 ->
+    case unicode_util:gc(CD) of
+        [_|Cont] -> slice_bin(Cont, N-1, Orig);
+        [] -> Orig
+    end;
+slice_bin([], 0, Orig) ->
+    Orig;
+slice_bin(CD, 0, Orig) ->
+    Sz = byte_size(Orig) - byte_size(CD),
+    <<Keep:Sz/binary, _/binary>> = Orig,
+    Keep.
+
+uppercase_list(CPs0) ->
+    case unicode_util:uppercase(CPs0) of
+        [Char|CPs] -> append(Char,uppercase_list(CPs));
+        [] -> []
+    end.
+
+uppercase_bin(CPs0, Acc) ->
+    case unicode_util:uppercase(CPs0) of
+        [Char|CPs] when is_integer(Char) ->
+            uppercase_bin(CPs, <<Acc/binary, Char/utf8>>);
+        [Chars|CPs] ->
+            uppercase_bin(CPs, <<Acc/binary,
+                                 << <<CP/utf8>> || CP <- Chars>>/binary >>);
+        [] -> Acc
+    end.
+
+lowercase_list(CPs0) ->
+    case unicode_util:lowercase(CPs0) of
+        [Char|CPs] -> append(Char,lowercase_list(CPs));
+        [] -> []
+    end.
+
+lowercase_bin(CPs0, Acc) ->
+    case unicode_util:lowercase(CPs0) of
+        [Char|CPs] when is_integer(Char) ->
+            lowercase_bin(CPs, <<Acc/binary, Char/utf8>>);
+        [Chars|CPs] ->
+            lowercase_bin(CPs, <<Acc/binary,
+                                 << <<CP/utf8>> || CP <- Chars>>/binary >>);
+        [] -> Acc
+    end.
+
+casefold_list(CPs0) ->
+    case unicode_util:casefold(CPs0) of
+        [Char|CPs] -> append(Char, casefold_list(CPs));
+        [] -> []
+    end.
+
+casefold_bin(CPs0, Acc) ->
+    case unicode_util:casefold(CPs0) of
+        [Char|CPs] when is_integer(Char) ->
+            casefold_bin(CPs, <<Acc/binary, Char/utf8>>);
+        [Chars|CPs] ->
+            casefold_bin(CPs, <<Acc/binary,
+                                << <<CP/utf8>> || CP <- Chars>>/binary >>);
+        [] -> Acc
+    end.
+
+
+trim_l([Bin|Cont0], Sep) when is_binary(Bin) ->
+    case bin_search_inv(Bin, Cont0, Sep) of
+        {nomatch, Cont} -> trim_l(Cont, Sep);
+        Keep -> Keep
+    end;
+trim_l(Str, {GCs, _, _}=Sep) when is_list(Str) ->
+    case unicode_util:gc(Str) of
+        [C|Cs] ->
+            case lists:member(C, GCs) of
+                true -> trim_l(Cs, Sep);
+                false -> Str
+            end;
+        [] -> []
+    end;
+trim_l(Bin, Sep) when is_binary(Bin) ->
+    case bin_search_inv(Bin, [], Sep) of
+        {nomatch,_} -> <<>>;
+        [Keep] -> Keep
+    end.
+
+trim_t([Bin|Cont0], N, Sep) when is_binary(Bin) ->
+    <<_:N/binary, Rest/binary>> = Bin,
+    case bin_search(Rest, Cont0, Sep) of
+        {nomatch,_} ->
+            stack(Bin, trim_t(Cont0, 0, Sep));
+        [SepStart|Cont1] ->
+            case bin_search_inv(SepStart, Cont1, Sep) of
+                {nomatch, Cont} ->
+                    Tail = trim_t(Cont, 0, Sep),
+                    case is_empty(Tail) of
+                        true ->
+                            KeepSz = byte_size(Bin) - byte_size(SepStart),
+                            <<Keep:KeepSz/binary, _/binary>> = Bin,
+                            Keep;
+                        false ->
+                            Used = cp_prefix(Cont0, Cont),
+                            stack(Bin, stack(Used, Tail))
+                    end;
+                [NonSep|Cont] when is_binary(NonSep) ->
+                    KeepSz = byte_size(Bin) - byte_size(NonSep),
+                    trim_t([Bin|Cont], KeepSz, Sep)
+            end
+    end;
+trim_t(Str, 0, {GCs,CPs,_}=Sep) when is_list(Str) ->
+    case unicode_util:cp(Str) of
+        [CP|Cs] ->
+            case lists:member(CP, CPs) of
+                true ->
+                    [GC|Cs1] = unicode_util:gc(Str),
+                    case lists:member(GC, GCs) of
+                        true ->
+                            Tail = trim_t(Cs1, 0, Sep),
+                            case is_empty(Tail) of
+                                true -> [];
+                                false -> append(GC,Tail)
+                            end;
+                        false ->
+                            append(GC,trim_t(Cs1, 0, Sep))
+                    end;
+                false ->
+                    append(CP,trim_t(Cs, 0, Sep))
+            end;
+        [] -> []
+    end;
+trim_t(Bin, N, Sep) when is_binary(Bin) ->
+    <<_:N/binary, Rest/binary>> = Bin,
+    case bin_search(Rest, Sep) of
+        {nomatch,_} -> Bin;
+        [SepStart] ->
+            case bin_search_inv(SepStart, [], Sep) of
+                {nomatch,_} ->
+                    KeepSz = byte_size(Bin) - byte_size(SepStart),
+                    <<Keep:KeepSz/binary, _/binary>> = Bin,
+                    Keep;
+                [NonSep] ->
+                    KeepSz = byte_size(Bin) - byte_size(NonSep),
+                    trim_t(Bin, KeepSz, Sep)
+            end
+    end.
+
+take_l([Bin|Cont0], Sep, Acc) when is_binary(Bin) ->
+    case bin_search_inv(Bin, Cont0, Sep) of
+        {nomatch, Cont} ->
+            Used = cp_prefix(Cont0, Cont),
+            take_l(Cont, Sep, [unicode:characters_to_binary([Bin|Used])|Acc]);
+        [Bin1|_]=After when is_binary(Bin1) ->
+            First = byte_size(Bin) - byte_size(Bin1),
+            <<Keep:First/binary, _/binary>> = Bin,
+            {btoken(Keep,Acc), After}
+    end;
+take_l(Str, {GCs, _, _}=Sep, Acc) when is_list(Str) ->
+    case unicode_util:gc(Str) of
+        [C|Cs] ->
+            case lists:member(C, GCs) of
+                true -> take_l(Cs, Sep, append(rev(C),Acc));
+                false -> {rev(Acc), Str}
+            end;
+        [] -> {rev(Acc), []}
+    end;
+take_l(Bin, Sep, Acc) when is_binary(Bin) ->
+    case bin_search_inv(Bin, [], Sep) of
+        {nomatch,_} ->
+            {btoken(Bin, Acc), <<>>};
+        [After] ->
+            First = byte_size(Bin) - byte_size(After),
+            <<Keep:First/binary, _/binary>> = Bin,
+            {btoken(Keep, Acc), After}
+    end.
+
+take_lc([Bin|Cont0], Sep, Acc) when is_binary(Bin) ->
+    case bin_search(Bin, Cont0, Sep) of
+        {nomatch, Cont} ->
+            Used = cp_prefix(Cont0, Cont),
+            take_lc(Cont, Sep, [unicode:characters_to_binary([Bin|Used])|Acc]);
+        [Bin1|_]=After when is_binary(Bin1) ->
+            First = byte_size(Bin) - byte_size(Bin1),
+            <<Keep:First/binary, _/binary>> = Bin,
+            {btoken(Keep,Acc), After}
+    end;
+take_lc(Str, {GCs, _, _}=Sep, Acc) when is_list(Str) ->
+    case unicode_util:gc(Str) of
+        [C|Cs] ->
+            case lists:member(C, GCs) of
+                false -> take_lc(Cs, Sep, append(rev(C),Acc));
+                true  -> {rev(Acc), Str}
+            end;
+        [] -> {rev(Acc), []}
+    end;
+take_lc(Bin, Sep, Acc) when is_binary(Bin) ->
+    case bin_search(Bin, [], Sep) of
+        {nomatch,_} ->
+            {btoken(Bin, Acc), <<>>};
+        [After] ->
+            First = byte_size(Bin) - byte_size(After),
+            <<Keep:First/binary, _/binary>> = Bin,
+            {btoken(Keep, Acc), After}
+    end.
+
+take_t([Bin|Cont0], N, Sep) when is_binary(Bin) ->
+    <<_:N/binary, Rest/binary>> = Bin,
+    case bin_search(Rest, Cont0, Sep) of
+        {nomatch,Cont} ->
+            Used = cp_prefix(Cont0, Cont),
+            {Head, Tail} = take_t(Cont, 0, Sep),
+            {stack(unicode:characters_to_binary([Bin|Used]), Head), Tail};
+        [SepStart|Cont1] ->
+            case bin_search_inv(SepStart, Cont1, Sep) of
+                {nomatch, Cont} ->
+                    {Head, Tail} = take_t(Cont, 0, Sep),
+                    Used = cp_prefix(Cont0, Cont),
+                    case equal(Tail, Cont) of
+                        true ->
+                            KeepSz = byte_size(Bin) - byte_size(SepStart),
+                            <<Keep:KeepSz/binary, End/binary>> = Bin,
+                            {stack(Keep,Head), stack(stack(End,Used),Tail)};
+                        false ->
+                            {stack(unicode:characters_to_binary([Bin|Used]),Head), Tail}
+                    end;
+                [NonSep|Cont] when is_binary(NonSep) ->
+                    KeepSz = byte_size(Bin) - byte_size(NonSep),
+                    take_t([Bin|Cont], KeepSz, Sep)
+            end
+    end;
+take_t(Str, 0, {GCs,CPs,_}=Sep) when is_list(Str) ->
+    case unicode_util:cp(Str) of
+        [CP|Cs] ->
+            case lists:member(CP, CPs) of
+                true ->
+                    [GC|Cs1] = unicode_util:gc(Str),
+                    case lists:member(GC, GCs) of
+                        true ->
+                            {Head, Tail} = take_t(Cs1, 0, Sep),
+                            case equal(Tail, Cs1) of
+                                true -> {Head, append(GC,Tail)};
+                                false -> {append(GC,Head), Tail}
+                            end;
+                        false ->
+                            {Head, Tail} = take_t(Cs, 0, Sep),
+                            {append(CP,Head), Tail}
+                    end;
+                false ->
+                    {Head, Tail} = take_t(Cs, 0, Sep),
+                    {append(CP,Head), Tail}
+            end;
+        [] -> {[],[]}
+    end;
+take_t(Bin, N, Sep) when is_binary(Bin) ->
+    <<_:N/binary, Rest/binary>> = Bin,
+    case bin_search(Rest, Sep) of
+        {nomatch,_} -> {Bin, <<>>};
+        [SepStart] ->
+            case bin_search_inv(SepStart, [], Sep) of
+                {nomatch,_} ->
+                    KeepSz = byte_size(Bin) - byte_size(SepStart),
+                    <<Before:KeepSz/binary, End/binary>> = Bin,
+                    {Before, End};
+                [NonSep] ->
+                    KeepSz = byte_size(Bin) - byte_size(NonSep),
+                    take_t(Bin, KeepSz, Sep)
+            end
+    end.
+
+take_tc([Bin|Cont0], N, Sep) when is_binary(Bin) ->
+    <<_:N/binary, Rest/binary>> = Bin,
+    case bin_search_inv(Rest, Cont0, Sep) of
+        {nomatch,Cont} ->
+            Used = cp_prefix(Cont0, Cont),
+            {Head, Tail} = take_tc(Cont, 0, Sep),
+            {stack(unicode:characters_to_binary([Bin|Used]), Head), Tail};
+        [SepStart|Cont1] ->
+            case bin_search(SepStart, Cont1, Sep) of
+                {nomatch, Cont} ->
+                    {Head, Tail} = take_tc(Cont, 0, Sep),
+                    Used = cp_prefix(Cont0, Cont),
+                    case equal(Tail, Cont) of
+                        true ->
+                            KeepSz = byte_size(Bin) - byte_size(SepStart),
+                            <<Keep:KeepSz/binary, End/binary>> = Bin,
+                            {stack(Keep,Head), stack(stack(End,Used),Tail)};
+                        false ->
+                            {stack(unicode:characters_to_binary([Bin|Used]),Head), Tail}
+                    end;
+                [NonSep|Cont] when is_binary(NonSep) ->
+                    KeepSz = byte_size(Bin) - byte_size(NonSep),
+                    take_tc([Bin|Cont], KeepSz, Sep)
+            end
+    end;
+take_tc(Str, 0, {GCs,CPs,_}=Sep) when is_list(Str) ->
+    case unicode_util:cp(Str) of
+        [CP|Cs] ->
+            case lists:member(CP, CPs) of
+                true ->
+                    [GC|Cs1] = unicode_util:gc(Str),
+                    case lists:member(GC, GCs) of
+                        false ->
+                            {Head, Tail} = take_tc(Cs1, 0, Sep),
+                            case equal(Tail, Cs1) of
+                                true -> {Head, append(GC,Tail)};
+                                false -> {append(GC,Head), Tail}
+                            end;
+                        true ->
+                            {Head, Tail} = take_tc(Cs1, 0, Sep),
+                            {append(GC,Head), Tail}
+                    end;
+                false ->
+                    {Head, Tail} = take_tc(Cs, 0, Sep),
+                    case equal(Tail, Cs) of
+                        true  -> {Head, append(CP,Tail)};
+                        false -> {append(CP,Head), Tail}
+                    end
+            end;
+        [] -> {[],[]}
+    end;
+take_tc(Bin, N, Sep) when is_binary(Bin) ->
+    <<_:N/binary, Rest/binary>> = Bin,
+    case bin_search_inv(Rest, [], Sep) of
+        {nomatch,_} -> {Bin, <<>>};
+        [SepStart] ->
+            case bin_search(SepStart, [], Sep) of
+                {nomatch,_} ->
+                    KeepSz = byte_size(Bin) - byte_size(SepStart),
+                    <<Before:KeepSz/binary, End/binary>> = Bin,
+                    {Before, End};
+                [NonSep] ->
+                    KeepSz = byte_size(Bin) - byte_size(NonSep),
+                    take_tc(Bin, KeepSz, Sep)
+            end
+    end.
+
+prefix_1(Cs, []) -> Cs;
+prefix_1(Cs, [_]=Pre) ->
+    prefix_2(unicode_util:gc(Cs), Pre);
+prefix_1(Cs, Pre) ->
+    prefix_2(unicode_util:cp(Cs), Pre).
+
+prefix_2([C|Cs], [C|Pre]) ->
+    prefix_1(Cs, Pre);
+prefix_2(_, _) ->
+    nomatch.
+
+split_1([Bin|Cont0], Needle, Start, Where, Curr0, Acc)
+  when is_binary(Bin) ->
+    case bin_search_str(Bin, Start, Cont0, Needle) of
+        {nomatch,Sz,Cont} ->
+            <<Keep:Sz/binary, _/binary>> = Bin,
+            split_1(Cont, Needle, 0, Where, [Keep|Curr0], Acc);
+        {Before, [Cs0|Cont], After} ->
+            Curr = add_non_empty(Before,Curr0),
+            case Where of
+                leading ->
+                    [rev(Curr),After];
+                trailing ->
+                    <<_/utf8, Cs/binary>> = Cs0,
+                    Next = byte_size(Bin) - byte_size(Cs),
+                    split_1([Bin|Cont], Needle, Next, Where,
+                            Curr0, [rev(Curr),After]);
+                all ->
+                    split_1(After, Needle, 0, Where, [], [rev(Curr)|Acc])
+            end
+    end;
+split_1(Cs0, [C|_]=Needle, _, Where, Curr, Acc) when is_list(Cs0) ->
+    case unicode_util:cp(Cs0) of
+        [C|Cs] ->
+            case prefix_1(Cs0, Needle) of
+                nomatch -> split_1(Cs, Needle, 0, Where, append(C,Curr), Acc);
+                Rest when Where =:= leading ->
+                    [rev(Curr), Rest];
+                Rest when Where =:= trailing ->
+                    split_1(Cs, Needle, 0, Where, [C|Curr], [rev(Curr), Rest]);
+                Rest when Where =:= all ->
+                    split_1(Rest, Needle, 0, Where, [], [rev(Curr)|Acc])
+            end;
+        [Other|Cs] ->
+            split_1(Cs, Needle, 0, Where, append(Other,Curr), Acc);
+        [] ->
+            {rev(Curr), Acc}
+    end;
+split_1(Bin, [_C|_]=Needle, Start, Where, Curr0, Acc) ->
+    case bin_search_str(Bin, Start, [], Needle) of
+        {nomatch,_,_} ->
+            <<_:Start/binary, Keep/binary>> = Bin,
+            {rev([Keep|Curr0]), Acc};
+        {Before, [Cs0], After} ->
+            case Where of
+                leading ->
+                    [rev([Before|Curr0]),After];
+                trailing ->
+                    <<_/utf8, Cs/binary>> = Cs0,
+                    Next = byte_size(Bin) - byte_size(Cs),
+                    split_1(Bin, Needle, Next, Where, Curr0,
+                            [btoken(Before,Curr0),After]);
+                all ->
+                    Next = byte_size(Bin) - byte_size(After),
+                    <<_:Start/binary, Keep/binary>> = Before,
+                    Curr = [Keep|Curr0],
+                    split_1(Bin, Needle, Next, Where, [], [rev(Curr)|Acc])
+            end
+    end.
+
+lexemes_m([Bin|Cont0], Seps, Ts) when is_binary(Bin) ->
+    case bin_search_inv(Bin, Cont0, Seps) of
+        {nomatch,Cont} ->
+            lexemes_m(Cont, Seps, Ts);
+        Cs ->
+            {Lexeme,Rest} = lexeme_pick(Cs, Seps, []),
+            lexemes_m(Rest, Seps, [Lexeme|Ts])
+    end;
+lexemes_m(Cs0, {GCs, _, _}=Seps, Ts) when is_list(Cs0) ->
+    case unicode_util:gc(Cs0) of
+        [C|Cs] ->
+            case lists:member(C, GCs) of
+                true  ->
+                    lexemes_m(Cs, Seps, Ts);
+                false ->
+                    {Lexeme,Rest} = lexeme_pick(Cs0, Seps, []),
+                    lexemes_m(Rest, Seps, [Lexeme|Ts])
+            end;
+        [] ->
+            lists:reverse(Ts)
+    end;
+lexemes_m(Bin, Seps, Ts) when is_binary(Bin) ->
+    case bin_search_inv(Bin, [], Seps) of
+        {nomatch,_} ->
+            lists:reverse(Ts);
+        [Cs] ->
+            {Lexeme,Rest} = lexeme_pick(Cs, Seps, []),
+            lexemes_m(Rest, Seps, add_non_empty(Lexeme,Ts))
+    end.
+
+lexeme_pick([CP|Cs1]=Cs0, {GCs,CPs,_}=Seps, Tkn) when is_integer(CP) ->
+    case lists:member(CP, CPs) of
+        true  ->
+            [GC|Cs2] = unicode_util:gc(Cs0),
+            case lists:member(GC, GCs) of
+                true -> {rev(Tkn), Cs2};
+                false -> lexeme_pick(Cs2, Seps, append(rev(GC),Tkn))
+            end;
+        false -> lexeme_pick(Cs1, Seps, [CP|Tkn])
+    end;
+lexeme_pick([Bin|Cont0], Seps, Tkn) when is_binary(Bin) ->
+    case bin_search(Bin, Cont0, Seps) of
+        {nomatch,_} ->
+            lexeme_pick(Cont0, Seps, [Bin|Tkn]);
+        [Left|_Cont] = Cs ->
+            Bytes = byte_size(Bin) - byte_size(Left),
+            <<Lexeme:Bytes/binary, _/binary>> = Bin,
+            {btoken(Lexeme, Tkn), Cs}
+    end;
+lexeme_pick(Cs0, {GCs, CPs, _} = Seps, Tkn) when is_list(Cs0) ->
+    case unicode_util:cp(Cs0) of
+        [CP|Cs] ->
+            case lists:member(CP, CPs) of
+                true ->
+                    [GC|Cs2] = unicode_util:gc(Cs0),
+                    case lists:member(GC, GCs) of
+                        true -> {rev(Tkn), Cs0};
+                        false -> lexeme_pick(Cs2, Seps, append(rev(GC),Tkn))
+                    end;
+                false ->
+                    lexeme_pick(Cs, Seps, append(CP,Tkn))
+            end;
+        [] ->
+            {rev(Tkn), []}
+    end;
+lexeme_pick(Bin, Seps, Tkn) when is_binary(Bin) ->
+    case bin_search(Bin, Seps) of
+        {nomatch,_} ->
+            {btoken(Bin,Tkn), []};
+        [Left] ->
+            Bytes = byte_size(Bin) - byte_size(Left),
+            <<Lexeme:Bytes/binary, _/binary>> = Bin,
+            {btoken(Lexeme, Tkn), Left}
+    end.
+
+nth_lexeme_m([Bin|Cont0], Seps, N) when is_binary(Bin) ->
+    case bin_search_inv(Bin, Cont0, Seps) of
+        {nomatch,Cont} ->
+            nth_lexeme_m(Cont, Seps, N);
+        Cs when N > 1 ->
+            Rest = lexeme_skip(Cs, Seps),
+            nth_lexeme_m(Rest, Seps, N-1);
+        Cs ->
+            {Lexeme,_} = lexeme_pick(Cs, Seps, []),
+            Lexeme
+    end;
+nth_lexeme_m(Cs0, {GCs, _, _}=Seps, N) when is_list(Cs0) ->
+    case unicode_util:gc(Cs0) of
+        [C|Cs] ->
+            case lists:member(C, GCs) of
+                true ->
+                    nth_lexeme_m(Cs, Seps, N);
+                false when N > 1 ->
+                    Cs1 = lexeme_skip(Cs, Seps),
+                    nth_lexeme_m(Cs1, Seps, N-1);
+                false ->
+                    {Lexeme,_} = lexeme_pick(Cs0, Seps, []),
+                    Lexeme
+            end;
+        [] ->
+            []
+    end;
+nth_lexeme_m(Bin, Seps, N) when is_binary(Bin) ->
+    case bin_search_inv(Bin, [], Seps) of
+        [Cs] when N > 1 ->
+            Cs1 = lexeme_skip(Cs, Seps),
+            nth_lexeme_m(Cs1, Seps, N-1);
+        [Cs] ->
+            {Lexeme,_} = lexeme_pick(Cs, Seps, []),
+            Lexeme;
+        {nomatch,_} ->
+            <<>>
+    end.
+
+lexeme_skip([CP|Cs1]=Cs0, {GCs,CPs,_}=Seps) when is_integer(CP) ->
+    case lists:member(CP, CPs) of
+        true  ->
+            [GC|Cs2] = unicode_util:gc(Cs0),
+            case lists:member(GC, GCs) of
+                true -> Cs0;
+                false -> lexeme_skip(Cs2, Seps)
+            end;
+        false ->
+            lexeme_skip(Cs1, Seps)
+    end;
+lexeme_skip([Bin|Cont0], Seps) when is_binary(Bin) ->
+    case bin_search(Bin, Cont0, Seps) of
+        {nomatch,_} -> lexeme_skip(Cont0, Seps);
+        Cs -> Cs
+    end;
+lexeme_skip(Cs0, {GCs, CPs, _} = Seps) when is_list(Cs0) ->
+    case unicode_util:cp(Cs0) of
+        [CP|Cs] ->
+            case lists:member(CP, CPs) of
+                true ->
+                    [GC|Cs2] = unicode_util:gc(Cs0),
+                    case lists:member(GC, GCs) of
+                        true -> Cs0;
+                        false -> lexeme_skip(Cs2, Seps)
+                    end;
+                false ->
+                    lexeme_skip(Cs, Seps)
+            end;
+        [] ->
+            []
+    end;
+lexeme_skip(Bin, Seps) when is_binary(Bin) ->
+    case bin_search(Bin, Seps) of
+        {nomatch,_} -> <<>>;
+        [Left] -> Left
+    end.
+
+find_l([Bin|Cont0], Needle) when is_binary(Bin) ->
+    case bin_search_str(Bin, 0, Cont0, Needle) of
+        {nomatch, _, Cont} ->
+            find_l(Cont, Needle);
+        {_Before, Cs, _After} ->
+            Cs
+    end;
+find_l(Cs0, [C|_]=Needle) when is_list(Cs0) ->
+    case unicode_util:cp(Cs0) of
+        [C|Cs] ->
+            case prefix_1(Cs0, Needle) of
+                nomatch -> find_l(Cs, Needle);
+                _ -> Cs0
+            end;
+        [_C|Cs] ->
+            find_l(Cs, Needle);
+        [] -> nomatch
+    end;
+find_l(Bin, Needle) ->
+    case bin_search_str(Bin, 0, [], Needle) of
+        {nomatch,_,_} -> nomatch;
+        {_Before, [Cs], _After} -> Cs
+    end.
+
+find_r([Bin|Cont0], Needle, Res) when is_binary(Bin) ->
+    case bin_search_str(Bin, 0, Cont0, Needle) of
+        {nomatch,_,Cont} ->
+            find_r(Cont, Needle, Res);
+        {_, Cs0, _} ->
+            [_|Cs] = unicode_util:gc(Cs0),
+            find_r(Cs, Needle, Cs0)
+    end;
+find_r(Cs0, [C|_]=Needle, Res) when is_list(Cs0) ->
+    case unicode_util:cp(Cs0) of
+        [C|Cs] ->
+            case prefix_1(Cs0, Needle) of
+                nomatch -> find_r(Cs, Needle, Res);
+                _ -> find_r(Cs, Needle, Cs0)
+            end;
+        [_C|Cs] ->
+            find_r(Cs, Needle, Res);
+        [] -> Res
+    end;
+find_r(Bin, Needle, Res) ->
+    case bin_search_str(Bin, 0, [], Needle) of
+        {nomatch,_,_} -> Res;
+        {_Before, [Cs0], _After} ->
+            <<_/utf8, Cs/binary>> = Cs0,
+            find_r(Cs, Needle, Cs0)
+    end.
+
+%% These are used to avoid creating lists around binaries
+%% might be unnecessary, is there a better solution?
+btoken(Token, []) -> Token;
+btoken(BinPart, [C]) when is_integer(C) -> <<C/utf8, BinPart/binary>>;
+btoken(<<>>, Tkn) -> lists:reverse(Tkn);
+btoken(BinPart, Cs) -> [lists:reverse(Cs),BinPart].
+
+rev([B]) when is_binary(B) -> B;
+rev(L) when is_list(L) -> lists:reverse(L);
+rev(C) when is_integer(C) -> C.
+
+append(Char, <<>>) when is_integer(Char) -> [Char];
+append(Char, <<>>) when is_list(Char) -> Char;
+append(Char, Bin) when is_binary(Bin) -> [Char,Bin];
+append(Char, Str) when is_integer(Char) -> [Char|Str];
+append(GC, Str) when is_list(GC) -> GC ++ Str.
+
+stack(Bin, []) -> Bin;
+stack(<<>>, St) -> St;
+stack([], St) -> St;
+stack(Bin, St) -> [Bin|St].
+
+add_non_empty(<<>>, L) -> L;
+add_non_empty(Token, L) -> [Token|L].
+
+cp_prefix(Orig, Cont) ->
+    case unicode_util:cp(Cont) of
+        [] -> Orig;
+        [Cp|Rest] -> cp_prefix_1(Orig, Cp, Rest)
+    end.
+
+cp_prefix_1(Orig, Until, Cont) ->
+    case unicode_util:cp(Orig) of
+        [Until|Rest] ->
+            case equal(Rest, Cont) of
+                true -> [];
+                false-> [Until|cp_prefix_1(Rest, Until, Cont)]
+            end;
+        [CP|Rest] -> [CP|cp_prefix_1(Rest, Until, Cont)]
+    end.
+
+
+%% Binary special
+bin_search(Bin, Seps) ->
+    bin_search(Bin, [], Seps).
+
+bin_search(_Bin, Cont, {[],_,_}) ->
+    {nomatch, Cont};
+bin_search(Bin, Cont, {Seps,_,BP}) ->
+    bin_search_loop(Bin, 0, BP, Cont, Seps).
+
+%% Need to work with [<<$a>>, <<778/utf8>>],
+%% i.e. å in nfd form  $a "COMBINING RING ABOVE"
+%% and PREPEND characters like "ARABIC NUMBER SIGN" 1536 <<216,128>>
+%% combined with other characters are currently ignored.
+search_pattern(Seps) ->
+    CPs = search_cp(Seps),
+    Bin = bin_pattern(CPs),
+    {Seps, CPs, Bin}.
+
+search_cp([CP|Seps]) when is_integer(CP) ->
+    [CP|search_cp(Seps)];
+search_cp([Pattern|Seps]) ->
+    [CP|_] = unicode_util:cp(Pattern),
+    [CP|search_cp(Seps)];
+search_cp([]) -> [].
+
+bin_pattern([CP|Seps]) ->
+    [<<CP/utf8>>|bin_pattern(Seps)];
+bin_pattern([]) -> [].
+
+bin_search_loop(Bin0, Start, _, Cont, _Seps)
+  when byte_size(Bin0) =< Start; Start < 0 ->
+    {nomatch, Cont};
+bin_search_loop(Bin0, Start, BinSeps, Cont, Seps) ->
+    <<_:Start/binary, Bin/binary>> = Bin0,
+    case binary:match(Bin, BinSeps) of
+        nomatch ->
+            {nomatch,Cont};
+        {Where, _CL} ->
+            <<_:Where/binary, Cont0/binary>> = Bin,
+            Cont1 = stack(Cont0, Cont),
+            [GC|Cont2] = unicode_util:gc(Cont1),
+            case lists:member(GC, Seps) of
+                false ->
+                    case Cont2 of
+                        [BinR|Cont] when is_binary(BinR) ->
+                            Next = byte_size(Bin0) - byte_size(BinR),
+                            bin_search_loop(Bin0, Next, BinSeps, Cont, Seps);
+                        BinR when is_binary(BinR), Cont =:= [] ->
+                            Next = byte_size(Bin0) - byte_size(BinR),
+                            bin_search_loop(Bin0, Next, BinSeps, Cont, Seps);
+                        _ ->
+                            {nomatch, Cont2}
+                    end;
+                true when is_list(Cont1) ->
+                    Cont1;
+                true ->
+                    [Cont1]
+            end
+    end.
+
+bin_search_inv(Bin, Cont, {[], _, _}) ->
+    [Bin|Cont];
+bin_search_inv(Bin, Cont, {[Sep], _, _}) ->
+    bin_search_inv_1([Bin|Cont], Sep);
+bin_search_inv(Bin, Cont, {Seps, _, _}) ->
+    bin_search_inv_n([Bin|Cont], Seps).
+
+bin_search_inv_1([<<>>|CPs], _) ->
+    {nomatch, CPs};
+bin_search_inv_1(CPs = [Bin0|Cont], Sep) when is_binary(Bin0) ->
+    case unicode_util:gc(CPs) of
+        [Sep|Bin] when is_binary(Bin), Cont =:= [] ->
+            bin_search_inv_1([Bin], Sep);
+        [Sep|[Bin|Cont]=Cs] when is_binary(Bin) ->
+            bin_search_inv_1(Cs, Sep);
+        [Sep|Cs] ->
+            {nomatch, Cs};
+        _ -> CPs
+    end.
+
+bin_search_inv_n([<<>>|CPs], _) ->
+    {nomatch, CPs};
+bin_search_inv_n([Bin0|Cont]=CPs, Seps) when is_binary(Bin0) ->
+    [C|Cs0] = unicode_util:gc(CPs),
+    case {lists:member(C, Seps), Cs0} of
+        {true, Cs} when is_binary(Cs), Cont =:= [] ->
+            bin_search_inv_n([Cs], Seps);
+        {true, [Bin|Cont]=Cs} when is_binary(Bin) ->
+            bin_search_inv_n(Cs, Seps);
+        {true, Cs} -> {nomatch, Cs};
+        {false, _} -> CPs
+    end.
+
+bin_search_str(Bin0, Start, Cont, [CP|_]=SearchCPs) ->
+    <<_:Start/binary, Bin/binary>> = Bin0,
+    case binary:match(Bin, <<CP/utf8>>) of
+        nomatch -> {nomatch, byte_size(Bin0), Cont};
+        {Where0, _} ->
+            Where = Start+Where0,
+            <<Keep:Where/binary, Cs0/binary>> = Bin0,
+            [GC|Cs]=unicode_util:gc(Cs0),
+            case prefix_1(stack(Cs0,Cont), SearchCPs) of
+                nomatch when is_binary(Cs) ->
+                    KeepSz = byte_size(Bin0) - byte_size(Cs),
+                    bin_search_str(Bin0, KeepSz, Cont, SearchCPs);
+                nomatch ->
+                    {nomatch, Where, stack([GC|Cs],Cont)};
+                [] ->
+                    {Keep, [Cs0|Cont], <<>>};
+                Rest ->
+                    {Keep, [Cs0|Cont], Rest}
+            end
+    end.
+
+
+%%---------------------------------------------------------------------------
+%% OLD lists API kept for backwards compability
+%%---------------------------------------------------------------------------
+
 %% Robert's bit
 
 %% len(String)
@@ -68,12 +1292,12 @@ len(S) -> length(S).
 %% equal(String1, String2)
 %%  Test if 2 strings are equal.
 
--spec equal(String1, String2) -> boolean() when
-      String1 :: string(),
-      String2 :: string().
+%% -spec equal(String1, String2) -> boolean() when
+%%       String1 :: string(),
+%%       String2 :: string().
 
-equal(S, S) -> true;
-equal(_, _) -> false.
+%% equal(S, S) -> true;
+%% equal(_, _) -> false.
 
 %% concat(String1, String2)
 %%  Concatenate 2 strings.
@@ -127,7 +1351,7 @@ rchr([], _C, _I, L) -> L.
 str(S, Sub) when is_list(Sub) -> str(S, Sub, 1).
 
 str([C|S], [C|Sub], I) ->
-    case prefix(Sub, S) of
+    case l_prefix(Sub, S) of
 	true -> I;
 	false -> str(S, [C|Sub], I+1)
     end;
@@ -142,16 +1366,16 @@ str([], _Sub, _I) -> 0.
 rstr(S, Sub) when is_list(Sub) -> rstr(S, Sub, 1, 0).
 
 rstr([C|S], [C|Sub], I, L) ->
-    case prefix(Sub, S) of
+    case l_prefix(Sub, S) of
 	true -> rstr(S, [C|Sub], I+1, I);
 	false -> rstr(S, [C|Sub], I+1, L)
     end;
 rstr([_|S], Sub, I, L) -> rstr(S, Sub, I+1, L);
 rstr([], _Sub, _I, L) -> L.
 
-prefix([C|Pre], [C|String]) -> prefix(Pre, String);
-prefix([], String) when is_list(String) -> true;
-prefix(Pre, String) when is_list(Pre), is_list(String) -> false.
+l_prefix([C|Pre], [C|String]) -> l_prefix(Pre, String);
+l_prefix([], String) when is_list(String) -> true;
+l_prefix(Pre, String) when is_list(Pre), is_list(String) -> false.
 
 %% span(String, Chars) -> Length.
 %% cspan(String, Chars) -> Length.
@@ -229,9 +1453,9 @@ tokens(S, Seps) ->
 		[_|_] -> [S]
 	    end;
 	[C] ->
-	    tokens_single_1(reverse(S), C, []);
+	    tokens_single_1(lists:reverse(S), C, []);
 	[_|_] ->
-	    tokens_multiple_1(reverse(S), Seps, [])
+	    tokens_multiple_1(lists:reverse(S), Seps, [])
     end.
 
 tokens_single_1([Sep|S], Sep, Toks) ->
@@ -342,8 +1566,8 @@ sub_word(String, Index, Char) when is_integer(Index), is_integer(Char) ->
 	    s_word(strip(String, left, Char), Index, Char, 1, [])
     end.
 
-s_word([], _, _, _,Res) -> reverse(Res);
-s_word([Char|_],Index,Char,Index,Res) -> reverse(Res);
+s_word([], _, _, _,Res) -> lists:reverse(Res);
+s_word([Char|_],Index,Char,Index,Res) -> lists:reverse(Res);
 s_word([H|T],Index,Char,Index,Res) -> s_word(T,Index,Char,Index,[H|Res]);
 s_word([Char|T],Stop,Char,Index,Res) when Index < Stop -> 
     s_word(strip(T,left,Char),Stop,Char,Index+1,Res);
@@ -359,7 +1583,7 @@ strip(String) -> strip(String, both).
 -spec strip(String, Direction) -> Stripped when
       String :: string(),
       Stripped :: string(),
-      Direction :: left | right | both.
+      Direction :: 'left' | 'right' | 'both'.
 
 strip(String, left) -> strip_left(String, $\s);
 strip(String, right) -> strip_right(String, $\s);
@@ -369,7 +1593,7 @@ strip(String, both) ->
 -spec strip(String, Direction, Character) -> Stripped when
       String :: string(),
       Stripped :: string(),
-      Direction :: left | right | both,
+      Direction :: 'left' | 'right' | 'both',
       Character :: char().
 
 strip(String, right, Char) -> strip_right(String, Char);
diff --git a/lib/stdlib/test/string_SUITE.erl b/lib/stdlib/test/string_SUITE.erl
index 836f9e5142..a78ddf761b 100644
--- a/lib/stdlib/test/string_SUITE.erl
+++ b/lib/stdlib/test/string_SUITE.erl
@@ -29,25 +29,46 @@
 -export([init_per_testcase/2, end_per_testcase/2]).
 
 %% Test cases must be exported.
--export([len/1,equal/1,concat/1,chr_rchr/1,str_rstr/1]).
--export([span_cspan/1,substr/1,tokens/1,chars/1]).
+-export([is_empty/1, length/1, to_graphemes/1,
+         reverse/1, slice/1,
+         equal/1,
+         pad/1, trim/1, chomp/1, take/1,
+         uppercase/1, lowercase/1, titlecase/1, casefold/1,
+         prefix/1, split/1, replace/1, find/1,
+         lexemes/1, nth_lexeme/1, cd_gc/1, meas/1
+        ]).
+
+-export([len/1,old_equal/1,old_concat/1,chr_rchr/1,str_rstr/1]).
+-export([span_cspan/1,substr/1,old_tokens/1,chars/1]).
 -export([copies/1,words/1,strip/1,sub_word/1,left_right/1]).
 -export([sub_string/1,centre/1, join/1]).
 -export([to_integer/1,to_float/1]).
 -export([to_upper_to_lower/1]).
 
+%% Run tests when debugging them
+-export([debug/0]).
+
 suite() ->
     [{ct_hooks,[ts_install_cth]},
      {timetrap,{minutes,1}}].
 
-all() -> 
-    [len, equal, concat, chr_rchr, str_rstr, span_cspan,
-     substr, tokens, chars, copies, words, strip, sub_word,
-     left_right, sub_string, centre, join, to_integer,
-     to_float, to_upper_to_lower].
+all() ->
+    [{group, chardata}, {group, list_string}].
 
-groups() -> 
-    [].
+groups() ->
+    [{chardata,
+      [is_empty, length, to_graphemes,
+       equal, reverse, slice,
+       pad, trim, chomp, take,
+       lexemes, nth_lexeme,
+       uppercase, lowercase, titlecase, casefold,
+       prefix, find, split, replace, cd_gc,
+       meas]},
+     {list_string,
+      [len, old_equal, old_concat, chr_rchr, str_rstr, span_cspan,
+       substr, old_tokens, chars, copies, words, strip, sub_word,
+       left_right, sub_string, centre, join, to_integer,
+       to_float, to_upper_to_lower]}].
 
 init_per_suite(Config) ->
     Config.
@@ -68,8 +89,839 @@ init_per_testcase(_Case, Config) ->
 end_per_testcase(_Case, _Config) ->
     ok.
 
+debug() ->
+    Config = [{data_dir, ?MODULE_STRING++"_data"}],
+    [io:format("~p:~p~n",[Test,?MODULE:Test(Config)]) ||
+        {_,Tests} <- groups(), Test <- Tests].
+
+-define(TEST(B,C,D), test(?LINE,?FUNCTION_NAME,B,C,D, true)).
+-define(TEST_EQ(B,C,D),
+        test(?LINE,?FUNCTION_NAME,B,C,D, true),
+        test(?LINE,?FUNCTION_NAME,hd(C),[B|tl(C),D, true)).
+
+-define(TEST_NN(B,C,D),
+        test(?LINE,?FUNCTION_NAME,B,C,D, false),
+        test(?LINE,?FUNCTION_NAME,hd(C),[B|tl(C)],D, false)).
+
+
+is_empty(_) ->
+    ?TEST("", [], true),
+    ?TEST([""|<<>>], [], true),
+    ?TEST("a", [], false),
+    ?TEST([""|<<$a>>], [], false),
+    ?TEST(["",[<<>>]], [], true),
+    ok.
+
+length(_) ->
+    %% invalid arg type
+    {'EXIT',_} = (catch string:length({})),
+    {'EXIT',_} = (catch string:length(foo)),
+    %% Valid signs
+    ?TEST("", [], 0),
+    ?TEST([""|<<>>], [], 0),
+    L = tuple_size(list_to_tuple(atom_to_list(?MODULE))),
+    ?TEST(atom_to_list(?MODULE), [], L),
+    ?TEST("Hello", [], 5),
+    ?TEST("UC Ω ßð", [], 7),
+    ?TEST(["abc"|<<"abc">>], [], 6),
+    ?TEST(["abc",["def"]], [], 6),
+    ?TEST([<<97/utf8, 778/utf8, 98/utf8>>, [776,111,776]], [], 3), %% åäö in nfd
+    ok.
+
+equal(_) ->
+    %% invalid arg type
+    {'EXIT',_} = (catch string:equal(1, 2)),
+    {'EXIT',_} = (catch string:equal(1, 2, foo)),
+    {'EXIT',_} = (catch string:equal(1, 2, true, foo)),
+
+    ?TEST("", [<<"">>], true),
+    ?TEST("Hello", ["Hello"], true),
+    ?TEST("Hello", ["Hell"], false),
+    ?TEST("Hello", ["Hello!"], false),
+    ?TEST("Hello", [<<"Hello"/utf8>>], true),
+    ?TEST("Hello", [<<"Mello"/utf8>>], false),
+    ?TEST("Hello", [<<"Hello!"/utf8>>], false),
+    ?TEST(["Hello",[" deep"]], ["Hello deep"], true),
+    ?TEST(["Hello",[<<" deep"/utf8>>]], ["Hello deep"], true),
+    ?TEST("Hello deep", [["Hello", [" deep"]]], true),
+    ?TEST("Hello deep", [["Hello", [" d!eep"]]], false),
+    ?TEST("Hello deep", [["Hello", [<<" deep"/utf8>>]]], true),
+    false = string:equal("Åäö", [<<97/utf8, 778/utf8, 98/utf8>>, [776,111,776]]), %% nfc vs nfd
+
+    %% case_insensitive_equal()
+    ?TEST("", ["", true], true),
+    ?TEST("a", ["b", true], false),
+    ?TEST("", [<<>>, true], true),
+    ?TEST("", [[<<>>,[]], true], true),
+    ?TEST("", [[<<>>,[$a]], true], false),
+    ?TEST("123", ["123", true], true),
+    ?TEST("abc", ["abc", true], true),
+    ?TEST([[],<<>>,"ABC"|<<>>], [["abc",[]], true], true),
+    ?TEST("ABCa", ["abcå", true], false),
+    ?TEST("åäö", [{norm,"åäö"}, true], true),
+    ?TEST("ÅÄÖ", [{norm,"åäö"}, true], true),
+    ?TEST("MICHAŁ", ["michał", true], true),
+    ?TEST(["Mic",<<"HAŁ"/utf8>>], ["michał", true], true),
+    ?TEST("ß SHARP S", ["ss sharp s", true], true),
+    ?TEST("ẞ SHARP S", [[<<$ß/utf8, $\s>>,"SHARP S"], true], true),
+    ?TEST("ẞ SHARP ß", ["ss sharp s", true], false),
+    ?TEST(<<"İ I WITH DOT ABOVE"/utf8>>, ["i̇ i with dot above", true], true),
+    %% These should be equivalent with the above
+    true = string:equal(string:casefold(["Mic",<<"HAŁ"/utf8>>]), string:casefold("michał")),
+    true = string:equal(string:casefold("ẞ SHARP S"), string:casefold([<<$ß/utf8, $\s>>,"SHARP S"])),
+    false = string:equal(string:casefold("ẞ SHARP ß"), string:casefold("ss sharp s")),
+
+    %% Normalization
+    ?TEST_NN("", ["", true, none], true),
+    ?TEST_NN("a", ["b", true, nfc], false),
+    ?TEST_NN("a", ["b", true, nfd], false),
+    ?TEST_NN("a", ["b", true, nfkc], false),
+    ?TEST_NN("a", ["b", true, nfkd], false),
+
+    ?TEST_NN("a", ["A", false, nfc], false),
+    ?TEST_NN("a", ["A", false, nfd], false),
+    ?TEST_NN([<<>>,"a"|<<>>], ["A", true, nfkc], true),
+    ?TEST_NN(<<"a">>, ["A", true, nfkd], true),
+
+    ?TEST_NN([$a, <<$b>>, [97,776]], ["abä", false, none], false),
+    ?TEST_NN([$a, <<$b>>, [97,776]], ["abä", false, nfc], true),
+    ?TEST_NN([$a, <<$b>>, [97,776]], ["abä", false, nfd], true),
+    ?TEST_NN([$a, <<$b>>, [97,776]], ["abä", false, nfkc], true),
+    ?TEST_NN([$a, <<$b>>, [97,776]], ["abä", false, nfkd], true),
+
+    ?TEST_NN([$a, <<$b>>, [97,776]], ["abÄ", true, none], false),
+    ?TEST_NN([$a, <<$b>>, [97,776]], ["abÄ", false, nfc], false),
+    ?TEST_NN([$a, <<$b>>, [97,776]], ["abÄ", true, nfc], true),
+    ?TEST_NN([$a, <<$b>>, [97,776]], ["abÄ", true, nfd], true),
+    ?TEST_NN([$a, <<$b>>, [97,776]], ["abÄ", true, nfkc], true),
+    ?TEST_NN([$a, <<$b>>, [97,776]], ["abÄ", true, nfkd], true),
+
+    ?TEST_NN([$a, <<$b>>, "ホンダ"], ["abﾎﾝﾀﾞ", true, none], false),
+    ?TEST_NN([$a, <<$b>>, "ホンダ"], ["abﾎﾝﾀﾞ", true, nfc], false),
+    ?TEST_NN([$a, <<$b>>, "ホンダ"], ["abﾎﾝﾀﾞ", true, nfd], false),
+    ?TEST_NN([$a, <<$b>>, "ホンダ"], ["abﾎﾝﾀﾞ", true, nfkc], true),
+    ?TEST_NN([$a, <<$b>>, "ホンダ"], ["abﾎﾝﾀﾞ", true, nfkd], true),
+
+    ?TEST_NN([$a, <<$b>>, "32"], ["ab３２", true, none], false),
+    ?TEST_NN([$a, <<$b>>, "32"], ["ab３２", true, nfc], false),
+    ?TEST_NN([$a, <<$b>>, "32"], ["ab３２", true, nfd], false),
+    ?TEST_NN([$a, <<$b>>, "32"], ["ab３２", true, nfkc], true),
+    ?TEST_NN([$a, <<$b>>, "32"], ["ab３２", true, nfkd], true),
+
+    %% Coverage.
+    ?TEST("", [<<"">>, false, nfc], true),
+    ?TEST("", [<<"">>, true, nfc], true),
+
+    ok.
+
+to_graphemes(_) ->
+    %% More tests are in unicode_util_SUITE.erl
+    {'EXIT', _} = (catch unicode:characters_to_nfd_binary(["asdåäö", an_atom])),
+    String = ["abc..åäö", $e, 788, <<"Ωµe`è"/utf8>>, "œŒþæÆħ§ß"],
+    NFD = unicode:characters_to_nfd_list(String),
+    [] = string:to_graphemes([]),
+    [] = string:to_graphemes(<<>>),
+    GCs = string:to_graphemes(String),
+    true = erlang:length(GCs) =:= string:length(String),
+    true = erlang:length(GCs) =:= erlang:length(string:to_graphemes(NFD)),
+    true = erlang:length(GCs) =:=
+        erlang:length(string:to_graphemes(unicode:characters_to_nfc_list(String))),
+    ok.
+
+reverse(_) ->
+    {'EXIT',_} = (catch string:reverse(2)),
+    Str1 = "Hello ",
+    Str2 = "Ω ßð",
+    Str3 = "åäö",
+    ?TEST("", [], ""),
+    ?TEST(Str1, [], lists:reverse(Str1)),
+    ?TEST(Str2, [], lists:reverse(Str2)),
+    ?TEST(Str3, [], lists:reverse(Str3)),
+    true = string:reverse(Str3) =:= lists:reverse(string:to_graphemes(Str3)),
+    ok.
+
+slice(_) ->
+    {'EXIT',_} = (catch string:slice(2, 2, 2)),
+    {'EXIT',_} = (catch string:slice("asd", foo, 2)),
+    {'EXIT',_} = (catch string:slice("asd", 2, -1)),
+    ?TEST("", [3], ""),
+    ?TEST("aåä", [1, 0], ""),
+    ?TEST("aåä", [3], ""),
+    ?TEST("aåäöbcd", [3], "öbcd"),
+    ?TEST([<<"aå"/utf8>>,"äöbcd"], [3], "öbcd"),
+    ?TEST([<<"aåä"/utf8>>,"öbcd"], [3], "öbcd"),
+    ?TEST([<<"aåä"/utf8>>,"öbcd"], [3, infinity], "öbcd"),
+
+    ?TEST("", [3, 2], ""),
+    ?TEST("aåä", [3, 2], ""),
+    ?TEST("aåäöbcd", [3,2], "öb"),
+    ?TEST([<<"aå"/utf8>>,"äöbcd"], [3,3], "öbc"),
+    ?TEST([<<"aåä"/utf8>>,"öbcd"], [3,10], "öbcd"),
+
+    ok.
+
+pad(_) ->
+    Str = "Hallå",
+    ?TEST(Str, [7], "Hallå  "),
+    ?TEST(Str, [7, leading], "  Hallå"),
+    ?TEST(Str, [4, both, $.], "Hallå"),
+    ?TEST(Str, [10, both, $.], "..Hallå..."),
+    ?TEST(Str, [10, leading, $.], ".....Hallå"),
+    ?TEST(Str, [10, trailing, $.], "Hallå....."),
+    ?TEST(Str++["f"], [10, trailing, $.], "Hallåf...."),
+    ?TEST(Str++[" flåwer"], [10, trailing, $.], "Hallå flåwer"),
+    ok.
+
+trim(_) ->
+    Str = "\t\s..Ha\s.llå..\t\n\r",
+    ?TEST("", [], ""),
+    ?TEST(Str, [both, "x"], Str),
+    ?TEST(Str, [leading], "..Ha\s.llå..\t\n\r"),
+    ?TEST(Str, [trailing], "\t\s..Ha\s.llå.."),
+    ?TEST(Str, [], "..Ha .llå.."),
+    ?TEST(".. ", [both, ""], ".. "),
+    ?TEST([<<".. ">>], [both, ". "], ""),
+    ?TEST(".. h.ej ..", [leading, ". "], "h.ej .."),
+    ?TEST(".. h.ej ..", [trailing, ". "], ".. h.ej"),
+    ?TEST(".. h.ej ..", [both, ". "], "h.ej"),
+    ?TEST(["..", <<"h.ej">>, ".."], [both, ". "], "h.ej"),
+    ?TEST([[], "..", " h.ej ", <<"..">>], [both, ". "], "h.ej"),
+    ?TEST([<<>>,<<"..">>, " h.ej", <<" ..">>], [both, ". "], "h.ej"),
+    ?TEST([<<>>,<<"..">>, " h.ej", <<" ..">>], [trailing, ". "], ".. h.ej"),
+    ?TEST([<<"..  h.ej .">>, <<"..">>], [both, ". "], "h.ej"),
+    ?TEST(["..h", ".e", <<"j..">>], [both, ". "], "h.ej"),
+    ?TEST(["..h", <<".ejsa"/utf8>>, "n.."], [both, ". "], "h.ejsan"),
+    %% Test that it behaves with graphemes (i.e. nfd tests are the hard part)
+    ?TEST("aaåaa", [both, "a"], "å"),
+    ?TEST(["aaa",778,"äöoo"], [both, "ao"], "åäö"),
+    ?TEST([<<"aaa">>,778,"äöoo"], [both, "ao"], "åäö"),
+    ?TEST([<<"e">>,778,"åäöe", <<778/utf8>>], [both, [[$e,778]]], "åäö"),
+    ?TEST([[<<"!v">>|<<204,128,$v,204,129>>]],[trailing, [[$v,769]]], [$!,$v,768]),
+    ?TEST([[[<<"v">>|<<204,129,118,204,128,118>>],769,118,769]], [trailing, [[118,769]]], [$v,769,$v,768]),
+    ?TEST([<<"vv">>|<<204,128,118,204,128>>], [trailing, [[118,768]]], "v"),
+    ok.
+
+chomp(_) ->
+    Str = "åäö\na\r\nsd\n",
+    Res = "åäö\na\r\nsd",
+    ?TEST("", [], ""),
+    ?TEST("\n", [], ""),
+    ?TEST("str \t", [], "str \t"),
+    ?TEST("str \t\n\r", [], "str \t\n\r"),
+    ?TEST(Str, [], Res),
+    ?TEST([Str,$\n], [], Res),
+    ?TEST([Str|"\n"], [], Res),
+    ?TEST([Str|<<"\n">>], [], Res),
+    ?TEST([Str,$\r|<<"\n">>], [], Res),
+    ?TEST([Str, <<$\r>>|"\n"], [], Res),
+    ?TEST([<<$a,$\r>>,"\na\n"], [], "a\r\na"),
+    ok.
+
+take(_) ->
+    Str = "\t\s..Ha\s.llå..\t\n\r",
+    WS = "\t\s\n\r",
+    Chars = lists:seq($a,$z)++lists:seq($A,$Z),
+    %% complement=false, dir=leading
+    ?TEST("", ["abc"], {"",""}),
+    ?TEST(Str, ["x"], {[], Str}),
+    ?TEST(Str, [WS], {"\t\s","..Ha\s.llå..\t\n\r"}),
+    ?TEST(".. ", ["", false], {"", ".. "}),
+    ?TEST([<<".. ">>], [". ", false, leading], {".. ", ""}),
+    ?TEST(".. h.ej ..", [". ", false, leading], {".. ", "h.ej .."}),
+    ?TEST(["..", <<"h.ej">>, ".."], [". ", false, leading], {"..", "h.ej.."}),
+    ?TEST([[], "..", " h.ej ", <<"..">>], [". ", false, leading], {".. ","h.ej .."}),
+    ?TEST([<<>>,<<"..">>, " h.ej", <<" ..">>], [". ", false, leading], {".. ", "h.ej .."}),
+    ?TEST(["..h", <<".ejsa"/utf8>>, "n.."], [". ", false, leading], {"..", "h.ejsan.."}),
+    ?TEST([[<<101,204,138,33>>]], [[[$e,778]]], {[$e,778], "!"}),
+    %% Test that it behaves with graphemes (i.e. nfd tests are the hard part)
+    ?TEST("aaåaa", ["a", false, leading], {"aa", "åaa"}),
+    ?TEST(["aaa",778,"äöoo"], ["ao", false, leading], {"aa", "åäöoo"}),
+    ?TEST([<<"aaa">>,778,"äöoo"], ["ao",false,leading], {"aa", "åäöoo"}),
+    ?TEST([<<"e">>,778,"åäöe", <<778/utf8>>], [[[$e,778]], false, leading], {[$e,778],"åäöe"++[778]}),
+
+    %% complement=true, dir=leading
+    ?TEST("", ["abc", true], {"",""}),
+    ?TEST(Str, ["x", true], {Str, []}),
+    ?TEST(Str, [Chars, true], {"\t\s..","Ha\s.llå..\t\n\r"}),
+    ?TEST(".. ", ["",true], {".. ", ""}),
+    ?TEST([<<".. ">>], [Chars, true, leading], {".. ", ""}),
+    ?TEST(".. h.ej ..", [Chars, true, leading], {".. ", "h.ej .."}),
+    ?TEST(["..", <<"h.ej">>, ".."], [Chars, true, leading], {"..", "h.ej.."}),
+    ?TEST([[], "..", " h.ej ", <<"..">>], [Chars, true, leading], {".. ","h.ej .."}),
+    ?TEST([<<>>,<<"..">>, " h.ej", <<" ..">>], [Chars, true, leading], {".. ", "h.ej .."}),
+    ?TEST(["..h", <<".ejsa"/utf8>>, "n.."], [Chars, true, leading], {"..", "h.ejsan.."}),
+    %% Test that it behaves with graphemes (i.e. nfd tests are the hard part)
+    ?TEST(["aaee",778,"äöoo"], [[[$e,778]], true, leading], {"aae", [$e,778|"äöoo"]}),
+    ?TEST([<<"aae">>,778,"äöoo"], [[[$e,778]],true,leading], {"aa", [$e,778|"äöoo"]}),
+    ?TEST([<<"e">>,778,"åäöe", <<778/utf8>>], [[[$e,778]], true, leading], {[], [$e,778]++"åäöe"++[778]}),
+
+    %% complement=false, dir=trailing
+    ?TEST(Str, ["", false, trailing], {Str, []}),
+    ?TEST(Str, ["x", false, trailing], {Str, []}),
+    ?TEST(Str, [WS, false,trailing], {"\t\s..Ha\s.llå..", "\t\n\r"}),
+    ?TEST(".. h.ej ..", [". ", false, trailing], {".. h.ej", " .."}),
+    ?TEST(["..", <<"h.ej">>, ".."], [". ", false, trailing], {"..h.ej", ".."}),
+    ?TEST([[], "..", " h.ej ", <<"..">>], [". ", false, trailing], {".. h.ej", " .."}),
+    ?TEST([<<>>,<<"..">>, " h.ej", <<" ..">>], [". ", false, trailing], {".. h.ej", " .."}),
+    ?TEST(["..h", <<".ejsa"/utf8>>, "n.."], [". ", false, trailing], {"..h.ejsan", ".."}),
+    ?TEST("aaåaa", ["a", false, trailing], {"aaå", "aa"}),
+    ?TEST([<<"KMÐ¨"/utf8>>], [[1064], false, trailing], {"KMÐ¨",[]}),
+    ?TEST([[<<"!\"">>|<<"\"">>]], ["\"", false, trailing], {"!", "\"\""}),
+    ?TEST([<<$v>>, 769], [[[$v,769]], false, trailing], {"", [$v,769]}),
+    ?TEST(["aaa",778,"äöoo"], ["ao", false, trailing], {"aaåäö", "oo"}),
+    ?TEST([<<"aaa">>,778,"äöoo"], ["ao", false, trailing], {"aaåäö", "oo"}),
+    ?TEST([<<"e">>,778,"åäöee", <<778/utf8>>], [[[$e,778]], false, trailing], {[$e,778|"åäöe"], [$e,778]}),
+
+    %% complement=true, dir=trailing
+    ?TEST("", ["abc", true, trailing], {"",""}),
+    ?TEST(Str, ["x", true, trailing], {[], Str}),
+    %?TEST(Str, [{norm,Chars}, true, trailing], {"\t\s..Ha\s.ll","å..\t\n\r"}),
+    ?TEST(".. ", ["", true, trailing], {"", ".. "}),
+    ?TEST([<<".. ">>], [Chars, true, trailing], {"", ".. "}),
+    ?TEST(".. h.ej ..", [Chars, true, trailing], {".. h.ej", " .."}),
+    ?TEST(["..", <<"h.ej">>, ".."], [Chars, true, trailing], {"..h.ej", ".."}),
+    ?TEST([[], "..", " h.ej ", <<"..">>], [Chars, true, trailing], {".. h.ej"," .."}),
+    ?TEST([<<>>,<<"..">>, " h.ej", <<" ..">>], [Chars, true, trailing], {".. h.ej"," .."}),
+    ?TEST(["..h", <<".ejsa"/utf8>>, "n.."], [Chars, true, trailing], {"..h.ejsan", ".."}),
+    ?TEST([[<<101,204,138,33>>]], [[[$e,778]], true, trailing], {[$e,778], "!"}),
+    ?TEST([<<"Fa">>], [[$F], true, trailing], {"F", "a"}),
+    ?TEST([[<<101,101,204,138>>,1045,778]], ["e", true, trailing], {"e", [101,778,1045,778]}),
+    ?TEST([[<<101,101,204,138>>,<<1045/utf8,778/utf8>>]], ["e", true, trailing], {"e", [101,778,1045,778]}),
+    ?TEST([[[118,769,118],<<204,129,118,204,129,120,204,128,118>>,768,120,768]],
+          [[[118,769]], true, trailing], {[118,769,118,769,118,769],[120,768,118,768,120,768]}),
+    ?TEST([[<<118,204,128,118>>|<<204,128,118,204,128,118,204,128,206,132,204,129,206,132,204,129>>]],
+          [[[118,768]], true, trailing], {[118,768,118,768,118,768,118,768], [900,769,900,769]}),
+    %% Test that it behaves with graphemes (i.e. nfd tests are the hard part)
+    ?TEST(["aaee",778,"äöoo"], [[[$e,778]], true, trailing], {"aae"++[$e,778], "äöoo"}),
+    ?TEST([<<"aae">>,778,"äöoo"], [[[$e,778]],true,trailing], {"aa"++[$e,778], "äöoo"}),
+    ?TEST([<<"e">>,778,"åäöe", <<778/utf8>>], [[[$e,778]], true, trailing], {[$e,778]++"åäöe"++[778], []}),
+    ?TEST([<<"e">>,778,"åäöe", <<778/utf8>>, $e, 779], [[[$e,778]], true, trailing],
+          {[$e,778]++"åäöe"++[778], [$e,779]}),
+
+    ok.
+
+
+uppercase(_) ->
+    ?TEST("", [], ""),
+    ?TEST("123", [], "123"),
+    ?TEST("abc", [], "ABC"),
+    ?TEST("ABC", [], "ABC"),
+    ?TEST("abcdefghiljklmnopqrstvxyzåäö",[], "ABCDEFGHILJKLMNOPQRSTVXYZÅÄÖ"),
+    ?TEST("åäö", [], "ÅÄÖ"),
+    ?TEST("ÅÄÖ", [], "ÅÄÖ"),
+    ?TEST("Michał", [], "MICHAŁ"),
+    ?TEST(["Mic",<<"hał"/utf8>>], [], "MICHAŁ"),
+    ?TEST("ǉǇ", [], "ǇǇ"),
+    ?TEST("Ǉǉ", [], "ǇǇ"),
+    ?TEST("ß sharp s", [], "SS SHARP S"),
+    ok.
+
+lowercase(_) ->
+    ?TEST("", [], ""),
+    ?TEST("123", [], "123"),
+    ?TEST("abc", [], "abc"),
+    ?TEST("ABC", [], "abc"),
+    ?TEST("åäö", [], "åäö"),
+    ?TEST("ÅÄÖ", [], "åäö"),
+    ?TEST("MICHAŁ", [], "michał"),
+    ?TEST(["Mic",<<"HAŁ"/utf8>>], [], "michał"),
+    ?TEST("ß SHARP S", [], "ß sharp s"),
+    ?TEST("İ I WITH DOT ABOVE", [], "i̇ i with dot above"),
+    ok.
+
+titlecase(_) ->
+    ?TEST("", [], ""),
+    ?TEST("123", [], "123"),
+    %% Titlecase is the same as uppercase for most chars
+    [?TEST([C,$x], [], string:uppercase([C])++[$x]) ||
+        C <-"abcdefghiljklmnopqrstvxyzåäö"],
+    %% Example of a different mapping
+    ?TEST("ǉusad", [],"ǈusad"),
+    ?TEST("ǉǇ", [], "ǈǇ"),
+    ?TEST("Ǉǉ", [], "ǈǉ"),
+    ?TEST("ß sharp s", [], "Ss sharp s"),
+    ok.
+
+casefold(_) ->
+    ?TEST("", [], ""),
+    ?TEST("123", [], "123"),
+    ?TEST("abc", [], "abc"),
+    ?TEST("ABC", [], "abc"),
+    ?TEST("åäö", [], "åäö"),
+    ?TEST("ÅÄÖ", [], "åäö"),
+    ?TEST("MICHAŁ", [], "michał"),
+    ?TEST(["Mic",<<"HAŁ"/utf8>>], [], "michał"),
+    ?TEST("ß SHARP S", [], "ss sharp s"),
+    ?TEST("ẞ SHARP S", [], "ss sharp s"),
+    ?TEST("İ I WITH DOT ABOVE", [], "i̇ i with dot above"),
+    ok.
+
+prefix(_) ->
+    ?TEST("", ["a"], nomatch),
+    ?TEST("a", [""], "a"),
+    ?TEST("b", ["a"], nomatch),
+    ?TEST("a", ["a"], ""),
+    ?TEST("å", ["a"], nomatch),
+    ?TEST(["a",<<778/utf8>>], ["a"], nomatch),
+    ?TEST([<<"a"/utf8>>,778], ["a"], nomatch),
+    ?TEST("hejsan", [""], "hejsan"),
+    ?TEST("hejsan", ["hej"], "san"),
+    ?TEST("hejsan", ["hes"], nomatch),
+    ?TEST(["h", "ejsan"], ["hej"], "san"),
+    ?TEST(["h", "e", "jsan"], ["hej"], "san"),
+    ?TEST(["h", "e", "san"], ["hej"], nomatch),
+    ?TEST(["h", <<"ejsan">>], ["hej"], "san"),
+    ?TEST(["h", <<"e">>, "jsan"], ["hej"], "san"),
+    ?TEST(["h", "e", <<"jsan">>], ["hej"], "san"),
+    ok.
+
+split(_) ->
+    Mod = fun(Res) ->
+                  [lists:flatten(unicode:characters_to_nfc_list(io_lib:format("~ts", [Str])))
+                   || Str <- Res] end,
+    ?TEST("..", ["", leading], {Mod, [".."]}),
+    ?TEST("..", ["..", leading], {Mod, [[],[]]}),
+    ?TEST("abcd", ["..", leading], {Mod, ["abcd"]}),
+    ?TEST("ab..bc", ["..", leading], {Mod, ["ab","bc"]}),
+    ?TEST("ab..bc..cd", ["..", leading], {Mod, ["ab","bc..cd"]}),
+    ?TEST("..ab", [".."], {Mod, [[],"ab"]}),
+    ?TEST("ab..", ["..", leading], {Mod, ["ab",[]]}),
+    ?TEST(["ab..bc..cd"], ["..", leading], {Mod, ["ab","bc..cd"]}),
+    ?TEST(["ab","..bc..cd"], ["..", leading], {Mod, ["ab","bc..cd"]}),
+    ?TEST(["ab",<<"..bc..cd">>], ["..", leading], {Mod, ["ab","bc..cd"]}),
+    ?TEST(["ab.",".bc..cd"], ["..", leading], {Mod, ["ab","bc..cd"]}),
+    ?TEST(["ab.",<<".bc..cd">>], ["..", leading], {Mod, ["ab","bc..cd"]}),
+    ?TEST(["ab..","bc..cd"], ["..", leading], {Mod, ["ab","bc..cd"]}),
+    ?TEST(["ab..",<<"bc..cd">>], ["..", leading], {Mod, ["ab","bc..cd"]}),
+    ?TEST(["ab.","bc..cd"], ["..", leading], {Mod, ["ab.bc","cd"]}),
+    ?TEST("ab...bc", ["..", leading], {Mod, ["ab",".bc"]}),
+
+    ?TEST("..", ["", trailing], {Mod, [".."]}),
+    ?TEST("..", ["..", trailing], {Mod, [[],[]]}),
+    ?TEST("abcd", ["..", trailing], {Mod, ["abcd"]}),
+    ?TEST("ab..bc", ["..", trailing], {Mod, ["ab","bc"]}),
+    ?TEST("ab..bc..cd", ["..", trailing], {Mod, ["ab..bc","cd"]}),
+    ?TEST("..ab", ["..", trailing], {Mod, [[],"ab"]}),
+    ?TEST("ab..", ["..", trailing], {Mod, ["ab",[]]}),
+    ?TEST(["ab..bc..cd"], ["..", trailing], {Mod, ["ab..bc","cd"]}),
+    ?TEST(["ab","..bc..cd"], ["..", trailing], {Mod, ["ab..bc","cd"]}),
+    ?TEST(["ab"|<<"a">>], ["a", trailing], {Mod, ["ab",[]]}),
+    ?TEST(["ab",<<"..bc..cd">>], ["..", trailing], {Mod, ["ab..bc","cd"]}),
+    ?TEST([<<"ab.">>,".bc..cd"], ["..", trailing], {Mod, ["ab..bc","cd"]}),
+    ?TEST(["ab.",<<".bc..cd">>], ["..", trailing], {Mod, ["ab..bc","cd"]}),
+    ?TEST(["ab..","bc..cd"], ["..", trailing], {Mod, ["ab..bc","cd"]}),
+    ?TEST(["ab..",<<"bc..cd">>], ["..", trailing], {Mod, ["ab..bc","cd"]}),
+    ?TEST(["ab.","bc..cd"], ["..", trailing], {Mod, ["ab.bc","cd"]}),
+    ?TEST("ab...bc", ["..", trailing], {Mod, ["ab.","bc"]}),
+
+    ?TEST("..", ["..", all], {Mod, [[],[]]}),
+    ?TEST("abcd", ["..", all], {Mod, ["abcd"]}),
+    ?TEST("a..b", ["..", all], {Mod, ["a","b"]}),
+    ?TEST("a..b..c", ["..", all], {Mod, ["a","b","c"]}),
+    ?TEST("a..", ["..", all], {Mod, ["a",[]]}),
+    ?TEST(["a..b..c"], ["..", all], {Mod, ["a","b","c"]}),
+    ?TEST(["a","..b..c"], ["..", all], {Mod, ["a","b","c"]}),
+    ?TEST(["a",<<"..b..c">>], ["..", all], {Mod, ["a","b","c"]}),
+    ?TEST(["a.",".b..c"], ["..", all], {Mod, ["a","b","c"]}),
+    ?TEST(["a.",<<".b..c">>], ["..", all], {Mod, ["a","b","c"]}),
+    ?TEST(["a..","b..c"], ["..", all], {Mod, ["a","b","c"]}),
+    ?TEST(["a..",<<"b..c">>], ["..", all], {Mod, ["a","b","c"]}),
+    ?TEST(["a.","b..c"], ["..", all], {Mod, ["a.b","c"]}),
+    ?TEST("a...b", ["..", all], {Mod, ["a",".b"]}),
+
+    %% Grapheme (split) tests
+    ?TEST("aΩΩb", ["Ω", all], {Mod, ["a","","b"]}),
+    ?TEST([<<"aae">>,778,"äöoo"], [[$e,778], leading], {Mod, ["aa","äöoo"]}),
+    ?TEST([<<"aae">>,778,"äöoo"], [[$e,778], trailing], {Mod, ["aa","äöoo"]}),
+    ?TEST([<<"aae">>,778,"äöoo"], [[$e,778], all], {Mod, ["aa","äöoo"]}),
+    ?TEST([<<"aae">>,778,"öeeåäö"], ["e", leading], {Mod, [[$a, $a, $e,778,$ö],"eåäö"]}),
+    ?TEST([<<"aae">>,778,"öeeåäö"], ["e", trailing], {Mod, [[$a, $a, $e,778,$ö, $e],"åäö"]}),
+    ?TEST([<<"aae">>,778,"öeeåäö"], ["e", all], {Mod, [[$a, $a, $e,778,$ö],"", "åäö"]}),
+
+    ok.
+
+replace(_) ->
+    ?TEST(["a..b.", [".c"]], ["xxx", "::"], "a..b..c"),
+    ?TEST(["a..b.", [".c"]], ["..", "::"], "a::b..c"),
+    ?TEST([<<"a..b.">>, [".c"]], ["..", "::", trailing], "a..b::c"),
+    ?TEST(["a..b.", [".c"]], ["..", "::", all], "a::b::c"),
+    ok.
+
+cd_gc(_) ->
+    [] = string:next_codepoint(""),
+    [] = string:next_codepoint(<<>>),
+    [] = string:next_codepoint([<<>>]),
+    "abcd" = string:next_codepoint("abcd"),
+    [$e,778] = string:next_codepoint([$e,778]),
+    [$e|<<204,138>>] = string:next_codepoint(<<$e,778/utf8>>),
+    [778|_] = string:next_codepoint(tl(string:next_codepoint(<<$e,778/utf8>>))),
+
+    [] = string:next_grapheme(""),
+    [] = string:next_grapheme(<<>>),
+    [] = string:next_grapheme([<<>>]),
+    "abcd" = string:next_grapheme("abcd"),
+    [[$e,778]] = string:next_grapheme([$e,778]),
+    [[$e,778]] = string:next_grapheme(<<$e,778/utf8>>),
+
+    ok.
+
+
+find(_) ->
+    ?TEST(["h", "ejsan"], [""], "hejsan"),
+    ?TEST(["h", "ejsan"], [<<>>], "hejsan"),
+    ?TEST([], [""], ""),
+    ?TEST([], ["hej"], nomatch),
+    ?TEST(["h", "ejsan"], ["hej"], "hejsan"),
+    ?TEST(["h", "e", "jsan"], ["hej"], "hejsan"),
+    ?TEST(["xh", "e", "san"], ["hej"], nomatch),
+    ?TEST([<<"xh">>, <<"ejsan">>], ["hej"], "hejsan"),
+    ?TEST(["xh", <<"ejsan">>], ["hej"], "hejsan"),
+    ?TEST(["xh", <<"e">>, "jsan"], ["hej"], "hejsan"),
+    ?TEST(["xh", "e", <<"jsan">>], ["hej"], "hejsan"),
+    ?TEST(["xh", "er", <<"ljsane">>, "rlang"], ["erl", leading], "erljsanerlang"),
+    ?TEST("aΩΩb", ["Ω", leading], "ΩΩb"),
+    ?TEST([<<"aae">>,778,"äöoo"], [[$e,778], leading], [$e,778]++"äöoo"),
+    ?TEST([<<"aae">>,778,"öeeåäö"], ["e", leading], "eeåäö"),
+
+    ?TEST(["h", "ejsan"], ["", trailing], "hejsan"),
+    ?TEST([], ["", trailing], ""),
+    ?TEST([], ["hej", trailing], nomatch),
+    ?TEST(["h", "ejsan"], ["hej", trailing], "hejsan"),
+    ?TEST(["h", "e", "jsan"], ["hej", trailing], "hejsan"),
+    ?TEST(["xh", "e", "san"], ["hej", trailing], nomatch),
+    ?TEST([<<"xh">>, <<"ejsan">>], ["hej", trailing], "hejsan"),
+    ?TEST(["xh", <<"ejsan">>], ["hej", trailing], "hejsan"),
+    ?TEST(["xh", <<"e">>, "jsan"], ["hej", trailing], "hejsan"),
+    ?TEST(["xh", "e", <<"jsan">>], ["hej", trailing], "hejsan"),
+    ?TEST(["xh", "er", <<"ljsane">>, "rlang"], ["erl", trailing], "erlang"),
+    ?TEST("aΩΩb", ["Ω", trailing], "Ωb"),
+    ?TEST([<<"aae">>,778,"äöoo"], [[$e,778], trailing], [$e,778]++"äöoo"),
+    ?TEST([<<"aeae">>,778,"äö"], ["e", trailing], "eae"++[778,$ä,$ö]),
+
+    ok.
+
+lexemes(_) ->
+    Mod = fun(Res) ->
+                  [unicode:characters_to_nfc_list(io_lib:format("~ts", [Str]))|| Str <- Res]
+          end,
+    Res = ["Hej", "san", "Hopp", "san"],
+    ?TEST("", [" ,."],  {Mod, []}),
+    ?TEST("Hej san", [""],  {Mod, ["Hej san"]}),
+    ?TEST("  ,., ", [" ,."],  {Mod, []}),
+    ?TEST( "Hej san Hopp san", [" ,."], {Mod, Res}),
+    ?TEST(" Hej san Hopp san ", [" ,."], {Mod, Res}),
+    ?TEST(" Hej san, .Hopp san ", [" ,."], {Mod, Res}),
+
+    ?TEST([" Hej san",", .Hopp san "], [" ,."], {Mod, Res}),
+    ?TEST([" Hej sa","n, .Hopp san "], [" ,."], {Mod, Res}),
+    ?TEST([" Hej san,"," .Hopp san "], [" ,."], {Mod, Res}),
+
+    ?TEST([" Hej san",[", .Hopp san "]], [" ,."], {Mod, Res}),
+    ?TEST([" Hej sa",["n, .Hopp san "]], [" ,."], {Mod, Res}),
+    ?TEST([" Hej san,",[" .Hopp san "]], [" ,."], {Mod, Res}),
+
+    ?TEST([" H",<<"ej san, .Hopp "/utf8>>, "san"], [" ,."], {Mod, Res}),
+    ?TEST([" Hej san",<<", .Hopp "/utf8>>, "san"], [" ,."], {Mod, Res}),
+    ?TEST([" Hej sa",<<"n, .Hopp"/utf8>>, " san"], [" ,."], {Mod, Res}),
+    ?TEST([" Hej san,",<<" .Hopp s"/utf8>>, "an"], [" ,."], {Mod, Res}),
+    ?TEST([" Hej san",[<<", .Hopp san "/utf8>>]], [" ,."], {Mod, Res}),
+    ?TEST([" Hej sa",[<<"n, .Hopp san "/utf8>>]], [" ,."], {Mod, Res}),
+    ?TEST([" Hej san,",[<<" .Hopp san "/utf8>>], <<"  ">>], [" ,."], {Mod, Res}),
+
+    ?TEST(" Hej\r\nsan\nnl", ["\r\n\s"], {Mod, ["Hej\r\nsan", "nl"]}),
+
+    ?TEST(["b1ec1e",778,"äöo21"], ["eo"], {Mod, ["b1",[$c,$1,$e,778,$ä,$ö],"21"]}),
+    ?TEST([<<"b1ec1e">>,778,"äöo21"], ["eo"], {Mod, ["b1",[$c,$1,$e,778,$ä,$ö],"21"]}),
+    %% Grapheme (split) tests
+    Str10 = [[[<<"Ã·"/utf8>>,1101],<<"Ã«"/utf8>>|<<"\"">>]],
+    ?TEST(Str10, [[1076]], {Mod, [unicode:characters_to_nfc_list(Str10)]}),
+    ?TEST("a1Ωb1Ωc1", ["Ω"], {Mod, ["a1","b1","c1"]}),
+    ?TEST([<<"aae">>,778,"äöoo"], [[[$e,778]]], {Mod, ["aa","äöoo"]}),
+    ?TEST([<<"aae">>,778,"äöo21"], [[[$e,778],$o]], {Mod, ["aa","äö","21"]}),
+    ?TEST([<<"aae">>,778,"öeeåäö"], ["e"], {Mod, [[$a, $a, $e,778,$ö],"åäö"]}),
+    ok.
+
+nth_lexeme(_) ->
+    {'EXIT', _} = (catch string:nth_lexeme("test test", 0, [])),
+    {'EXIT', _} = (catch string:nth_lexeme(<<"test test">>, 0, [])),
+    ?TEST( "", [1, " ,."],  []),
+    ?TEST( "Hej san", [1, ""],  "Hej san"),
+    ?TEST( "  ,., ", [1, " ,."],  []),
+    ?TEST( "  ,., ", [3, " ,."],  []),
+    ?TEST("Hej san Hopp san", [1, " ,."], "Hej"),
+    ?TEST("...Hej san Hopp san", [1, " ,."], "Hej"),
+    ?TEST("Hej san Hopp san", [3, " ,."], "Hopp"),
+    ?TEST(" Hej san Hopp san ", [3, " ,."], "Hopp"),
+    ?TEST(" Hej san, .Hopp san ", [3, " ,."], "Hopp"),
+    ?TEST("ab cd", [3, " "], ""),
+
+    ?TEST([" Hej san",", .Hopp san "], [3, " ,."], "Hopp"),
+    ?TEST([" Hej sa","n, .Hopp san "], [3, " ,."], "Hopp"),
+    ?TEST([" Hej san,"," .Hopp san "], [3, " ,."], "Hopp"),
+
+    ?TEST([" Hej san",[", .Hopp san "]], [3," ,."], "Hopp"),
+    ?TEST([" Hej sa",["n, .Hopp san "]], [3, " ,."], "Hopp"),
+    ?TEST([" Hej san,",[" .Hopp san "]], [3, " ,."], "Hopp"),
+
+    ?TEST([" Hej san",<<", .Hopp "/utf8>>, "san"], [3, " ,."], "Hopp"),
+    ?TEST([" Hej sa",<<"n, .Hopp"/utf8>>, " san"], [3, " ,."], "Hopp"),
+    ?TEST([" Hej san,",<<" .Hopp s"/utf8>>, "an"], [3, " ,."], "Hopp"),
+    ?TEST([" Hej san,",<<" .Hopp s"/utf8>>, "an"], [4, " ,."], "san"),
+    ?TEST([" Hej san",[<<", .Hopp san "/utf8>>]], [3, " ,."], "Hopp"),
+    ?TEST([" Hej sa",[<<"n, .Hopp san "/utf8>>]], [3, " ,."], "Hopp"),
+    ?TEST([" Hej san,",[<<" .Hopp san "/utf8>>], <<"  ">>], [3, " ,."], "Hopp"),
+
+    ?TEST(["b1ec1e",778,"äöo21"], [3,"eo"], "21"),
+    ?TEST([<<"b1ec1e">>,778,"äöo21"], [3, "eo"], "21"),
+    %% Grapheme (split) tests
+    ?TEST("a1Ωb1Ωc1", [1, "Ω"], "a1"),
+    ?TEST([<<"aae">>,778,"äöoo"], [2,[[$e,778]]], "äöoo"),
+    ?TEST([<<"aae">>,778,"äöo21"], [2,[[$e,778],$o]], "äö"),
+    ?TEST([<<"aae">>,778,"öeeåäö"], [2,"e"], "åäö"),
+    ok.
+
+
+meas(Config) ->
+    case ct:get_timetrap_info() of
+        {_,{_,Scale}} when Scale > 1 ->
+            {skip,{will_not_run_in_debug,Scale}};
+        _ -> % No scaling
+            DataDir = proplists:get_value(data_dir, Config),
+            TestDir = filename:dirname(string:trim(DataDir, trailing, "/")),
+            do_measure(TestDir)
+    end.
+
+do_measure(TestDir) ->
+    File =  filename:join(TestDir, ?MODULE_STRING ++ ".erl"),
+    io:format("File ~s ",[File]),
+    {ok, Bin} = file:read_file(File),
+    io:format("~p~n",[byte_size(Bin)]),
+    Do = fun(Name, Func, Mode) ->
+                 {N, Mean, Stddev, _} = time_func(Func, Mode, Bin),
+                 io:format("~10w ~6w ~6.2fms ±~4.2fms #~.2w gc included~n",
+                           [Name, Mode, Mean/1000, Stddev/1000, N])
+         end,
+    io:format("----------------------~n"),
+    Do(tokens, fun(Str) -> string:tokens(Str, [$\n,$\r]) end, list),
+    Tokens = {lexemes, fun(Str) -> string:lexemes(Str, [$\n,$\r]) end},
+    [Do(Name,Fun,Mode) || {Name,Fun} <- [Tokens], Mode <- [list, binary]],
+    ok.
+
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+%% internal functions
+
+test(Line, Func, Str, Args, Res, Norm) ->
+    %%io:format("~p: ~p ~w ~w~n",[Line, Func, Str, Args]),
+    test_1(Line, Func, Str, [Str|norm(none,Args)], Res),
+    %%io:format("~p: ~p bin ",[Line, Func]),
+    test_1({Line,list}, Func, Str,
+           [unicode:characters_to_list(Str)|norm(none,Args)], Res),
+    Norm andalso
+        test_1({Line,clist}, Func, Str,
+               [unicode:characters_to_nfc_list(Str)|norm(nfc,Args)], Res),
+    Norm andalso
+        test_1({Line,dlist}, Func, Str,
+               [unicode:characters_to_nfd_list(Str)|norm(nfd,Args)], Res),
+    test_1({Line,bin}, Func, Str,
+           [unicode:characters_to_binary(Str)|norm(none, Args)], Res),
+    Norm andalso
+        test_1({Line,cbin}, Func, Str,
+               [unicode:characters_to_nfc_binary(Str)|norm(nfc,Args)], Res),
+    Norm andalso
+        test_1({Line,dbin}, Func, Str,
+               [unicode:characters_to_nfd_binary(Str)|norm(nfd,Args)], Res),
+    %%io:format("~n",[]),
+    ok.
+
+test_1(Line, Func, Str, Args, Exp) ->
+    try
+        Res = apply(string, Func, Args),
+        check_types(Line, Func, Args, Res),
+        case res(Res, Exp) of
+            true -> ok;
+            {Res1,Exp1} when is_tuple(Exp1) ->
+                io:format("~p~n",[Args]),
+                io:format("~p:~p: ~ts~w =>~n  :~w:~w~n",
+                          [Func,Line, Str,Str,Res1,Exp1]),
+                exit({error, Func});
+            {Res1,Exp1} ->
+                io:format("~p:~p: ~ts~w =>~n  :~ts~w:~ts~w~n",
+                          [Func,Line, Str,Str, Res1,Res1, Exp1,Exp1]),
+                exit({error, Func})
+        end
+    catch
+        error:Exp ->
+            ok;
+        error:Reason ->
+            io:format("~p:~p: Crash ~p ~p~n",
+                      [?MODULE,Line, Reason, erlang:get_stacktrace()]),
+            exit({error, Func})
+    end.
+
+norm(Type, Args) ->
+    Norm = case Type of
+               nfc -> fun unicode:characters_to_nfc_list/1;
+               nfd -> fun unicode:characters_to_nfd_list/1;
+               none -> fun(Str) -> Str end
+           end,
+    lists:map(fun({norm,Str}) -> Norm(Str);
+                 (Other) -> Other
+              end, Args).
+
+res(Str, Str) -> true;
+res(Str, Exp) when is_list(Str), is_list(Exp) ->
+    A = unicode:characters_to_nfc_list(Str),
+    A==Exp orelse {A,Exp};
+res(Str, Exp) when is_binary(Str), is_list(Exp) ->
+    A = unicode:characters_to_nfc_list(Str),
+    A==Exp orelse {A,Exp};
+res(What, {Fun, Exp}) when is_function(Fun) ->
+    Fun(What) == Exp orelse {Fun(What), Exp};
+res({S1,S2}=S, {Exp1,Exp2}=E) -> %% For take
+    case {res(S1,Exp1), res(S2,Exp2)} of
+        {true, true} -> true;
+        _ -> {S, E}
+    end;
+res(Int, Exp) ->
+    Int == Exp orelse {Int, Exp}.
+
+
+check_types(_Line, _Func, _Str, Res)
+  when is_integer(Res); is_boolean(Res); Res =:= nomatch ->
+    %% length or equal
+    ok;
+check_types(Line, Func, [S1,S2], Res)
+  when Func =:= concat ->
+    case check_types_1(type(S1),type(S2)) of
+        ok ->
+            case check_types_1(type(S1),type(Res)) of
+                ok -> ok;
+                {T1,T2} ->
+                    io:format("Failed: ~p ~p ~p ~p~n",[Line, Func, T1, T2]),
+                    io:format("  ~p ~p  => ~p~n", [S1, S2, Res]),
+                    error
+            end;
+        _ -> ok
+    end;
+check_types(Line, Func, [Str|_], Res)  ->
+    AddList = fun(mixed) -> mixed;
+                 ({list,{list,_}}) -> {list, deep};
+                 (R) ->
+                      case lists:member(Func, [lexemes, tokens, split]) of
+                          true -> {list, R};
+                          false -> R
+                      end
+              end,
+    try needs_check(Func) andalso (ok = check_types_1(AddList(type(Str)), type(Res))) of
+        ok -> ok;
+        false -> ok
+    catch _:{badmatch, {T1,T2}} ->
+            io:format("Failed: ~p ~p: ~p ~p~n",[Line, Func, T1, T2]),
+            io:format("  ~p  => ~p~n", [Str, Res]),
+            error;
+          _:Reason ->
+            io:format("Crash: ~p in~n ~p~n",[Reason, erlang:get_stacktrace()]),
+            io:format("Failed: ~p ~p: ~p => ~p~n", [Line, Func, Str, Res]),
+            exit({Reason, erlang:get_stacktrace()})
+    end.
+
+check_types_1(T, T) ->
+    ok;
+check_types_1(Str, Res)
+  when is_binary(Str), is_binary(Res) ->
+    ok;
+check_types_1({list, _},{list, undefined}) ->
+    ok;
+check_types_1({list, _},{list, codepoints}) ->
+    ok;
+check_types_1({list, _},{list, {list, codepoints}}) ->
+    ok;
+check_types_1({list, {list, _}},{list, {list, codepoints}}) ->
+    ok;
+check_types_1(mixed,_) ->
+    ok;
+check_types_1({list, binary}, binary) ->
+    ok;
+check_types_1({list, binary}, {other, _, _}) -> %% take
+    ok;
+check_types_1({list, deep}, _) ->
+    ok;
+check_types_1({list, {list, deep}}, _) ->
+    ok;
+check_types_1(T1,T2) ->
+    {T1,T2}.
+
+type(Bin) when is_binary(Bin) ->
+    binary;
+type([]) ->
+    {list, undefined};
+type(List) when is_list(List) ->
+    Deep = fun(L) when is_list(L) ->
+                   lists:any(fun(C) -> is_list(C) orelse is_binary(C) end, L);
+              (_) -> false
+           end,
+    case all(fun(C) -> not is_binary(C) end, List) of
+        true ->
+            case all(fun(C) -> is_integer(C) end, List) of
+                true -> {list, codepoints};
+                false ->
+                    case [deep || L <- List, Deep(L)] of
+                        [] -> {list, {list, codepoints}};
+                        _ -> {list, deep}
+                    end
+            end;
+        false ->
+            case all(fun(C) -> is_binary(C) end, List) of
+                true -> {list, binary};
+                false -> mixed
+            end
+    end;
+type({R1,R2}) ->
+    case {type(R1),type(R2)} of
+        {T,T} -> T;
+        {{list,undefined}, {list,codepoints}} -> {list,codepoints};
+        {{list,codepoints}, {list,undefined}} -> {list,codepoints};
+        {T1,T2} -> {other, T1,T2}
+    end;
+type(Other) ->
+    {other, Other}.
+
+all(_Check, []) ->
+    true;
+all(Check, [H|T]) ->
+    Check(H) andalso all(Check,T);
+all(Check, Bin) when is_binary(Bin) ->
+    Check(Bin).
+
+needs_check(reverse) -> false;
+needs_check(pad) -> false;
+needs_check(replace) -> false;
+needs_check(_) -> true.
+
+%%%% Timer stuff
+
+time_func(Fun, Mode, Bin) ->
+    timer:sleep(100), %% Let emulator catch up and clean things before test runs
+    Self = self(),
+    Pid = spawn_link(fun() ->
+                             Str = mode(Mode, Bin),
+                             Self ! {self(),time_func(0,0,0, Fun, Str, undefined)}
+                     end),
+    receive {Pid,Msg} -> Msg end.
+
+time_func(N,Sum,SumSq, Fun, Str, _) when N < 50 ->
+    {Time, Res} = timer:tc(fun() -> Fun(Str) end),
+    time_func(N+1,Sum+Time,SumSq+Time*Time, Fun, Str, Res);
+time_func(N,Sum,SumSq, _, _, Res) ->
+    Mean = round(Sum / N),
+    Stdev = round(math:sqrt((SumSq - (Sum*Sum/N))/(N - 1))),
+    {N, Mean, Stdev, Res}.
+
+mode(binary, Bin) -> Bin;
+mode(list, Bin) -> unicode:characters_to_list(Bin).
+
 %%
-%% Test cases starts here.
+%% Old string lists Test cases starts here.
 %%
 
 len(Config) when is_list(Config) ->
@@ -80,16 +932,14 @@ len(Config) when is_list(Config) ->
     {'EXIT',_} = (catch string:len({})),
     ok.
 
-equal(Config) when is_list(Config) ->
+old_equal(Config) when is_list(Config) ->
     true = string:equal("", ""),
     false = string:equal("", " "),
     true = string:equal("laban", "laban"),
     false = string:equal("skvimp", "skvump"),
-    %% invalid arg type
-    true = string:equal(2, 2),			% not good, should crash
     ok.
 
-concat(Config) when is_list(Config) ->
+old_concat(Config) when is_list(Config) ->
     "erlang rules" = string:concat("erlang ", "rules"),
     "" = string:concat("", ""),
     "x" = string:concat("x", ""),
@@ -130,6 +980,7 @@ str_rstr(Config) when is_list(Config) ->
     3 = string:rstr("xxxx", "xx"),
     3 = string:str("xy z yx", " z"),
     3 = string:rstr("xy z yx", " z"),
+    3 = string:str("aaab", "ab"),
     %% invalid arg type
     {'EXIT',_} = (catch string:str(hello, "he")),
     %% invalid arg type
@@ -184,7 +1035,7 @@ substr(Config) when is_list(Config) ->
     {'EXIT',_} = (catch string:substr("1234", "1")),
     ok.
 
-tokens(Config) when is_list(Config) ->
+old_tokens(Config) when is_list(Config) ->
     [] = string:tokens("",""),
     [] = string:tokens("abc","abc"),
     ["abc"] = string:tokens("abc", ""),
@@ -221,7 +1072,7 @@ replace_sep(C, Seps, New) ->
 chars(Config) when is_list(Config) ->
     [] = string:chars($., 0),
     [] = string:chars($., 0, []),
-    10 = length(string:chars(32, 10, [])),
+    10 = erlang:length(string:chars(32, 10, [])),
     "aaargh" = string:chars($a, 3, "rgh"),
     %% invalid arg type
     {'EXIT',_} = (catch string:chars($x, [])),
@@ -231,7 +1082,7 @@ copies(Config) when is_list(Config) ->
     "" = string:copies("", 10),
     "" = string:copies(".", 0),
     "." = string:copies(".", 1),
-    30 = length(string:copies("123", 10)),
+    30 = erlang:length(string:copies("123", 10)),
     %% invalid arg type
     {'EXIT',_} = (catch string:copies("hej", -1)),
     {'EXIT',_} = (catch string:copies("hej", 2.0)),
@@ -360,7 +1211,7 @@ to_integer(Config) when is_list(Config) ->
     ok.
 
 test_to_integer(Str) ->
-    io:format("Checking ~p~n", [Str]),
+    %% io:format("Checking ~p~n", [Str]),
     case string:to_integer(Str) of
 	{error,_Reason} = Bad ->
 	    {'EXIT',_} = (catch list_to_integer(Str)),
@@ -403,7 +1254,7 @@ to_float(Config) when is_list(Config) ->
     ok.
 
 test_to_float(Str) ->
-    io:format("Checking ~p~n", [Str]),
+    %% io:format("Checking ~p~n", [Str]),
     case string:to_float(Str) of
 	{error,_Reason} = Bad ->
 	    {'EXIT',_} = (catch list_to_float(Str)),
@@ -419,7 +1270,7 @@ to_upper_to_lower(Config) when is_list(Config) ->
     All = lists:seq(0, 255),
 
     UC = string:to_upper(All),
-    256 = length(UC),
+    256 = erlang:length(UC),
     all_upper_latin1(UC, 0),
 
     LC = string:to_lower(All),
@@ -450,7 +1301,7 @@ all_lower_latin1([C|T], C) when 0 =< C, C < $A;
 all_lower_latin1([H|T], C) when $A =< C, C =< $Z;
 				16#C0 =< C, C =< 16#F6;
 				16#C8 =< C, C =< 16#DE ->
-    io:format("~p\n", [{H,C}]),
+    % io:format("~p\n", [{H,C}]),
     H = C + 32,
     all_lower_latin1(T, C+1);
 all_lower_latin1([], 256) -> ok.
author	Dan Gudmundsson <[email protected]>	2017-04-03 12:19:21 +0200
committer	Dan Gudmundsson <[email protected]>	2017-04-24 12:16:56 +0200
commit	2c72e662bad11a41839780f86680d4bb05367c78 (patch)
tree	01e9ae9b32fdb953392e571a0773fb2cd059c498 /lib
parent	75fc94b8b462d7b7f6dd4b706bbe32cff77ee575 (diff)
download	otp-2c72e662bad11a41839780f86680d4bb05367c78.tar.gz otp-2c72e662bad11a41839780f86680d4bb05367c78.tar.bz2 otp-2c72e662bad11a41839780f86680d4bb05367c78.zip