Add nf(k)d, nf(k)c conversion functions to unicode module

author: Dan Gudmundsson <[email protected]> 2017-01-27 15:27:37 +0100
committer: Dan Gudmundsson <[email protected]> 2017-04-24 12:16:56 +0200
commit: 75fc94b8b462d7b7f6dd4b706bbe32cff77ee575 (patch)
tree: da96f42d849544e674cb40c9376f2ccf57a14385
parent: e6a31e66961314a5b74cdb5ffcdab89092c31bc7 (diff)
download: otp-75fc94b8b462d7b7f6dd4b706bbe32cff77ee575.tar.gz
otp-75fc94b8b462d7b7f6dd4b706bbe32cff77ee575.tar.bz2
otp-75fc94b8b462d7b7f6dd4b706bbe32cff77ee575.zip
3 files changed, 351 insertions, 4 deletions
diff --git a/lib/stdlib/doc/src/unicode.xml b/lib/stdlib/doc/src/unicode.xml
index 93d0d37456..382b253ba1 100644
--- a/lib/stdlib/doc/src/unicode.xml
+++ b/lib/stdlib/doc/src/unicode.xml
@@ -50,8 +50,35 @@
     external entities where this is required. When working inside the
     Erlang/OTP environment, it is recommended to keep binaries in UTF-8 when
     representing Unicode characters. ISO Latin-1 encoding is supported both
-    for backward compatibility and for communication 
-    with external entities not supporting Unicode character sets.</p>
+    for backward compatibility and for communication
+  with external entities not supporting Unicode character sets.</p>
+  <p>Programs should always operate on a normalized form and compare
+  canonical-equivalent Unicode characters as equal. All characters
+  should thus be normalized to one form once on the system borders.
+  One of the following functions can convert characters to their
+  normalized forms <seealso marker="#characters_to_nfc_list/1">
+  <c>characters_to_nfc_list/1</c></seealso>,
+  <seealso marker="#characters_to_nfc_binary/1">
+    <c>characters_to_nfc_binary/1</c></seealso>,
+    <seealso marker="#characters_to_nfd_list/1">
+    <c>characters_to_nfd_list/1</c></seealso> or
+    <seealso marker="#characters_to_nfd_binary/1">
+      <c>characters_to_nfd_binary/1</c></seealso>.
+  For general text
+  <seealso marker="#characters_to_nfc_list/1">
+    <c>characters_to_nfc_list/1</c></seealso> or
+    <seealso marker="#characters_to_nfc_binary/1">
+      <c>characters_to_nfc_binary/1</c></seealso> is preferred, and
+      for identifiers one of the compatibility normalization
+      functions, such as
+      <seealso marker="#characters_to_nfkc_list/1">
+      <c>characters_to_nfkc_list/1</c></seealso>,
+      is preferred for security reasons.
+      The normalization functions where introduced in OTP 20.
+      Additional information on normalization can be found in the
+      <url href="http://unicode.org/faq/normalization.html">Unicode FAQ</url>.
+  </p>
+
   </description>
 
   <datatypes>
@@ -335,6 +362,154 @@ decode_data(Data) ->
     </func>
 
     <func>
+      <name name="characters_to_nfc_list" arity="1"/>
+      <fsummary>Normalize characters to a list of canonical equivalent
+      composed Unicode characters.</fsummary>
+      <desc>
+        <p>Converts a possibly deep list of characters and binaries
+        into a Normalized Form of canonical equivalent Composed
+        characters according to the Unicode standard.</p>
+	<p>Any binaries in the input must be encoded with utf8
+        encoding.
+	</p>
+	<p>The result is a list of characters.</p>
+        <code>
+3> unicode:characters_to_nfc_list([&lt;&lt;"abc..a">>,[778],$a,[776],$o,[776]]).
+"abc..åäö"
+</code>
+      </desc>
+    </func>
+
+    <func>
+      <name name="characters_to_nfc_binary" arity="1"/>
+      <fsummary>Normalize characters to a utf8 binary of canonical equivalent
+      composed Unicode characters.</fsummary>
+      <desc>
+        <p>Converts a possibly deep list of characters and binaries
+        into a Normalized Form of canonical equivalent Composed
+        characters according to the Unicode standard.</p>
+	<p>Any binaries in the input must be encoded with utf8
+        encoding.</p>
+	<p>The result is an utf8 encoded binary.</p>
+        <code>
+4> unicode:characters_to_nfc_binary([&lt;&lt;"abc..a">>,[778],$a,[776],$o,[776]]).
+&lt;&lt;"abc..åäö"/utf8>>
+</code>
+      </desc>
+    </func>
+
+    <func>
+      <name name="characters_to_nfd_list" arity="1"/>
+      <fsummary>Normalize characters to a list of canonical equivalent
+      decomposed Unicode characters.</fsummary>
+      <desc>
+        <p>Converts a possibly deep list of characters and binaries
+        into a Normalized Form of canonical equivalent Decomposed
+        characters according to the Unicode standard.</p>
+	<p>Any binaries in the input must be encoded with utf8
+        encoding.
+	</p>
+	<p>The result is a list of characters.</p>
+        <code>
+1> unicode:characters_to_nfd_list("abc..åäö").
+[97,98,99,46,46,97,778,97,776,111,776]
+</code>
+      </desc>
+    </func>
+
+    <func>
+      <name name="characters_to_nfd_binary" arity="1"/>
+      <fsummary>Normalize characters to a utf8 binary of canonical equivalent
+      decomposed Unicode characters.</fsummary>
+      <desc>
+        <p>Converts a possibly deep list of characters and binaries
+        into a Normalized Form of canonical equivalent Decomposed
+        characters according to the Unicode standard.</p>
+	<p>Any binaries in the input must be encoded with utf8
+        encoding.</p>
+	<p>The result is an utf8 encoded binary.</p>
+        <code>
+2> unicode:characters_to_nfd_binary("abc..åäö").
+&lt;&lt;97,98,99,46,46,97,204,138,97,204,136,111,204,136>>
+</code>
+      </desc>
+    </func>
+
+    <func>
+      <name name="characters_to_nfkc_list" arity="1"/>
+      <fsummary>Normalize characters to a list of canonical equivalent
+      composed Unicode characters.</fsummary>
+      <desc>
+        <p>Converts a possibly deep list of characters and binaries
+        into a Normalized Form of compatibly equivalent Composed
+        characters according to the Unicode standard.</p>
+	<p>Any binaries in the input must be encoded with utf8
+        encoding.
+	</p>
+	<p>The result is a list of characters.</p>
+        <code>
+3> unicode:characters_to_nfkc_list([&lt;&lt;"abc..a">>,[778],$a,[776],$o,[776],[65299,65298]]).
+"abc..åäö32"
+</code>
+      </desc>
+    </func>
+
+    <func>
+      <name name="characters_to_nfkc_binary" arity="1"/>
+      <fsummary>Normalize characters to a utf8 binary of compatibly equivalent
+      composed Unicode characters.</fsummary>
+      <desc>
+        <p>Converts a possibly deep list of characters and binaries
+        into a Normalized Form of compatibly equivalent Composed
+        characters according to the Unicode standard.</p>
+	<p>Any binaries in the input must be encoded with utf8
+        encoding.</p>
+	<p>The result is an utf8 encoded binary.</p>
+        <code>
+4> unicode:characters_to_nfkc_binary([&lt;&lt;"abc..a">>,[778],$a,[776],$o,[776],[65299,65298]]).
+&lt;&lt;"abc..åäö32"/utf8>>
+</code>
+      </desc>
+    </func>
+
+    <func>
+      <name name="characters_to_nfkd_list" arity="1"/>
+      <fsummary>Normalize characters to a list of compatibly equivalent
+      decomposed Unicode characters.</fsummary>
+      <desc>
+        <p>Converts a possibly deep list of characters and binaries
+        into a Normalized Form of compatibly equivalent Decomposed
+        characters according to the Unicode standard.</p>
+	<p>Any binaries in the input must be encoded with utf8
+        encoding.
+	</p>
+	<p>The result is a list of characters.</p>
+        <code>
+1> unicode:characters_to_nfkd_list(["abc..åäö",[65299,65298]]).
+[97,98,99,46,46,97,778,97,776,111,776,51,50]
+</code>
+      </desc>
+    </func>
+
+    <func>
+      <name name="characters_to_nfkd_binary" arity="1"/>
+      <fsummary>Normalize characters to a utf8 binary of compatibly equivalent
+      decomposed Unicode characters.</fsummary>
+      <desc>
+        <p>Converts a possibly deep list of characters and binaries
+        into a Normalized Form of compatibly equivalent Decomposed
+        characters according to the Unicode standard.</p>
+	<p>Any binaries in the input must be encoded with utf8
+        encoding.</p>
+	<p>The result is an utf8 encoded binary.</p>
+        <code>
+2> unicode:characters_to_nfkd_binary(["abc..åäö",[65299,65298]]).
+&lt;&lt;97,98,99,46,46,97,204,138,97,204,136,111,204,136,51,50>>
+</code>
+      </desc>
+    </func>
+
+    <func>
       <name name="encoding_to_bom" arity="1"/>
       <fsummary>Create a binary UTF byte order mark from encoding.</fsummary>
       <type_desc variable="Bin">
diff --git a/lib/stdlib/src/unicode.erl b/lib/stdlib/src/unicode.erl
index cb2020c21b..59499021cb 100644
--- a/lib/stdlib/src/unicode.erl
+++ b/lib/stdlib/src/unicode.erl
@@ -22,7 +22,12 @@
 -export([characters_to_list/1, characters_to_list_int/2,
 	 characters_to_binary/1, characters_to_binary_int/2,
 	 characters_to_binary/3,
-	 bom_to_encoding/1, encoding_to_bom/1]).
+	 bom_to_encoding/1, encoding_to_bom/1,
+         characters_to_nfd_list/1, characters_to_nfd_binary/1,
+         characters_to_nfc_list/1, characters_to_nfc_binary/1,
+         characters_to_nfkd_list/1, characters_to_nfkd_binary/1,
+         characters_to_nfkc_list/1, characters_to_nfkc_binary/1
+        ]).
 
 -export_type([chardata/0, charlist/0, encoding/0, external_chardata/0,
               external_charlist/0, latin1_char/0, latin1_chardata/0,
@@ -242,6 +247,92 @@ encoding_to_bom({utf32,little}) ->
 encoding_to_bom(latin1) ->
     <<>>.
 
+-define(GC_N, 200). %% arbitrary number
+
+%% Canonical decompose string to list of chars
+-spec characters_to_nfd_list(chardata()) -> [char()].
+characters_to_nfd_list(CD) ->
+    case unicode_util:nfd(CD) of
+        [GC|Str] when is_list(GC) -> GC++characters_to_nfd_list(Str);
+        [CP|Str] -> [CP|characters_to_nfd_list(Str)];
+        [] -> []
+    end.
+
+-spec characters_to_nfd_binary(chardata()) -> unicode_binary().
+characters_to_nfd_binary(CD) ->
+    list_to_binary(characters_to_nfd_binary(CD, ?GC_N, [])).
+
+characters_to_nfd_binary(CD, N, Row) when N > 0 ->
+    case unicode_util:nfd(CD) of
+        [GC|Str] -> characters_to_nfd_binary(Str, N-1, [GC|Row]);
+        [] -> [characters_to_binary(lists:reverse(Row))]
+    end;
+characters_to_nfd_binary(CD, _, Row) ->
+    [characters_to_binary(lists:reverse(Row))|characters_to_nfd_binary(CD,?GC_N,[])].
+
+%% Compability Canonical decompose string to list of chars.
+-spec characters_to_nfkd_list(chardata()) -> [char()].
+characters_to_nfkd_list(CD) ->
+    case unicode_util:nfkd(CD) of
+        [GC|Str] when is_list(GC) -> GC++characters_to_nfkd_list(Str);
+        [CP|Str] -> [CP|characters_to_nfkd_list(Str)];
+        [] -> []
+    end.
+
+-spec characters_to_nfkd_binary(chardata()) -> unicode_binary().
+characters_to_nfkd_binary(CD) ->
+    list_to_binary(characters_to_nfkd_binary(CD, ?GC_N, [])).
+
+characters_to_nfkd_binary(CD, N, Row) when N > 0 ->
+    case unicode_util:nfkd(CD) of
+        [GC|Str] -> characters_to_nfkd_binary(Str, N-1, [GC|Row]);
+        [] -> [characters_to_binary(lists:reverse(Row))]
+    end;
+characters_to_nfkd_binary(CD, _, Row) ->
+    [characters_to_binary(lists:reverse(Row))|characters_to_nfkd_binary(CD,?GC_N,[])].
+
+
+%% Canonical compose string to list of chars
+-spec characters_to_nfc_list(chardata()) -> [char()].
+characters_to_nfc_list(CD) ->
+    case unicode_util:nfc(CD) of
+        [CPs|Str] when is_list(CPs) -> CPs ++ characters_to_nfc_list(Str);
+        [CP|Str] -> [CP|characters_to_nfc_list(Str)];
+        [] -> []
+    end.
+
+-spec characters_to_nfc_binary(chardata()) -> unicode_binary().
+characters_to_nfc_binary(CD) ->
+    list_to_binary(characters_to_nfc_binary(CD, ?GC_N, [])).
+
+characters_to_nfc_binary(CD, N, Row) when N > 0 ->
+    case unicode_util:nfc(CD) of
+        [GC|Str] -> characters_to_nfc_binary(Str, N-1, [GC|Row]);
+        [] -> [characters_to_binary(lists:reverse(Row))]
+    end;
+characters_to_nfc_binary(CD, _, Row) ->
+    [characters_to_binary(lists:reverse(Row))|characters_to_nfc_binary(CD,?GC_N,[])].
+
+%% Compability Canonical compose string to list of chars
+-spec characters_to_nfkc_list(chardata()) -> [char()].
+characters_to_nfkc_list(CD) ->
+    case unicode_util:nfkc(CD) of
+        [CPs|Str] when is_list(CPs) -> CPs ++ characters_to_nfkc_list(Str);
+        [CP|Str] -> [CP|characters_to_nfkc_list(Str)];
+        [] -> []
+    end.
+
+-spec characters_to_nfkc_binary(chardata()) -> unicode_binary().
+characters_to_nfkc_binary(CD) ->
+    list_to_binary(characters_to_nfkc_binary(CD, ?GC_N, [])).
+
+characters_to_nfkc_binary(CD, N, Row) when N > 0 ->
+    case unicode_util:nfkc(CD) of
+        [GC|Str] -> characters_to_nfkc_binary(Str, N-1, [GC|Row]);
+        [] -> [characters_to_binary(lists:reverse(Row))]
+    end;
+characters_to_nfkc_binary(CD, _, Row) ->
+    [characters_to_binary(lists:reverse(Row))|characters_to_nfkc_binary(CD,?GC_N,[])].
 
 %% internals
 
diff --git a/lib/stdlib/test/unicode_SUITE.erl b/lib/stdlib/test/unicode_SUITE.erl
index 07d63bdf22..52f2cd5202 100644
--- a/lib/stdlib/test/unicode_SUITE.erl
+++ b/lib/stdlib/test/unicode_SUITE.erl
@@ -33,7 +33,9 @@
 	 ex_binaries_errors_utf16_little/1,
 	 ex_binaries_errors_utf16_big/1,
 	 ex_binaries_errors_utf32_little/1,
-	 ex_binaries_errors_utf32_big/1]).
+	 ex_binaries_errors_utf32_big/1,
+         normalize/1
+        ]).
 
 suite() ->
     [{ct_hooks,[ts_install_cth]},
@@ -44,6 +46,7 @@ all() ->
      utf16_illegal_sequences_bif, random_lists, roundtrips,
      latin1, exceptions,
      binaries_errors_limit,
+     normalize,
      {group,binaries_errors}].
 
 groups() -> 
@@ -920,6 +923,84 @@ fail_bif_1(Bin,Coding) ->
 	    ok
     end.
 
+
+normalize(_) ->
+    %% More tests are in unicode_util_SUITE.erl and str_SUITE.erl
+    {'EXIT', _} = (catch unicode:characters_to_nfc_list({tuple})),
+    {'EXIT', _} = (catch unicode:characters_to_nfd_list({tuple})),
+    {'EXIT', _} = (catch unicode:characters_to_nfkc_list({tuple})),
+    {'EXIT', _} = (catch unicode:characters_to_nfkd_list({tuple})),
+    {'EXIT', _} = (catch unicode:characters_to_nfc_binary({tuple})),
+    {'EXIT', _} = (catch unicode:characters_to_nfd_binary({tuple})),
+    {'EXIT', _} = (catch unicode:characters_to_nfkc_binary({tuple})),
+    {'EXIT', _} = (catch unicode:characters_to_nfkd_binary({tuple})),
+    String = ["abc..åäö", <<"Ωµe`è"/utf8>>, "œŒþæÆħ§ß ホンダ"],
+    NFD_l = unicode:characters_to_nfd_list(String),
+    NFD_b = unicode:characters_to_nfd_binary(String),
+    NFC_l = unicode:characters_to_nfc_list(String),
+    NFC_b = unicode:characters_to_nfc_binary(String),
+
+    NFD_l = unicode:characters_to_nfd_list(NFD_l),
+    NFD_l = unicode:characters_to_nfd_list(NFD_b),
+    NFD_l = unicode:characters_to_nfd_list(NFC_l),
+    NFD_l = unicode:characters_to_nfd_list(NFC_b),
+
+    NFD_b = unicode:characters_to_nfd_binary(NFD_b),
+    NFD_b = unicode:characters_to_nfd_binary(NFD_l),
+    NFD_b = unicode:characters_to_nfd_binary(NFC_b),
+    NFD_b = unicode:characters_to_nfd_binary(NFC_l),
+
+    NFC_l = unicode:characters_to_nfc_list(NFD_l),
+    NFC_l = unicode:characters_to_nfc_list(NFD_b),
+    NFC_l = unicode:characters_to_nfc_list(NFC_l),
+    NFC_l = unicode:characters_to_nfc_list(NFC_b),
+
+    NFC_b = unicode:characters_to_nfc_binary(NFD_b),
+    NFC_b = unicode:characters_to_nfc_binary(NFD_l),
+    NFC_b = unicode:characters_to_nfc_binary(NFC_b),
+    NFC_b = unicode:characters_to_nfc_binary(NFC_l),
+
+    Str = [lists:duplicate(20,lists:seq($a, $q))|String],
+    StrD_bin = unicode:characters_to_binary(unicode:characters_to_nfd_list(Str)),
+    StrD_bin = unicode:characters_to_nfd_binary(Str),
+    StrC_bin = unicode:characters_to_binary(unicode:characters_to_nfc_list(StrD_bin)),
+    StrC_bin = unicode:characters_to_nfc_binary(Str),
+
+    NFKD_l = unicode:characters_to_nfkd_list(String),
+    NFKD_b = unicode:characters_to_nfkd_binary(String),
+    NFKC_l = unicode:characters_to_nfkc_list(String),
+    NFKC_b = unicode:characters_to_nfkc_binary(String),
+
+    NFKD_l = unicode:characters_to_nfkd_list(NFKD_l),
+    NFKD_l = unicode:characters_to_nfkd_list(NFKD_b),
+    NFKD_l = unicode:characters_to_nfkd_list(NFKC_l),
+    NFKD_l = unicode:characters_to_nfkd_list(NFKC_b),
+
+    NFKD_b = unicode:characters_to_nfd_binary(NFKD_b),
+    NFKD_b = unicode:characters_to_nfd_binary(NFKD_l),
+    NFKD_b = unicode:characters_to_nfd_binary(NFKC_b),
+    NFKD_b = unicode:characters_to_nfd_binary(NFKC_l),
+
+    NFKC_l = unicode:characters_to_nfc_list(NFKD_l),
+    NFKC_l = unicode:characters_to_nfc_list(NFKD_b),
+    NFKC_l = unicode:characters_to_nfc_list(NFKC_l),
+    NFKC_l = unicode:characters_to_nfc_list(NFKC_b),
+
+    NFKC_b = unicode:characters_to_nfc_binary(NFKD_b),
+    NFKC_b = unicode:characters_to_nfc_binary(NFKD_l),
+    NFKC_b = unicode:characters_to_nfc_binary(NFKC_b),
+    NFKC_b = unicode:characters_to_nfc_binary(NFKC_l),
+
+    StrKD_bin = unicode:characters_to_binary(unicode:characters_to_nfkd_list(Str)),
+    StrKD_bin = unicode:characters_to_nfkd_binary(Str),
+    StrKC_bin = unicode:characters_to_binary(unicode:characters_to_nfkc_list(StrD_bin)),
+    StrKC_bin = unicode:characters_to_nfkc_binary(Str),
+
+    true = unicode:characters_to_nfkc_list("ホンダ") =:= unicode:characters_to_nfkc_list("ﾎﾝﾀﾞ"),
+    true = unicode:characters_to_nfkd_list("32") =:= unicode:characters_to_nfkd_list("３２"),
+    ok.
+
+
 %%
 %% Diverse utilities
 %%
author	Dan Gudmundsson <[email protected]>	2017-01-27 15:27:37 +0100
committer	Dan Gudmundsson <[email protected]>	2017-04-24 12:16:56 +0200
commit	75fc94b8b462d7b7f6dd4b706bbe32cff77ee575 (patch)
tree	da96f42d849544e674cb40c9376f2ccf57a14385
parent	e6a31e66961314a5b74cdb5ffcdab89092c31bc7 (diff)
download	otp-75fc94b8b462d7b7f6dd4b706bbe32cff77ee575.tar.gz otp-75fc94b8b462d7b7f6dd4b706bbe32cff77ee575.tar.bz2 otp-75fc94b8b462d7b7f6dd4b706bbe32cff77ee575.zip