From 75fc94b8b462d7b7f6dd4b706bbe32cff77ee575 Mon Sep 17 00:00:00 2001 From: Dan Gudmundsson Date: Fri, 27 Jan 2017 15:27:37 +0100 Subject: Add nf(k)d, nf(k)c conversion functions to unicode module --- lib/stdlib/doc/src/unicode.xml | 179 +++++++++++++++++++++++++++++++++++++- lib/stdlib/src/unicode.erl | 93 +++++++++++++++++++- lib/stdlib/test/unicode_SUITE.erl | 83 +++++++++++++++++- 3 files changed, 351 insertions(+), 4 deletions(-) diff --git a/lib/stdlib/doc/src/unicode.xml b/lib/stdlib/doc/src/unicode.xml index 93d0d37456..382b253ba1 100644 --- a/lib/stdlib/doc/src/unicode.xml +++ b/lib/stdlib/doc/src/unicode.xml @@ -50,8 +50,35 @@ external entities where this is required. When working inside the Erlang/OTP environment, it is recommended to keep binaries in UTF-8 when representing Unicode characters. ISO Latin-1 encoding is supported both - for backward compatibility and for communication - with external entities not supporting Unicode character sets.

+ for backward compatibility and for communication + with external entities not supporting Unicode character sets.

+

Programs should always operate on a normalized form and compare + canonical-equivalent Unicode characters as equal. All characters + should thus be normalized to one form once on the system borders. + One of the following functions can convert characters to their + normalized forms + characters_to_nfc_list/1, + + characters_to_nfc_binary/1, + + characters_to_nfd_list/1 or + + characters_to_nfd_binary/1. + For general text + + characters_to_nfc_list/1 or + + characters_to_nfc_binary/1 is preferred, and + for identifiers one of the compatibility normalization + functions, such as + + characters_to_nfkc_list/1, + is preferred for security reasons. + The normalization functions where introduced in OTP 20. + Additional information on normalization can be found in the + Unicode FAQ. +

+ @@ -334,6 +361,154 @@ decode_data(Data) -> + + + Normalize characters to a list of canonical equivalent + composed Unicode characters. + +

Converts a possibly deep list of characters and binaries + into a Normalized Form of canonical equivalent Composed + characters according to the Unicode standard.

+

Any binaries in the input must be encoded with utf8 + encoding. +

+

The result is a list of characters.

+ +3> unicode:characters_to_nfc_list([<<"abc..a">>,[778],$a,[776],$o,[776]]). +"abc..åäö" + +
+
+ + + + Normalize characters to a utf8 binary of canonical equivalent + composed Unicode characters. + +

Converts a possibly deep list of characters and binaries + into a Normalized Form of canonical equivalent Composed + characters according to the Unicode standard.

+

Any binaries in the input must be encoded with utf8 + encoding.

+

The result is an utf8 encoded binary.

+ +4> unicode:characters_to_nfc_binary([<<"abc..a">>,[778],$a,[776],$o,[776]]). +<<"abc..åäö"/utf8>> + +
+
+ + + + Normalize characters to a list of canonical equivalent + decomposed Unicode characters. + +

Converts a possibly deep list of characters and binaries + into a Normalized Form of canonical equivalent Decomposed + characters according to the Unicode standard.

+

Any binaries in the input must be encoded with utf8 + encoding. +

+

The result is a list of characters.

+ +1> unicode:characters_to_nfd_list("abc..åäö"). +[97,98,99,46,46,97,778,97,776,111,776] + +
+
+ + + + Normalize characters to a utf8 binary of canonical equivalent + decomposed Unicode characters. + +

Converts a possibly deep list of characters and binaries + into a Normalized Form of canonical equivalent Decomposed + characters according to the Unicode standard.

+

Any binaries in the input must be encoded with utf8 + encoding.

+

The result is an utf8 encoded binary.

+ +2> unicode:characters_to_nfd_binary("abc..åäö"). +<<97,98,99,46,46,97,204,138,97,204,136,111,204,136>> + +
+
+ + + + Normalize characters to a list of canonical equivalent + composed Unicode characters. + +

Converts a possibly deep list of characters and binaries + into a Normalized Form of compatibly equivalent Composed + characters according to the Unicode standard.

+

Any binaries in the input must be encoded with utf8 + encoding. +

+

The result is a list of characters.

+ +3> unicode:characters_to_nfkc_list([<<"abc..a">>,[778],$a,[776],$o,[776],[65299,65298]]). +"abc..åäö32" + +
+
+ + + + Normalize characters to a utf8 binary of compatibly equivalent + composed Unicode characters. + +

Converts a possibly deep list of characters and binaries + into a Normalized Form of compatibly equivalent Composed + characters according to the Unicode standard.

+

Any binaries in the input must be encoded with utf8 + encoding.

+

The result is an utf8 encoded binary.

+ +4> unicode:characters_to_nfkc_binary([<<"abc..a">>,[778],$a,[776],$o,[776],[65299,65298]]). +<<"abc..åäö32"/utf8>> + +
+
+ + + + Normalize characters to a list of compatibly equivalent + decomposed Unicode characters. + +

Converts a possibly deep list of characters and binaries + into a Normalized Form of compatibly equivalent Decomposed + characters according to the Unicode standard.

+

Any binaries in the input must be encoded with utf8 + encoding. +

+

The result is a list of characters.

+ +1> unicode:characters_to_nfkd_list(["abc..åäö",[65299,65298]]). +[97,98,99,46,46,97,778,97,776,111,776,51,50] + +
+
+ + + + Normalize characters to a utf8 binary of compatibly equivalent + decomposed Unicode characters. + +

Converts a possibly deep list of characters and binaries + into a Normalized Form of compatibly equivalent Decomposed + characters according to the Unicode standard.

+

Any binaries in the input must be encoded with utf8 + encoding.

+

The result is an utf8 encoded binary.

+ +2> unicode:characters_to_nfkd_binary(["abc..åäö",[65299,65298]]). +<<97,98,99,46,46,97,204,138,97,204,136,111,204,136,51,50>> + +
+
+ Create a binary UTF byte order mark from encoding. diff --git a/lib/stdlib/src/unicode.erl b/lib/stdlib/src/unicode.erl index cb2020c21b..59499021cb 100644 --- a/lib/stdlib/src/unicode.erl +++ b/lib/stdlib/src/unicode.erl @@ -22,7 +22,12 @@ -export([characters_to_list/1, characters_to_list_int/2, characters_to_binary/1, characters_to_binary_int/2, characters_to_binary/3, - bom_to_encoding/1, encoding_to_bom/1]). + bom_to_encoding/1, encoding_to_bom/1, + characters_to_nfd_list/1, characters_to_nfd_binary/1, + characters_to_nfc_list/1, characters_to_nfc_binary/1, + characters_to_nfkd_list/1, characters_to_nfkd_binary/1, + characters_to_nfkc_list/1, characters_to_nfkc_binary/1 + ]). -export_type([chardata/0, charlist/0, encoding/0, external_chardata/0, external_charlist/0, latin1_char/0, latin1_chardata/0, @@ -242,6 +247,92 @@ encoding_to_bom({utf32,little}) -> encoding_to_bom(latin1) -> <<>>. +-define(GC_N, 200). %% arbitrary number + +%% Canonical decompose string to list of chars +-spec characters_to_nfd_list(chardata()) -> [char()]. +characters_to_nfd_list(CD) -> + case unicode_util:nfd(CD) of + [GC|Str] when is_list(GC) -> GC++characters_to_nfd_list(Str); + [CP|Str] -> [CP|characters_to_nfd_list(Str)]; + [] -> [] + end. + +-spec characters_to_nfd_binary(chardata()) -> unicode_binary(). +characters_to_nfd_binary(CD) -> + list_to_binary(characters_to_nfd_binary(CD, ?GC_N, [])). + +characters_to_nfd_binary(CD, N, Row) when N > 0 -> + case unicode_util:nfd(CD) of + [GC|Str] -> characters_to_nfd_binary(Str, N-1, [GC|Row]); + [] -> [characters_to_binary(lists:reverse(Row))] + end; +characters_to_nfd_binary(CD, _, Row) -> + [characters_to_binary(lists:reverse(Row))|characters_to_nfd_binary(CD,?GC_N,[])]. + +%% Compability Canonical decompose string to list of chars. +-spec characters_to_nfkd_list(chardata()) -> [char()]. +characters_to_nfkd_list(CD) -> + case unicode_util:nfkd(CD) of + [GC|Str] when is_list(GC) -> GC++characters_to_nfkd_list(Str); + [CP|Str] -> [CP|characters_to_nfkd_list(Str)]; + [] -> [] + end. + +-spec characters_to_nfkd_binary(chardata()) -> unicode_binary(). +characters_to_nfkd_binary(CD) -> + list_to_binary(characters_to_nfkd_binary(CD, ?GC_N, [])). + +characters_to_nfkd_binary(CD, N, Row) when N > 0 -> + case unicode_util:nfkd(CD) of + [GC|Str] -> characters_to_nfkd_binary(Str, N-1, [GC|Row]); + [] -> [characters_to_binary(lists:reverse(Row))] + end; +characters_to_nfkd_binary(CD, _, Row) -> + [characters_to_binary(lists:reverse(Row))|characters_to_nfkd_binary(CD,?GC_N,[])]. + + +%% Canonical compose string to list of chars +-spec characters_to_nfc_list(chardata()) -> [char()]. +characters_to_nfc_list(CD) -> + case unicode_util:nfc(CD) of + [CPs|Str] when is_list(CPs) -> CPs ++ characters_to_nfc_list(Str); + [CP|Str] -> [CP|characters_to_nfc_list(Str)]; + [] -> [] + end. + +-spec characters_to_nfc_binary(chardata()) -> unicode_binary(). +characters_to_nfc_binary(CD) -> + list_to_binary(characters_to_nfc_binary(CD, ?GC_N, [])). + +characters_to_nfc_binary(CD, N, Row) when N > 0 -> + case unicode_util:nfc(CD) of + [GC|Str] -> characters_to_nfc_binary(Str, N-1, [GC|Row]); + [] -> [characters_to_binary(lists:reverse(Row))] + end; +characters_to_nfc_binary(CD, _, Row) -> + [characters_to_binary(lists:reverse(Row))|characters_to_nfc_binary(CD,?GC_N,[])]. + +%% Compability Canonical compose string to list of chars +-spec characters_to_nfkc_list(chardata()) -> [char()]. +characters_to_nfkc_list(CD) -> + case unicode_util:nfkc(CD) of + [CPs|Str] when is_list(CPs) -> CPs ++ characters_to_nfkc_list(Str); + [CP|Str] -> [CP|characters_to_nfkc_list(Str)]; + [] -> [] + end. + +-spec characters_to_nfkc_binary(chardata()) -> unicode_binary(). +characters_to_nfkc_binary(CD) -> + list_to_binary(characters_to_nfkc_binary(CD, ?GC_N, [])). + +characters_to_nfkc_binary(CD, N, Row) when N > 0 -> + case unicode_util:nfkc(CD) of + [GC|Str] -> characters_to_nfkc_binary(Str, N-1, [GC|Row]); + [] -> [characters_to_binary(lists:reverse(Row))] + end; +characters_to_nfkc_binary(CD, _, Row) -> + [characters_to_binary(lists:reverse(Row))|characters_to_nfkc_binary(CD,?GC_N,[])]. %% internals diff --git a/lib/stdlib/test/unicode_SUITE.erl b/lib/stdlib/test/unicode_SUITE.erl index 07d63bdf22..52f2cd5202 100644 --- a/lib/stdlib/test/unicode_SUITE.erl +++ b/lib/stdlib/test/unicode_SUITE.erl @@ -33,7 +33,9 @@ ex_binaries_errors_utf16_little/1, ex_binaries_errors_utf16_big/1, ex_binaries_errors_utf32_little/1, - ex_binaries_errors_utf32_big/1]). + ex_binaries_errors_utf32_big/1, + normalize/1 + ]). suite() -> [{ct_hooks,[ts_install_cth]}, @@ -44,6 +46,7 @@ all() -> utf16_illegal_sequences_bif, random_lists, roundtrips, latin1, exceptions, binaries_errors_limit, + normalize, {group,binaries_errors}]. groups() -> @@ -920,6 +923,84 @@ fail_bif_1(Bin,Coding) -> ok end. + +normalize(_) -> + %% More tests are in unicode_util_SUITE.erl and str_SUITE.erl + {'EXIT', _} = (catch unicode:characters_to_nfc_list({tuple})), + {'EXIT', _} = (catch unicode:characters_to_nfd_list({tuple})), + {'EXIT', _} = (catch unicode:characters_to_nfkc_list({tuple})), + {'EXIT', _} = (catch unicode:characters_to_nfkd_list({tuple})), + {'EXIT', _} = (catch unicode:characters_to_nfc_binary({tuple})), + {'EXIT', _} = (catch unicode:characters_to_nfd_binary({tuple})), + {'EXIT', _} = (catch unicode:characters_to_nfkc_binary({tuple})), + {'EXIT', _} = (catch unicode:characters_to_nfkd_binary({tuple})), + String = ["abc..åäö", <<"Ωµe`è"/utf8>>, "œŒþæÆħ§ß ホンダ"], + NFD_l = unicode:characters_to_nfd_list(String), + NFD_b = unicode:characters_to_nfd_binary(String), + NFC_l = unicode:characters_to_nfc_list(String), + NFC_b = unicode:characters_to_nfc_binary(String), + + NFD_l = unicode:characters_to_nfd_list(NFD_l), + NFD_l = unicode:characters_to_nfd_list(NFD_b), + NFD_l = unicode:characters_to_nfd_list(NFC_l), + NFD_l = unicode:characters_to_nfd_list(NFC_b), + + NFD_b = unicode:characters_to_nfd_binary(NFD_b), + NFD_b = unicode:characters_to_nfd_binary(NFD_l), + NFD_b = unicode:characters_to_nfd_binary(NFC_b), + NFD_b = unicode:characters_to_nfd_binary(NFC_l), + + NFC_l = unicode:characters_to_nfc_list(NFD_l), + NFC_l = unicode:characters_to_nfc_list(NFD_b), + NFC_l = unicode:characters_to_nfc_list(NFC_l), + NFC_l = unicode:characters_to_nfc_list(NFC_b), + + NFC_b = unicode:characters_to_nfc_binary(NFD_b), + NFC_b = unicode:characters_to_nfc_binary(NFD_l), + NFC_b = unicode:characters_to_nfc_binary(NFC_b), + NFC_b = unicode:characters_to_nfc_binary(NFC_l), + + Str = [lists:duplicate(20,lists:seq($a, $q))|String], + StrD_bin = unicode:characters_to_binary(unicode:characters_to_nfd_list(Str)), + StrD_bin = unicode:characters_to_nfd_binary(Str), + StrC_bin = unicode:characters_to_binary(unicode:characters_to_nfc_list(StrD_bin)), + StrC_bin = unicode:characters_to_nfc_binary(Str), + + NFKD_l = unicode:characters_to_nfkd_list(String), + NFKD_b = unicode:characters_to_nfkd_binary(String), + NFKC_l = unicode:characters_to_nfkc_list(String), + NFKC_b = unicode:characters_to_nfkc_binary(String), + + NFKD_l = unicode:characters_to_nfkd_list(NFKD_l), + NFKD_l = unicode:characters_to_nfkd_list(NFKD_b), + NFKD_l = unicode:characters_to_nfkd_list(NFKC_l), + NFKD_l = unicode:characters_to_nfkd_list(NFKC_b), + + NFKD_b = unicode:characters_to_nfd_binary(NFKD_b), + NFKD_b = unicode:characters_to_nfd_binary(NFKD_l), + NFKD_b = unicode:characters_to_nfd_binary(NFKC_b), + NFKD_b = unicode:characters_to_nfd_binary(NFKC_l), + + NFKC_l = unicode:characters_to_nfc_list(NFKD_l), + NFKC_l = unicode:characters_to_nfc_list(NFKD_b), + NFKC_l = unicode:characters_to_nfc_list(NFKC_l), + NFKC_l = unicode:characters_to_nfc_list(NFKC_b), + + NFKC_b = unicode:characters_to_nfc_binary(NFKD_b), + NFKC_b = unicode:characters_to_nfc_binary(NFKD_l), + NFKC_b = unicode:characters_to_nfc_binary(NFKC_b), + NFKC_b = unicode:characters_to_nfc_binary(NFKC_l), + + StrKD_bin = unicode:characters_to_binary(unicode:characters_to_nfkd_list(Str)), + StrKD_bin = unicode:characters_to_nfkd_binary(Str), + StrKC_bin = unicode:characters_to_binary(unicode:characters_to_nfkc_list(StrD_bin)), + StrKC_bin = unicode:characters_to_nfkc_binary(Str), + + true = unicode:characters_to_nfkc_list("ホンダ") =:= unicode:characters_to_nfkc_list("ホンダ"), + true = unicode:characters_to_nfkd_list("32") =:= unicode:characters_to_nfkd_list("32"), + ok. + + %% %% Diverse utilities %% -- cgit v1.2.3