From 75fc94b8b462d7b7f6dd4b706bbe32cff77ee575 Mon Sep 17 00:00:00 2001 From: Dan Gudmundsson Date: Fri, 27 Jan 2017 15:27:37 +0100 Subject: Add nf(k)d, nf(k)c conversion functions to unicode module --- lib/stdlib/doc/src/unicode.xml | 179 ++++++++++++++++++++++++++++++++++++++++- 1 file changed, 177 insertions(+), 2 deletions(-) (limited to 'lib/stdlib/doc') diff --git a/lib/stdlib/doc/src/unicode.xml b/lib/stdlib/doc/src/unicode.xml index 93d0d37456..382b253ba1 100644 --- a/lib/stdlib/doc/src/unicode.xml +++ b/lib/stdlib/doc/src/unicode.xml @@ -50,8 +50,35 @@ external entities where this is required. When working inside the Erlang/OTP environment, it is recommended to keep binaries in UTF-8 when representing Unicode characters. ISO Latin-1 encoding is supported both - for backward compatibility and for communication - with external entities not supporting Unicode character sets.

+ for backward compatibility and for communication + with external entities not supporting Unicode character sets.

+

Programs should always operate on a normalized form and compare + canonical-equivalent Unicode characters as equal. All characters + should thus be normalized to one form once on the system borders. + One of the following functions can convert characters to their + normalized forms + characters_to_nfc_list/1, + + characters_to_nfc_binary/1, + + characters_to_nfd_list/1 or + + characters_to_nfd_binary/1. + For general text + + characters_to_nfc_list/1 or + + characters_to_nfc_binary/1 is preferred, and + for identifiers one of the compatibility normalization + functions, such as + + characters_to_nfkc_list/1, + is preferred for security reasons. + The normalization functions where introduced in OTP 20. + Additional information on normalization can be found in the + Unicode FAQ. +

+ @@ -334,6 +361,154 @@ decode_data(Data) -> + + + Normalize characters to a list of canonical equivalent + composed Unicode characters. + +

Converts a possibly deep list of characters and binaries + into a Normalized Form of canonical equivalent Composed + characters according to the Unicode standard.

+

Any binaries in the input must be encoded with utf8 + encoding. +

+

The result is a list of characters.

+ +3> unicode:characters_to_nfc_list([<<"abc..a">>,[778],$a,[776],$o,[776]]). +"abc..åäö" + +
+
+ + + + Normalize characters to a utf8 binary of canonical equivalent + composed Unicode characters. + +

Converts a possibly deep list of characters and binaries + into a Normalized Form of canonical equivalent Composed + characters according to the Unicode standard.

+

Any binaries in the input must be encoded with utf8 + encoding.

+

The result is an utf8 encoded binary.

+ +4> unicode:characters_to_nfc_binary([<<"abc..a">>,[778],$a,[776],$o,[776]]). +<<"abc..åäö"/utf8>> + +
+
+ + + + Normalize characters to a list of canonical equivalent + decomposed Unicode characters. + +

Converts a possibly deep list of characters and binaries + into a Normalized Form of canonical equivalent Decomposed + characters according to the Unicode standard.

+

Any binaries in the input must be encoded with utf8 + encoding. +

+

The result is a list of characters.

+ +1> unicode:characters_to_nfd_list("abc..åäö"). +[97,98,99,46,46,97,778,97,776,111,776] + +
+
+ + + + Normalize characters to a utf8 binary of canonical equivalent + decomposed Unicode characters. + +

Converts a possibly deep list of characters and binaries + into a Normalized Form of canonical equivalent Decomposed + characters according to the Unicode standard.

+

Any binaries in the input must be encoded with utf8 + encoding.

+

The result is an utf8 encoded binary.

+ +2> unicode:characters_to_nfd_binary("abc..åäö"). +<<97,98,99,46,46,97,204,138,97,204,136,111,204,136>> + +
+
+ + + + Normalize characters to a list of canonical equivalent + composed Unicode characters. + +

Converts a possibly deep list of characters and binaries + into a Normalized Form of compatibly equivalent Composed + characters according to the Unicode standard.

+

Any binaries in the input must be encoded with utf8 + encoding. +

+

The result is a list of characters.

+ +3> unicode:characters_to_nfkc_list([<<"abc..a">>,[778],$a,[776],$o,[776],[65299,65298]]). +"abc..åäö32" + +
+
+ + + + Normalize characters to a utf8 binary of compatibly equivalent + composed Unicode characters. + +

Converts a possibly deep list of characters and binaries + into a Normalized Form of compatibly equivalent Composed + characters according to the Unicode standard.

+

Any binaries in the input must be encoded with utf8 + encoding.

+

The result is an utf8 encoded binary.

+ +4> unicode:characters_to_nfkc_binary([<<"abc..a">>,[778],$a,[776],$o,[776],[65299,65298]]). +<<"abc..åäö32"/utf8>> + +
+
+ + + + Normalize characters to a list of compatibly equivalent + decomposed Unicode characters. + +

Converts a possibly deep list of characters and binaries + into a Normalized Form of compatibly equivalent Decomposed + characters according to the Unicode standard.

+

Any binaries in the input must be encoded with utf8 + encoding. +

+

The result is a list of characters.

+ +1> unicode:characters_to_nfkd_list(["abc..åäö",[65299,65298]]). +[97,98,99,46,46,97,778,97,776,111,776,51,50] + +
+
+ + + + Normalize characters to a utf8 binary of compatibly equivalent + decomposed Unicode characters. + +

Converts a possibly deep list of characters and binaries + into a Normalized Form of compatibly equivalent Decomposed + characters according to the Unicode standard.

+

Any binaries in the input must be encoded with utf8 + encoding.

+

The result is an utf8 encoded binary.

+ +2> unicode:characters_to_nfkd_binary(["abc..åäö",[65299,65298]]). +<<97,98,99,46,46,97,204,138,97,204,136,111,204,136,51,50>> + +
+
+ Create a binary UTF byte order mark from encoding. -- cgit v1.2.3