aboutsummaryrefslogtreecommitdiffstats
path: root/lib/stdlib/doc/src/unicode.xml
diff options
context:
space:
mode:
Diffstat (limited to 'lib/stdlib/doc/src/unicode.xml')
-rw-r--r--lib/stdlib/doc/src/unicode.xml190
1 files changed, 185 insertions, 5 deletions
diff --git a/lib/stdlib/doc/src/unicode.xml b/lib/stdlib/doc/src/unicode.xml
index 93d0d37456..d822aca89c 100644
--- a/lib/stdlib/doc/src/unicode.xml
+++ b/lib/stdlib/doc/src/unicode.xml
@@ -5,7 +5,7 @@
<header>
<copyright>
<year>1996</year>
- <year>2016</year>
+ <year>2017</year>
<holder>Ericsson AB, All Rights Reserved</holder>
</copyright>
<legalnotice>
@@ -50,8 +50,35 @@
external entities where this is required. When working inside the
Erlang/OTP environment, it is recommended to keep binaries in UTF-8 when
representing Unicode characters. ISO Latin-1 encoding is supported both
- for backward compatibility and for communication
- with external entities not supporting Unicode character sets.</p>
+ for backward compatibility and for communication
+ with external entities not supporting Unicode character sets.</p>
+ <p>Programs should always operate on a normalized form and compare
+ canonical-equivalent Unicode characters as equal. All characters
+ should thus be normalized to one form once on the system borders.
+ One of the following functions can convert characters to their
+ normalized forms <seealso marker="#characters_to_nfc_list/1">
+ <c>characters_to_nfc_list/1</c></seealso>,
+ <seealso marker="#characters_to_nfc_binary/1">
+ <c>characters_to_nfc_binary/1</c></seealso>,
+ <seealso marker="#characters_to_nfd_list/1">
+ <c>characters_to_nfd_list/1</c></seealso> or
+ <seealso marker="#characters_to_nfd_binary/1">
+ <c>characters_to_nfd_binary/1</c></seealso>.
+ For general text
+ <seealso marker="#characters_to_nfc_list/1">
+ <c>characters_to_nfc_list/1</c></seealso> or
+ <seealso marker="#characters_to_nfc_binary/1">
+ <c>characters_to_nfc_binary/1</c></seealso> is preferred, and
+ for identifiers one of the compatibility normalization
+ functions, such as
+ <seealso marker="#characters_to_nfkc_list/1">
+ <c>characters_to_nfkc_list/1</c></seealso>,
+ is preferred for security reasons.
+ The normalization functions where introduced in OTP 20.
+ Additional information on normalization can be found in the
+ <url href="http://unicode.org/faq/normalization.html">Unicode FAQ</url>.
+ </p>
+
</description>
<datatypes>
@@ -212,8 +239,13 @@
<c><anno>InEncoding</anno></c>.</p>
</item>
</list>
- <p>Only when <c><anno>InEncoding</anno></c> is one of the UTF
- encodings, integers in the list are allowed to be &gt; 255.</p>
+ <p>
+ Note that integers in the list always represent code points
+ regardless of <c><anno>InEncoding</anno></c> passed. If
+ <c><anno>InEncoding</anno> latin1</c> is passed, only code
+ points &lt; 256 are allowed; otherwise, all valid unicode code
+ points are allowed.
+ </p>
<p>If <c><anno>InEncoding</anno></c> is <c>latin1</c>, parameter
<c><anno>Data</anno></c> corresponds to the <c>iodata()</c> type,
but for <c>unicode</c>, parameter <c><anno>Data</anno></c> can
@@ -335,6 +367,154 @@ decode_data(Data) ->
</func>
<func>
+ <name name="characters_to_nfc_list" arity="1"/>
+ <fsummary>Normalize characters to a list of canonical equivalent
+ composed Unicode characters.</fsummary>
+ <desc>
+ <p>Converts a possibly deep list of characters and binaries
+ into a Normalized Form of canonical equivalent Composed
+ characters according to the Unicode standard.</p>
+ <p>Any binaries in the input must be encoded with utf8
+ encoding.
+ </p>
+ <p>The result is a list of characters.</p>
+ <code>
+3> unicode:characters_to_nfc_list([&lt;&lt;"abc..a">>,[778],$a,[776],$o,[776]]).
+"abc..åäö"
+</code>
+ </desc>
+ </func>
+
+ <func>
+ <name name="characters_to_nfc_binary" arity="1"/>
+ <fsummary>Normalize characters to a utf8 binary of canonical equivalent
+ composed Unicode characters.</fsummary>
+ <desc>
+ <p>Converts a possibly deep list of characters and binaries
+ into a Normalized Form of canonical equivalent Composed
+ characters according to the Unicode standard.</p>
+ <p>Any binaries in the input must be encoded with utf8
+ encoding.</p>
+ <p>The result is an utf8 encoded binary.</p>
+ <code>
+4> unicode:characters_to_nfc_binary([&lt;&lt;"abc..a">>,[778],$a,[776],$o,[776]]).
+&lt;&lt;"abc..åäö"/utf8>>
+</code>
+ </desc>
+ </func>
+
+ <func>
+ <name name="characters_to_nfd_list" arity="1"/>
+ <fsummary>Normalize characters to a list of canonical equivalent
+ decomposed Unicode characters.</fsummary>
+ <desc>
+ <p>Converts a possibly deep list of characters and binaries
+ into a Normalized Form of canonical equivalent Decomposed
+ characters according to the Unicode standard.</p>
+ <p>Any binaries in the input must be encoded with utf8
+ encoding.
+ </p>
+ <p>The result is a list of characters.</p>
+ <code>
+1> unicode:characters_to_nfd_list("abc..åäö").
+[97,98,99,46,46,97,778,97,776,111,776]
+</code>
+ </desc>
+ </func>
+
+ <func>
+ <name name="characters_to_nfd_binary" arity="1"/>
+ <fsummary>Normalize characters to a utf8 binary of canonical equivalent
+ decomposed Unicode characters.</fsummary>
+ <desc>
+ <p>Converts a possibly deep list of characters and binaries
+ into a Normalized Form of canonical equivalent Decomposed
+ characters according to the Unicode standard.</p>
+ <p>Any binaries in the input must be encoded with utf8
+ encoding.</p>
+ <p>The result is an utf8 encoded binary.</p>
+ <code>
+2> unicode:characters_to_nfd_binary("abc..åäö").
+&lt;&lt;97,98,99,46,46,97,204,138,97,204,136,111,204,136>>
+</code>
+ </desc>
+ </func>
+
+ <func>
+ <name name="characters_to_nfkc_list" arity="1"/>
+ <fsummary>Normalize characters to a list of canonical equivalent
+ composed Unicode characters.</fsummary>
+ <desc>
+ <p>Converts a possibly deep list of characters and binaries
+ into a Normalized Form of compatibly equivalent Composed
+ characters according to the Unicode standard.</p>
+ <p>Any binaries in the input must be encoded with utf8
+ encoding.
+ </p>
+ <p>The result is a list of characters.</p>
+ <code>
+3> unicode:characters_to_nfkc_list([&lt;&lt;"abc..a">>,[778],$a,[776],$o,[776],[65299,65298]]).
+"abc..åäö32"
+</code>
+ </desc>
+ </func>
+
+ <func>
+ <name name="characters_to_nfkc_binary" arity="1"/>
+ <fsummary>Normalize characters to a utf8 binary of compatibly equivalent
+ composed Unicode characters.</fsummary>
+ <desc>
+ <p>Converts a possibly deep list of characters and binaries
+ into a Normalized Form of compatibly equivalent Composed
+ characters according to the Unicode standard.</p>
+ <p>Any binaries in the input must be encoded with utf8
+ encoding.</p>
+ <p>The result is an utf8 encoded binary.</p>
+ <code>
+4> unicode:characters_to_nfkc_binary([&lt;&lt;"abc..a">>,[778],$a,[776],$o,[776],[65299,65298]]).
+&lt;&lt;"abc..åäö32"/utf8>>
+</code>
+ </desc>
+ </func>
+
+ <func>
+ <name name="characters_to_nfkd_list" arity="1"/>
+ <fsummary>Normalize characters to a list of compatibly equivalent
+ decomposed Unicode characters.</fsummary>
+ <desc>
+ <p>Converts a possibly deep list of characters and binaries
+ into a Normalized Form of compatibly equivalent Decomposed
+ characters according to the Unicode standard.</p>
+ <p>Any binaries in the input must be encoded with utf8
+ encoding.
+ </p>
+ <p>The result is a list of characters.</p>
+ <code>
+1> unicode:characters_to_nfkd_list(["abc..åäö",[65299,65298]]).
+[97,98,99,46,46,97,778,97,776,111,776,51,50]
+</code>
+ </desc>
+ </func>
+
+ <func>
+ <name name="characters_to_nfkd_binary" arity="1"/>
+ <fsummary>Normalize characters to a utf8 binary of compatibly equivalent
+ decomposed Unicode characters.</fsummary>
+ <desc>
+ <p>Converts a possibly deep list of characters and binaries
+ into a Normalized Form of compatibly equivalent Decomposed
+ characters according to the Unicode standard.</p>
+ <p>Any binaries in the input must be encoded with utf8
+ encoding.</p>
+ <p>The result is an utf8 encoded binary.</p>
+ <code>
+2> unicode:characters_to_nfkd_binary(["abc..åäö",[65299,65298]]).
+&lt;&lt;97,98,99,46,46,97,204,138,97,204,136,111,204,136,51,50>>
+</code>
+ </desc>
+ </func>
+
+ <func>
<name name="encoding_to_bom" arity="1"/>
<fsummary>Create a binary UTF byte order mark from encoding.</fsummary>
<type_desc variable="Bin">