aboutsummaryrefslogtreecommitdiffstats
path: root/lib/stdlib/doc/src/unicode.xml
diff options
context:
space:
mode:
Diffstat (limited to 'lib/stdlib/doc/src/unicode.xml')
-rw-r--r--lib/stdlib/doc/src/unicode.xml187
1 files changed, 110 insertions, 77 deletions
diff --git a/lib/stdlib/doc/src/unicode.xml b/lib/stdlib/doc/src/unicode.xml
index e3a25a407b..1001ebbae4 100644
--- a/lib/stdlib/doc/src/unicode.xml
+++ b/lib/stdlib/doc/src/unicode.xml
@@ -38,50 +38,83 @@
<p>It is recommended to only use external encodings for communication with external entities where this is required. When working inside the Erlang/OTP environment, it is recommended to keep binaries in UTF-8 when representing Unicode characters. Latin1 encoding is supported both for backward compatibility and for communication with external entities not supporting Unicode character sets.</p>
</description>
- <section>
- <title>DATA TYPES</title>
- <marker id="type-charlist"></marker>
- <code type="none">
-unicode_binary() = binary() with characters encoded in UTF-8 coding standard
-unicode_char() = integer() representing valid unicode codepoint
-
-chardata() = charlist() | unicode_binary()
-
-charlist() = [unicode_char() | unicode_binary() | charlist()]
- a unicode_binary is allowed as the tail of the list</code>
-
- <code type="none">
-external_unicode_binary() = binary()
- with characters coded in a user specified Unicode encoding other
- than UTF-8 (UTF-16 or UTF-32)
-
-external_chardata() = external_charlist() | external_unicode_binary()
-
-external_charlist() = [unicode_char() | external_unicode_binary() | external_charlist()]
- an external_unicode_binary is allowed as the tail of the list</code>
-
- <code type="none">
-latin1_binary() = binary() with characters coded in iso-latin-1
-latin1_char() = integer() representing valid latin1 character (0-255)
-
-latin1_chardata() = latin1_charlist() | latin1_binary()
+ <datatypes>
+ <datatype>
+ <name name="encoding"/>
+ </datatype>
+ <datatype>
+ <name name="endian"/>
+ </datatype>
+ <datatype>
+ <name name="unicode_binary"/>
+ <desc>
+ <p>A binary() with characters encoded in the UTF-8 coding standard.</p>
+ </desc>
+ </datatype>
+ <datatype>
+ <name name="unicode_char"/>
+ <desc>
+ <p>An integer() representing a valid unicode codepoint.</p>
+ </desc>
+ </datatype>
+ <datatype>
+ <name name="chardata"/>
+ </datatype>
+ <datatype>
+ <name name="charlist"/>
+ <desc>
+ <p>A unicode_binary is allowed as the tail of the list.</p>
+ </desc>
+ </datatype>
+ <datatype>
+ <name name="external_unicode_binary"/>
+ <desc>
+ <p>A <c>binary()</c> with characters coded in a user specified Unicode
+ encoding other than UTF-8 (UTF-16 or UTF-32).</p>
+ </desc>
+ </datatype>
+ <datatype>
+ <name name="external_chardata"/>
+ </datatype>
+ <datatype>
+ <name name="external_charlist"/>
+ <desc>
+ <p>An <c>external_unicode_binary()</c> is allowed as the tail
+ of the list.</p>
+ </desc>
+ </datatype>
+ <datatype>
+ <name name="latin1_binary"/>
+ <desc><p>A <c>binary()</c> with characters coded in iso-latin-1.</p>
+ </desc>
+ </datatype>
+ <datatype>
+ <name name="latin1_char"/>
+ <desc><p>An <c>integer()</c> representing valid latin1
+ character (0-255).</p>
+ </desc>
+ </datatype>
+ <datatype>
+ <name name="latin1_chardata"/>
+ </datatype>
+ <datatype>
+ <name name="latin1_charlist"/>
+ <desc><p>A <c>latin1_binary()</c> is allowed as the tail of
+ the list.</p>
+ </desc>
+ </datatype>
+ </datatypes>
-latin1_charlist() = [latin1_char() | latin1_binary() | latin1_charlist()]
- a latin1_binary is allowed as the tail of the list</code>
- </section>
<funcs>
<func>
- <name>bom_to_encoding(Bin) -> {Encoding,Length}</name>
+ <name name="bom_to_encoding" arity="1"/>
<fsummary>Identify UTF byte order marks in a binary.</fsummary>
- <type>
- <v>Bin = binary() of byte_size 4 or more</v>
- <v>Encoding = latin1 | utf8 | {utf16,little} | {utf16,big} | {utf32,little} | {utf32,big}</v>
- <v>Length = int()</v>
- </type>
+ <type name="endian"/>
+ <type_desc variable="Bin">A binary() of byte_size 4 or more.</type_desc>
<desc>
<p>Check for a UTF byte order mark (BOM) in the beginning of a
- binary. If the supplied binary <c>Bin</c> begins with a valid
+ binary. If the supplied binary <c><anno>Bin</anno></c> begins with a valid
byte order mark for either UTF-8, UTF-16 or UTF-32, the function
returns the encoding identified along with the length of the BOM
in bytes.</p>
@@ -90,23 +123,24 @@ latin1_charlist() = [latin1_char() | latin1_binary() | latin1_charlist()]
</desc>
</func>
<func>
- <name>characters_to_list(Data) -> list() | {error, list(), RestData} | {incomplete, list(), binary()} </name>
+ <name name="characters_to_list" arity="1"/>
<fsummary>Convert a collection of characters to list of Unicode characters</fsummary>
- <type>
- <v>Data = latin1_chardata() | chardata() | external_chardata()</v>
- <v>RestData = latin1_chardata() | chardata() | external_chardata()</v>
- </type>
<desc>
- <p>Same as characters_to_list(Data,unicode).</p>
+ <p>Same as characters_to_list(<anno>Data</anno>,unicode).</p>
</desc>
</func>
<func>
- <name>characters_to_list(Data, InEncoding) -> list() | {error, list(), RestData} | {incomplete, list(), binary()} </name>
+ <name>characters_to_list(Data, InEncoding) -> Result</name>
<fsummary>Convert a collection of characters to list of Unicode characters</fsummary>
<type>
- <v>Data = latin1_chardata() | chardata() | external_chardata()</v>
- <v>RestData = latin1_chardata() | chardata() | external_chardata()</v>
- <v>InEncoding = latin1 | unicode | utf8 | utf16 | utf32 | {utf16,little} | {utf16,big} | {utf32,little} | {utf32,big}</v>
+ <v>Data = <seealso marker="#type-latin1_chardata">latin1_chardata()</seealso>
+ | <seealso marker="#type-chardata">chardata()</seealso>
+ | <seealso marker="#type-external_chardata">external_chardata()</seealso></v>
+ <v>Result = list() | {error, list(), RestData} | {incomplete, list(), binary()}</v>
+ <v>RestData = <seealso marker="#type-latin1_chardata">latin1_chardata()</seealso>
+ | <seealso marker="#type-chardata">chardata()</seealso>
+ | <seealso marker="#type-external_chardata">external_chardata()</seealso></v>
+ <v>InEncoding = <seealso marker="#type-encoding">encoding()</seealso></v>
</type>
<desc>
@@ -164,10 +198,15 @@ latin1_charlist() = [latin1_char() | latin1_binary() | latin1_charlist()]
<item>Integers out of range - If <c>InEncoding</c> is
<c>latin1</c>, an error occurs whenever an integer greater
than 255 is found in the lists. If <c>InEncoding</c> is
- of a Unicode type, error occurs whenever an integer greater than
- <c>16#10FFFF</c> (the maximum unicode character) or in the
- range <c>16#D800</c> to <c>16#DFFF</c> (invalid unicode
- range) is found.</item>
+ of a Unicode type, an error occurs whenever an integer
+ <list type="bulleted">
+ <item>greater than <c>16#10FFFF</c>
+ (the maximum unicode character),</item>
+ <item>in the range <c>16#D800</c> to <c>16#DFFF</c>
+ (invalid range reserved for UTF-16 surrogate pairs)</item>
+ </list>
+ is found.
+ </item>
<item>UTF encoding incorrect - If <c>InEncoding</c> is
one of the UTF types, the bytes in any binaries have to be valid
@@ -228,44 +267,42 @@ latin1_charlist() = [latin1_char() | latin1_binary() | latin1_charlist()]
</desc>
</func>
<func>
- <name>characters_to_binary(Data) -> binary() | {error, binary(), RestData} | {incomplete, binary(), binary()} </name>
- <fsummary>Convert a collection of characters to an UTF-8 binary</fsummary> <type>
- <v>Data = latin1_chardata() | chardata() | external_chardata()</v>
- <v>RestData = latin1_chardata() | chardata() | external_chardata()</v>
- </type>
+ <name name="characters_to_binary" arity="1"/>
+ <fsummary>Convert a collection of characters to an UTF-8 binary</fsummary>
<desc>
<p>Same as characters_to_binary(Data, unicode, unicode).</p>
</desc>
</func>
<func>
- <name>characters_to_binary(Data,InEncoding) -> binary() | {error, binary(), RestData} | {incomplete, binary(), binary()} </name>
- <fsummary>Convert a collection of characters to an UTF-8 binary</fsummary> <type>
- <v>Data = latin1_chardata() | chardata() | external_chardata()</v>
- <v>RestData = latin1_chardata() | chardata() | external_chardata()</v>
- <v>InEncoding = latin1 | unicode | utf8 | utf16 | utf32 | {utf16,little} | {utf16,big} | {utf32,little} | {utf32,big}</v>
+ <name>characters_to_binary(Data,InEncoding) -> Result</name>
+ <fsummary>Convert a collection of characters to an UTF-8 binary</fsummary>
+
+ <type>
+ <v>Data = <seealso marker="#type-latin1_chardata">latin1_chardata()</seealso>
+ | <seealso marker="#type-chardata">chardata()</seealso>
+ | <seealso marker="#type-external_chardata">external_chardata()</seealso></v>
+ <v>Result = binary() | {error, binary(), RestData} | {incomplete, binary(), binary()}</v>
+ <v>RestData = <seealso marker="#type-latin1_chardata">latin1_chardata()</seealso>
+ | <seealso marker="#type-chardata">chardata()</seealso>
+ | <seealso marker="#type-external_chardata">external_chardata()</seealso></v>
+ <v>InEncoding = <seealso marker="#type-encoding">encoding()</seealso></v>
</type>
<desc>
<p>Same as characters_to_binary(Data, InEncoding, unicode).</p>
</desc>
</func>
<func>
- <name>characters_to_binary(Data, InEncoding, OutEncoding) -> binary() | {error, binary(), RestData} | {incomplete, binary(), binary()} </name>
+ <name name="characters_to_binary" arity="3"/>
<fsummary>Convert a collection of characters to an UTF-8 binary</fsummary>
- <type>
- <v>Data = latin1_chardata() | chardata() | external_chardata()</v>
- <v>RestData = latin1_chardata() | chardata() | external_chardata()</v>
- <v>InEncoding = latin1 | unicode | utf8 | utf16 | utf32 | {utf16,little} | {utf16,big} | {utf32,little} | {utf32,big}</v>
- <v>OutEncoding = latin1 | unicode | utf8 | utf16 | utf32| {utf16,little} | {utf16,big} | {utf32,little} | {utf32,big}</v>
- </type>
<desc>
<p>This function behaves as <seealso
marker="#characters_to_list/2">
characters_to_list/2</seealso>, but produces an binary
instead of a unicode list. The
- <c>InEncoding</c> defines how input is to be interpreted if
+ <c><anno>InEncoding</anno></c> defines how input is to be interpreted if
binaries are present in the <c>Data</c>, while
- <c>OutEncoding</c> defines in what format output is to be
+ <c><anno>OutEncoding</anno></c> defines in what format output is to be
generated.</p>
<p>The option <c>unicode</c> is an alias for <c>utf8</c>, as this is the
@@ -285,17 +322,13 @@ latin1_charlist() = [latin1_char() | latin1_binary() | latin1_charlist()]
</desc>
</func>
<func>
- <name>encoding_to_bom(InEncoding) -> Bin</name>
+ <name name="encoding_to_bom" arity="1"/>
<fsummary>Create a binary UTF byte order mark from encoding.</fsummary>
- <type>
- <v>Bin = binary() of byte_size 4 or less</v>
- <v>InEncoding = latin1 | unicode | utf8 | utf16 | utf32 | {utf16,little} | {utf16,big} | {utf32,little} | {utf32,big}</v>
- <v>Length = int()</v>
- </type>
+ <type_desc variable="Bin">A binary() of byte_size 4 or more.</type_desc>
<desc>
<p>Create an UTF byte order mark (BOM) as a binary from the
- supplied <c>InEncoding</c>. The BOM is, if supported at all,
+ supplied <c><anno>InEncoding</anno></c>. The BOM is, if supported at all,
expected to be placed first in UTF encoded files or
messages.</p>