aboutsummaryrefslogtreecommitdiffstats
path: root/lib/stdlib/doc/src/unicode.xml
diff options
context:
space:
mode:
Diffstat (limited to 'lib/stdlib/doc/src/unicode.xml')
-rw-r--r--lib/stdlib/doc/src/unicode.xml174
1 files changed, 101 insertions, 73 deletions
diff --git a/lib/stdlib/doc/src/unicode.xml b/lib/stdlib/doc/src/unicode.xml
index cb1cfa8ed0..d02763f75c 100644
--- a/lib/stdlib/doc/src/unicode.xml
+++ b/lib/stdlib/doc/src/unicode.xml
@@ -38,50 +38,83 @@
<p>It is recommended to only use external encodings for communication with external entities where this is required. When working inside the Erlang/OTP environment, it is recommended to keep binaries in UTF-8 when representing Unicode characters. Latin1 encoding is supported both for backward compatibility and for communication with external entities not supporting Unicode character sets.</p>
</description>
- <section>
- <title>DATA TYPES</title>
- <marker id="type-charlist"></marker>
- <code type="none">
-unicode_binary() = binary() with characters encoded in UTF-8 coding standard
-unicode_char() = integer() representing valid unicode codepoint
-
-chardata() = charlist() | unicode_binary()
-
-charlist() = [unicode_char() | unicode_binary() | charlist()]
- a unicode_binary is allowed as the tail of the list</code>
-
- <code type="none">
-external_unicode_binary() = binary()
- with characters coded in a user specified Unicode encoding other
- than UTF-8 (UTF-16 or UTF-32)
-
-external_chardata() = external_charlist() | external_unicode_binary()
-
-external_charlist() = [unicode_char() | external_unicode_binary() | external_charlist()]
- an external_unicode_binary is allowed as the tail of the list</code>
-
- <code type="none">
-latin1_binary() = binary() with characters coded in iso-latin-1
-latin1_char() = integer() representing valid latin1 character (0-255)
-
-latin1_chardata() = latin1_charlist() | latin1_binary()
+ <datatypes>
+ <datatype>
+ <name name="encoding"/>
+ </datatype>
+ <datatype>
+ <name name="endian"/>
+ </datatype>
+ <datatype>
+ <name name="unicode_binary"/>
+ <desc>
+ <p>A binary() with characters encoded in the UTF-8 coding standard.</p>
+ </desc>
+ </datatype>
+ <datatype>
+ <name name="unicode_char"/>
+ <desc>
+ <p>An integer() representing a valid unicode codepoint.</p>
+ </desc>
+ </datatype>
+ <datatype>
+ <name name="chardata"/>
+ </datatype>
+ <datatype>
+ <name name="charlist"/>
+ <desc>
+ <p>A unicode_binary is allowed as the tail of the list.</p>
+ </desc>
+ </datatype>
+ <datatype>
+ <name name="external_unicode_binary"/>
+ <desc>
+ <p>A <c>binary()</c> with characters coded in a user specified Unicode
+ encoding other than UTF-8 (UTF-16 or UTF-32).</p>
+ </desc>
+ </datatype>
+ <datatype>
+ <name name="external_chardata"/>
+ </datatype>
+ <datatype>
+ <name name="external_charlist"/>
+ <desc>
+ <p>An <c>external_unicode_binary()</c> is allowed as the tail
+ of the list.</p>
+ </desc>
+ </datatype>
+ <datatype>
+ <name name="latin1_binary"/>
+ <desc><p>A <c>binary()</c> with characters coded in iso-latin-1.</p>
+ </desc>
+ </datatype>
+ <datatype>
+ <name name="latin1_char"/>
+ <desc><p>An <c>integer()</c> representing valid latin1
+ character (0-255).</p>
+ </desc>
+ </datatype>
+ <datatype>
+ <name name="latin1_chardata"/>
+ </datatype>
+ <datatype>
+ <name name="latin1_charlist"/>
+ <desc><p>A <c>latin1_binary()</c> is allowed as the tail of
+ the list.</p>
+ </desc>
+ </datatype>
+ </datatypes>
-latin1_charlist() = [latin1_char() | latin1_binary() | latin1_charlist()]
- a latin1_binary is allowed as the tail of the list</code>
- </section>
<funcs>
<func>
- <name>bom_to_encoding(Bin) -> {Encoding,Length}</name>
+ <name name="bom_to_encoding" arity="1"/>
<fsummary>Identify UTF byte order marks in a binary.</fsummary>
- <type>
- <v>Bin = binary() of byte_size 4 or more</v>
- <v>Encoding = latin1 | utf8 | {utf16,little} | {utf16,big} | {utf32,little} | {utf32,big}</v>
- <v>Length = int()</v>
- </type>
+ <type name="endian"/>
+ <type_desc variable="Bin">A binary() of byte_size 4 or more.</type_desc>
<desc>
<p>Check for a UTF byte order mark (BOM) in the beginning of a
- binary. If the supplied binary <c>Bin</c> begins with a valid
+ binary. If the supplied binary <c><anno>Bin</anno></c> begins with a valid
byte order mark for either UTF-8, UTF-16 or UTF-32, the function
returns the encoding identified along with the length of the BOM
in bytes.</p>
@@ -90,23 +123,24 @@ latin1_charlist() = [latin1_char() | latin1_binary() | latin1_charlist()]
</desc>
</func>
<func>
- <name>characters_to_list(Data) -> list() | {error, list(), RestData} | {incomplete, list(), binary()} </name>
+ <name name="characters_to_list" arity="1"/>
<fsummary>Convert a collection of characters to list of Unicode characters</fsummary>
- <type>
- <v>Data = latin1_chardata() | chardata() | external_chardata()</v>
- <v>RestData = latin1_chardata() | chardata() | external_chardata()</v>
- </type>
<desc>
- <p>Same as characters_to_list(Data,unicode).</p>
+ <p>Same as characters_to_list(<anno>Data</anno>,unicode).</p>
</desc>
</func>
<func>
- <name>characters_to_list(Data, InEncoding) -> list() | {error, list(), RestData} | {incomplete, list(), binary()} </name>
+ <name>characters_to_list(Data, InEncoding) -> Result</name>
<fsummary>Convert a collection of characters to list of Unicode characters</fsummary>
<type>
- <v>Data = latin1_chardata() | chardata() | external_chardata()</v>
- <v>RestData = latin1_chardata() | chardata() | external_chardata()</v>
- <v>InEncoding = latin1 | unicode | utf8 | utf16 | utf32 | {utf16,little} | {utf16,big} | {utf32,little} | {utf32,big}</v>
+ <v>Data = <seealso marker="#type-latin1_chardata">latin1_chardata()</seealso>
+ | <seealso marker="#type-chardata">chardata()</seealso>
+ | <seealso marker="#type-external_chardata">external_chardata()</seealso></v>
+ <v>Result = list() | {error, list(), RestData} | {incomplete, list(), binary()}</v>
+ <v>RestData = <seealso marker="#type-latin1_chardata">latin1_chardata()</seealso>
+ | <seealso marker="#type-chardata">chardata()</seealso>
+ | <seealso marker="#type-external_chardata">external_chardata()</seealso></v>
+ <v>InEncoding = <seealso marker="#type-encoding">encoding()</seealso></v>
</type>
<desc>
@@ -234,44 +268,42 @@ latin1_charlist() = [latin1_char() | latin1_binary() | latin1_charlist()]
</desc>
</func>
<func>
- <name>characters_to_binary(Data) -> binary() | {error, binary(), RestData} | {incomplete, binary(), binary()} </name>
- <fsummary>Convert a collection of characters to an UTF-8 binary</fsummary> <type>
- <v>Data = latin1_chardata() | chardata() | external_chardata()</v>
- <v>RestData = latin1_chardata() | chardata() | external_chardata()</v>
- </type>
+ <name name="characters_to_binary" arity="1"/>
+ <fsummary>Convert a collection of characters to an UTF-8 binary</fsummary>
<desc>
<p>Same as characters_to_binary(Data, unicode, unicode).</p>
</desc>
</func>
<func>
- <name>characters_to_binary(Data,InEncoding) -> binary() | {error, binary(), RestData} | {incomplete, binary(), binary()} </name>
- <fsummary>Convert a collection of characters to an UTF-8 binary</fsummary> <type>
- <v>Data = latin1_chardata() | chardata() | external_chardata()</v>
- <v>RestData = latin1_chardata() | chardata() | external_chardata()</v>
- <v>InEncoding = latin1 | unicode | utf8 | utf16 | utf32 | {utf16,little} | {utf16,big} | {utf32,little} | {utf32,big}</v>
+ <name>characters_to_binary(Data,InEncoding) -> Result</name>
+ <fsummary>Convert a collection of characters to an UTF-8 binary</fsummary>
+
+ <type>
+ <v>Data = <seealso marker="#type-latin1_chardata">latin1_chardata()</seealso>
+ | <seealso marker="#type-chardata">chardata()</seealso>
+ | <seealso marker="#type-external_chardata">external_chardata()</seealso></v>
+ <v>Result = binary() | {error, binary(), RestData} | {incomplete, binary(), binary()}</v>
+ <v>RestData = <seealso marker="#type-latin1_chardata">latin1_chardata()</seealso>
+ | <seealso marker="#type-chardata">chardata()</seealso>
+ | <seealso marker="#type-external_chardata">external_chardata()</seealso></v>
+ <v>InEncoding = <seealso marker="#type-encoding">encoding()</seealso></v>
</type>
<desc>
<p>Same as characters_to_binary(Data, InEncoding, unicode).</p>
</desc>
</func>
<func>
- <name>characters_to_binary(Data, InEncoding, OutEncoding) -> binary() | {error, binary(), RestData} | {incomplete, binary(), binary()} </name>
+ <name name="characters_to_binary" arity="3"/>
<fsummary>Convert a collection of characters to an UTF-8 binary</fsummary>
- <type>
- <v>Data = latin1_chardata() | chardata() | external_chardata()</v>
- <v>RestData = latin1_chardata() | chardata() | external_chardata()</v>
- <v>InEncoding = latin1 | unicode | utf8 | utf16 | utf32 | {utf16,little} | {utf16,big} | {utf32,little} | {utf32,big}</v>
- <v>OutEncoding = latin1 | unicode | utf8 | utf16 | utf32| {utf16,little} | {utf16,big} | {utf32,little} | {utf32,big}</v>
- </type>
<desc>
<p>This function behaves as <seealso
marker="#characters_to_list/2">
characters_to_list/2</seealso>, but produces an binary
instead of a unicode list. The
- <c>InEncoding</c> defines how input is to be interpreted if
+ <c><anno>InEncoding</anno></c> defines how input is to be interpreted if
binaries are present in the <c>Data</c>, while
- <c>OutEncoding</c> defines in what format output is to be
+ <c><anno>OutEncoding</anno></c> defines in what format output is to be
generated.</p>
<p>The option <c>unicode</c> is an alias for <c>utf8</c>, as this is the
@@ -291,17 +323,13 @@ latin1_charlist() = [latin1_char() | latin1_binary() | latin1_charlist()]
</desc>
</func>
<func>
- <name>encoding_to_bom(InEncoding) -> Bin</name>
+ <name name="encoding_to_bom" arity="1"/>
<fsummary>Create a binary UTF byte order mark from encoding.</fsummary>
- <type>
- <v>Bin = binary() of byte_size 4 or less</v>
- <v>InEncoding = latin1 | unicode | utf8 | utf16 | utf32 | {utf16,little} | {utf16,big} | {utf32,little} | {utf32,big}</v>
- <v>Length = int()</v>
- </type>
+ <type_desc variable="Bin">A binary() of byte_size 4 or more.</type_desc>
<desc>
<p>Create an UTF byte order mark (BOM) as a binary from the
- supplied <c>InEncoding</c>. The BOM is, if supported at all,
+ supplied <c><anno>InEncoding</anno></c>. The BOM is, if supported at all,
expected to be placed first in UTF encoded files or
messages.</p>