aboutsummaryrefslogtreecommitdiffstats
path: root/erts
diff options
context:
space:
mode:
authorSverker Eriksson <[email protected]>2013-01-23 18:09:35 +0100
committerSverker Eriksson <[email protected]>2013-01-23 18:09:35 +0100
commitb8e623410d1c22fe6d5fdeb8ccb0b2305533f033 (patch)
tree708d64e36e18b61ae1801c02ec3aeef42a697be3 /erts
parente99df74bee7c245ec76678e336fcd09d4b51a089 (diff)
parentd6e3e256b850050b7a86323b2948009d5fcc30a9 (diff)
downloadotp-b8e623410d1c22fe6d5fdeb8ccb0b2305533f033.tar.gz
otp-b8e623410d1c22fe6d5fdeb8ccb0b2305533f033.tar.bz2
otp-b8e623410d1c22fe6d5fdeb8ccb0b2305533f033.zip
Merge branch 'sverk/r16/utf8-atoms'
* sverk/r16/utf8-atoms: erl_interface: Fix bug when transcoding atoms from and to UTF8 erl_interface: Changed erlang_char_encoding interface erts: Testcase doing unicode atom printout with ~w erl_interface: even more utf8 atom stuff erts: Fix bug in analyze_utf8 causing faulty latin1 detection Add UTF-8 node name support for epmd workaround... Fix merge conflict with hasse UTF-8 atom documentation test case erl_interface: utf8 atoms continued Add utf8 atom distribution test cases atom fixes for NIFs and atom_to_binary UTF-8 support for distribution Implement UTF-8 atom support for jinterface erl_interface: Enable decode of unicode atoms stdlib: Fix printing of unicode atoms erts: Change internal representation of atoms to utf8 erts: Refactor rename DFLAG(S)_INTERNAL_TAGS for conformity Conflicts: erts/emulator/beam/io.c OTP-10753
Diffstat (limited to 'erts')
-rw-r--r--erts/doc/src/erl_dist_protocol.xml288
-rw-r--r--erts/doc/src/erl_ext_dist.xml121
-rw-r--r--erts/doc/src/erlang.xml10
-rw-r--r--erts/emulator/beam/atom.c169
-rw-r--r--erts/emulator/beam/atom.h62
-rw-r--r--erts/emulator/beam/atom.names2
-rw-r--r--erts/emulator/beam/beam_load.c6
-rw-r--r--erts/emulator/beam/bif.c38
-rw-r--r--erts/emulator/beam/dist.c8
-rw-r--r--erts/emulator/beam/dist.h3
-rw-r--r--erts/emulator/beam/erl_alloc.c6
-rw-r--r--erts/emulator/beam/erl_alloc.types2
-rw-r--r--erts/emulator/beam/erl_alloc_util.c4
-rw-r--r--erts/emulator/beam/erl_bif_ddll.c5
-rwxr-xr-xerts/emulator/beam/erl_bif_info.c22
-rw-r--r--erts/emulator/beam/erl_bif_port.c2
-rw-r--r--erts/emulator/beam/erl_db.c2
-rw-r--r--erts/emulator/beam/erl_db_util.c3
-rw-r--r--erts/emulator/beam/erl_init.c4
-rw-r--r--erts/emulator/beam/erl_nif.c32
-rw-r--r--erts/emulator/beam/erl_unicode.c228
-rw-r--r--erts/emulator/beam/external.c292
-rw-r--r--erts/emulator/beam/external.h7
-rwxr-xr-xerts/emulator/beam/global.h7
-rw-r--r--erts/emulator/beam/io.c35
-rw-r--r--erts/emulator/beam/utils.c2
-rw-r--r--erts/emulator/test/bif_SUITE.erl4
-rw-r--r--erts/emulator/test/distribution_SUITE.erl325
-rw-r--r--erts/epmd/src/epmd_cli.c13
-rw-r--r--erts/epmd/src/epmd_int.h20
-rw-r--r--erts/epmd/src/epmd_srv.c215
-rw-r--r--erts/epmd/test/epmd_SUITE.erl47
32 files changed, 1608 insertions, 376 deletions
diff --git a/erts/doc/src/erl_dist_protocol.xml b/erts/doc/src/erl_dist_protocol.xml
index 6c725fc82d..0252187be5 100644
--- a/erts/doc/src/erl_dist_protocol.xml
+++ b/erts/doc/src/erl_dist_protocol.xml
@@ -547,13 +547,289 @@ If Result > 0, the packet only consists of [119, Result].
-->
</section>
-
+ <marker id="distribution_handshake"/>
<section>
- <title>Handshake</title>
- <p>
- The handshake is discussed in detail in the internal documentation for
- the kernel (Erlang) application.
- </p>
+ <title>Distribution Handshake</title>
+ <p>
+ This section describes the distribution handshake protocol
+ introduced in the OTP-R6 release of Erlang/OTP. This
+ description was previously located in
+ <c>$ERL_TOP/lib/kernel/internal_doc/distribution_handshake.txt</c>,
+ and has more or less been copied and "formatted" here. It has been
+ more or less unchanged since the year 1999, but the handshake
+ should not have changed much since then either.
+ </p>
+ <section>
+ <title>General</title>
+ <p>
+ The TCP/IP distribution uses a handshake which expects a
+ connection based protocol, i.e. the protocol does not include
+ any authentication after the handshake procedure.
+ </p>
+ <p>
+ This is not entirely safe, as it is vulnerable against takeover
+ attacks, but it is a tradeoff between fair safety and performance.
+ </p>
+ <p>
+ The cookies are never sent in cleartext and the handshake procedure
+ expects the client (called A) to be the first one to prove that it can
+ generate a sufficient digest. The digest is generated with the
+ MD5 message digest algorithm and the challenges are expected to be very
+ random numbers.
+ </p>
+ </section>
+ <section>
+ <title>Definitions</title>
+ <p>
+ A challenge is a 32 bit integer number in big endian order. Below the function
+ <c>gen_challenge()</c> returns a random 32 bit integer used as a challenge.
+ </p>
+ <p>
+ A digest is a (16 bytes) MD5 hash of the Challenge (as text) concatenated
+ with the cookie (as text). Below, the function <c>gen_digest(Challenge, Cookie)</c>
+ generates a digest as described above.
+ </p>
+ <p>
+ An out_cookie is the cookie used in outgoing communication to a certain node,
+ so that A's out_cookie for B should correspond with B's in_cookie for A and
+ the other way around. A's out_cookie for B and A's in_cookie for B need <em>NOT</em>
+ be the same. Below the function <c>out_cookie(Node)</c> returns the current
+ node's out_cookie for <c>Node</c>.
+ </p>
+ <p>
+ An in_cookie is the cookie expected to be used by another node when
+ communicating with us, so that A's in_cookie for B corresponds with B's
+ out_cookie for A. Below the function <c>in_cookie(Node)</c> returns the current
+ node's <c>in_cookie</c> for <c>Node</c>.
+ </p>
+ <p>
+ The cookies are text strings that can be viewed as passwords.
+ </p>
+ <p>
+ Every message in the handshake starts with a 16 bit big endian integer
+ which contains the length of the message (not counting the two initial bytes).
+ In erlang this corresponds to the <c>gen_tcp</c> option <c>{packet, 2}</c>. Note that after
+ the handshake, the distribution switches to 4 byte packet headers.
+ </p>
+
+ </section>
+ <section>
+ <title>The Handshake in Detail</title>
+ <p>
+ Imagine two nodes, node A, which initiates the handshake and node B, which
+ accepts the connection.
+ </p>
+ <taglist>
+ <tag>1) connect/accept</tag>
+ <item><p>A connects to B via TCP/IP and B accepts the connection.</p></item>
+ <tag>2) send_name/receive_name</tag>
+ <item><p>A sends an initial identification to B. B receives the message.
+ The message looks like this (every "square" being one byte and the packet
+ header removed):
+ </p>
+<pre>
++---+--------+--------+-----+-----+-----+-----+-----+-----+-...-+-----+
+|'n'|Version0|Version1|Flag0|Flag1|Flag2|Flag3|Name0|Name1| ... |NameN|
++---+--------+--------+-----+-----+-----+-----+-----+-----+-... +-----+
+</pre>
+ <p>
+ The 'n' is just a message tag.
+ Version0 and Version1 is the distribution version selected by node A,
+ based on information from EPMD. (16 bit big endian)
+ Flag0 ... Flag3 are capability flags, the capabilities defined in
+ <c>$ERL_TOP/lib/kernel/include/dist.hrl</c>.
+ (32 bit big endian)
+ Name0 ... NameN is the full nodename of A, as a string of bytes (the
+ packet length denotes how long it is).
+ </p></item>
+ <tag>3) recv_status/send_status</tag>
+ <item><p>B sends a status message to A, which indicates
+ if the connection is allowed. The following status codes are defined:</p>
+ <taglist>
+ <tag><c>ok</c></tag>
+ <item>The handshake will continue.</item>
+ <tag><c>ok_simultaneous</c></tag>
+ <item>The handshake will continue, but A is informed that B
+ has another ongoing connection attempt that will be
+ shut down (simultaneous connect where A's name is
+ greater than B's name, compared literally).</item>
+ <tag><c>nok</c></tag>
+ <item>The handshake will not continue, as B already has an ongoing handshake
+ which it itself has initiated. (simultaneous connect where B's name is
+ greater than A's).</item>
+ <tag><c>not_allowed</c></tag>
+ <item>The connection is disallowed for some (unspecified) security
+ reason.</item>
+ <tag><c>alive</c></tag>
+ <item>A connection to the node is already active, which either means
+ that node A is confused or that the TCP connection breakdown
+ of a previous node with this name has not yet reached node B.
+ See 3B below.</item>
+ </taglist>
+ <p>This is the format of the status message:</p>
+<pre>
++---+-------+-------+-...-+-------+
+|'s'|Status0|Status1| ... |StatusN|
++---+-------+-------+-...-+-------+
+</pre>
+ <p>
+ 's' is the message tag Status0 ... StatusN is the status as a string (not terminated)
+ </p>
+ </item>
+ <tag>3B) send_status/recv_status</tag>
+ <item><p>If status was 'alive', node A will answer with
+ another status message containing either 'true' which means that the
+ connection should continue (The old connection from this node is broken), or
+ <c>'false'</c>, which simply means that the connection should be closed, the
+ connection attempt was a mistake.</p></item>
+ <tag>4) recv_challenge/send_challenge</tag>
+ <item><p>If the status was <c>ok</c> or <c>ok_simultaneous</c>,
+ The handshake continues with B sending A another message, the challenge.
+ The challenge contains the same type of information as the "name" message
+ initially sent from A to B, with the addition of a 32 bit challenge:</p>
+<pre>
++---+--------+--------+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-...-+-----+
+|'n'|Version0|Version1|Flag0|Flag1|Flag2|Flag3|Chal0|Chal1|Chal2|Chal3|Name0|Name1| ... |NameN|
++---+--------+--------+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-... +-----+
+</pre>
+ <p>
+ Where Chal0 ... Chal3 is the challenge as a 32 bit big endian integer
+ and the other fields are B's version, flags and full nodename.
+ </p></item>
+ <tag>5) send_challenge_reply/recv_challenge_reply</tag>
+ <item><p>Now A has generated a digest and its own challenge. Those are
+ sent together in a package to B:</p>
+<pre>
++---+-----+-----+-----+-----+-----+-----+-----+-----+-...-+------+
+|'r'|Chal0|Chal1|Chal2|Chal3|Dige0|Dige1|Dige2|Dige3| ... |Dige15|
++---+-----+-----+-----+-----+-----+-----+-----+-----+-...-+------+
+</pre>
+ <p>
+ Where 'r' is the tag, Chal0 ... Chal3 is A's challenge for B to handle and
+ Dige0 ... Dige15 is the digest that A constructed from the challenge B sent
+ in the previous step.
+ </p></item>
+ <tag>6) recv_challenge_ack/send_challenge_ack</tag>
+ <item><p>B checks that the digest received from A is correct and generates a
+ digest from the challenge received from A. The digest is then sent to A. The
+ message looks like this:</p>
+<pre>
++---+-----+-----+-----+-----+-...-+------+
+|'a'|Dige0|Dige1|Dige2|Dige3| ... |Dige15|
++---+-----+-----+-----+-----+-...-+------+
+</pre>
+ <p>
+ Where 'a' is the tag and Dige0 ... Dige15 is the digest calculated by B
+ for A's challenge.</p></item>
+ <tag>7)</tag>
+ <item><p>A checks the digest from B and the connection is up.</p></item>
+ </taglist>
+ </section>
+ <section>
+ <title>Semigraphic View</title>
+<pre>
+A (initiator) B (acceptor)
+
+TCP connect -----------------------------------------&gt;
+ TCP accept
+
+send_name -----------------------------------------&gt;
+ recv_name
+
+ &lt;---------------------------------------- send_status
+recv_status
+(if status was 'alive'
+ send_status - - - - - - - - - - - - - - - - - - - -&gt;
+ recv_status)
+ ChB = gen_challenge()
+ (ChB)
+ &lt;---------------------------------------- send_challenge
+recv_challenge
+
+ChA = gen_challenge(),
+OCA = out_cookie(B),
+DiA = gen_digest(ChB,OCA)
+ (ChA, DiA)
+send_challenge_reply --------------------------------&gt;
+ recv_challenge_reply
+ ICB = in_cookie(A),
+ check:
+ DiA == gen_digest
+ (ChB, ICB) ?
+ - if OK:
+ OCB = out_cookie(A),
+ DiB = gen_digest
+ (DiB) (ChA, OCB)
+ &lt;----------------------------------------- send_challenge_ack
+recv_challenge_ack DONE
+ICA = in_cookie(B), - else
+check: CLOSE
+DiB == gen_digest(ChA,ICA) ?
+- if OK
+ DONE
+- else
+ CLOSE
+</pre>
+ </section>
+ <marker id="dflags"/>
+ <section>
+ <title>The Currently Defined Distribution Flags</title>
+ <p>
+ Currently (OTP-R16) the following capability flags are defined:
+ </p>
+<pre>
+%% The node should be published and part of the global namespace
+-define(DFLAG_PUBLISHED,1).
+
+%% The node implements an atom cache (obsolete)
+-define(DFLAG_ATOM_CACHE,2).
+
+%% The node implements extended (3 * 32 bits) references. This is
+%% required today. If not present connection will be refused.
+-define(DFLAG_EXTENDED_REFERENCES,4).
+
+%% The node implements distributed process monitoring.
+-define(DFLAG_DIST_MONITOR,8).
+
+%% The node uses separate tag for fun's (lambdas) in the distribution protocol.
+-define(DFLAG_FUN_TAGS,16#10).
+
+%% The node implements distributed named process monitoring.
+-define(DFLAG_DIST_MONITOR_NAME,16#20).
+
+%% The (hidden) node implements atom cache (obsolete)
+-define(DFLAG_HIDDEN_ATOM_CACHE,16#40).
+
+%% The node understand new fun-tags
+-define(DFLAG_NEW_FUN_TAGS,16#80).
+
+%% The node is capable of handling extended pids and ports. This is
+%% required today. If not present connection will be refused.
+-define(DFLAG_EXTENDED_PIDS_PORTS,16#100).
+
+%%
+-define(DFLAG_EXPORT_PTR_TAG,16#200).
+
+%%
+-define(DFLAG_BIT_BINARIES,16#400).
+
+%% The node understands new float format
+-define(DFLAG_NEW_FLOATS,16#800).
+
+%%
+-define(DFLAG_UNICODE_IO,16#1000).
+
+%% The node implements atom cache in distribution header.
+-define(DFLAG_DIST_HDR_ATOM_CACHE,16#2000).
+
+%% The node understand the SMALL_ATOM_EXT tag
+-define(DFLAG_SMALL_ATOM_TAGS, 16#4000).
+
+%% The node understand UTF-8 encoded atoms
+-define(DFLAG_UTF8_ATOMS, 16#10000).
+
+</pre>
+ </section>
</section>
<section>
diff --git a/erts/doc/src/erl_ext_dist.xml b/erts/doc/src/erl_ext_dist.xml
index fd2da2cfe3..28afea8b29 100644
--- a/erts/doc/src/erl_ext_dist.xml
+++ b/erts/doc/src/erl_ext_dist.xml
@@ -119,10 +119,39 @@
<cell align="center">Data</cell>
</row>
<tcaption></tcaption></table>
+ <marker id="utf8_atoms"/>
+ <note>
+ <p>As of ERTS version 5.10 (OTP-R16) support
+ for UTF-8 encoded atoms has been introduced in the external format.
+ However, only characters that can be encoded using Latin1 (ISO-8859-1)
+ are currently supported in atoms. The support for UTF-8 encoded atoms
+ in the external format has been implemented in order to be able to support
+ all Unicode characters in atoms in <em>some future release</em>. Full
+ support for Unicode atoms will not happen before OTP-R18, and might
+ be introduced even later than that. Until full Unicode support for
+ atoms has been introduced, it is an <em>error</em> to pass atoms containing
+ characters that cannot be encoded in Latin1, and <em>the behavior is
+ undefined</em>.</p>
+ <p>When the
+ <seealso marker="erl_dist_protocol#dflags"><c>DFLAG_UTF8_ATOMS</c></seealso>
+ distribution flag has been exchanged between both nodes in the
+ <seealso marker="erl_dist_protocol#distribution_handshake">distribution handshake</seealso>,
+ all atoms in the distribution header will be encoded in UTF-8; otherwise,
+ all atoms in the distribution header will be encoded in Latin1. The two
+ new tags <seealso marker="#ATOM_UTF8_EXT">ATOM_UTF8_EXT</seealso>, and
+ <seealso marker="#SMALL_ATOM_UTF8_EXT">SMALL_ATOM_UTF8_EXT</seealso>
+ will only be used if the <c>DFLAG_UTF8_ATOMS</c> distribution flag has
+ been exchanged between nodes, or if an atom containing characters
+ that cannot be encoded in Latin1 is encountered.
+ </p>
+ <p>The maximum number of allowed characters in an atom is 255. In the
+ UTF-8 case each character may need 4 bytes to be encoded.
+ </p>
+ </note>
</section>
- <section>
- <marker id="distribution_header"/>
+ <marker id="distribution_header"/>
+ <section>
<title>Distribution header</title>
<p>
As of erts version 5.7.2 the old atom cache protocol was
@@ -219,8 +248,7 @@
<p>
The least significant bit in that half byte is the <c>LongAtoms</c>
flag. If it is set, 2 bytes are used for atom lengths instead of
- 1 byte in the distribution header. However, the current emulator
- cannot handle long atoms, so it will currently always be 0.
+ 1 byte in the distribution header.
</p>
<p>
After the <c>Flags</c> field follow the <c>AtomCacheRefs</c>. The
@@ -247,16 +275,26 @@
<p>
<c>InternalSegmentIndex</c> together with the <c>SegmentIndex</c>
completely identify the location of an atom cache entry in the
- atom cache. <c>Length</c> is number of one byte characters that
- the atom text consists of. Length is a two byte big endian integer
+ atom cache. <c>Length</c> is number of bytes that <c>AtomText</c>
+ consists of. Length is a two byte big endian integer
if the <c>LongAtoms</c> flag has been set, otherwise a one byte
- integer. Subsequent <c>CachedAtomRef</c>s with the same
+ integer. When the
+ <seealso marker="erl_dist_protocol#dflags"><c>DFLAG_UTF8_ATOMS</c></seealso>
+ distribution flag has been exchanged between both nodes in the
+ <seealso marker="erl_dist_protocol#distribution_handshake">distribution handshake</seealso>,
+ characters in <c>AtomText</c> is encoded in UTF-8; otherwise,
+ encoded in Latin1. Subsequent <c>CachedAtomRef</c>s with the same
<c>SegmentIndex</c> and <c>InternalSegmentIndex</c> as this
<c>NewAtomCacheRef</c> will refer to this atom until a new
<c>NewAtomCacheRef</c> with the same <c>SegmentIndex</c>
and <c>InternalSegmentIndex</c> appear.
</p>
<p>
+ For more information on encoding of atoms, see
+ <seealso marker="#utf8_atoms">note on UTF-8 encoded atoms</seealso>
+ in the beginning of this document.
+ </p>
+ <p>
If the <c>NewCacheEntryFlag</c> for the next <c>AtomCacheRef</c>
has not been set, a <c>CachedAtomRef</c> on the following format
will follow:
@@ -383,9 +421,9 @@
<tcaption></tcaption></table>
<p>
An atom is stored with a 2 byte unsigned length in big-endian order,
- followed by <c>Len</c> numbers of 8 bit characters that forms the
- <c>AtomName</c>.
- Note: The maximum allowed value for <c>Len</c> is 255.
+ followed by <c>Len</c> numbers of 8 bit Latin1 characters that forms
+ the <c>AtomName</c>.
+ <em>Note</em>: The maximum allowed value for <c>Len</c> is 255.
</p>
</section>
@@ -754,12 +792,14 @@
<tcaption></tcaption></table>
<p>
An atom is stored with a 1 byte unsigned length,
- followed by <c>Len</c> numbers of 8 bit characters that
+ followed by <c>Len</c> numbers of 8 bit Latin1 characters that
forms the <c>AtomName</c>. Longer atoms can be represented
by <seealso marker="#ATOM_EXT">ATOM_EXT</seealso>. <em>Note</em>
the <c>SMALL_ATOM_EXT</c> was introduced in erts version 5.7.2 and
- require a small atom distribution flag exchanged in the distribution
- handshake.
+ require an exchange of the
+ <seealso marker="erl_dist_protocol#dflags"><c>DFLAG_SMALL_ATOM_TAGS</c></seealso>
+ distribution flag in the
+ <seealso marker="erl_dist_protocol#distribution_handshake">distribution handshake</seealso>.
</p>
</section>
@@ -1007,7 +1047,62 @@
This term is used in minor version 1 of the external format.
</p>
</section>
+ <section>
+ <marker id="ATOM_UTF8_EXT"/>
+ <title>ATOM_UTF8_EXT</title>
+
+ <table align="left">
+ <row>
+ <cell align="center">1</cell>
+ <cell align="center">2</cell>
+ <cell align="center">Len</cell>
+ </row>
+ <row>
+ <cell align="center"><c>118</c></cell>
+ <cell align="center"><c>Len</c></cell>
+ <cell align="center"><c>AtomName</c></cell>
+ </row>
+ <tcaption></tcaption></table>
+ <p>
+ An atom is stored with a 2 byte unsigned length in big-endian order,
+ followed by <c>Len</c> bytes containing the <c>AtomName</c> encoded
+ in UTF-8.
+ </p>
+ <p>
+ For more information on encoding of atoms, see
+ <seealso marker="#utf8_atoms">note on UTF-8 encoded atoms</seealso>
+ in the beginning of this document.
+ </p>
+ </section>
+ <section>
+ <marker id="SMALL_ATOM_UTF8_EXT"/>
+ <title>SMALL_ATOM_UTF8_EXT</title>
+
+ <table align="left">
+ <row>
+ <cell align="center">1</cell>
+ <cell align="center">1</cell>
+ <cell align="center">Len</cell>
+ </row>
+ <row>
+ <cell align="center"><c>119</c></cell>
+ <cell align="center"><c>Len</c></cell>
+ <cell align="center"><c>AtomName</c></cell>
+ </row>
+ <tcaption></tcaption></table>
+ <p>
+ An atom is stored with a 1 byte unsigned length,
+ followed by <c>Len</c> bytes containing the <c>AtomName</c> encoded
+ in UTF-8. Longer atoms encoded in UTF-8 can be represented using
+ <seealso marker="#ATOM_UTF8_EXT">ATOM_UTF8_EXT</seealso>.
+ </p>
+ <p>
+ For more information on encoding of atoms, see
+ <seealso marker="#utf8_atoms">note on UTF-8 encoded atoms</seealso>
+ in the beginning of this document.
+ </p>
+ </section>
</chapter>
diff --git a/erts/doc/src/erlang.xml b/erts/doc/src/erlang.xml
index d09f286e36..315dc323ba 100644
--- a/erts/doc/src/erlang.xml
+++ b/erts/doc/src/erlang.xml
@@ -277,7 +277,9 @@
the binary contains Unicode characters greater than 16#FF.
In a future release, such Unicode characters might be allowed
and <c>binary_to_atom(<anno>Binary</anno>, utf8)</c>
- will not fail in that case.</p></note>
+ will not fail in that case. For more information on Unicode support in atoms
+ see <seealso marker="erl_ext_dist#utf8_atoms">note on UTF-8 encoded atoms</seealso>
+ in the chapter about the external term format in the ERTS User's Guide.</p></note>
<pre>
> <input>binary_to_atom(&lt;&lt;"Erlang"&gt;&gt;, latin1).</input>
@@ -1712,9 +1714,11 @@ os_prompt% </pre>
<desc>
<p>Returns the atom whose text representation is <c><anno>String</anno></c>.</p>
<p><c><anno>String</anno></c> may only contain ISO-latin-1
- characterns (i.e. numbers below 256) as the current
+ characters (i.e. numbers below 256) as the current
implementation does not allow unicode characters >= 256 in
- atoms.</p>
+ atoms. For more information on Unicode support in atoms
+ see <seealso marker="erl_ext_dist#utf8_atoms">note on UTF-8 encoded atoms</seealso>
+ in the chapter about the external term format in the ERTS User's Guide.</p>
<pre>
> <input>list_to_atom("Erlang").</input>
'Erlang'</pre>
diff --git a/erts/emulator/beam/atom.c b/erts/emulator/beam/atom.c
index d7c7f117cf..82dd320ea9 100644
--- a/erts/emulator/beam/atom.c
+++ b/erts/emulator/beam/atom.c
@@ -111,7 +111,7 @@ atom_text_alloc(int bytes)
{
byte *res;
- ASSERT(bytes <= MAX_ATOM_LENGTH);
+ ASSERT(bytes <= MAX_ATOM_SZ_LIMIT);
if (atom_text_pos + bytes >= atom_text_end) {
more_atom_space();
}
@@ -162,6 +162,7 @@ atom_alloc(Atom* tmpl)
obj->name = atom_text_alloc(tmpl->len);
sys_memcpy(obj->name, tmpl->name, tmpl->len);
obj->len = tmpl->len;
+ obj->latin1_chars = tmpl->latin1_chars;
obj->slot.index = -1;
/*
@@ -192,44 +193,146 @@ atom_free(Atom* obj)
erts_free(ERTS_ALC_T_ATOM, (void*) obj);
}
+static void latin1_to_utf8(byte* conv_buf, const byte** srcp, int* lenp)
+{
+ byte* dst;
+ const byte* src = *srcp;
+ int i, len = *lenp;
+
+ for (i=0 ; i < len; ++i) {
+ if (src[i] & 0x80) {
+ goto need_convertion;
+ }
+ }
+ return;
+
+need_convertion:
+ sys_memcpy(conv_buf, src, i);
+ dst = conv_buf + i;
+ for ( ; i < len; ++i) {
+ unsigned char chr = src[i];
+ if (!(chr & 0x80)) {
+ *dst++ = chr;
+ }
+ else {
+ *dst++ = 0xC0 | (chr >> 6);
+ *dst++ = 0x80 | (chr & 0x3F);
+ }
+ }
+ *srcp = conv_buf;
+ *lenp = dst - conv_buf;
+}
+
+/*
+ * erts_atom_put() may fail. If it fails THE_NON_VALUE is returned!
+ */
Eterm
-am_atom_put(const char* name, int len)
+erts_atom_put(const byte *name, int len, ErtsAtomEncoding enc, int trunc)
{
+ byte utf8_copy[MAX_ATOM_SZ_FROM_LATIN1];
+ const byte *text = name;
+ int tlen = len;
+ Sint no_latin1_chars;
Atom a;
- Eterm ret;
int aix;
- /*
- * Silently truncate the atom if it is too long. Overlong atoms
- * could occur in situations where we have no good way to return
- * an error, such as in the I/O system. (Unfortunately, many
- * drivers don't check for errors.)
- *
- * If an error should be produced for overlong atoms (such in
- * list_to_atom/1), the caller should check the length before
- * calling this function.
- */
- if (len > MAX_ATOM_LENGTH) {
- len = MAX_ATOM_LENGTH;
- }
#ifdef ERTS_ATOM_PUT_OPS_STAT
erts_smp_atomic_inc_nob(&atom_put_ops);
#endif
- a.len = len;
- a.name = (byte*)name;
+
+ if (tlen < 0) {
+ if (trunc)
+ tlen = 0;
+ else
+ return THE_NON_VALUE;
+ }
+
+ switch (enc) {
+ case ERTS_ATOM_ENC_7BIT_ASCII:
+ if (tlen > MAX_ATOM_CHARACTERS) {
+ if (trunc)
+ tlen = MAX_ATOM_CHARACTERS;
+ else
+ return THE_NON_VALUE;
+ }
+#ifdef DEBUG
+ for (aix = 0; aix < len; aix++) {
+ ASSERT((name[aix] & 0x80) == 0);
+ }
+#endif
+ no_latin1_chars = tlen;
+ break;
+ case ERTS_ATOM_ENC_LATIN1:
+ if (tlen > MAX_ATOM_CHARACTERS) {
+ if (trunc)
+ tlen = MAX_ATOM_CHARACTERS;
+ else
+ return THE_NON_VALUE;
+ }
+ no_latin1_chars = tlen;
+ latin1_to_utf8(utf8_copy, &text, &tlen);
+ break;
+ case ERTS_ATOM_ENC_UTF8:
+ /* First sanity check; need to verify later */
+ if (tlen > MAX_ATOM_SZ_LIMIT && !trunc)
+ return THE_NON_VALUE;
+ break;
+ }
+
+ a.len = tlen;
+ a.name = (byte *) text;
atom_read_lock();
aix = index_get(&erts_atom_table, (void*) &a);
atom_read_unlock();
- if (aix >= 0)
- ret = make_atom(aix);
- else {
- atom_write_lock();
- ret = make_atom(index_put(&erts_atom_table, (void*) &a));
- atom_write_unlock();
+ if (aix >= 0) {
+ /* Already in table no need to verify it */
+ return make_atom(aix);
}
- return ret;
+
+ if (enc == ERTS_ATOM_ENC_UTF8) {
+ /* Need to verify encoding and length */
+ byte *err_pos;
+ Uint no_chars;
+ switch (erts_analyze_utf8_x((byte *) text,
+ (Uint) tlen,
+ &err_pos,
+ &no_chars, NULL,
+ &no_latin1_chars,
+ MAX_ATOM_CHARACTERS)) {
+ case ERTS_UTF8_OK:
+ ASSERT(no_chars <= MAX_ATOM_CHARACTERS);
+ break;
+ case ERTS_UTF8_OK_MAX_CHARS:
+ /* Truncated... */
+ if (!trunc)
+ return THE_NON_VALUE;
+ ASSERT(no_chars == MAX_ATOM_CHARACTERS);
+ tlen = err_pos - text;
+ break;
+ default:
+ /* Bad utf8... */
+ return THE_NON_VALUE;
+ }
+ }
+
+ ASSERT(tlen <= MAX_ATOM_SZ_LIMIT);
+ ASSERT(-1 <= no_latin1_chars && no_latin1_chars <= MAX_ATOM_CHARACTERS);
+
+ a.len = tlen;
+ a.latin1_chars = (Sint16) no_latin1_chars;
+ a.name = (byte *) text;
+ atom_write_lock();
+ aix = index_put(&erts_atom_table, (void*) &a);
+ atom_write_unlock();
+ return make_atom(aix);
}
+Eterm
+am_atom_put(const char* name, int len)
+{
+ /* Assumes 7-bit ascii; use erts_atom_put() for other encodings... */
+ return erts_atom_put((byte *) name, len, ERTS_ATOM_ENC_7BIT_ASCII, 1);
+}
int atom_table_size(void)
{
@@ -264,14 +367,19 @@ int atom_table_sz(void)
}
int
-erts_atom_get(const char *name, int len, Eterm* ap)
+erts_atom_get(const char *name, int len, Eterm* ap, int is_latin1)
{
+ byte utf8_copy[MAX_ATOM_SZ_FROM_LATIN1];
Atom a;
int i;
int res;
- a.len = len;
+ a.len = (Sint16) len;
a.name = (byte *)name;
+ if (is_latin1) {
+ latin1_to_utf8(utf8_copy, (const byte**)&a.name, &len);
+ a.len = (Sint16) len;
+ }
atom_read_lock();
i = index_get(&erts_atom_table, (void*) &a);
res = i < 0 ? 0 : (*ap = make_atom(i), 1);
@@ -333,8 +441,15 @@ init_atom_table(void)
for (i = 0; erl_atom_names[i] != 0; i++) {
int ix;
a.len = strlen(erl_atom_names[i]);
+ a.latin1_chars = a.len;
a.name = (byte*)erl_atom_names[i];
a.slot.index = i;
+#ifdef DEBUG
+ /* Verify 7-bit ascii */
+ for (ix = 0; ix < a.len; ix++) {
+ ASSERT((a.name[ix] & 0x80) == 0);
+ }
+#endif
ix = index_put(&erts_atom_table, (void*) &a);
atom_text_pos -= a.len;
atom_space -= a.len;
diff --git a/erts/emulator/beam/atom.h b/erts/emulator/beam/atom.h
index fd9c04d3d0..f721999a4c 100644
--- a/erts/emulator/beam/atom.h
+++ b/erts/emulator/beam/atom.h
@@ -26,7 +26,9 @@
#include "erl_atom_table.h"
-#define MAX_ATOM_LENGTH 255
+#define MAX_ATOM_CHARACTERS 255
+#define MAX_ATOM_SZ_FROM_LATIN1 (2*MAX_ATOM_CHARACTERS)
+#define MAX_ATOM_SZ_LIMIT (4*MAX_ATOM_CHARACTERS) /* theoretical byte limit */
#define ATOM_LIMIT (1024*1024)
#define MIN_ATOM_TABLE_SIZE 8192
@@ -45,7 +47,8 @@
*/
typedef struct atom {
IndexSlot slot; /* MUST BE LOCATED AT TOP OF STRUCT!!! */
- int len; /* length of atom name */
+ Sint16 len; /* length of atom name (UTF-8 encoded) */
+ Sint16 latin1_chars; /* 0-255 if atom can be encoded in latin1; otherwise, -1 */
int ord0; /* ordinal value of first 3 bytes + 7 bits */
byte* name; /* name of atom */
} Atom;
@@ -53,8 +56,8 @@ typedef struct atom {
extern IndexTable erts_atom_table;
ERTS_GLB_INLINE Atom* atom_tab(Uint i);
-ERTS_GLB_INLINE int erts_is_atom_bytes(byte *text, size_t len, Eterm term);
-ERTS_GLB_INLINE int erts_is_atom_str(char *str, Eterm term);
+ERTS_GLB_INLINE int erts_is_atom_utf8_bytes(byte *text, size_t len, Eterm term);
+ERTS_GLB_INLINE int erts_is_atom_str(const char *str, Eterm term, int is_latin1);
#if ERTS_GLB_INLINE_INCL_FUNC_DEF
ERTS_GLB_INLINE Atom*
@@ -63,7 +66,7 @@ atom_tab(Uint i)
return (Atom *) erts_index_lookup(&erts_atom_table, i);
}
-ERTS_GLB_INLINE int erts_is_atom_bytes(byte *text, size_t len, Eterm term)
+ERTS_GLB_INLINE int erts_is_atom_utf8_bytes(byte *text, size_t len, Eterm term)
{
Atom *a;
if (!is_atom(term))
@@ -73,43 +76,70 @@ ERTS_GLB_INLINE int erts_is_atom_bytes(byte *text, size_t len, Eterm term)
&& sys_memcmp((void *) a->name, (void *) text, len) == 0);
}
-ERTS_GLB_INLINE int erts_is_atom_str(char *str, Eterm term)
+ERTS_GLB_INLINE int erts_is_atom_str(const char *str, Eterm term, int is_latin1)
{
Atom *a;
int i, len;
- char *aname;
+ const byte* aname;
+ const byte* s = (const byte*) str;
+
if (!is_atom(term))
return 0;
a = atom_tab(atom_val(term));
len = a->len;
- aname = (char *) a->name;
- for (i = 0; i < len; i++)
- if (aname[i] != str[i] || str[i] == '\0')
- return 0;
- return str[len] == '\0';
+ aname = a->name;
+ if (is_latin1) {
+ for (i = 0; i < len; s++) {
+ if (aname[i] < 0x80) {
+ if (aname[i] != *s || *s == '\0')
+ return 0;
+ i++;
+ }
+ else {
+ if (aname[i] != (0xC0 | (*s >> 6)) ||
+ aname[i+1] != (0x80 | (*s & 0x3F))) {
+ return 0;
+ }
+ i += 2;
+ }
+ }
+ }
+ else {
+ for (i = 0; i < len; i++, s++)
+ if (aname[i] != *s || *s == '\0')
+ return 0;
+ }
+ return *s == '\0';
}
#endif
+typedef enum {
+ ERTS_ATOM_ENC_7BIT_ASCII,
+ ERTS_ATOM_ENC_LATIN1,
+ ERTS_ATOM_ENC_UTF8
+} ErtsAtomEncoding;
+
/*
* Note, ERTS_IS_ATOM_STR() expects the first argument to be a
- * string literal.
+ * 7-bit ASCII string literal.
*/
#define ERTS_IS_ATOM_STR(LSTR, TERM) \
- (erts_is_atom_bytes((byte *) LSTR, sizeof(LSTR) - 1, (TERM)))
+ (erts_is_atom_utf8_bytes((byte *) LSTR, sizeof(LSTR) - 1, (TERM)))
#define ERTS_DECL_AM(S) Eterm AM_ ## S = am_atom_put(#S, sizeof(#S) - 1)
#define ERTS_INIT_AM(S) AM_ ## S = am_atom_put(#S, sizeof(#S) - 1)
int atom_table_size(void); /* number of elements */
int atom_table_sz(void); /* table size in bytes, excluding stored objects */
-Eterm am_atom_put(const char*, int); /* most callers pass plain char*'s */
+Eterm am_atom_put(const char*, int); /* ONLY 7-bit ascii! */
+Eterm erts_atom_put(const byte *name, int len, ErtsAtomEncoding enc, int trunc);
int atom_erase(byte*, int);
int atom_static_put(byte*, int);
void init_atom_table(void);
void atom_info(int, void *);
void dump_atoms(int, void *);
-int erts_atom_get(const char* name, int len, Eterm* ap);
+int erts_atom_get(const char* name, int len, Eterm* ap, int is_latin1);
void erts_atom_get_text_space_sizes(Uint *reserved, Uint *used);
#endif
diff --git a/erts/emulator/beam/atom.names b/erts/emulator/beam/atom.names
index b74e2785e5..590b2fc960 100644
--- a/erts/emulator/beam/atom.names
+++ b/erts/emulator/beam/atom.names
@@ -18,6 +18,8 @@
#
#
+# IMPORTANT! All atoms defined here *need* to be in 7-bit ascii!
+#
# File format:
#
# Lines starting with '#' are ignored.
diff --git a/erts/emulator/beam/beam_load.c b/erts/emulator/beam/beam_load.c
index b51f076a5d..8b4135e21d 100644
--- a/erts/emulator/beam/beam_load.c
+++ b/erts/emulator/beam/beam_load.c
@@ -1230,7 +1230,7 @@ load_atom_table(LoaderState* stp)
GetByte(stp, n);
GetString(stp, atom, n);
- stp->atom[i] = am_atom_put((char*)atom, n);
+ stp->atom[i] = erts_atom_put(atom, n, ERTS_ATOM_ENC_LATIN1, 1);
}
/*
@@ -1240,7 +1240,7 @@ load_atom_table(LoaderState* stp)
if (is_nil(stp->module)) {
stp->module = stp->atom[1];
} else if (stp->atom[1] != stp->module) {
- char sbuf[256];
+ char sbuf[MAX_ATOM_SZ_FROM_LATIN1];
Atom* ap;
ap = atom_tab(atom_val(stp->atom[1]));
@@ -1620,7 +1620,7 @@ read_line_table(LoaderState* stp)
GetInt(stp, 2, n);
GetString(stp, fname, n);
- stp->fname[i] = am_atom_put((char*)fname, n);
+ stp->fname[i] = erts_atom_put(fname, n, ERTS_ATOM_ENC_LATIN1, 1);
}
}
diff --git a/erts/emulator/beam/bif.c b/erts/emulator/beam/bif.c
index 182129cd36..bde70911a2 100644
--- a/erts/emulator/beam/bif.c
+++ b/erts/emulator/beam/bif.c
@@ -2604,9 +2604,13 @@ BIF_RETTYPE delete_element_2(BIF_ALIST_3)
BIF_RETTYPE atom_to_list_1(BIF_ALIST_1)
{
- Uint need;
- Eterm* hp;
Atom* ap;
+ Uint num_chars, num_built, num_eaten;
+ byte* err_pos;
+ Eterm res;
+#ifdef DEBUG
+ int ares;
+#endif
if (is_not_atom(BIF_ARG_1))
BIF_ERROR(BIF_P, BADARG);
@@ -2615,9 +2619,18 @@ BIF_RETTYPE atom_to_list_1(BIF_ALIST_1)
ap = atom_tab(atom_val(BIF_ARG_1));
if (ap->len == 0)
BIF_RET(NIL); /* the empty atom */
- need = ap->len*2;
- hp = HAlloc(BIF_P, need);
- BIF_RET(buf_to_intlist(&hp,(char*)ap->name,ap->len, NIL));
+
+#ifdef DEBUG
+ ares =
+#endif
+ erts_analyze_utf8(ap->name, ap->len, &err_pos, &num_chars, NULL);
+ ASSERT(ares == ERTS_UTF8_OK);
+
+ res = erts_utf8_to_list(BIF_P, num_chars, ap->name, ap->len, ap->len,
+ &num_built, &num_eaten, NIL);
+ ASSERT(num_built == num_chars);
+ ASSERT(num_eaten == ap->len);
+ BIF_RET(res);
}
/**********************************************************************/
@@ -2627,18 +2640,19 @@ BIF_RETTYPE atom_to_list_1(BIF_ALIST_1)
BIF_RETTYPE list_to_atom_1(BIF_ALIST_1)
{
Eterm res;
- char *buf = (char *) erts_alloc(ERTS_ALC_T_TMP, MAX_ATOM_LENGTH);
- int i = intlist_to_buf(BIF_ARG_1, buf, MAX_ATOM_LENGTH);
+ char *buf = (char *) erts_alloc(ERTS_ALC_T_TMP, MAX_ATOM_CHARACTERS);
+ int i = intlist_to_buf(BIF_ARG_1, buf, MAX_ATOM_CHARACTERS);
if (i < 0) {
erts_free(ERTS_ALC_T_TMP, (void *) buf);
i = list_length(BIF_ARG_1);
- if (i > MAX_ATOM_LENGTH) {
+ if (i > MAX_ATOM_CHARACTERS) {
BIF_ERROR(BIF_P, SYSTEM_LIMIT);
}
BIF_ERROR(BIF_P, BADARG);
}
- res = am_atom_put(buf, i);
+ res = erts_atom_put((byte *) buf, i, ERTS_ATOM_ENC_LATIN1, 1);
+ ASSERT(is_atom(res));
erts_free(ERTS_ALC_T_TMP, (void *) buf);
BIF_RET(res);
}
@@ -2648,16 +2662,16 @@ BIF_RETTYPE list_to_atom_1(BIF_ALIST_1)
BIF_RETTYPE list_to_existing_atom_1(BIF_ALIST_1)
{
int i;
- char *buf = (char *) erts_alloc(ERTS_ALC_T_TMP, MAX_ATOM_LENGTH);
+ char *buf = (char *) erts_alloc(ERTS_ALC_T_TMP, MAX_ATOM_CHARACTERS);
- if ((i = intlist_to_buf(BIF_ARG_1, buf, MAX_ATOM_LENGTH)) < 0) {
+ if ((i = intlist_to_buf(BIF_ARG_1, buf, MAX_ATOM_CHARACTERS)) < 0) {
error:
erts_free(ERTS_ALC_T_TMP, (void *) buf);
BIF_ERROR(BIF_P, BADARG);
} else {
Eterm a;
- if (erts_atom_get(buf, i, &a)) {
+ if (erts_atom_get(buf, i, &a, 1)) {
erts_free(ERTS_ALC_T_TMP, (void *) buf);
BIF_RET(a);
} else {
diff --git a/erts/emulator/beam/dist.c b/erts/emulator/beam/dist.c
index 64cd93a100..145e6861f6 100644
--- a/erts/emulator/beam/dist.c
+++ b/erts/emulator/beam/dist.c
@@ -1729,7 +1729,7 @@ dsig_send(ErtsDSigData *dsdp, Eterm ctl, Eterm msg, int force_busy)
data_size += erts_encode_dist_ext_size(ctl, flags, acmp);
if (is_value(msg))
data_size += erts_encode_dist_ext_size(msg, flags, acmp);
- erts_finalize_atom_cache_map(acmp);
+ erts_finalize_atom_cache_map(acmp, flags);
dhdr_ext_size = erts_encode_ext_dist_header_size(acmp);
data_size += dhdr_ext_size;
@@ -2073,7 +2073,8 @@ erts_dist_command(Port *prt, int reds_limit)
ASSERT(ob);
do {
ob->extp = erts_encode_ext_dist_header_finalize(ob->extp,
- dep->cache);
+ dep->cache,
+ flags);
if (!(flags & DFLAG_DIST_HDR_ATOM_CACHE))
*--ob->extp = PASS_THROUGH; /* Old node; 'pass through'
needed */
@@ -2117,7 +2118,8 @@ erts_dist_command(Port *prt, int reds_limit)
Uint size;
oq.first->extp
= erts_encode_ext_dist_header_finalize(oq.first->extp,
- dep->cache);
+ dep->cache,
+ flags);
reds += ERTS_PORT_REDS_DIST_CMD_FINALIZE;
if (!(flags & DFLAG_DIST_HDR_ATOM_CACHE))
*--oq.first->extp = PASS_THROUGH; /* Old node; 'pass through'
diff --git a/erts/emulator/beam/dist.h b/erts/emulator/beam/dist.h
index 2bc3d9c881..310d09768d 100644
--- a/erts/emulator/beam/dist.h
+++ b/erts/emulator/beam/dist.h
@@ -38,7 +38,8 @@
#define DFLAG_UNICODE_IO 0x1000
#define DFLAG_DIST_HDR_ATOM_CACHE 0x2000
#define DFLAG_SMALL_ATOM_TAGS 0x4000
-#define DFLAGS_INTERNAL_TAGS 0x8000
+#define DFLAG_INTERNAL_TAGS 0x8000
+#define DFLAG_UTF8_ATOMS 0x10000
/* All flags that should be enabled when term_to_binary/1 is used. */
#define TERM_TO_BINARY_DFLAGS (DFLAG_EXTENDED_REFERENCES \
diff --git a/erts/emulator/beam/erl_alloc.c b/erts/emulator/beam/erl_alloc.c
index 5da8c69c03..007cdbdfa6 100644
--- a/erts/emulator/beam/erl_alloc.c
+++ b/erts/emulator/beam/erl_alloc.c
@@ -3062,13 +3062,13 @@ erts_request_alloc_info(struct process *c_p,
Eterm alloc = CAR(consp);
for (ai = ERTS_ALC_A_MIN; ai <= ERTS_ALC_A_MAX; ai++)
- if (erts_is_atom_str((char *) erts_alc_a2ad[ai], alloc))
+ if (erts_is_atom_str(erts_alc_a2ad[ai], alloc, 0))
goto save_alloc;
- if (erts_is_atom_str("mseg_alloc", alloc)) {
+ if (erts_is_atom_str("mseg_alloc", alloc, 0)) {
ai = ERTS_ALC_INFO_A_MSEG_ALLOC;
goto save_alloc;
}
- if (erts_is_atom_str("alloc_util", alloc)) {
+ if (erts_is_atom_str("alloc_util", alloc, 0)) {
ai = ERTS_ALC_INFO_A_ALLOC_UTIL;
save_alloc:
if (req_ai[ai])
diff --git a/erts/emulator/beam/erl_alloc.types b/erts/emulator/beam/erl_alloc.types
index d4de0d076a..3d437652ce 100644
--- a/erts/emulator/beam/erl_alloc.types
+++ b/erts/emulator/beam/erl_alloc.types
@@ -49,6 +49,8 @@
# true after a "+enable X" statement or if it has been passed as a
# command line argument to make_alloc_types. The variable X is false
# after a "+disable X" statement or if it has never been mentioned.
+#
+# IMPORTANT! Only use 7-bit ascii text in this file!
+if smp
+disable threads_no_smp
diff --git a/erts/emulator/beam/erl_alloc_util.c b/erts/emulator/beam/erl_alloc_util.c
index 3d6761339b..6de0099636 100644
--- a/erts/emulator/beam/erl_alloc_util.c
+++ b/erts/emulator/beam/erl_alloc_util.c
@@ -2906,10 +2906,10 @@ make_name_atoms(Allctr_t *allctr)
char alloc[] = "alloc";
char realloc[] = "realloc";
char free[] = "free";
- char buf[MAX_ATOM_LENGTH];
+ char buf[MAX_ATOM_CHARACTERS];
size_t prefix_len = strlen(allctr->name_prefix);
- if (prefix_len > MAX_ATOM_LENGTH + sizeof(realloc) - 1)
+ if (prefix_len > MAX_ATOM_CHARACTERS + sizeof(realloc) - 1)
erl_exit(1,"Too long allocator name: %salloc\n",allctr->name_prefix);
memcpy((void *) buf, (void *) allctr->name_prefix, prefix_len);
diff --git a/erts/emulator/beam/erl_bif_ddll.c b/erts/emulator/beam/erl_bif_ddll.c
index 7cbea55eac..889fefacfc 100644
--- a/erts/emulator/beam/erl_bif_ddll.c
+++ b/erts/emulator/beam/erl_bif_ddll.c
@@ -1835,7 +1835,10 @@ static Eterm build_load_error_hp(Eterm *hp, int code)
static Eterm mkatom(char *str)
{
- return am_atom_put(str, sys_strlen(str));
+ return erts_atom_put((byte *) str,
+ sys_strlen(str),
+ ERTS_ATOM_ENC_LATIN1,
+ 1);
}
static char *pick_list_or_atom(Eterm name_term)
diff --git a/erts/emulator/beam/erl_bif_info.c b/erts/emulator/beam/erl_bif_info.c
index fabddffc68..b8026063e6 100755
--- a/erts/emulator/beam/erl_bif_info.c
+++ b/erts/emulator/beam/erl_bif_info.c
@@ -2298,8 +2298,10 @@ BIF_RETTYPE system_info_1(BIF_ALIST_1)
for (i = num_instructions-1; i >= 0; i--) {
res = erts_bld_cons(hpp, hszp,
erts_bld_tuple(hpp, hszp, 2,
- am_atom_put(opc[i].name,
- strlen(opc[i].name)),
+ erts_atom_put(opc[i].name,
+ strlen(opc[i].name),
+ ERTS_ATOM_ENC_LATIN1,
+ 1),
erts_bld_uint(hpp, hszp,
opc[i].count)),
res);
@@ -3873,7 +3875,7 @@ static Eterm lcnt_build_lock_stats_term(Eterm **hpp, Uint *szp, erts_lcnt_lock_s
timer_ns = stats->timer.ns;
timer_n = stats->timer_n;
- af = am_atom_put(stats->file, strlen(stats->file));
+ af = erts_atom_put(stats->file, strlen(stats->file), ERTS_ATOM_ENC_LATIN1, 1);
uil = erts_bld_uint( hpp, szp, line);
tloc = erts_bld_tuple(hpp, szp, 2, af, uil);
@@ -3910,13 +3912,13 @@ static Eterm lcnt_build_lock_term(Eterm **hpp, Uint *szp, erts_lcnt_lock_t *lock
ASSERT(ltype);
- type = am_atom_put(ltype, strlen(ltype));
- name = am_atom_put(lock->name, strlen(lock->name));
+ type = erts_atom_put(ltype, strlen(ltype), ERTS_ATOM_ENC_LATIN1, 1);
+ name = erts_atom_put(lock->name, strlen(lock->name), ERTS_ATOM_ENC_LATIN1, 1);
if (lock->flag & ERTS_LCNT_LT_ALLOC) {
/* use allocator types names as id's for allocator locks */
ltype = (char *) ERTS_ALC_A2AD(signed_val(lock->id));
- id = am_atom_put(ltype, strlen(ltype));
+ id = erts_atom_put(ltype, strlen(ltype), ERTS_ATOM_ENC_LATIN1, 1);
} else if (lock->flag & ERTS_LCNT_LT_PROCLOCK) {
/* use registered names as id's for process locks if available */
proc = erts_proc_lookup(lock->id);
@@ -3956,12 +3958,12 @@ static Eterm lcnt_build_result_term(Eterm **hpp, Uint *szp, erts_lcnt_data_t *da
dtns = erts_bld_uint( hpp, szp, data->duration.ns);
tdt = erts_bld_tuple(hpp, szp, 2, dts, dtns);
- adur = am_atom_put(str_duration, strlen(str_duration));
+ adur = erts_atom_put(str_duration, strlen(str_duration), ERTS_ATOM_ENC_LATIN1, 1);
tdur = erts_bld_tuple(hpp, szp, 2, adur, tdt);
/* lock tuple */
- aloc = am_atom_put(str_locks, strlen(str_locks));
+ aloc = erts_atom_put(str_locks, strlen(str_locks), ERTS_ATOM_ENC_LATIN1, 1);
for (lock = data->current_locks->head; lock != NULL ; lock = lock->next ) {
lloc = lcnt_build_lock_term(hpp, szp, lock, lloc);
@@ -4097,14 +4099,14 @@ BIF_RETTYPE erts_debug_lock_counters_1(BIF_ALIST_1)
static void os_info_init(void)
{
- Eterm type = am_atom_put(os_type, strlen(os_type));
+ Eterm type = erts_atom_put((byte *) os_type, strlen(os_type), ERTS_ATOM_ENC_LATIN1, 1);
Eterm flav;
int major, minor, build;
char* buf = erts_alloc(ERTS_ALC_T_TMP, 1024); /* More than enough */
Eterm* hp;
os_flavor(buf, 1024);
- flav = am_atom_put(buf, strlen(buf));
+ flav = erts_atom_put((byte *) buf, strlen(buf), ERTS_ATOM_ENC_LATIN1, 1);
erts_free(ERTS_ALC_T_TMP, (void *) buf);
hp = erts_alloc(ERTS_ALC_T_LL_TEMP_TERM, (3+4)*sizeof(Eterm));
os_type_tuple = TUPLE2(hp, type, flav);
diff --git a/erts/emulator/beam/erl_bif_port.c b/erts/emulator/beam/erl_bif_port.c
index 81146e38d7..a4b837541b 100644
--- a/erts/emulator/beam/erl_bif_port.c
+++ b/erts/emulator/beam/erl_bif_port.c
@@ -66,7 +66,7 @@ BIF_RETTYPE open_port_2(BIF_ALIST_2)
} else {
str = "einval";
}
- BIF_P->fvalue = am_atom_put(str, strlen(str));
+ BIF_P->fvalue = erts_atom_put((byte *) str, strlen(str), ERTS_ATOM_ENC_LATIN1, 1);
BIF_ERROR(BIF_P, EXC_ERROR);
}
diff --git a/erts/emulator/beam/erl_db.c b/erts/emulator/beam/erl_db.c
index 48a95cdf32..7932966539 100644
--- a/erts/emulator/beam/erl_db.c
+++ b/erts/emulator/beam/erl_db.c
@@ -3830,7 +3830,7 @@ erts_ets_colliding_names(Process* p, Eterm name, Uint cnt)
while (index >= atom_table_size()) {
char tmp[20];
erts_snprintf(tmp, sizeof(tmp), "am%x", atom_table_size());
- am_atom_put(tmp,strlen(tmp));
+ erts_atom_put((byte *) tmp, strlen(tmp), ERTS_ATOM_ENC_LATIN1, 1);
}
list = CONS(hp, make_atom(index), list);
hp += 2;
diff --git a/erts/emulator/beam/erl_db_util.c b/erts/emulator/beam/erl_db_util.c
index bb08762b26..58c4d75c31 100644
--- a/erts/emulator/beam/erl_db_util.c
+++ b/erts/emulator/beam/erl_db_util.c
@@ -4773,7 +4773,8 @@ static int match_compact(ErlHeapFragment *expr, DMCErrInfo *err_info)
ASSERT(j < x);
erts_snprintf(buff+1, sizeof(buff) - 1, "%u", (unsigned) j);
/* Yes, writing directly into terms, they ARE off heap */
- *p = am_atom_put(buff, strlen(buff));
+ *p = erts_atom_put((byte *) buff, strlen(buff),
+ ERTS_ATOM_ENC_LATIN1, 1);
}
++p;
}
diff --git a/erts/emulator/beam/erl_init.c b/erts/emulator/beam/erl_init.c
index b518683730..61b3c09d16 100644
--- a/erts/emulator/beam/erl_init.c
+++ b/erts/emulator/beam/erl_init.c
@@ -358,7 +358,7 @@ erl_first_process_otp(char* modname, void* code, unsigned size, int argc, char**
ErlSpawnOpts so;
Eterm env;
- start_mod = am_atom_put(modname, sys_strlen(modname));
+ start_mod = erts_atom_put((byte *) modname, sys_strlen(modname), ERTS_ATOM_ENC_LATIN1, 1);
if (erts_find_function(start_mod, am_start, 2,
erts_active_code_ix()) == NULL) {
erl_exit(5, "No function %s:start/2\n", modname);
@@ -455,7 +455,7 @@ load_preloaded(void)
i = 0;
while ((name = preload_p[i].name) != NULL) {
length = preload_p[i].size;
- module_name = am_atom_put(name, sys_strlen(name));
+ module_name = erts_atom_put((byte *) name, sys_strlen(name), ERTS_ATOM_ENC_LATIN1, 1);
if ((code = sys_preload_begin(&preload_p[i])) == 0)
erl_exit(1, "Failed to find preloaded code for module %s\n",
name);
diff --git a/erts/emulator/beam/erl_nif.c b/erts/emulator/beam/erl_nif.c
index 1bd2d933b2..fb295c9a8a 100644
--- a/erts/emulator/beam/erl_nif.c
+++ b/erts/emulator/beam/erl_nif.c
@@ -743,16 +743,23 @@ int enif_get_atom(ErlNifEnv* env, Eterm atom, char* buf, unsigned len,
{
Atom* ap;
ASSERT(encoding == ERL_NIF_LATIN1);
- if (is_not_atom(atom)) {
+ if (is_not_atom(atom) || len==0) {
return 0;
}
ap = atom_tab(atom_val(atom));
- if (ap->len+1 > len) {
+
+ if (ap->latin1_chars < 0 || ap->latin1_chars >= len) {
return 0;
}
- sys_memcpy(buf, ap->name, ap->len);
- buf[ap->len] = '\0';
- return ap->len + 1;
+ if (ap->latin1_chars == ap->len) {
+ sys_memcpy(buf, ap->name, ap->len);
+ }
+ else {
+ int dlen = erts_utf8_to_latin1((byte*)buf, ap->name, ap->len);
+ ASSERT(dlen == ap->latin1_chars); (void)dlen;
+ }
+ buf[ap->latin1_chars] = '\0';
+ return ap->latin1_chars + 1;
}
int enif_get_int(ErlNifEnv* env, Eterm term, int* ip)
@@ -854,7 +861,10 @@ int enif_get_atom_length(ErlNifEnv* env, Eterm atom, unsigned* len,
ASSERT(enc == ERL_NIF_LATIN1);
if (is_not_atom(atom)) return 0;
ap = atom_tab(atom_val(atom));
- *len = ap->len;
+ if (ap->latin1_chars < 0) {
+ return 0;
+ }
+ *len = ap->latin1_chars;
return 1;
}
@@ -961,7 +971,7 @@ ERL_NIF_TERM enif_make_atom(ErlNifEnv* env, const char* name)
ERL_NIF_TERM enif_make_atom_len(ErlNifEnv* env, const char* name, size_t len)
{
- return am_atom_put(name, len);
+ return erts_atom_put((byte*)name, len, ERTS_ATOM_ENC_LATIN1, 1);
}
int enif_make_existing_atom(ErlNifEnv* env, const char* name, ERL_NIF_TERM* atom,
@@ -974,7 +984,7 @@ int enif_make_existing_atom_len(ErlNifEnv* env, const char* name, size_t len,
ERL_NIF_TERM* atom, ErlNifCharEncoding encoding)
{
ASSERT(encoding == ERL_NIF_LATIN1);
- return erts_atom_get(name, len, atom);
+ return erts_atom_get(name, len, atom, 1);
}
ERL_NIF_TERM enif_make_tuple(ErlNifEnv* env, unsigned cnt, ...)
@@ -1633,7 +1643,7 @@ BIF_RETTYPE load_nif_2(BIF_ALIST_2)
"this vm variant (%s).",
entry->vm_variant, ERL_NIF_VM_VARIANT);
}
- else if (!erts_is_atom_str((char*)entry->name, mod_atom)) {
+ else if (!erts_is_atom_str((char*)entry->name, mod_atom, 1)) {
ret = load_nif_error(BIF_P, bad_lib, "Library module name '%s' does not"
" match calling module '%T'", entry->name, mod_atom);
}
@@ -1643,7 +1653,7 @@ BIF_RETTYPE load_nif_2(BIF_ALIST_2)
for (i=0; i < entry->num_of_funcs && ret==am_ok; i++) {
BeamInstr** code_pp;
ErlNifFunc* f = &entry->funcs[i];
- if (!erts_atom_get(f->name, sys_strlen(f->name), &f_atom)
+ if (!erts_atom_get(f->name, sys_strlen(f->name), &f_atom, 1)
|| (code_pp = get_func_pp(mod->curr.code, f_atom, f->arity))==NULL) {
ret = load_nif_error(BIF_P,bad_lib,"Function not found %T:%s/%u",
mod_atom, f->name, f->arity);
@@ -1746,7 +1756,7 @@ BIF_RETTYPE load_nif_2(BIF_ALIST_2)
for (i=0; i < entry->num_of_funcs; i++)
{
BeamInstr* code_ptr;
- erts_atom_get(entry->funcs[i].name, sys_strlen(entry->funcs[i].name), &f_atom);
+ erts_atom_get(entry->funcs[i].name, sys_strlen(entry->funcs[i].name), &f_atom, 1);
code_ptr = *get_func_pp(mod->curr.code, f_atom, entry->funcs[i].arity);
if (code_ptr[1] == 0) {
diff --git a/erts/emulator/beam/erl_unicode.c b/erts/emulator/beam/erl_unicode.c
index 51559aea1c..883405d066 100644
--- a/erts/emulator/beam/erl_unicode.c
+++ b/erts/emulator/beam/erl_unicode.c
@@ -1154,15 +1154,24 @@ BIF_RETTYPE unicode_characters_to_list_2(BIF_ALIST_2)
* When input to characters_to_list is a plain binary and the format is 'unicode', we do
* a faster analyze and size count with this function.
*/
-int erts_analyze_utf8(byte *source, Uint size,
- byte **err_pos, Uint *num_chars, int *left)
+static ERTS_INLINE int
+analyze_utf8(byte *source, Uint size, byte **err_pos, Uint *num_chars, int *left,
+ Sint *num_latin1_chars, Uint max_chars)
{
+ Uint latin1_count;
+ int is_latin1;
*err_pos = source;
+ if (num_latin1_chars) {
+ is_latin1 = 1;
+ latin1_count = 0;
+ }
*num_chars = 0;
while (size) {
if (((*source) & ((byte) 0x80)) == 0) {
source++;
- --size;
+ --size;
+ if (num_latin1_chars)
+ latin1_count++;
} else if (((*source) & ((byte) 0xE0)) == 0xC0) {
if (size < 2) {
return ERTS_UTF8_INCOMPLETE;
@@ -1171,6 +1180,11 @@ int erts_analyze_utf8(byte *source, Uint size,
((*source) < 0xC2) /* overlong */) {
return ERTS_UTF8_ERROR;
}
+ if (num_latin1_chars) {
+ latin1_count++;
+ if ((source[0] & ((byte) 0xFC)) != ((byte) 0xC0))
+ is_latin1 = 0;
+ }
source += 2;
size -= 2;
} else if (((*source) & ((byte) 0xF0)) == 0xE0) {
@@ -1188,6 +1202,8 @@ int erts_analyze_utf8(byte *source, Uint size,
}
source += 3;
size -= 3;
+ if (num_latin1_chars)
+ is_latin1 = 0;
} else if (((*source) & ((byte) 0xF8)) == 0xF0) {
if (size < 4) {
return ERTS_UTF8_INCOMPLETE;
@@ -1205,21 +1221,40 @@ int erts_analyze_utf8(byte *source, Uint size,
}
source += 4;
size -= 4;
+ if (num_latin1_chars)
+ is_latin1 = 0;
} else {
return ERTS_UTF8_ERROR;
}
++(*num_chars);
*err_pos = source;
- if (left && --(*left) <= 0) {
+ if (max_chars && size > 0 && *num_chars == max_chars)
+ return ERTS_UTF8_OK_MAX_CHARS;
+ if (left && --(*left) <= 0 && size) {
return ERTS_UTF8_ANALYZE_MORE;
}
}
+ if (num_latin1_chars)
+ *num_latin1_chars = is_latin1 ? latin1_count : -1;
return ERTS_UTF8_OK;
}
+int erts_analyze_utf8(byte *source, Uint size,
+ byte **err_pos, Uint *num_chars, int *left)
+{
+ return analyze_utf8(source, size, err_pos, num_chars, left, NULL, 0);
+}
+
+int erts_analyze_utf8_x(byte *source, Uint size,
+ byte **err_pos, Uint *num_chars, int *left,
+ Sint *num_latin1_chars, Uint max_chars)
+{
+ return analyze_utf8(source, size, err_pos, num_chars, left, num_latin1_chars, max_chars);
+}
+
/*
* No errors should be able to occur - no overlongs, no malformed, no nothing
- */
+ */
static Eterm do_utf8_to_list(Process *p, Uint num, byte *bytes, Uint sz,
Uint left,
Uint *num_built, Uint *num_eaten, Eterm tail)
@@ -1275,6 +1310,12 @@ static Eterm do_utf8_to_list(Process *p, Uint num, byte *bytes, Uint sz,
return ret;
}
+Eterm erts_utf8_to_list(Process *p, Uint num, byte *bytes, Uint sz, Uint left,
+ Uint *num_built, Uint *num_eaten, Eterm tail)
+{
+ return do_utf8_to_list(p, num, bytes, sz, left, num_built, num_eaten, tail);
+}
+
static int is_candidate(Uint cp)
{
int index,pos;
@@ -1812,31 +1853,25 @@ BIF_RETTYPE atom_to_binary_2(BIF_ALIST_2)
ap = atom_tab(atom_val(BIF_ARG_1));
if (BIF_ARG_2 == am_latin1) {
- BIF_RET(new_binary(BIF_P, ap->name, ap->len));
- } else if (BIF_ARG_2 == am_utf8 || BIF_ARG_2 == am_unicode) {
- int bin_size = 0;
- int i;
Eterm bin_term;
- byte* bin_p;
- for (i = 0; i < ap->len; i++) {
- bin_size += (ap->name[i] >= 0x80) ? 2 : 1;
+ if (ap->latin1_chars < 0) {
+ goto error;
}
- if (bin_size == ap->len) {
- BIF_RET(new_binary(BIF_P, ap->name, ap->len));
+ if (ap->latin1_chars == ap->len) {
+ bin_term = new_binary(BIF_P, ap->name, ap->len);
}
- bin_term = new_binary(BIF_P, 0, bin_size);
- bin_p = binary_bytes(bin_term);
- for (i = 0; i < ap->len; i++) {
- byte b = ap->name[i];
- if (b < 0x80) {
- *bin_p++ = b;
- } else {
- *bin_p++ = 0xC0 | (b >> 6);
- *bin_p++ = 0x80 | (b & 0x3F);
- }
+ else {
+ byte* bin_p;
+ int dbg_sz;
+ bin_term = new_binary(BIF_P, 0, ap->latin1_chars);
+ bin_p = binary_bytes(bin_term);
+ dbg_sz = erts_utf8_to_latin1(bin_p, ap->name, ap->len);
+ ASSERT(dbg_sz == ap->latin1_chars); (void)dbg_sz;
}
BIF_RET(bin_term);
+ } else if (BIF_ARG_2 == am_utf8 || BIF_ARG_2 == am_unicode) {
+ BIF_RET(new_binary(BIF_P, ap->name, ap->len));
} else {
error:
BIF_ERROR(BIF_P, BADARG);
@@ -1844,118 +1879,78 @@ BIF_RETTYPE atom_to_binary_2(BIF_ALIST_2)
}
static BIF_RETTYPE
-binary_to_atom(Process* p, Eterm bin, Eterm enc, int must_exist)
+binary_to_atom(Process* proc, Eterm bin, Eterm enc, int must_exist)
{
byte* bytes;
byte *temp_alloc = NULL;
Uint bin_size;
if ((bytes = erts_get_aligned_binary_bytes(bin, &temp_alloc)) == 0) {
- BIF_ERROR(p, BADARG);
+ BIF_ERROR(proc, BADARG);
}
bin_size = binary_size(bin);
if (enc == am_latin1) {
Eterm a;
- if (bin_size > MAX_ATOM_LENGTH) {
+ if (bin_size > MAX_ATOM_CHARACTERS) {
system_limit:
erts_free_aligned_binary_bytes(temp_alloc);
- BIF_ERROR(p, SYSTEM_LIMIT);
+ BIF_ERROR(proc, SYSTEM_LIMIT);
}
if (!must_exist) {
- a = am_atom_put((char *)bytes, bin_size);
- erts_free_aligned_binary_bytes(temp_alloc);
+ a = erts_atom_put((byte *) bytes,
+ bin_size,
+ ERTS_ATOM_ENC_LATIN1,
+ 0);
+ erts_free_aligned_binary_bytes(temp_alloc);
+ if (is_non_value(a))
+ goto badarg;
BIF_RET(a);
- } else if (erts_atom_get((char *)bytes, bin_size, &a)) {
+ } else if (erts_atom_get((char *)bytes, bin_size, &a, 1)) {
erts_free_aligned_binary_bytes(temp_alloc);
BIF_RET(a);
} else {
goto badarg;
}
} else if (enc == am_utf8 || enc == am_unicode) {
- char *buf;
- char *dst;
- int i;
- int num_chars;
Eterm res;
+ Uint num_chars = 0;
+ const byte* p = bytes;
+ Uint left = bin_size;
- if (bin_size > 2*MAX_ATOM_LENGTH) {
- byte* err_pos;
- Uint n;
- int reds_left = bin_size+1; /* Number of reductions left. */
-
- if (erts_analyze_utf8(bytes, bin_size, &err_pos,
- &n, &reds_left) == ERTS_UTF8_OK) {
- /*
- * Correct UTF-8 encoding, but too many characters to
- * fit in an atom.
- */
+ while (left) {
+ if (++num_chars > MAX_ATOM_CHARACTERS) {
goto system_limit;
- } else {
- /*
- * Something wrong in the UTF-8 encoding or Unicode code
- * points > 255.
- */
- goto badarg;
}
- }
-
- /*
- * Allocate a temporary buffer the same size as the binary,
- * so that we don't need an extra overflow test.
- */
- buf = (char *) erts_alloc(ERTS_ALC_T_TMP, bin_size);
- dst = buf;
- for (i = 0; i < bin_size; i++) {
- int c = bytes[i];
- if (c < 0x80) {
- *dst++ = c;
- } else if (i < bin_size-1) {
- int c2;
- if ((c & 0xE0) != 0xC0) {
- goto free_badarg;
- }
- i++;
- c = (c & 0x3F) << 6;
- c2 = bytes[i];
- if ((c2 & 0xC0) != 0x80) {
- goto free_badarg;
- }
- c = c | (c2 & 0x3F);
- if (0x80 <= c && c < 256) {
- *dst++ = c;
- } else {
- goto free_badarg;
- }
- } else {
- free_badarg:
- erts_free(ERTS_ALC_T_TMP, (void *) buf);
- goto badarg;
+ if ((p[0] & 0x80) == 0) {
+ ++p;
+ --left;
}
+ else if (left >= 2
+ && (p[0] & 0xFE) == 0xC2 /* only allow latin1 subset */
+ && (p[1] & 0xC0) == 0x80) {
+ p += 2;
+ left -= 2;
+ }
+ else goto badarg;
}
- num_chars = dst - buf;
- if (num_chars > MAX_ATOM_LENGTH) {
- erts_free(ERTS_ALC_T_TMP, (void *) buf);
- goto system_limit;
- }
+
if (!must_exist) {
- res = am_atom_put(buf, num_chars);
- erts_free(ERTS_ALC_T_TMP, (void *) buf);
- erts_free_aligned_binary_bytes(temp_alloc);
- BIF_RET(res);
- } else {
- int exists = erts_atom_get(buf, num_chars, &res);
- erts_free(ERTS_ALC_T_TMP, (void *) buf);
- if (exists) {
- erts_free_aligned_binary_bytes(temp_alloc);
- BIF_RET(res);
- } else {
- goto badarg;
- }
+ res = erts_atom_put((byte *) bytes,
+ bin_size,
+ ERTS_ATOM_ENC_UTF8,
+ 0);
+ }
+ else if (!erts_atom_get((char*)bytes, bin_size, &res, 0)) {
+ goto badarg;
}
+ erts_free_aligned_binary_bytes(temp_alloc);
+ if (is_non_value(res))
+ goto badarg;
+ BIF_RET(res);
} else {
badarg:
erts_free_aligned_binary_bytes(temp_alloc);
- BIF_ERROR(p, BADARG);
+ BIF_ERROR(proc, BADARG);
}
}
@@ -2670,3 +2665,28 @@ BIF_RETTYPE file_native_name_encoding_0(BIF_ALIST_0)
}
}
+int erts_utf8_to_latin1(byte* dest, const byte* source, int slen)
+{
+ /*
+ * Assumes source contains valid utf8 that can be encoded as latin1,
+ * and that dest has enough room.
+ */
+ byte* dp = dest;
+
+ while (slen > 0) {
+ if ((source[0] & 0x80) == 0) {
+ *dp++ = *source++;
+ --slen;
+ }
+ else {
+ ASSERT(slen > 1);
+ ASSERT((source[0] & 0xFE) == 0xC2);
+ ASSERT((source[1] & 0xC0) == 0x80);
+ *dp++ = (char) ((source[0] << 6) | (source[1] & 0x3F));
+ source += 2;
+ slen -= 2;
+ }
+ }
+ return dp - dest;
+}
+
diff --git a/erts/emulator/beam/external.c b/erts/emulator/beam/external.c
index ab1065aaa1..8c4d9108d4 100644
--- a/erts/emulator/beam/external.c
+++ b/erts/emulator/beam/external.c
@@ -142,6 +142,7 @@ erts_init_atom_cache_map(ErtsAtomCacheMap *acmp)
{
if (acmp) {
int ix;
+ acmp->long_atoms = 0;
for (ix = 0; ix < ERTS_ATOM_CACHE_SIZE; ix++)
acmp->cache[ix].iix = -1;
acmp->sz = 0;
@@ -154,6 +155,7 @@ erts_reset_atom_cache_map(ErtsAtomCacheMap *acmp)
{
if (acmp) {
int i;
+ acmp->long_atoms = 0;
for (i = 0; i < acmp->sz; i++) {
ASSERT(0 <= acmp->cix[i] && acmp->cix[i] < ERTS_ATOM_CACHE_SIZE);
acmp->cache[acmp->cix[i]].iix = -1;
@@ -175,9 +177,23 @@ erts_destroy_atom_cache_map(ErtsAtomCacheMap *acmp)
}
static ERTS_INLINE void
-insert_acache_map(ErtsAtomCacheMap *acmp, Eterm atom)
+insert_acache_map(ErtsAtomCacheMap *acmp, Eterm atom, Uint32 dflags)
{
- if (acmp && acmp->sz < ERTS_MAX_INTERNAL_ATOM_CACHE_ENTRIES) {
+ /*
+ * If the receiver do not understand utf8 atoms
+ * and this atom cannot be represented in latin1,
+ * we are not allowed to cache it.
+ *
+ * In this case all atoms are assumed to have
+ * latin1 encoding in the cache. By refusing it
+ * in the cache we will instead encode it using
+ * ATOM_UTF8_EXT/SMALL_ATOM_UTF8_EXT which the
+ * receiver do not recognize and tear down the
+ * connection.
+ */
+ if (acmp && acmp->sz < ERTS_MAX_INTERNAL_ATOM_CACHE_ENTRIES
+ && ((dflags & DFLAG_UTF8_ATOMS)
+ || atom_tab(atom_val(atom))->latin1_chars >= 0)) {
int ix;
ASSERT(acmp->hdr_sz < 0);
ix = atom2cix(atom);
@@ -190,7 +206,7 @@ insert_acache_map(ErtsAtomCacheMap *acmp, Eterm atom)
}
static ERTS_INLINE int
-get_iix_acache_map(ErtsAtomCacheMap *acmp, Eterm atom)
+get_iix_acache_map(ErtsAtomCacheMap *acmp, Eterm atom, Uint32 dflags)
{
if (!acmp)
return -1;
@@ -199,7 +215,9 @@ get_iix_acache_map(ErtsAtomCacheMap *acmp, Eterm atom)
ASSERT(is_atom(atom));
ix = atom2cix(atom);
if (acmp->cache[ix].iix < 0) {
- ASSERT(acmp->sz == ERTS_MAX_INTERNAL_ATOM_CACHE_ENTRIES);
+ ASSERT(acmp->sz == ERTS_MAX_INTERNAL_ATOM_CACHE_ENTRIES
+ || (!(dflags & DFLAG_UTF8_ATOMS)
+ && atom_tab(atom_val(atom))->latin1_chars < 0));
return -1;
}
else {
@@ -210,18 +228,17 @@ get_iix_acache_map(ErtsAtomCacheMap *acmp, Eterm atom)
}
void
-erts_finalize_atom_cache_map(ErtsAtomCacheMap *acmp)
+erts_finalize_atom_cache_map(ErtsAtomCacheMap *acmp, Uint32 dflags)
{
if (acmp) {
-#if MAX_ATOM_LENGTH > 255
-#error "This code is not complete; long_atoms info need to be passed to the following stages."
- int long_atoms = 0; /* !0 if one or more atoms are long than 255. */
-#endif
+ int utf8_atoms = (int) (dflags & DFLAG_UTF8_ATOMS);
+ int long_atoms = 0; /* !0 if one or more atoms are longer than 255. */
int i;
int sz;
int fix_sz
= 1 /* VERSION_MAGIC */
+ 1 /* DIST_HEADER */
+ + 1 /* dist header flags */
+ 1 /* number of internal cache entries */
;
int min_sz;
@@ -230,22 +247,23 @@ erts_finalize_atom_cache_map(ErtsAtomCacheMap *acmp)
min_sz = fix_sz+(2+4)*acmp->sz;
sz = fix_sz;
for (i = 0; i < acmp->sz; i++) {
+ Atom *a;
Eterm atom;
int len;
atom = acmp->cache[acmp->cix[i]].atom;
ASSERT(is_atom(atom));
- len = atom_tab(atom_val(atom))->len;
-#if MAX_ATOM_LENGTH > 255
+ a = atom_tab(atom_val(atom));
+ len = (int) (utf8_atoms ? a->len : a->latin1_chars);
+ ASSERT(len >= 0);
if (!long_atoms && len > 255)
long_atoms = 1;
-#endif
/* Enough for a new atom cache value */
sz += 1 /* cix */ + 1 /* length */ + len /* text */;
}
-#if MAX_ATOM_LENGTH > 255
- if (long_atoms)
+ if (long_atoms) {
+ acmp->long_atoms = 1;
sz += acmp->sz; /* we need 2 bytes per atom for length */
-#endif
+ }
/* Dynamically sized flag field */
sz += ERTS_DIST_HDR_ATOM_CACHE_FLAG_BYTES(acmp->sz);
if (sz < min_sz)
@@ -274,6 +292,7 @@ byte *erts_encode_ext_dist_header_setup(byte *ctl_ext, ErtsAtomCacheMap *acmp)
else {
int i;
byte *ep = ctl_ext;
+ byte dist_hdr_flags = acmp->long_atoms ? ERTS_DIST_HDR_LONG_ATOMS_FLG : 0;
ASSERT(acmp->hdr_sz >= 0);
/*
* Write cache update instructions. Note that this is a purely
@@ -296,28 +315,36 @@ byte *erts_encode_ext_dist_header_setup(byte *ctl_ext, ErtsAtomCacheMap *acmp)
}
--ep;
put_int8(acmp->sz, ep);
+ --ep;
+ put_int8(dist_hdr_flags, ep);
*--ep = DIST_HEADER;
*--ep = VERSION_MAGIC;
return ep;
}
}
-byte *erts_encode_ext_dist_header_finalize(byte *ext, ErtsAtomCache *cache)
+byte *erts_encode_ext_dist_header_finalize(byte *ext, ErtsAtomCache *cache, Uint32 dflags)
{
byte *ip;
byte instr_buf[(2+4)*ERTS_ATOM_CACHE_SIZE];
int ci, sz;
+ byte dist_hdr_flags;
+ int long_atoms;
+ int utf8_atoms = (int) (dflags & DFLAG_UTF8_ATOMS);
register byte *ep = ext;
ASSERT(ep[0] == VERSION_MAGIC);
if (ep[1] != DIST_HEADER)
return ext;
+ dist_hdr_flags = ep[2];
+ long_atoms = ERTS_DIST_HDR_LONG_ATOMS_FLG & ((int) dist_hdr_flags);
+
/*
* Update output atom cache and write the external version of
* the dist header. We write the header backwards just
* before the actual term(s).
*/
- ep += 2;
+ ep += 3;
ci = (int) get_int8(ep);
ASSERT(0 <= ci && ci < ERTS_ATOM_CACHE_SIZE);
ep += 1;
@@ -342,12 +369,7 @@ byte *erts_encode_ext_dist_header_finalize(byte *ext, ErtsAtomCache *cache)
flgs_bytes = ERTS_DIST_HDR_ATOM_CACHE_FLAG_BYTES(ci);
ASSERT(flgs_bytes <= sizeof(flgs_buf));
-#if MAX_ATOM_LENGTH > 255
- /* long_atoms info needs to be passed from previous stages */
- if (long_atoms)
- flgs |= ERTS_DIST_HDR_LONG_ATOMS_FLG;
-#endif
- flgs = 0;
+ flgs = (Uint32) dist_hdr_flags;
flgs_buf_ix = 0;
if ((ci & 1) == 0)
used_half_bytes = 2;
@@ -382,17 +404,22 @@ byte *erts_encode_ext_dist_header_finalize(byte *ext, ErtsAtomCache *cache)
Atom *a;
cache->out_arr[cix] = atom;
a = atom_tab(atom_val(atom));
- sz = a->len;
- ep -= sz;
- sys_memcpy((void *) ep, (void *) a->name, sz);
-#if MAX_ATOM_LENGTH > 255
+ if (utf8_atoms) {
+ sz = a->len;
+ ep -= sz;
+ sys_memcpy((void *) ep, (void *) a->name, sz);
+ }
+ else {
+ ASSERT(0 <= a->latin1_chars && a->latin1_chars <= MAX_ATOM_CHARACTERS);
+ ep -= a->latin1_chars;
+ sz = erts_utf8_to_latin1(ep, a->name, a->len);
+ ASSERT(a->latin1_chars == sz);
+ }
if (long_atoms) {
ep -= 2;
put_int16(sz, ep);
}
- else
-#endif
- {
+ else {
ASSERT(0 <= sz && sz <= 255);
--ep;
put_int8(sz, ep);
@@ -467,7 +494,7 @@ Uint erts_encode_ext_size_2(Eterm term, unsigned dflags)
Uint erts_encode_ext_size_ets(Eterm term)
{
- return encode_size_struct2(NULL, term, TERM_TO_BINARY_DFLAGS|DFLAGS_INTERNAL_TAGS);
+ return encode_size_struct2(NULL, term, TERM_TO_BINARY_DFLAGS|DFLAG_INTERNAL_TAGS);
}
@@ -500,7 +527,7 @@ void erts_encode_ext(Eterm term, byte **ext)
byte* erts_encode_ext_ets(Eterm term, byte *ep, struct erl_off_heap_header** off_heap)
{
- return enc_term(NULL, term, ep, TERM_TO_BINARY_DFLAGS|DFLAGS_INTERNAL_TAGS,
+ return enc_term(NULL, term, ep, TERM_TO_BINARY_DFLAGS|DFLAG_INTERNAL_TAGS,
off_heap);
}
@@ -553,6 +580,7 @@ erts_prepare_dist_ext(ErtsDistExternal *edep,
#endif
register byte *ep = ext;
+ int utf8_atoms = (int) (dep->flags & DFLAG_UTF8_ATOMS);
edep->heap_size = -1;
edep->ext_endp = ext+size;
@@ -611,9 +639,7 @@ erts_prepare_dist_ext(ErtsDistExternal *edep,
ERTS_EXT_HDR_FAIL;
ep++;
if (no_atoms) {
-#if MAX_ATOM_LENGTH > 255
int long_atoms = 0;
-#endif
#ifdef DEBUG
byte *flgs_buf = ep;
#endif
@@ -632,14 +658,8 @@ erts_prepare_dist_ext(ErtsDistExternal *edep,
*/
byte_ix = ERTS_DIST_HDR_ATOM_CACHE_FLAG_BYTE_IX(no_atoms);
bit_ix = ERTS_DIST_HDR_ATOM_CACHE_FLAG_BIT_IX(no_atoms);
- if (flgsp[byte_ix] & (((byte) ERTS_DIST_HDR_LONG_ATOMS_FLG)
- << bit_ix)) {
-#if MAX_ATOM_LENGTH > 255
+ if (flgsp[byte_ix] & (((byte) ERTS_DIST_HDR_LONG_ATOMS_FLG) << bit_ix))
long_atoms = 1;
-#else
- ERTS_EXT_HDR_FAIL; /* Long atoms not supported yet */
-#endif
- }
#ifdef DEBUG
byte_ix = 0;
@@ -707,23 +727,25 @@ erts_prepare_dist_ext(ErtsDistExternal *edep,
if (cix >= ERTS_ATOM_CACHE_SIZE)
ERTS_EXT_HDR_FAIL;
ep++;
-#if MAX_ATOM_LENGTH > 255
if (long_atoms) {
CHKSIZE(2);
len = get_int16(ep);
ep += 2;
}
- else
-#endif
- {
+ else {
CHKSIZE(1);
len = get_int8(ep);
ep++;
}
- if (len > MAX_ATOM_LENGTH)
- ERTS_EXT_HDR_FAIL; /* Too long atom */
CHKSIZE(len);
- atom = am_atom_put((char *) ep, len);
+ atom = erts_atom_put((byte *) ep,
+ len,
+ (utf8_atoms
+ ? ERTS_ATOM_ENC_UTF8
+ : ERTS_ATOM_ENC_LATIN1),
+ 0);
+ if (is_non_value(atom))
+ ERTS_EXT_HDR_FAIL;
ep += len;
cache->in_arr[cix] = atom;
edep->attab.atom[tix] = atom;
@@ -1404,11 +1426,12 @@ static byte*
enc_atom(ErtsAtomCacheMap *acmp, Eterm atom, byte *ep, Uint32 dflags)
{
int iix;
- int i, j;
+ int len;
+ int utf8_atoms = (int) (dflags & DFLAG_UTF8_ATOMS);
ASSERT(is_atom(atom));
- if (dflags & DFLAGS_INTERNAL_TAGS) {
+ if (dflags & DFLAG_INTERNAL_TAGS) {
Uint aval = atom_val(atom);
ASSERT(aval < (1<<24));
if (aval >= (1 << 16)) {
@@ -1423,27 +1446,46 @@ enc_atom(ErtsAtomCacheMap *acmp, Eterm atom, byte *ep, Uint32 dflags)
}
return ep;
}
+
/*
* term_to_binary/1,2 and the initial distribution message
* don't use the cache.
*/
- iix = get_iix_acache_map(acmp, atom);
- if (iix < 0) {
- i = atom_val(atom);
- j = atom_tab(i)->len;
- if ((MAX_ATOM_LENGTH <= 255 || j <= 255)
- && (dflags & DFLAG_SMALL_ATOM_TAGS)) {
- *ep++ = SMALL_ATOM_EXT;
- put_int8(j, ep);
- ep++;
+
+ iix = get_iix_acache_map(acmp, atom, dflags);
+ if (iix < 0) {
+ Atom *a = atom_tab(atom_val(atom));
+ if (utf8_atoms || a->latin1_chars < 0) {
+ len = a->len;
+ if (len > 255) {
+ *ep++ = ATOM_UTF8_EXT;
+ put_int16(len, ep);
+ ep += 2;
+ }
+ else {
+ *ep++ = SMALL_ATOM_UTF8_EXT;
+ put_int8(len, ep);
+ ep += 1;
+ }
+ sys_memcpy((char *) ep, (char *) a->name, len);
}
else {
- *ep++ = ATOM_EXT;
- put_int16(j, ep);
- ep += 2;
+ if (a->latin1_chars <= 255 && (dflags & DFLAG_SMALL_ATOM_TAGS)) {
+ *ep++ = SMALL_ATOM_EXT;
+ len = erts_utf8_to_latin1(ep+1, a->name, a->len);
+ ASSERT(len == a->latin1_chars);
+ put_int8(len, ep);
+ ep++;
+ }
+ else {
+ *ep++ = ATOM_EXT;
+ len = erts_utf8_to_latin1(ep+2, a->name, a->len);
+ ASSERT(len == a->latin1_chars);
+ put_int16(len, ep);
+ ep += 2;
+ }
}
- sys_memcpy((char *) ep, (char*)atom_tab(i)->name, (int) j);
- ep += j;
+ ep += len;
return ep;
}
@@ -1472,7 +1514,7 @@ enc_pid(ErtsAtomCacheMap *acmp, Eterm pid, byte* ep, Uint32 dflags)
ep += 4;
put_int32(os, ep);
ep += 4;
- *ep++ = (is_internal_pid(pid) && (dflags & DFLAGS_INTERNAL_TAGS)) ?
+ *ep++ = (is_internal_pid(pid) && (dflags & DFLAG_INTERNAL_TAGS)) ?
INTERNAL_CREATION : pid_creation(pid);
return ep;
}
@@ -1482,7 +1524,7 @@ static byte*
dec_atom(ErtsDistExternal *edep, byte* ep, Eterm* objp)
{
Uint len;
- int n;
+ int n, is_latin1;
switch (*ep++) {
case ATOM_CACHE_REF:
@@ -1498,17 +1540,37 @@ dec_atom(ErtsDistExternal *edep, byte* ep, Eterm* objp)
case ATOM_EXT:
len = get_int16(ep),
ep += 2;
+ is_latin1 = 1;
goto dec_atom_common;
case SMALL_ATOM_EXT:
len = get_int8(ep);
ep++;
+ is_latin1 = 1;
+ goto dec_atom_common;
+ case ATOM_UTF8_EXT:
+ len = get_int16(ep),
+ ep += 2;
+ is_latin1 = 0;
+ goto dec_atom_common;
+ case SMALL_ATOM_UTF8_EXT:
+ len = get_int8(ep),
+ ep++;
+ is_latin1 = 0;
dec_atom_common:
if (edep && (edep->flags & ERTS_DIST_EXT_BTT_SAFE)) {
- if (!erts_atom_get((char*)ep, len, objp)) {
+ if (!erts_atom_get((char*)ep, len, objp, is_latin1)) {
goto error;
}
} else {
- *objp = am_atom_put((char*)ep, len);
+ Eterm atom = erts_atom_put(ep,
+ len,
+ (is_latin1
+ ? ERTS_ATOM_ENC_LATIN1
+ : ERTS_ATOM_ENC_UTF8),
+ 0);
+ if (is_non_value(atom))
+ goto error;
+ *objp = atom;
}
ep += len;
break;
@@ -1770,7 +1832,7 @@ enc_term(ErtsAtomCacheMap *acmp, Eterm obj, byte* ep, Uint32 dflags,
put_int16(i, ep);
ep += 2;
ep = enc_atom(acmp,ref_node_name(obj),ep,dflags);
- *ep++ = ((dflags & DFLAGS_INTERNAL_TAGS) && is_internal_ref(obj)) ?
+ *ep++ = ((dflags & DFLAG_INTERNAL_TAGS) && is_internal_ref(obj)) ?
INTERNAL_CREATION : ref_creation(obj);
ref_num = ref_numbers(obj);
for (j = 0; j < i; j++) {
@@ -1787,7 +1849,7 @@ enc_term(ErtsAtomCacheMap *acmp, Eterm obj, byte* ep, Uint32 dflags,
j = port_number(obj);
put_int32(j, ep);
ep += 4;
- *ep++ = ((dflags & DFLAGS_INTERNAL_TAGS) && is_internal_port(obj)) ?
+ *ep++ = ((dflags & DFLAG_INTERNAL_TAGS) && is_internal_port(obj)) ?
INTERNAL_CREATION : port_creation(obj);
break;
@@ -1868,7 +1930,7 @@ enc_term(ErtsAtomCacheMap *acmp, Eterm obj, byte* ep, Uint32 dflags,
byte* bytes;
ERTS_GET_BINARY_BYTES(obj, bytes, bitoffs, bitsize);
- if (dflags & DFLAGS_INTERNAL_TAGS) {
+ if (dflags & DFLAG_INTERNAL_TAGS) {
ProcBin* pb = (ProcBin*) binary_val(obj);
Uint bytesize = pb->size;
if (pb->thing_word == HEADER_SUB_BIN) {
@@ -2113,7 +2175,7 @@ static byte*
dec_term(ErtsDistExternal *edep, Eterm** hpp, byte* ep, ErlOffHeap* off_heap, Eterm* objp)
{
Eterm* hp_saved = *hpp;
- int n;
+ int n, is_latin1;
register Eterm* hp = *hpp; /* Please don't take the address of hp */
Eterm* next = objp;
@@ -2199,17 +2261,37 @@ dec_term(ErtsDistExternal *edep, Eterm** hpp, byte* ep, ErlOffHeap* off_heap, Et
case ATOM_EXT:
n = get_int16(ep);
ep += 2;
- goto dec_term_atom_common;
+ is_latin1 = 1;
+ goto dec_term_atom_common;
case SMALL_ATOM_EXT:
n = get_int8(ep);
ep++;
+ is_latin1 = 1;
+ goto dec_term_atom_common;
+ case ATOM_UTF8_EXT:
+ n = get_int16(ep);
+ ep += 2;
+ is_latin1 = 0;
+ goto dec_term_atom_common;
+ case SMALL_ATOM_UTF8_EXT:
+ n = get_int8(ep);
+ ep++;
+ is_latin1 = 0;
dec_term_atom_common:
if (edep && (edep->flags & ERTS_DIST_EXT_BTT_SAFE)) {
- if (!erts_atom_get((char*)ep, n, objp)) {
+ if (!erts_atom_get((char*)ep, n, objp, is_latin1)) {
goto error;
}
} else {
- *objp = am_atom_put((char*)ep, n);
+ Eterm atom = erts_atom_put(ep,
+ n,
+ (is_latin1
+ ? ERTS_ATOM_ENC_LATIN1
+ : ERTS_ATOM_ENC_UTF8),
+ 0);
+ if (is_non_value(atom))
+ goto error;
+ *objp = atom;
}
ep += n;
break;
@@ -2869,7 +2951,7 @@ encode_size_struct2(ErtsAtomCacheMap *acmp, Eterm obj, unsigned dflags)
result++;
break;
case ATOM_DEF:
- if (dflags & DFLAGS_INTERNAL_TAGS) {
+ if (dflags & DFLAG_INTERNAL_TAGS) {
if (atom_val(obj) >= (1<<16)) {
result += 1 + 3;
}
@@ -2878,17 +2960,22 @@ encode_size_struct2(ErtsAtomCacheMap *acmp, Eterm obj, unsigned dflags)
}
}
else {
- int alen = atom_tab(atom_val(obj))->len;
- if ((MAX_ATOM_LENGTH <= 255 || alen <= 255)
- && (dflags & DFLAG_SMALL_ATOM_TAGS)) {
- /* Make sure a SMALL_ATOM_EXT fits: SMALL_ATOM_EXT l t1 t2... */
- result += 1 + 1 + alen;
+ Atom *a = atom_tab(atom_val(obj));
+ int alen;
+ if ((dflags & DFLAG_UTF8_ATOMS) || a->latin1_chars < 0) {
+ alen = a->len;
+ result += 1 + 1 + alen;
+ if (alen > 255) {
+ result++; /* ATOM_UTF8_EXT (not small) */
+ }
}
else {
- /* Make sure an ATOM_EXT fits: ATOM_EXT l1 l0 t1 t2... */
- result += 1 + 2 + alen;
+ alen = a->latin1_chars;
+ result += 1 + 1 + alen;
+ if (alen > 255 || !(dflags & DFLAG_SMALL_ATOM_TAGS))
+ result++; /* ATOM_EXT (not small) */
}
- insert_acache_map(acmp, obj);
+ insert_acache_map(acmp, obj, dflags);
}
break;
case SMALL_DEF:
@@ -2969,7 +3056,7 @@ encode_size_struct2(ErtsAtomCacheMap *acmp, Eterm obj, unsigned dflags)
}
break;
case BINARY_DEF:
- if (dflags & DFLAGS_INTERNAL_TAGS) {
+ if (dflags & DFLAG_INTERNAL_TAGS) {
ProcBin* pb = (ProcBin*) binary_val(obj);
Uint sub_extra = 0;
Uint tot_bytes = pb->size;
@@ -3058,6 +3145,17 @@ encode_size_struct2(ErtsAtomCacheMap *acmp, Eterm obj, unsigned dflags)
return result;
}
+static int is_valid_utf8_atom(byte* bytes, Uint nbytes)
+{
+ byte* err_pos;
+ Uint num_chars;
+
+ /*SVERK Do we really need to validate correct utf8? */
+ return nbytes <= MAX_ATOM_SZ_LIMIT
+ && erts_analyze_utf8(bytes, nbytes, &err_pos, &num_chars, NULL) == ERTS_UTF8_OK
+ && num_chars <= MAX_ATOM_CHARACTERS;
+}
+
static Sint
decoded_size(byte *ep, byte* endp, int internal_tags)
{
@@ -3125,21 +3223,41 @@ decoded_size(byte *ep, byte* endp, int internal_tags)
case ATOM_EXT:
CHKSIZE(2);
n = get_int16(ep);
- if (n > MAX_ATOM_LENGTH) {
+ if (n > MAX_ATOM_CHARACTERS) {
return -1;
}
SKIP(n+2+atom_extra_skip);
atom_extra_skip = 0;
break;
+ case ATOM_UTF8_EXT:
+ CHKSIZE(2);
+ n = get_int16(ep);
+ ep += 2;
+ if (!is_valid_utf8_atom(ep, n)) {
+ return -1;
+ }
+ SKIP(n+atom_extra_skip);
+ atom_extra_skip = 0;
+ break;
case SMALL_ATOM_EXT:
CHKSIZE(1);
n = get_int8(ep);
- if (n > MAX_ATOM_LENGTH) {
+ if (n > MAX_ATOM_CHARACTERS) {
return -1;
}
SKIP(n+1+atom_extra_skip);
atom_extra_skip = 0;
break;
+ case SMALL_ATOM_UTF8_EXT:
+ CHKSIZE(1);
+ n = get_int8(ep);
+ ep++;
+ if (!is_valid_utf8_atom(ep, n)) {
+ return -1;
+ }
+ SKIP(n+atom_extra_skip);
+ atom_extra_skip = 0;
+ break;
case ATOM_CACHE_REF:
SKIP(1+atom_extra_skip);
atom_extra_skip = 0;
diff --git a/erts/emulator/beam/external.h b/erts/emulator/beam/external.h
index eddd4571dd..ad430117c8 100644
--- a/erts/emulator/beam/external.h
+++ b/erts/emulator/beam/external.h
@@ -51,6 +51,8 @@
#define NEW_FUN_EXT 'p'
#define EXPORT_EXT 'q'
#define FUN_EXT 'u'
+#define ATOM_UTF8_EXT 'v'
+#define SMALL_ATOM_UTF8_EXT 'w'
#define DIST_HEADER 'D'
#define ATOM_CACHE_REF 'R'
@@ -90,6 +92,7 @@ typedef struct cache {
typedef struct {
int hdr_sz;
int sz;
+ int long_atoms;
int cix[ERTS_ATOM_CACHE_SIZE];
struct {
Eterm atom;
@@ -150,12 +153,12 @@ typedef struct {
void erts_init_atom_cache_map(ErtsAtomCacheMap *);
void erts_reset_atom_cache_map(ErtsAtomCacheMap *);
void erts_destroy_atom_cache_map(ErtsAtomCacheMap *);
-void erts_finalize_atom_cache_map(ErtsAtomCacheMap *);
+void erts_finalize_atom_cache_map(ErtsAtomCacheMap *, Uint32);
Uint erts_encode_ext_dist_header_size(ErtsAtomCacheMap *);
Uint erts_encode_ext_dist_header_size(ErtsAtomCacheMap *);
byte *erts_encode_ext_dist_header_setup(byte *, ErtsAtomCacheMap *);
-byte *erts_encode_ext_dist_header_finalize(byte *, ErtsAtomCache *);
+byte *erts_encode_ext_dist_header_finalize(byte *, ErtsAtomCache *, Uint32);
Uint erts_encode_dist_ext_size(Eterm, Uint32, ErtsAtomCacheMap *);
void erts_encode_dist_ext(Eterm, byte **, Uint32, ErtsAtomCacheMap *);
diff --git a/erts/emulator/beam/global.h b/erts/emulator/beam/global.h
index 298241618f..41c2a0f2b9 100755
--- a/erts/emulator/beam/global.h
+++ b/erts/emulator/beam/global.h
@@ -786,16 +786,23 @@ Sint erts_native_filename_need(Eterm ioterm, int encoding);
void erts_copy_utf8_to_utf16_little(byte *target, byte *bytes, int num_chars);
int erts_analyze_utf8(byte *source, Uint size,
byte **err_pos, Uint *num_chars, int *left);
+int erts_analyze_utf8_x(byte *source, Uint size,
+ byte **err_pos, Uint *num_chars, int *left,
+ Sint *num_latin1_chars, Uint max_chars);
char *erts_convert_filename_to_native(Eterm name, char *statbuf,
size_t statbuf_size,
ErtsAlcType_t alloc_type,
int allow_empty, int allow_atom,
Sint *used /* out */);
Eterm erts_convert_native_to_filename(Process *p, byte *bytes);
+Eterm erts_utf8_to_list(Process *p, Uint num, byte *bytes, Uint sz, Uint left,
+ Uint *num_built, Uint *num_eaten, Eterm tail);
+int erts_utf8_to_latin1(byte* dest, const byte* source, int slen);
#define ERTS_UTF8_OK 0
#define ERTS_UTF8_INCOMPLETE 1
#define ERTS_UTF8_ERROR 2
#define ERTS_UTF8_ANALYZE_MORE 3
+#define ERTS_UTF8_OK_MAX_CHARS 4
void bin_write(int, void*, byte*, size_t);
int intlist_to_buf(Eterm, char*, int); /* most callers pass plain char*'s */
diff --git a/erts/emulator/beam/io.c b/erts/emulator/beam/io.c
index 04dc9bb236..536a3cc819 100644
--- a/erts/emulator/beam/io.c
+++ b/erts/emulator/beam/io.c
@@ -663,8 +663,11 @@ erts_open_driver(erts_driver_t* driver, /* Pointer to driver. */
if (IS_TRACED_FL(port, F_TRACE_PORTS)) {
trace_port_open(port,
- pid,
- am_atom_put(port->name, strlen(port->name)));
+ pid,
+ erts_atom_put((byte *) port->name,
+ strlen(port->name),
+ ERTS_ATOM_ENC_LATIN1,
+ 1));
}
error_number = error_type = 0;
@@ -2677,8 +2680,11 @@ static ERTS_INLINE void lcnt_enable_drv_lock_count(erts_driver_t *dp, int enable
erts_lcnt_init_lock_x(&dp->lock->lcnt,
"driver_lock",
ERTS_LCNT_LT_MUTEX,
- am_atom_put(dp->name,
- sys_strlen(dp->name)));
+ erts_atom_put((byte*)dp->name,
+ sys_strlen(dp->name),
+ ERTS_ATOM_ENC_LATIN1,
+ 1));
+
else
erts_lcnt_destroy_lock(&dp->lock->lcnt);
@@ -7043,7 +7049,8 @@ int driver_exit(ErlDrvPort ix, int err)
return driver_failure_term(ix, am_normal, 0);
else {
char* err_str = erl_errno_id(err);
- Eterm am_err = am_atom_put(err_str, sys_strlen(err_str));
+ Eterm am_err = erts_atom_put((byte *) err_str, sys_strlen(err_str),
+ ERTS_ATOM_ENC_LATIN1, 1);
return driver_failure_term(ix, am_err, 0);
}
}
@@ -7056,8 +7063,12 @@ int driver_failure(ErlDrvPort ix, int code)
int driver_failure_atom(ErlDrvPort ix, char* string)
{
- Eterm am = am_atom_put(string, strlen(string));
- return driver_failure_term(ix, am, 0);
+ return driver_failure_term(ix,
+ erts_atom_put((byte *) string,
+ strlen(string),
+ ERTS_ATOM_ENC_LATIN1,
+ 1),
+ 0);
}
int driver_failure_posix(ErlDrvPort ix, int err)
@@ -7074,7 +7085,10 @@ int driver_failure_eof(ErlDrvPort ix)
ErlDrvTermData driver_mk_atom(char* string)
{
- Eterm am = am_atom_put(string, sys_strlen(string));
+ Eterm am = erts_atom_put((byte *) string,
+ sys_strlen(string),
+ ERTS_ATOM_ENC_LATIN1,
+ 1);
ERTS_SMP_CHK_NO_PROC_LOCKS;
return (ErlDrvTermData) am;
}
@@ -7369,7 +7383,10 @@ init_driver(erts_driver_t *drv, ErlDrvEntry *de, DE_Handle *handle)
erts_mtx_init_x(drv->lock,
"driver_lock",
#if defined(ERTS_ENABLE_LOCK_CHECK) || defined(ERTS_ENABLE_LOCK_COUNT)
- am_atom_put(drv->name, sys_strlen(drv->name))
+ erts_atom_put((byte *) drv->name,
+ sys_strlen(drv->name),
+ ERTS_ATOM_ENC_LATIN1,
+ 1)
#else
NIL
#endif
diff --git a/erts/emulator/beam/utils.c b/erts/emulator/beam/utils.c
index 5261effef9..2a6a8efd4d 100644
--- a/erts/emulator/beam/utils.c
+++ b/erts/emulator/beam/utils.c
@@ -371,7 +371,7 @@ Eterm
erts_bld_atom(Uint **hpp, Uint *szp, char *str)
{
if (hpp)
- return am_atom_put(str, sys_strlen(str));
+ return erts_atom_put((byte *) str, sys_strlen(str), ERTS_ATOM_ENC_LATIN1, 1);
else
return THE_NON_VALUE;
}
diff --git a/erts/emulator/test/bif_SUITE.erl b/erts/emulator/test/bif_SUITE.erl
index e2442861c7..02c6de8cb1 100644
--- a/erts/emulator/test/bif_SUITE.erl
+++ b/erts/emulator/test/bif_SUITE.erl
@@ -481,8 +481,6 @@ binary_to_atom(Config) when is_list(Config) ->
%% Bad UTF8 sequences.
?line ?BADARG(binary_to_atom(id(<<255>>), utf8)),
?line ?BADARG(binary_to_atom(id(<<255,0>>), utf8)),
- ?line ?BADARG(binary_to_atom(id(<<0:512/unit:8,255>>), utf8)),
- ?line ?BADARG(binary_to_atom(id(<<0:512/unit:8,255,0>>), utf8)),
?line ?BADARG(binary_to_atom(id(<<16#C0,16#80>>), utf8)), %Overlong 0.
?line [?BADARG(binary_to_atom(<<C/utf8>>, utf8)) ||
C <- lists:seq(256, 16#D7FF)],
@@ -494,6 +492,8 @@ binary_to_atom(Config) when is_list(Config) ->
C <- lists:seq(16#90000, 16#10FFFF)],
%% system_limit failures.
+ ?line ?SYS_LIMIT(binary_to_atom(id(<<0:512/unit:8,255>>), utf8)),
+ ?line ?SYS_LIMIT(binary_to_atom(id(<<0:512/unit:8,255,0>>), utf8)),
?line ?SYS_LIMIT(binary_to_atom(<<0:256/unit:8>>, latin1)),
?line ?SYS_LIMIT(binary_to_atom(<<0:257/unit:8>>, latin1)),
?line ?SYS_LIMIT(binary_to_atom(<<0:512/unit:8>>, latin1)),
diff --git a/erts/emulator/test/distribution_SUITE.erl b/erts/emulator/test/distribution_SUITE.erl
index f3a177faf2..101007c288 100644
--- a/erts/emulator/test/distribution_SUITE.erl
+++ b/erts/emulator/test/distribution_SUITE.erl
@@ -18,7 +18,17 @@
%%
-module(distribution_SUITE).
--compile(r13).
+-compile(r15).
+
+-define(VERSION_MAGIC, 131).
+
+-define(ATOM_EXT, 100).
+-define(REFERENCE_EXT, 101).
+-define(PORT_EXT, 102).
+-define(PID_EXT, 103).
+-define(NEW_REFERENCE_EXT, 114).
+-define(ATOM_UTF8_EXT, 118).
+-define(SMALL_ATOM_UTF8_EXT, 119).
%% Tests distribution and the tcp driver.
@@ -37,8 +47,10 @@
dist_auto_connect_never/1, dist_auto_connect_once/1,
dist_parallel_send/1,
atom_roundtrip/1,
- atom_roundtrip_r13b/1,
+ unicode_atom_roundtrip/1,
+ atom_roundtrip_r15b/1,
contended_atom_cache_entry/1,
+ contended_unicode_atom_cache_entry/1,
bad_dist_structure/1,
bad_dist_ext_receive/1,
bad_dist_ext_process_info/1,
@@ -62,8 +74,9 @@ all() ->
link_to_dead_new_node, applied_monitor_node,
ref_port_roundtrip, nil_roundtrip, stop_dist,
{group, trap_bif}, {group, dist_auto_connect},
- dist_parallel_send, atom_roundtrip, atom_roundtrip_r13b,
- contended_atom_cache_entry, bad_dist_structure, {group, bad_dist_ext}].
+ dist_parallel_send, atom_roundtrip, unicode_atom_roundtrip, atom_roundtrip_r15b,
+ contended_atom_cache_entry, contended_unicode_atom_cache_entry,
+ bad_dist_structure, {group, bad_dist_ext}].
groups() ->
[{bulk_send, [], [bulk_send_small, bulk_send_big, bulk_send_bigbig]},
@@ -1085,19 +1098,27 @@ atom_roundtrip(Config) when is_list(Config) ->
?line stop_node(Node),
?line ok.
-atom_roundtrip_r13b(Config) when is_list(Config) ->
- case ?t:is_release_available("r13b") of
+atom_roundtrip_r15b(Config) when is_list(Config) ->
+ case ?t:is_release_available("r15b") of
true ->
?line AtomData = atom_data(),
?line verify_atom_data(AtomData),
- ?line {ok, Node} = start_node(Config, [], "r13b"),
+ ?line {ok, Node} = start_node(Config, [], "r15b"),
?line do_atom_roundtrip(Node, AtomData),
?line stop_node(Node),
?line ok;
false ->
- ?line {skip,"No OTP R13B available"}
+ ?line {skip,"No OTP R15B available"}
end.
+unicode_atom_roundtrip(Config) when is_list(Config) ->
+ ?line AtomData = unicode_atom_data(),
+ ?line verify_atom_data(AtomData),
+ ?line {ok, Node} = start_node(Config),
+ ?line do_atom_roundtrip(Node, AtomData),
+ ?line stop_node(Node),
+ ?line ok.
+
do_atom_roundtrip(Node, AtomData) ->
?line Parent = self(),
?line Proc = spawn_link(Node, fun () -> verify_atom_data_loop(Parent) end),
@@ -1128,12 +1149,76 @@ atom_data() ->
lists:seq(1, 2000)).
verify_atom_data(AtomData) ->
- lists:foreach(fun ({Atom, AtomTxt}) ->
- AtomTxt = atom_to_list(Atom)
+ lists:foreach(fun ({Atom, AtomTxt}) when is_atom(Atom) ->
+ AtomTxt = atom_to_list(Atom);
+ ({PPR, AtomTxt}) ->
+ % Pid, Port, or Ref
+ AtomTxt = atom_to_list(node(PPR))
end,
AtomData).
+uc_atom_tup(ATxt) ->
+ Atom = string_to_atom(ATxt),
+ ATxt = atom_to_list(Atom),
+ {Atom, ATxt}.
+
+uc_pid_tup(ATxt) ->
+ ATxtExt = string_to_atom_ext(ATxt),
+ Pid = mk_pid({ATxtExt, 1}, 4711,17),
+ true = is_pid(Pid),
+ Atom = node(Pid),
+ true = is_atom(Atom),
+ ATxt = atom_to_list(Atom),
+ {Pid, ATxt}.
+
+uc_port_tup(ATxt) ->
+ ATxtExt = string_to_atom_ext(ATxt),
+ Port = mk_port({ATxtExt, 2}, 4711),
+ true = is_port(Port),
+ Atom = node(Port),
+ true = is_atom(Atom),
+ ATxt = atom_to_list(Atom),
+ {Port, ATxt}.
+
+uc_ref_tup(ATxt) ->
+ ATxtExt = string_to_atom_ext(ATxt),
+ Ref = mk_ref({ATxtExt, 3}, [4711,17, 4711]),
+ true = is_reference(Ref),
+ Atom = node(Ref),
+ true = is_atom(Atom),
+ ATxt = atom_to_list(Atom),
+ {Ref, ATxt}.
+
+
+unicode_atom_data() ->
+ [uc_pid_tup(lists:seq(16#1f600, 16#1f600+249) ++ "@host"),
+ uc_pid_tup(lists:seq(16#1f600, 16#1f600+30) ++ "@host"),
+ uc_port_tup(lists:seq(16#1f600, 16#1f600+249) ++ "@host"),
+ uc_port_tup(lists:seq(16#1f600, 16#1f600+30) ++ "@host"),
+ uc_ref_tup(lists:seq(16#1f600, 16#1f600+249) ++ "@host"),
+ uc_ref_tup(lists:seq(16#1f600, 16#1f600+30) ++ "@host"),
+ uc_atom_tup(lists:seq(16#1f600, 16#1f600+254)),
+ uc_atom_tup(lists:seq(16#1f600, 16#1f600+63)),
+ uc_atom_tup(lists:seq(0, 254)),
+ uc_atom_tup(lists:seq(100, 163)),
+ uc_atom_tup(lists:seq(200, 354)),
+ uc_atom_tup(lists:seq(200, 263)),
+ uc_atom_tup(lists:seq(2000, 2254)),
+ uc_atom_tup(lists:seq(2000, 2063)),
+ uc_atom_tup(lists:seq(65500, 65754)),
+ uc_atom_tup(lists:seq(65500, 65563))
+ | lists:map(fun (N) ->
+ uc_atom_tup(lists:seq(64000+N, 64254+N))
+ end,
+ lists:seq(1, 2000))].
+
contended_atom_cache_entry(Config) when is_list(Config) ->
+ contended_atom_cache_entry_test(Config, latin1).
+
+contended_unicode_atom_cache_entry(Config) when is_list(Config) ->
+ contended_atom_cache_entry_test(Config, unicode).
+
+contended_atom_cache_entry_test(Config, Type) ->
?line TestServer = self(),
?line ProcessPairs = 10,
?line Msgs = 100000,
@@ -1147,9 +1232,16 @@ contended_atom_cache_entry(Config) when is_list(Config) ->
true),
Master = self(),
CIX = get_cix(),
- TestAtoms = get_conflicting_atoms(CIX, ProcessPairs),
+ TestAtoms = case Type of
+ latin1 ->
+ get_conflicting_atoms(CIX,
+ ProcessPairs);
+ unicode ->
+ get_conflicting_unicode_atoms(CIX,
+ ProcessPairs)
+ end,
io:format("Testing with the following atoms all using "
- "cache index ~p:~n ~p~n",
+ "cache index ~p:~n ~w~n",
[CIX, TestAtoms]),
Ps = lists:map(
fun (A) ->
@@ -1159,8 +1251,12 @@ contended_atom_cache_entry(Config) when is_list(Config) ->
fun () ->
Atom = receive
{Ref, txt, ATxt} ->
- list_to_atom(
- ATxt)
+ case Type of
+ latin1 ->
+ list_to_atom(ATxt);
+ unicode ->
+ string_to_atom(ATxt)
+ end
end,
receive_ref_atom(Ref,
Atom,
@@ -1252,6 +1348,20 @@ get_conflicting_atoms(CIX, N) ->
get_conflicting_atoms(CIX, N)
end.
+get_conflicting_unicode_atoms(_CIX, 0) ->
+ [];
+get_conflicting_unicode_atoms(CIX, N) ->
+ {A, B, C} = now(),
+ Atom = string_to_atom([16#1f608] ++ "atom" ++ integer_to_list(A*1000000000000
+ + B*1000000
+ + C)),
+ case erts_debug:get_internal_state({atom_out_cache_index, Atom}) of
+ CIX ->
+ [Atom|get_conflicting_unicode_atoms(CIX, N-1)];
+ _ ->
+ get_conflicting_unicode_atoms(CIX, N)
+ end.
+
-define(COOKIE, '').
-define(DOP_LINK, 1).
-define(DOP_SEND, 2).
@@ -2131,3 +2241,190 @@ repeat(_Fun, 0) ->
repeat(Fun, N) ->
Fun(),
repeat(Fun, N-1).
+
+string_to_atom_ext(String) ->
+ Utf8List = string_to_utf8_list(String),
+ Len = length(Utf8List),
+ case Len < 256 of
+ true ->
+ [?SMALL_ATOM_UTF8_EXT, Len | Utf8List];
+ false ->
+ [?ATOM_UTF8_EXT, Len bsr 8, Len band 16#ff | Utf8List]
+ end.
+
+string_to_atom(String) ->
+ binary_to_term(list_to_binary([?VERSION_MAGIC
+ | string_to_atom_ext(String)])).
+
+string_to_utf8_list([]) ->
+ [];
+string_to_utf8_list([CP|CPs]) when is_integer(CP),
+ 0 =< CP,
+ CP =< 16#7F ->
+ [CP | string_to_utf8_list(CPs)];
+string_to_utf8_list([CP|CPs]) when is_integer(CP),
+ 16#80 =< CP,
+ CP =< 16#7FF ->
+ [16#C0 bor (CP bsr 6),
+ 16#80 bor (16#3F band CP)
+ | string_to_utf8_list(CPs)];
+string_to_utf8_list([CP|CPs]) when is_integer(CP),
+ 16#800 =< CP,
+ CP =< 16#FFFF ->
+ [16#E0 bor (CP bsr 12),
+ 16#80 bor (16#3F band (CP bsr 6)),
+ 16#80 bor (16#3F band CP)
+ | string_to_utf8_list(CPs)];
+string_to_utf8_list([CP|CPs]) when is_integer(CP),
+ 16#10000 =< CP,
+ CP =< 16#10FFFF ->
+ [16#F0 bor (CP bsr 18),
+ 16#80 bor (16#3F band (CP bsr 12)),
+ 16#80 bor (16#3F band (CP bsr 6)),
+ 16#80 bor (16#3F band CP)
+ | string_to_utf8_list(CPs)].
+
+utf8_list_to_string([]) ->
+ [];
+utf8_list_to_string([B|Bs]) when is_integer(B),
+ 0 =< B,
+ B =< 16#7F ->
+ [B | utf8_list_to_string(Bs)];
+utf8_list_to_string([B0, B1 | Bs]) when is_integer(B0),
+ 16#C0 =< B0,
+ B0 =< 16#DF,
+ is_integer(B1),
+ 16#80 =< B1,
+ B1 =< 16#BF ->
+ [(((B0 band 16#1F) bsl 6)
+ bor (B1 band 16#3F))
+ | utf8_list_to_string(Bs)];
+utf8_list_to_string([B0, B1, B2 | Bs]) when is_integer(B0),
+ 16#E0 =< B0,
+ B0 =< 16#EF,
+ is_integer(B1),
+ 16#80 =< B1,
+ B1 =< 16#BF,
+ is_integer(B2),
+ 16#80 =< B2,
+ B2 =< 16#BF ->
+ [(((B0 band 16#F) bsl 12)
+ bor ((B1 band 16#3F) bsl 6)
+ bor (B2 band 16#3F))
+ | utf8_list_to_string(Bs)];
+utf8_list_to_string([B0, B1, B2, B3 | Bs]) when is_integer(B0),
+ 16#F0 =< B0,
+ B0 =< 16#F7,
+ is_integer(B1),
+ 16#80 =< B1,
+ B1 =< 16#BF,
+ is_integer(B2),
+ 16#80 =< B2,
+ B2 =< 16#BF,
+ is_integer(B3),
+ 16#80 =< B3,
+ B3 =< 16#BF ->
+ [(((B0 band 16#7) bsl 18)
+ bor ((B1 band 16#3F) bsl 12)
+ bor ((B2 band 16#3F) bsl 6)
+ bor (B3 band 16#3F))
+ | utf8_list_to_string(Bs)].
+
+mk_pid({NodeName, Creation}, Number, Serial) when is_atom(NodeName) ->
+ <<?VERSION_MAGIC, NodeNameExt/binary>> = term_to_binary(NodeName),
+ mk_pid({NodeNameExt, Creation}, Number, Serial);
+mk_pid({NodeNameExt, Creation}, Number, Serial) ->
+ case catch binary_to_term(list_to_binary([?VERSION_MAGIC,
+ ?PID_EXT,
+ NodeNameExt,
+ uint32_be(Number),
+ uint32_be(Serial),
+ uint8(Creation)])) of
+ Pid when is_pid(Pid) ->
+ Pid;
+ {'EXIT', {badarg, _}} ->
+ exit({badarg, mk_pid, [{NodeNameExt, Creation}, Number, Serial]});
+ Other ->
+ exit({unexpected_binary_to_term_result, Other})
+ end.
+
+mk_port({NodeName, Creation}, Number) when is_atom(NodeName) ->
+ <<?VERSION_MAGIC, NodeNameExt/binary>> = term_to_binary(NodeName),
+ mk_port({NodeNameExt, Creation}, Number);
+mk_port({NodeNameExt, Creation}, Number) ->
+ case catch binary_to_term(list_to_binary([?VERSION_MAGIC,
+ ?PORT_EXT,
+ NodeNameExt,
+ uint32_be(Number),
+ uint8(Creation)])) of
+ Port when is_port(Port) ->
+ Port;
+ {'EXIT', {badarg, _}} ->
+ exit({badarg, mk_port, [{NodeNameExt, Creation}, Number]});
+ Other ->
+ exit({unexpected_binary_to_term_result, Other})
+ end.
+
+mk_ref({NodeName, Creation}, [Number] = NL) when is_atom(NodeName),
+ is_integer(Creation),
+ is_integer(Number) ->
+ <<?VERSION_MAGIC, NodeNameExt/binary>> = term_to_binary(NodeName),
+ mk_ref({NodeNameExt, Creation}, NL);
+mk_ref({NodeNameExt, Creation}, [Number]) when is_integer(Creation),
+ is_integer(Number) ->
+ case catch binary_to_term(list_to_binary([?VERSION_MAGIC,
+ ?REFERENCE_EXT,
+ NodeNameExt,
+ uint32_be(Number),
+ uint8(Creation)])) of
+ Ref when is_reference(Ref) ->
+ Ref;
+ {'EXIT', {badarg, _}} ->
+ exit({badarg, mk_ref, [{NodeNameExt, Creation}, [Number]]});
+ Other ->
+ exit({unexpected_binary_to_term_result, Other})
+ end;
+mk_ref({NodeName, Creation}, Numbers) when is_atom(NodeName),
+ is_integer(Creation),
+ is_list(Numbers) ->
+ <<?VERSION_MAGIC, NodeNameExt/binary>> = term_to_binary(NodeName),
+ mk_ref({NodeNameExt, Creation}, Numbers);
+mk_ref({NodeNameExt, Creation}, Numbers) when is_integer(Creation),
+ is_list(Numbers) ->
+ case catch binary_to_term(list_to_binary([?VERSION_MAGIC,
+ ?NEW_REFERENCE_EXT,
+ uint16_be(length(Numbers)),
+ NodeNameExt,
+ uint8(Creation),
+ lists:map(fun (N) ->
+ uint32_be(N)
+ end,
+ Numbers)])) of
+ Ref when is_reference(Ref) ->
+ Ref;
+ {'EXIT', {badarg, _}} ->
+ exit({badarg, mk_ref, [{NodeNameExt, Creation}, Numbers]});
+ Other ->
+ exit({unexpected_binary_to_term_result, Other})
+ end.
+
+
+uint32_be(Uint) when is_integer(Uint), 0 =< Uint, Uint < 1 bsl 32 ->
+ [(Uint bsr 24) band 16#ff,
+ (Uint bsr 16) band 16#ff,
+ (Uint bsr 8) band 16#ff,
+ Uint band 16#ff];
+uint32_be(Uint) ->
+ exit({badarg, uint32_be, [Uint]}).
+
+
+uint16_be(Uint) when is_integer(Uint), 0 =< Uint, Uint < 1 bsl 16 ->
+ [(Uint bsr 8) band 16#ff,
+ Uint band 16#ff];
+uint16_be(Uint) ->
+ exit({badarg, uint16_be, [Uint]}).
+
+uint8(Uint) when is_integer(Uint), 0 =< Uint, Uint < 1 bsl 8 ->
+ Uint band 16#ff;
+uint8(Uint) ->
+ exit({badarg, uint8, [Uint]}).
diff --git a/erts/epmd/src/epmd_cli.c b/erts/epmd/src/epmd_cli.c
index 74408e3ebe..1d4de64b63 100644
--- a/erts/epmd/src/epmd_cli.c
+++ b/erts/epmd/src/epmd_cli.c
@@ -22,6 +22,7 @@
#endif
#include "epmd.h" /* Renamed from 'epmd_r4.h' */
#include "epmd_int.h"
+#include "erl_printf.h" /* erts_snprintf */
/* forward declarations */
@@ -114,16 +115,18 @@ void epmd_call(EpmdVars *g,int what)
epmd_cleanup_exit(g,1);
}
j = ntohl(i);
- if (!g->silent)
- printf("epmd: up and running on port %d with data:\n", j);
+ if (!g->silent) {
+ rval = erts_snprintf(buf, OUTBUF_SIZE,
+ "epmd: up and running on port %d with data:\n", j);
+ write(1, buf, rval);
+ }
while(1) {
- if ((rval = read(fd,buf,1)) <= 0) {
+ if ((rval = read(fd,buf,OUTBUF_SIZE)) <= 0) {
close(fd);
epmd_cleanup_exit(g,0);
}
- buf[rval] = '\0';
if (!g->silent)
- printf("%s",buf);
+ write(1, buf, rval); /* Potentially UTF-8 encoded */
}
}
diff --git a/erts/epmd/src/epmd_int.h b/erts/epmd/src/epmd_int.h
index 14d05c3f19..b25412c905 100644
--- a/erts/epmd/src/epmd_int.h
+++ b/erts/epmd/src/epmd_int.h
@@ -226,13 +226,25 @@
#define MAX_UNREG_COUNT 1000
#define DEBUG_MAX_UNREG_COUNT 5
-/* Maximum length of a node name == atom name */
-#define MAXSYMLEN 255
+/*
+ * Maximum length of a node name == atom name
+ * 255 characters; UTF-8 encoded -> max 255*4
+ */
+#define MAXSYMLEN (255*4)
#define MAX_LISTEN_SOCKETS 16
-#define INBUF_SIZE 1024
-#define OUTBUF_SIZE 1024
+/*
+ * Largest request: ALIVE2_REQ
+ * 2 + 13 + 2*MAXSYMLEN
+ * Largest response: PORT2_RESP
+ * 2 + 14 + 2*MAXSYMLEN
+ *
+ * That is, 3*MAXSYMLEN should be large enough
+ */
+
+#define INBUF_SIZE (3*MAXSYMLEN)
+#define OUTBUF_SIZE (3*MAXSYMLEN)
#define get_int16(s) ((((unsigned char*) (s))[0] << 8) | \
(((unsigned char*) (s))[1]))
diff --git a/erts/epmd/src/epmd_srv.c b/erts/epmd/src/epmd_srv.c
index 36565b7438..2a74c4955e 100644
--- a/erts/epmd/src/epmd_srv.c
+++ b/erts/epmd/src/epmd_srv.c
@@ -73,7 +73,7 @@ static int conn_open(EpmdVars*,int);
static int conn_close_fd(EpmdVars*,int);
static void node_init(EpmdVars*);
-static Node *node_reg2(EpmdVars*,char*, int, int, unsigned char, unsigned char, int, int, int, char*);
+static Node *node_reg2(EpmdVars*, int, char*, int, int, unsigned char, unsigned char, int, int, int, char*);
static int node_unreg(EpmdVars*,char*);
static int node_unreg_sock(EpmdVars*,int);
@@ -81,6 +81,113 @@ static int reply(EpmdVars*,int,char *,int);
static void dbg_print_buf(EpmdVars*,char *,int);
static void print_names(EpmdVars*);
+static int is_same_str(char *x, char *y)
+{
+ int i = 0;
+ /*
+ * Using strcmp() == 0 is probably ok, but just to be sure,
+ * since we got UTF-8 strings, we do it ourselves.
+ *
+ * We assume null-terminated correctly encoded UTF-8.
+ */
+ while (x[i] == y[i]) {
+ if (x[i] == '\0')
+ return 1;
+ i++;
+ }
+ return 0;
+}
+
+static int copy_str(char *x, char *y)
+{
+ int i = 0;
+ /*
+ * Using strcpy() is probably ok, but just to be sure,
+ * since we got UTF-8 strings, we do it ourselves.
+ *
+ * We assume null-terminated correctly encoded UTF-8.
+ */
+ while (1) {
+ x[i] = y[i];
+ if (y[i] == '\0')
+ return i;
+ i++;
+ }
+}
+
+static int length_str(char *x)
+{
+ int i = 0;
+ /*
+ * Using strlen is probably ok, but just to be sure,
+ * since we got UTF-8 strings, we do it ourselves.
+ *
+ * We assume null-terminated correctly encoded UTF-8.
+ */
+ while (x[i])
+ i++;
+ return i;
+}
+
+static int verify_utf8(const char *src, int sz, int null_term)
+{
+ unsigned char *source = (unsigned char *) src;
+ int size = sz;
+ int num_chars = 0;
+ while (size) {
+ if (null_term && (*source) == 0)
+ return num_chars;
+ if (((*source) & ((unsigned char) 0x80)) == 0) {
+ source++;
+ --size;
+ } else if (((*source) & ((unsigned char) 0xE0)) == 0xC0) {
+ if (size < 2)
+ return -1;
+ if (((source[1] & ((unsigned char) 0xC0)) != 0x80) ||
+ ((*source) < 0xC2) /* overlong */) {
+ return -1;
+ }
+ source += 2;
+ size -= 2;
+ } else if (((*source) & ((unsigned char) 0xF0)) == 0xE0) {
+ if (size < 3)
+ return -1;
+ if (((source[1] & ((unsigned char) 0xC0)) != 0x80) ||
+ ((source[2] & ((unsigned char) 0xC0)) != 0x80) ||
+ (((*source) == 0xE0) && (source[1] < 0xA0)) /* overlong */ ) {
+ return -1;
+ }
+ if ((((*source) & ((unsigned char) 0xF)) == 0xD) &&
+ ((source[1] & 0x20) != 0)) {
+ return -1;
+ }
+ source += 3;
+ size -= 3;
+ } else if (((*source) & ((unsigned char) 0xF8)) == 0xF0) {
+ if (size < 4)
+ return -1;
+ if (((source[1] & ((unsigned char) 0xC0)) != 0x80) ||
+ ((source[2] & ((unsigned char) 0xC0)) != 0x80) ||
+ ((source[3] & ((unsigned char) 0xC0)) != 0x80) ||
+ (((*source) == 0xF0) && (source[1] < 0x90)) /* overlong */) {
+ return -1;
+ }
+ if ((((*source) & ((unsigned char)0x7)) > 0x4U) ||
+ ((((*source) & ((unsigned char)0x7)) == 0x4U) &&
+ ((source[1] & ((unsigned char)0x3F)) > 0xFU))) {
+ return -1;
+ }
+ source += 4;
+ size -= 4;
+ } else {
+ return -1;
+ }
+ ++num_chars;
+ }
+ return num_chars;
+}
+
+
static EPMD_INLINE void select_fd_set(EpmdVars* g, int fd)
{
FD_SET(fd, &g->orig_read_mask);
@@ -525,10 +632,11 @@ static void do_request(g, fd, s, buf, bsize)
}
name = &buf[11];
name[namelen]='\000';
+
extra = &buf[11+namelen+2];
extra[extralen]='\000';
wbuf[0] = EPMD_ALIVE2_RESP;
- if ((node = node_reg2(g, name, fd, eport, nodetype, protocol,
+ if ((node = node_reg2(g, namelen, name, fd, eport, nodetype, protocol,
highvsn, lowvsn, extralen, extra)) == NULL) {
wbuf[1] = 1; /* error */
put_int16(99, wbuf+2);
@@ -573,22 +681,28 @@ static void do_request(g, fd, s, buf, bsize)
{
char *name = &buf[1]; /* Points to node name */
+ int nsz;
Node *node;
-
+
+ nsz = verify_utf8(name, bsize, 0);
+ if (nsz < 1 || 255 < nsz) {
+ dbg_printf(g,0,"invalid node name in PORT2_REQ");
+ return;
+ }
+
wbuf[0] = EPMD_PORT2_RESP;
for (node = g->nodes.reg; node; node = node->next) {
int offset;
- if (strcmp(node->symname, name) == 0) {
+ if (is_same_str(node->symname, name)) {
wbuf[1] = 0; /* ok */
put_int16(node->port,wbuf+2);
wbuf[4] = node->nodetype;
wbuf[5] = node->protocol;
put_int16(node->highvsn,wbuf+6);
put_int16(node->lowvsn,wbuf+8);
- put_int16(strlen(node->symname),wbuf+10);
+ put_int16(length_str(node->symname),wbuf+10);
offset = 12;
- strcpy(wbuf + offset,node->symname);
- offset += strlen(node->symname);
+ offset += copy_str(wbuf + offset,node->symname);
put_int16(node->extralen,wbuf + offset);
offset += 2;
memcpy(wbuf + offset,node->extra,node->extralen);
@@ -629,15 +743,22 @@ static void do_request(g, fd, s, buf, bsize)
for (node = g->nodes.reg; node; node = node->next)
{
- int len;
+ int len = 0;
+ int r;
/* CAREFUL!!! These are parsed by "erl_epmd.erl" so a slight
change in syntax will break < OTP R3A */
- erts_snprintf(wbuf, sizeof(wbuf), "name %s at port %d\n",node->symname, node->port);
- len = strlen(wbuf);
+ len += copy_str(&wbuf[len], "name ");
+ len += copy_str(&wbuf[len], node->symname);
+ r = erts_snprintf(&wbuf[len], sizeof(wbuf)-len,
+ " at port %d\n", node->port);
+ if (r < 0)
+ goto failed_names_resp;
+ len += r;
if (reply(g, fd, wbuf, len) != len)
{
+ failed_names_resp:
dbg_tty_printf(g,1,"failed to send NAMES_RESP");
return;
}
@@ -665,16 +786,22 @@ static void do_request(g, fd, s, buf, bsize)
for (node = g->nodes.reg; node; node = node->next)
{
- int len;
+ int len = 0, r;
/* CAREFUL!!! These are parsed by "erl_epmd.erl" so a slight
change in syntax will break < OTP R3A */
- erts_snprintf(wbuf, sizeof(wbuf), "active name <%s> at port %d, fd = %d\n",
- node->symname, node->port, node->fd);
- len = strlen(wbuf) + 1;
- if (reply(g, fd,wbuf,len) != len)
+ len += copy_str(&wbuf[len], "active name <");
+ len += copy_str(&wbuf[len], node->symname);
+ r = erts_snprintf(&wbuf[len], sizeof(wbuf)-len,
+ "> at port %d, fd = %d\n",
+ node->port, node->fd);
+ if (r < 0)
+ goto failed_dump_resp;
+ len += r + 1;
+ if (reply(g, fd,wbuf,len) != len)
{
+ failed_dump_resp:
dbg_tty_printf(g,1,"failed to send DUMP_RESP");
return;
}
@@ -682,16 +809,22 @@ static void do_request(g, fd, s, buf, bsize)
for (node = g->nodes.unreg; node; node = node->next)
{
- int len;
+ int len = 0, r;
/* CAREFUL!!! These are parsed by "erl_epmd.erl" so a slight
change in syntax will break < OTP R3A */
- erts_snprintf(wbuf, sizeof(wbuf), "old/unused name <%s>, port = %d, fd = %d \n",
- node->symname,node->port, node->fd);
- len = strlen(wbuf) + 1;
- if (reply(g, fd,wbuf,len) != len)
+ len += copy_str(&wbuf[len], "old/unused name <");
+ len += copy_str(&wbuf[len], node->symname);
+ r = erts_snprintf(&wbuf[len], sizeof(wbuf)-len,
+ ">, port = %d, fd = %d \n",
+ node->port, node->fd);
+ if (r < 0)
+ goto failed_dump_resp2;
+ len += r + 1;
+ if (reply(g, fd,wbuf,len) != len)
{
+ failed_dump_resp2:
dbg_tty_printf(g,1,"failed to send DUMP_RESP");
return;
}
@@ -933,7 +1066,7 @@ static int node_unreg(EpmdVars *g,char *name)
Node *node = g->nodes.reg; /* Point to first node */
for (; node; prev = &node->next, node = node->next)
- if (strcmp(node->symname, name) == 0)
+ if (is_same_str(node->symname, name))
{
dbg_tty_printf(g,1,"unregistering '%s:%d', port %d",
node->symname, node->creation, node->port);
@@ -1013,6 +1146,7 @@ static int node_unreg_sock(EpmdVars *g,int fd)
*/
static Node *node_reg2(EpmdVars *g,
+ int namelen,
char* name,
int fd,
int port,
@@ -1025,6 +1159,7 @@ static Node *node_reg2(EpmdVars *g,
{
Node *prev; /* Point to previous node or NULL */
Node *node; /* Point to first node */
+ int sz;
/* Can be NULL; means old style */
if (extra == NULL)
@@ -1032,21 +1167,47 @@ static Node *node_reg2(EpmdVars *g,
/* Fail if node name is too long */
- if (strlen(name) > MAXSYMLEN)
+
+ if (namelen > MAXSYMLEN)
{
- dbg_printf(g,0,"node name is too long (%d) %s", strlen(name), name);
+ too_long_name:
+ dbg_printf(g,0,"node name is too long (%d) %s", namelen, name);
return NULL;
}
+
+ sz = verify_utf8(name, namelen, 0);
+ if (sz > 255)
+ goto too_long_name;
+
+ if (sz < 0) {
+ dbg_printf(g,0,"invalid node name encoding");
+ return NULL;
+ }
+
if (extralen > MAXSYMLEN)
{
- dbg_printf(g,0,"extra data is too long (%d) %s", strlen(name), name);
+#if 0
+ too_long_extra:
+#endif
+ dbg_printf(g,0,"extra data is too long (%d) %s", extralen, extra);
return NULL;
}
+#if 0 /* Should we require valid utf8 here? */
+ sz = verify_utf8(extra, extralen, 0);
+ if (sz > 255)
+ goto too_long_extra;
+
+ if (sz < 0) {
+ dbg_printf(g,0,"invalid extra data encoding");
+ return NULL;
+ }
+#endif
+
/* Fail if it is already registered */
for (node = g->nodes.reg; node; node = node->next)
- if (strcmp(node->symname, name) == 0)
+ if (is_same_str(node->symname, name))
{
dbg_printf(g,0,"node name already occupied %s", name);
return NULL;
@@ -1058,7 +1219,7 @@ static Node *node_reg2(EpmdVars *g,
prev = NULL;
for (node = g->nodes.unreg; node; prev = node, node = node->next)
- if (strcmp(node->symname, name) == 0)
+ if (is_same_str(node->symname, name))
{
dbg_tty_printf(g,1,"reusing slot with same name '%s'", node->symname);
@@ -1126,7 +1287,7 @@ static Node *node_reg2(EpmdVars *g,
node->lowvsn = lowvsn;
node->extralen = extralen;
memcpy(node->extra,extra,extralen);
- strcpy(node->symname,name);
+ copy_str(node->symname,name);
select_fd_set(g, fd);
if (highvsn == 0) {
diff --git a/erts/epmd/test/epmd_SUITE.erl b/erts/epmd/test/epmd_SUITE.erl
index fd9969ae2b..fc0abef400 100644
--- a/erts/epmd/test/epmd_SUITE.erl
+++ b/erts/epmd/test/epmd_SUITE.erl
@@ -45,6 +45,8 @@
register_names_1/1,
register_names_2/1,
register_duplicate_name/1,
+ unicode_name/1,
+ long_unicode_name/1,
get_port_nr/1,
slow_get_port_nr/1,
unregister_others_name_1/1,
@@ -107,7 +109,8 @@ suite() -> [{ct_hooks,[ts_install_cth]}].
all() ->
[register_name, register_names_1, register_names_2,
- register_duplicate_name, get_port_nr, slow_get_port_nr,
+ register_duplicate_name, unicode_name, long_unicode_name,
+ get_port_nr, slow_get_port_nr,
unregister_others_name_1, unregister_others_name_2,
register_overflow, name_with_null_inside,
name_null_terminated, stupid_names_req, no_data,
@@ -197,6 +200,37 @@ register_duplicate_name(Config) when is_list(Config) ->
?line ok = close(Sock), % Unregister
ok.
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+
+unicode_name(doc) ->
+ ["Check that we can register and lookup a unicode name"];
+unicode_name(suite) ->
+ [];
+unicode_name(Config) when is_list(Config) ->
+ ok = epmdrun(),
+ NodeName = [16#1f608],
+ {ok,Sock} = register_node_v2(4711, 72, 0, 5, 5, NodeName, []),
+ {ok,NodeInfo} = port_please_v2(NodeName),
+ NodeName = NodeInfo#node_info.node_name,
+ ok = close(Sock),
+ ok.
+
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+
+long_unicode_name(doc) ->
+ ["Check that we can register and lookup a long unicode name"];
+long_unicode_name(suite) ->
+ [];
+long_unicode_name(Config) when is_list(Config) ->
+ ok = epmdrun(),
+ BaseChar = 16#1f600,
+ NodeName = lists:seq(BaseChar, BaseChar+200), % will be 800 bytes long
+ {ok,Sock} = register_node_v2(4711, 72, 0, 5, 5, NodeName, []),
+ {ok,NodeInfo} = port_please_v2(NodeName),
+ NodeName = NodeInfo#node_info.node_name,
+ ok = close(Sock),
+ ok.
+
% Internal function to register a node name, no close, i.e. unregister
register_node(Name) ->
@@ -205,9 +239,10 @@ register_node(Name,Port) ->
register_node_v2(Port,$M,0,5,5,Name,"").
register_node_v2(Port, NodeType, Prot, HVsn, LVsn, Name, Extra) ->
+ Utf8Name = unicode:characters_to_binary(Name),
Req = [?EPMD_ALIVE2_REQ, put16(Port), NodeType, Prot,
put16(HVsn), put16(LVsn),
- size16(Name), Name,
+ put16(size(Utf8Name)), binary_to_list(Utf8Name),
size16(Extra), Extra],
case send_req(Req) of
{ok,Sock} ->
@@ -226,7 +261,8 @@ register_node_v2(Port, NodeType, Prot, HVsn, LVsn, Name, Extra) ->
% Internal function to fetch information about a node
port_please_v2(Name) ->
- case send_req([?EPMD_PORT_PLEASE2_REQ, Name]) of
+ case send_req([?EPMD_PORT_PLEASE2_REQ,
+ binary_to_list(unicode:characters_to_binary(Name))]) of
{ok,Sock} ->
case recv_until_sock_closes(Sock) of
{ok, Resp} ->
@@ -247,7 +283,7 @@ parse_port2_resp(Resp) ->
ELen:16,Extra:ELen/binary>> when Res =:= 0 ->
{ok, #node_info{port=Port,node_type=NodeType,prot=Prot,
hvsn=HVsn,lvsn=LVsn,
- node_name=binary_to_list(NodeName),
+ node_name=unicode:characters_to_list(NodeName),
extra=binary_to_list(Extra)}};
_Other ->
test_server:format("invalid port2 resp: ~p~n",
@@ -737,7 +773,7 @@ buffer_overrun_2(doc) ->
["Test security vulnerability in fake extra lengths in alive2_req"];
buffer_overrun_2(Config) when is_list(Config) ->
?line ok = epmdrun(),
- ?line [false | Rest] = [hostile2(N) || N <- lists:seq(255,10000)],
+ ?line [false | Rest] = [hostile2(N) || N <- lists:seq(255*4,10000)],
?line true = alltrue(Rest),
ok.
hostile(N) ->
@@ -880,6 +916,7 @@ no_live_killing(Config) when is_list(Config) ->
?line close(Sock3),
ok.
+
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% Terminate all tests with killing epmd.