diff options
author | Sverker Eriksson <sverker@erlang.org> | 2013-01-23 18:09:35 +0100 |
---|---|---|
committer | Sverker Eriksson <sverker@erlang.org> | 2013-01-23 18:09:35 +0100 |
commit | b8e623410d1c22fe6d5fdeb8ccb0b2305533f033 (patch) | |
tree | 708d64e36e18b61ae1801c02ec3aeef42a697be3 /erts | |
parent | e99df74bee7c245ec76678e336fcd09d4b51a089 (diff) | |
parent | d6e3e256b850050b7a86323b2948009d5fcc30a9 (diff) | |
download | otp-b8e623410d1c22fe6d5fdeb8ccb0b2305533f033.tar.gz otp-b8e623410d1c22fe6d5fdeb8ccb0b2305533f033.tar.bz2 otp-b8e623410d1c22fe6d5fdeb8ccb0b2305533f033.zip |
Merge branch 'sverk/r16/utf8-atoms'
* sverk/r16/utf8-atoms:
erl_interface: Fix bug when transcoding atoms from and to UTF8
erl_interface: Changed erlang_char_encoding interface
erts: Testcase doing unicode atom printout with ~w
erl_interface: even more utf8 atom stuff
erts: Fix bug in analyze_utf8 causing faulty latin1 detection
Add UTF-8 node name support for epmd
workaround...
Fix merge conflict with hasse
UTF-8 atom documentation
test case
erl_interface: utf8 atoms continued
Add utf8 atom distribution test cases
atom fixes for NIFs and atom_to_binary
UTF-8 support for distribution
Implement UTF-8 atom support for jinterface
erl_interface: Enable decode of unicode atoms
stdlib: Fix printing of unicode atoms
erts: Change internal representation of atoms to utf8
erts: Refactor rename DFLAG(S)_INTERNAL_TAGS for conformity
Conflicts:
erts/emulator/beam/io.c
OTP-10753
Diffstat (limited to 'erts')
32 files changed, 1608 insertions, 376 deletions
diff --git a/erts/doc/src/erl_dist_protocol.xml b/erts/doc/src/erl_dist_protocol.xml index 6c725fc82d..0252187be5 100644 --- a/erts/doc/src/erl_dist_protocol.xml +++ b/erts/doc/src/erl_dist_protocol.xml @@ -547,13 +547,289 @@ If Result > 0, the packet only consists of [119, Result]. --> </section> - + <marker id="distribution_handshake"/> <section> - <title>Handshake</title> - <p> - The handshake is discussed in detail in the internal documentation for - the kernel (Erlang) application. - </p> + <title>Distribution Handshake</title> + <p> + This section describes the distribution handshake protocol + introduced in the OTP-R6 release of Erlang/OTP. This + description was previously located in + <c>$ERL_TOP/lib/kernel/internal_doc/distribution_handshake.txt</c>, + and has more or less been copied and "formatted" here. It has been + more or less unchanged since the year 1999, but the handshake + should not have changed much since then either. + </p> + <section> + <title>General</title> + <p> + The TCP/IP distribution uses a handshake which expects a + connection based protocol, i.e. the protocol does not include + any authentication after the handshake procedure. + </p> + <p> + This is not entirely safe, as it is vulnerable against takeover + attacks, but it is a tradeoff between fair safety and performance. + </p> + <p> + The cookies are never sent in cleartext and the handshake procedure + expects the client (called A) to be the first one to prove that it can + generate a sufficient digest. The digest is generated with the + MD5 message digest algorithm and the challenges are expected to be very + random numbers. + </p> + </section> + <section> + <title>Definitions</title> + <p> + A challenge is a 32 bit integer number in big endian order. Below the function + <c>gen_challenge()</c> returns a random 32 bit integer used as a challenge. + </p> + <p> + A digest is a (16 bytes) MD5 hash of the Challenge (as text) concatenated + with the cookie (as text). Below, the function <c>gen_digest(Challenge, Cookie)</c> + generates a digest as described above. + </p> + <p> + An out_cookie is the cookie used in outgoing communication to a certain node, + so that A's out_cookie for B should correspond with B's in_cookie for A and + the other way around. A's out_cookie for B and A's in_cookie for B need <em>NOT</em> + be the same. Below the function <c>out_cookie(Node)</c> returns the current + node's out_cookie for <c>Node</c>. + </p> + <p> + An in_cookie is the cookie expected to be used by another node when + communicating with us, so that A's in_cookie for B corresponds with B's + out_cookie for A. Below the function <c>in_cookie(Node)</c> returns the current + node's <c>in_cookie</c> for <c>Node</c>. + </p> + <p> + The cookies are text strings that can be viewed as passwords. + </p> + <p> + Every message in the handshake starts with a 16 bit big endian integer + which contains the length of the message (not counting the two initial bytes). + In erlang this corresponds to the <c>gen_tcp</c> option <c>{packet, 2}</c>. Note that after + the handshake, the distribution switches to 4 byte packet headers. + </p> + + </section> + <section> + <title>The Handshake in Detail</title> + <p> + Imagine two nodes, node A, which initiates the handshake and node B, which + accepts the connection. + </p> + <taglist> + <tag>1) connect/accept</tag> + <item><p>A connects to B via TCP/IP and B accepts the connection.</p></item> + <tag>2) send_name/receive_name</tag> + <item><p>A sends an initial identification to B. B receives the message. + The message looks like this (every "square" being one byte and the packet + header removed): + </p> +<pre> ++---+--------+--------+-----+-----+-----+-----+-----+-----+-...-+-----+ +|'n'|Version0|Version1|Flag0|Flag1|Flag2|Flag3|Name0|Name1| ... |NameN| ++---+--------+--------+-----+-----+-----+-----+-----+-----+-... +-----+ +</pre> + <p> + The 'n' is just a message tag. + Version0 and Version1 is the distribution version selected by node A, + based on information from EPMD. (16 bit big endian) + Flag0 ... Flag3 are capability flags, the capabilities defined in + <c>$ERL_TOP/lib/kernel/include/dist.hrl</c>. + (32 bit big endian) + Name0 ... NameN is the full nodename of A, as a string of bytes (the + packet length denotes how long it is). + </p></item> + <tag>3) recv_status/send_status</tag> + <item><p>B sends a status message to A, which indicates + if the connection is allowed. The following status codes are defined:</p> + <taglist> + <tag><c>ok</c></tag> + <item>The handshake will continue.</item> + <tag><c>ok_simultaneous</c></tag> + <item>The handshake will continue, but A is informed that B + has another ongoing connection attempt that will be + shut down (simultaneous connect where A's name is + greater than B's name, compared literally).</item> + <tag><c>nok</c></tag> + <item>The handshake will not continue, as B already has an ongoing handshake + which it itself has initiated. (simultaneous connect where B's name is + greater than A's).</item> + <tag><c>not_allowed</c></tag> + <item>The connection is disallowed for some (unspecified) security + reason.</item> + <tag><c>alive</c></tag> + <item>A connection to the node is already active, which either means + that node A is confused or that the TCP connection breakdown + of a previous node with this name has not yet reached node B. + See 3B below.</item> + </taglist> + <p>This is the format of the status message:</p> +<pre> ++---+-------+-------+-...-+-------+ +|'s'|Status0|Status1| ... |StatusN| ++---+-------+-------+-...-+-------+ +</pre> + <p> + 's' is the message tag Status0 ... StatusN is the status as a string (not terminated) + </p> + </item> + <tag>3B) send_status/recv_status</tag> + <item><p>If status was 'alive', node A will answer with + another status message containing either 'true' which means that the + connection should continue (The old connection from this node is broken), or + <c>'false'</c>, which simply means that the connection should be closed, the + connection attempt was a mistake.</p></item> + <tag>4) recv_challenge/send_challenge</tag> + <item><p>If the status was <c>ok</c> or <c>ok_simultaneous</c>, + The handshake continues with B sending A another message, the challenge. + The challenge contains the same type of information as the "name" message + initially sent from A to B, with the addition of a 32 bit challenge:</p> +<pre> ++---+--------+--------+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-...-+-----+ +|'n'|Version0|Version1|Flag0|Flag1|Flag2|Flag3|Chal0|Chal1|Chal2|Chal3|Name0|Name1| ... |NameN| ++---+--------+--------+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-... +-----+ +</pre> + <p> + Where Chal0 ... Chal3 is the challenge as a 32 bit big endian integer + and the other fields are B's version, flags and full nodename. + </p></item> + <tag>5) send_challenge_reply/recv_challenge_reply</tag> + <item><p>Now A has generated a digest and its own challenge. Those are + sent together in a package to B:</p> +<pre> ++---+-----+-----+-----+-----+-----+-----+-----+-----+-...-+------+ +|'r'|Chal0|Chal1|Chal2|Chal3|Dige0|Dige1|Dige2|Dige3| ... |Dige15| ++---+-----+-----+-----+-----+-----+-----+-----+-----+-...-+------+ +</pre> + <p> + Where 'r' is the tag, Chal0 ... Chal3 is A's challenge for B to handle and + Dige0 ... Dige15 is the digest that A constructed from the challenge B sent + in the previous step. + </p></item> + <tag>6) recv_challenge_ack/send_challenge_ack</tag> + <item><p>B checks that the digest received from A is correct and generates a + digest from the challenge received from A. The digest is then sent to A. The + message looks like this:</p> +<pre> ++---+-----+-----+-----+-----+-...-+------+ +|'a'|Dige0|Dige1|Dige2|Dige3| ... |Dige15| ++---+-----+-----+-----+-----+-...-+------+ +</pre> + <p> + Where 'a' is the tag and Dige0 ... Dige15 is the digest calculated by B + for A's challenge.</p></item> + <tag>7)</tag> + <item><p>A checks the digest from B and the connection is up.</p></item> + </taglist> + </section> + <section> + <title>Semigraphic View</title> +<pre> +A (initiator) B (acceptor) + +TCP connect -----------------------------------------> + TCP accept + +send_name -----------------------------------------> + recv_name + + <---------------------------------------- send_status +recv_status +(if status was 'alive' + send_status - - - - - - - - - - - - - - - - - - - -> + recv_status) + ChB = gen_challenge() + (ChB) + <---------------------------------------- send_challenge +recv_challenge + +ChA = gen_challenge(), +OCA = out_cookie(B), +DiA = gen_digest(ChB,OCA) + (ChA, DiA) +send_challenge_reply --------------------------------> + recv_challenge_reply + ICB = in_cookie(A), + check: + DiA == gen_digest + (ChB, ICB) ? + - if OK: + OCB = out_cookie(A), + DiB = gen_digest + (DiB) (ChA, OCB) + <----------------------------------------- send_challenge_ack +recv_challenge_ack DONE +ICA = in_cookie(B), - else +check: CLOSE +DiB == gen_digest(ChA,ICA) ? +- if OK + DONE +- else + CLOSE +</pre> + </section> + <marker id="dflags"/> + <section> + <title>The Currently Defined Distribution Flags</title> + <p> + Currently (OTP-R16) the following capability flags are defined: + </p> +<pre> +%% The node should be published and part of the global namespace +-define(DFLAG_PUBLISHED,1). + +%% The node implements an atom cache (obsolete) +-define(DFLAG_ATOM_CACHE,2). + +%% The node implements extended (3 * 32 bits) references. This is +%% required today. If not present connection will be refused. +-define(DFLAG_EXTENDED_REFERENCES,4). + +%% The node implements distributed process monitoring. +-define(DFLAG_DIST_MONITOR,8). + +%% The node uses separate tag for fun's (lambdas) in the distribution protocol. +-define(DFLAG_FUN_TAGS,16#10). + +%% The node implements distributed named process monitoring. +-define(DFLAG_DIST_MONITOR_NAME,16#20). + +%% The (hidden) node implements atom cache (obsolete) +-define(DFLAG_HIDDEN_ATOM_CACHE,16#40). + +%% The node understand new fun-tags +-define(DFLAG_NEW_FUN_TAGS,16#80). + +%% The node is capable of handling extended pids and ports. This is +%% required today. If not present connection will be refused. +-define(DFLAG_EXTENDED_PIDS_PORTS,16#100). + +%% +-define(DFLAG_EXPORT_PTR_TAG,16#200). + +%% +-define(DFLAG_BIT_BINARIES,16#400). + +%% The node understands new float format +-define(DFLAG_NEW_FLOATS,16#800). + +%% +-define(DFLAG_UNICODE_IO,16#1000). + +%% The node implements atom cache in distribution header. +-define(DFLAG_DIST_HDR_ATOM_CACHE,16#2000). + +%% The node understand the SMALL_ATOM_EXT tag +-define(DFLAG_SMALL_ATOM_TAGS, 16#4000). + +%% The node understand UTF-8 encoded atoms +-define(DFLAG_UTF8_ATOMS, 16#10000). + +</pre> + </section> </section> <section> diff --git a/erts/doc/src/erl_ext_dist.xml b/erts/doc/src/erl_ext_dist.xml index fd2da2cfe3..28afea8b29 100644 --- a/erts/doc/src/erl_ext_dist.xml +++ b/erts/doc/src/erl_ext_dist.xml @@ -119,10 +119,39 @@ <cell align="center">Data</cell> </row> <tcaption></tcaption></table> + <marker id="utf8_atoms"/> + <note> + <p>As of ERTS version 5.10 (OTP-R16) support + for UTF-8 encoded atoms has been introduced in the external format. + However, only characters that can be encoded using Latin1 (ISO-8859-1) + are currently supported in atoms. The support for UTF-8 encoded atoms + in the external format has been implemented in order to be able to support + all Unicode characters in atoms in <em>some future release</em>. Full + support for Unicode atoms will not happen before OTP-R18, and might + be introduced even later than that. Until full Unicode support for + atoms has been introduced, it is an <em>error</em> to pass atoms containing + characters that cannot be encoded in Latin1, and <em>the behavior is + undefined</em>.</p> + <p>When the + <seealso marker="erl_dist_protocol#dflags"><c>DFLAG_UTF8_ATOMS</c></seealso> + distribution flag has been exchanged between both nodes in the + <seealso marker="erl_dist_protocol#distribution_handshake">distribution handshake</seealso>, + all atoms in the distribution header will be encoded in UTF-8; otherwise, + all atoms in the distribution header will be encoded in Latin1. The two + new tags <seealso marker="#ATOM_UTF8_EXT">ATOM_UTF8_EXT</seealso>, and + <seealso marker="#SMALL_ATOM_UTF8_EXT">SMALL_ATOM_UTF8_EXT</seealso> + will only be used if the <c>DFLAG_UTF8_ATOMS</c> distribution flag has + been exchanged between nodes, or if an atom containing characters + that cannot be encoded in Latin1 is encountered. + </p> + <p>The maximum number of allowed characters in an atom is 255. In the + UTF-8 case each character may need 4 bytes to be encoded. + </p> + </note> </section> - <section> - <marker id="distribution_header"/> + <marker id="distribution_header"/> + <section> <title>Distribution header</title> <p> As of erts version 5.7.2 the old atom cache protocol was @@ -219,8 +248,7 @@ <p> The least significant bit in that half byte is the <c>LongAtoms</c> flag. If it is set, 2 bytes are used for atom lengths instead of - 1 byte in the distribution header. However, the current emulator - cannot handle long atoms, so it will currently always be 0. + 1 byte in the distribution header. </p> <p> After the <c>Flags</c> field follow the <c>AtomCacheRefs</c>. The @@ -247,16 +275,26 @@ <p> <c>InternalSegmentIndex</c> together with the <c>SegmentIndex</c> completely identify the location of an atom cache entry in the - atom cache. <c>Length</c> is number of one byte characters that - the atom text consists of. Length is a two byte big endian integer + atom cache. <c>Length</c> is number of bytes that <c>AtomText</c> + consists of. Length is a two byte big endian integer if the <c>LongAtoms</c> flag has been set, otherwise a one byte - integer. Subsequent <c>CachedAtomRef</c>s with the same + integer. When the + <seealso marker="erl_dist_protocol#dflags"><c>DFLAG_UTF8_ATOMS</c></seealso> + distribution flag has been exchanged between both nodes in the + <seealso marker="erl_dist_protocol#distribution_handshake">distribution handshake</seealso>, + characters in <c>AtomText</c> is encoded in UTF-8; otherwise, + encoded in Latin1. Subsequent <c>CachedAtomRef</c>s with the same <c>SegmentIndex</c> and <c>InternalSegmentIndex</c> as this <c>NewAtomCacheRef</c> will refer to this atom until a new <c>NewAtomCacheRef</c> with the same <c>SegmentIndex</c> and <c>InternalSegmentIndex</c> appear. </p> <p> + For more information on encoding of atoms, see + <seealso marker="#utf8_atoms">note on UTF-8 encoded atoms</seealso> + in the beginning of this document. + </p> + <p> If the <c>NewCacheEntryFlag</c> for the next <c>AtomCacheRef</c> has not been set, a <c>CachedAtomRef</c> on the following format will follow: @@ -383,9 +421,9 @@ <tcaption></tcaption></table> <p> An atom is stored with a 2 byte unsigned length in big-endian order, - followed by <c>Len</c> numbers of 8 bit characters that forms the - <c>AtomName</c>. - Note: The maximum allowed value for <c>Len</c> is 255. + followed by <c>Len</c> numbers of 8 bit Latin1 characters that forms + the <c>AtomName</c>. + <em>Note</em>: The maximum allowed value for <c>Len</c> is 255. </p> </section> @@ -754,12 +792,14 @@ <tcaption></tcaption></table> <p> An atom is stored with a 1 byte unsigned length, - followed by <c>Len</c> numbers of 8 bit characters that + followed by <c>Len</c> numbers of 8 bit Latin1 characters that forms the <c>AtomName</c>. Longer atoms can be represented by <seealso marker="#ATOM_EXT">ATOM_EXT</seealso>. <em>Note</em> the <c>SMALL_ATOM_EXT</c> was introduced in erts version 5.7.2 and - require a small atom distribution flag exchanged in the distribution - handshake. + require an exchange of the + <seealso marker="erl_dist_protocol#dflags"><c>DFLAG_SMALL_ATOM_TAGS</c></seealso> + distribution flag in the + <seealso marker="erl_dist_protocol#distribution_handshake">distribution handshake</seealso>. </p> </section> @@ -1007,7 +1047,62 @@ This term is used in minor version 1 of the external format. </p> </section> + <section> + <marker id="ATOM_UTF8_EXT"/> + <title>ATOM_UTF8_EXT</title> + + <table align="left"> + <row> + <cell align="center">1</cell> + <cell align="center">2</cell> + <cell align="center">Len</cell> + </row> + <row> + <cell align="center"><c>118</c></cell> + <cell align="center"><c>Len</c></cell> + <cell align="center"><c>AtomName</c></cell> + </row> + <tcaption></tcaption></table> + <p> + An atom is stored with a 2 byte unsigned length in big-endian order, + followed by <c>Len</c> bytes containing the <c>AtomName</c> encoded + in UTF-8. + </p> + <p> + For more information on encoding of atoms, see + <seealso marker="#utf8_atoms">note on UTF-8 encoded atoms</seealso> + in the beginning of this document. + </p> + </section> + <section> + <marker id="SMALL_ATOM_UTF8_EXT"/> + <title>SMALL_ATOM_UTF8_EXT</title> + + <table align="left"> + <row> + <cell align="center">1</cell> + <cell align="center">1</cell> + <cell align="center">Len</cell> + </row> + <row> + <cell align="center"><c>119</c></cell> + <cell align="center"><c>Len</c></cell> + <cell align="center"><c>AtomName</c></cell> + </row> + <tcaption></tcaption></table> + <p> + An atom is stored with a 1 byte unsigned length, + followed by <c>Len</c> bytes containing the <c>AtomName</c> encoded + in UTF-8. Longer atoms encoded in UTF-8 can be represented using + <seealso marker="#ATOM_UTF8_EXT">ATOM_UTF8_EXT</seealso>. + </p> + <p> + For more information on encoding of atoms, see + <seealso marker="#utf8_atoms">note on UTF-8 encoded atoms</seealso> + in the beginning of this document. + </p> + </section> </chapter> diff --git a/erts/doc/src/erlang.xml b/erts/doc/src/erlang.xml index d09f286e36..315dc323ba 100644 --- a/erts/doc/src/erlang.xml +++ b/erts/doc/src/erlang.xml @@ -277,7 +277,9 @@ the binary contains Unicode characters greater than 16#FF. In a future release, such Unicode characters might be allowed and <c>binary_to_atom(<anno>Binary</anno>, utf8)</c> - will not fail in that case.</p></note> + will not fail in that case. For more information on Unicode support in atoms + see <seealso marker="erl_ext_dist#utf8_atoms">note on UTF-8 encoded atoms</seealso> + in the chapter about the external term format in the ERTS User's Guide.</p></note> <pre> > <input>binary_to_atom(<<"Erlang">>, latin1).</input> @@ -1712,9 +1714,11 @@ os_prompt% </pre> <desc> <p>Returns the atom whose text representation is <c><anno>String</anno></c>.</p> <p><c><anno>String</anno></c> may only contain ISO-latin-1 - characterns (i.e. numbers below 256) as the current + characters (i.e. numbers below 256) as the current implementation does not allow unicode characters >= 256 in - atoms.</p> + atoms. For more information on Unicode support in atoms + see <seealso marker="erl_ext_dist#utf8_atoms">note on UTF-8 encoded atoms</seealso> + in the chapter about the external term format in the ERTS User's Guide.</p> <pre> > <input>list_to_atom("Erlang").</input> 'Erlang'</pre> diff --git a/erts/emulator/beam/atom.c b/erts/emulator/beam/atom.c index d7c7f117cf..82dd320ea9 100644 --- a/erts/emulator/beam/atom.c +++ b/erts/emulator/beam/atom.c @@ -111,7 +111,7 @@ atom_text_alloc(int bytes) { byte *res; - ASSERT(bytes <= MAX_ATOM_LENGTH); + ASSERT(bytes <= MAX_ATOM_SZ_LIMIT); if (atom_text_pos + bytes >= atom_text_end) { more_atom_space(); } @@ -162,6 +162,7 @@ atom_alloc(Atom* tmpl) obj->name = atom_text_alloc(tmpl->len); sys_memcpy(obj->name, tmpl->name, tmpl->len); obj->len = tmpl->len; + obj->latin1_chars = tmpl->latin1_chars; obj->slot.index = -1; /* @@ -192,44 +193,146 @@ atom_free(Atom* obj) erts_free(ERTS_ALC_T_ATOM, (void*) obj); } +static void latin1_to_utf8(byte* conv_buf, const byte** srcp, int* lenp) +{ + byte* dst; + const byte* src = *srcp; + int i, len = *lenp; + + for (i=0 ; i < len; ++i) { + if (src[i] & 0x80) { + goto need_convertion; + } + } + return; + +need_convertion: + sys_memcpy(conv_buf, src, i); + dst = conv_buf + i; + for ( ; i < len; ++i) { + unsigned char chr = src[i]; + if (!(chr & 0x80)) { + *dst++ = chr; + } + else { + *dst++ = 0xC0 | (chr >> 6); + *dst++ = 0x80 | (chr & 0x3F); + } + } + *srcp = conv_buf; + *lenp = dst - conv_buf; +} + +/* + * erts_atom_put() may fail. If it fails THE_NON_VALUE is returned! + */ Eterm -am_atom_put(const char* name, int len) +erts_atom_put(const byte *name, int len, ErtsAtomEncoding enc, int trunc) { + byte utf8_copy[MAX_ATOM_SZ_FROM_LATIN1]; + const byte *text = name; + int tlen = len; + Sint no_latin1_chars; Atom a; - Eterm ret; int aix; - /* - * Silently truncate the atom if it is too long. Overlong atoms - * could occur in situations where we have no good way to return - * an error, such as in the I/O system. (Unfortunately, many - * drivers don't check for errors.) - * - * If an error should be produced for overlong atoms (such in - * list_to_atom/1), the caller should check the length before - * calling this function. - */ - if (len > MAX_ATOM_LENGTH) { - len = MAX_ATOM_LENGTH; - } #ifdef ERTS_ATOM_PUT_OPS_STAT erts_smp_atomic_inc_nob(&atom_put_ops); #endif - a.len = len; - a.name = (byte*)name; + + if (tlen < 0) { + if (trunc) + tlen = 0; + else + return THE_NON_VALUE; + } + + switch (enc) { + case ERTS_ATOM_ENC_7BIT_ASCII: + if (tlen > MAX_ATOM_CHARACTERS) { + if (trunc) + tlen = MAX_ATOM_CHARACTERS; + else + return THE_NON_VALUE; + } +#ifdef DEBUG + for (aix = 0; aix < len; aix++) { + ASSERT((name[aix] & 0x80) == 0); + } +#endif + no_latin1_chars = tlen; + break; + case ERTS_ATOM_ENC_LATIN1: + if (tlen > MAX_ATOM_CHARACTERS) { + if (trunc) + tlen = MAX_ATOM_CHARACTERS; + else + return THE_NON_VALUE; + } + no_latin1_chars = tlen; + latin1_to_utf8(utf8_copy, &text, &tlen); + break; + case ERTS_ATOM_ENC_UTF8: + /* First sanity check; need to verify later */ + if (tlen > MAX_ATOM_SZ_LIMIT && !trunc) + return THE_NON_VALUE; + break; + } + + a.len = tlen; + a.name = (byte *) text; atom_read_lock(); aix = index_get(&erts_atom_table, (void*) &a); atom_read_unlock(); - if (aix >= 0) - ret = make_atom(aix); - else { - atom_write_lock(); - ret = make_atom(index_put(&erts_atom_table, (void*) &a)); - atom_write_unlock(); + if (aix >= 0) { + /* Already in table no need to verify it */ + return make_atom(aix); } - return ret; + + if (enc == ERTS_ATOM_ENC_UTF8) { + /* Need to verify encoding and length */ + byte *err_pos; + Uint no_chars; + switch (erts_analyze_utf8_x((byte *) text, + (Uint) tlen, + &err_pos, + &no_chars, NULL, + &no_latin1_chars, + MAX_ATOM_CHARACTERS)) { + case ERTS_UTF8_OK: + ASSERT(no_chars <= MAX_ATOM_CHARACTERS); + break; + case ERTS_UTF8_OK_MAX_CHARS: + /* Truncated... */ + if (!trunc) + return THE_NON_VALUE; + ASSERT(no_chars == MAX_ATOM_CHARACTERS); + tlen = err_pos - text; + break; + default: + /* Bad utf8... */ + return THE_NON_VALUE; + } + } + + ASSERT(tlen <= MAX_ATOM_SZ_LIMIT); + ASSERT(-1 <= no_latin1_chars && no_latin1_chars <= MAX_ATOM_CHARACTERS); + + a.len = tlen; + a.latin1_chars = (Sint16) no_latin1_chars; + a.name = (byte *) text; + atom_write_lock(); + aix = index_put(&erts_atom_table, (void*) &a); + atom_write_unlock(); + return make_atom(aix); } +Eterm +am_atom_put(const char* name, int len) +{ + /* Assumes 7-bit ascii; use erts_atom_put() for other encodings... */ + return erts_atom_put((byte *) name, len, ERTS_ATOM_ENC_7BIT_ASCII, 1); +} int atom_table_size(void) { @@ -264,14 +367,19 @@ int atom_table_sz(void) } int -erts_atom_get(const char *name, int len, Eterm* ap) +erts_atom_get(const char *name, int len, Eterm* ap, int is_latin1) { + byte utf8_copy[MAX_ATOM_SZ_FROM_LATIN1]; Atom a; int i; int res; - a.len = len; + a.len = (Sint16) len; a.name = (byte *)name; + if (is_latin1) { + latin1_to_utf8(utf8_copy, (const byte**)&a.name, &len); + a.len = (Sint16) len; + } atom_read_lock(); i = index_get(&erts_atom_table, (void*) &a); res = i < 0 ? 0 : (*ap = make_atom(i), 1); @@ -333,8 +441,15 @@ init_atom_table(void) for (i = 0; erl_atom_names[i] != 0; i++) { int ix; a.len = strlen(erl_atom_names[i]); + a.latin1_chars = a.len; a.name = (byte*)erl_atom_names[i]; a.slot.index = i; +#ifdef DEBUG + /* Verify 7-bit ascii */ + for (ix = 0; ix < a.len; ix++) { + ASSERT((a.name[ix] & 0x80) == 0); + } +#endif ix = index_put(&erts_atom_table, (void*) &a); atom_text_pos -= a.len; atom_space -= a.len; diff --git a/erts/emulator/beam/atom.h b/erts/emulator/beam/atom.h index fd9c04d3d0..f721999a4c 100644 --- a/erts/emulator/beam/atom.h +++ b/erts/emulator/beam/atom.h @@ -26,7 +26,9 @@ #include "erl_atom_table.h" -#define MAX_ATOM_LENGTH 255 +#define MAX_ATOM_CHARACTERS 255 +#define MAX_ATOM_SZ_FROM_LATIN1 (2*MAX_ATOM_CHARACTERS) +#define MAX_ATOM_SZ_LIMIT (4*MAX_ATOM_CHARACTERS) /* theoretical byte limit */ #define ATOM_LIMIT (1024*1024) #define MIN_ATOM_TABLE_SIZE 8192 @@ -45,7 +47,8 @@ */ typedef struct atom { IndexSlot slot; /* MUST BE LOCATED AT TOP OF STRUCT!!! */ - int len; /* length of atom name */ + Sint16 len; /* length of atom name (UTF-8 encoded) */ + Sint16 latin1_chars; /* 0-255 if atom can be encoded in latin1; otherwise, -1 */ int ord0; /* ordinal value of first 3 bytes + 7 bits */ byte* name; /* name of atom */ } Atom; @@ -53,8 +56,8 @@ typedef struct atom { extern IndexTable erts_atom_table; ERTS_GLB_INLINE Atom* atom_tab(Uint i); -ERTS_GLB_INLINE int erts_is_atom_bytes(byte *text, size_t len, Eterm term); -ERTS_GLB_INLINE int erts_is_atom_str(char *str, Eterm term); +ERTS_GLB_INLINE int erts_is_atom_utf8_bytes(byte *text, size_t len, Eterm term); +ERTS_GLB_INLINE int erts_is_atom_str(const char *str, Eterm term, int is_latin1); #if ERTS_GLB_INLINE_INCL_FUNC_DEF ERTS_GLB_INLINE Atom* @@ -63,7 +66,7 @@ atom_tab(Uint i) return (Atom *) erts_index_lookup(&erts_atom_table, i); } -ERTS_GLB_INLINE int erts_is_atom_bytes(byte *text, size_t len, Eterm term) +ERTS_GLB_INLINE int erts_is_atom_utf8_bytes(byte *text, size_t len, Eterm term) { Atom *a; if (!is_atom(term)) @@ -73,43 +76,70 @@ ERTS_GLB_INLINE int erts_is_atom_bytes(byte *text, size_t len, Eterm term) && sys_memcmp((void *) a->name, (void *) text, len) == 0); } -ERTS_GLB_INLINE int erts_is_atom_str(char *str, Eterm term) +ERTS_GLB_INLINE int erts_is_atom_str(const char *str, Eterm term, int is_latin1) { Atom *a; int i, len; - char *aname; + const byte* aname; + const byte* s = (const byte*) str; + if (!is_atom(term)) return 0; a = atom_tab(atom_val(term)); len = a->len; - aname = (char *) a->name; - for (i = 0; i < len; i++) - if (aname[i] != str[i] || str[i] == '\0') - return 0; - return str[len] == '\0'; + aname = a->name; + if (is_latin1) { + for (i = 0; i < len; s++) { + if (aname[i] < 0x80) { + if (aname[i] != *s || *s == '\0') + return 0; + i++; + } + else { + if (aname[i] != (0xC0 | (*s >> 6)) || + aname[i+1] != (0x80 | (*s & 0x3F))) { + return 0; + } + i += 2; + } + } + } + else { + for (i = 0; i < len; i++, s++) + if (aname[i] != *s || *s == '\0') + return 0; + } + return *s == '\0'; } #endif +typedef enum { + ERTS_ATOM_ENC_7BIT_ASCII, + ERTS_ATOM_ENC_LATIN1, + ERTS_ATOM_ENC_UTF8 +} ErtsAtomEncoding; + /* * Note, ERTS_IS_ATOM_STR() expects the first argument to be a - * string literal. + * 7-bit ASCII string literal. */ #define ERTS_IS_ATOM_STR(LSTR, TERM) \ - (erts_is_atom_bytes((byte *) LSTR, sizeof(LSTR) - 1, (TERM))) + (erts_is_atom_utf8_bytes((byte *) LSTR, sizeof(LSTR) - 1, (TERM))) #define ERTS_DECL_AM(S) Eterm AM_ ## S = am_atom_put(#S, sizeof(#S) - 1) #define ERTS_INIT_AM(S) AM_ ## S = am_atom_put(#S, sizeof(#S) - 1) int atom_table_size(void); /* number of elements */ int atom_table_sz(void); /* table size in bytes, excluding stored objects */ -Eterm am_atom_put(const char*, int); /* most callers pass plain char*'s */ +Eterm am_atom_put(const char*, int); /* ONLY 7-bit ascii! */ +Eterm erts_atom_put(const byte *name, int len, ErtsAtomEncoding enc, int trunc); int atom_erase(byte*, int); int atom_static_put(byte*, int); void init_atom_table(void); void atom_info(int, void *); void dump_atoms(int, void *); -int erts_atom_get(const char* name, int len, Eterm* ap); +int erts_atom_get(const char* name, int len, Eterm* ap, int is_latin1); void erts_atom_get_text_space_sizes(Uint *reserved, Uint *used); #endif diff --git a/erts/emulator/beam/atom.names b/erts/emulator/beam/atom.names index b74e2785e5..590b2fc960 100644 --- a/erts/emulator/beam/atom.names +++ b/erts/emulator/beam/atom.names @@ -18,6 +18,8 @@ # # +# IMPORTANT! All atoms defined here *need* to be in 7-bit ascii! +# # File format: # # Lines starting with '#' are ignored. diff --git a/erts/emulator/beam/beam_load.c b/erts/emulator/beam/beam_load.c index b51f076a5d..8b4135e21d 100644 --- a/erts/emulator/beam/beam_load.c +++ b/erts/emulator/beam/beam_load.c @@ -1230,7 +1230,7 @@ load_atom_table(LoaderState* stp) GetByte(stp, n); GetString(stp, atom, n); - stp->atom[i] = am_atom_put((char*)atom, n); + stp->atom[i] = erts_atom_put(atom, n, ERTS_ATOM_ENC_LATIN1, 1); } /* @@ -1240,7 +1240,7 @@ load_atom_table(LoaderState* stp) if (is_nil(stp->module)) { stp->module = stp->atom[1]; } else if (stp->atom[1] != stp->module) { - char sbuf[256]; + char sbuf[MAX_ATOM_SZ_FROM_LATIN1]; Atom* ap; ap = atom_tab(atom_val(stp->atom[1])); @@ -1620,7 +1620,7 @@ read_line_table(LoaderState* stp) GetInt(stp, 2, n); GetString(stp, fname, n); - stp->fname[i] = am_atom_put((char*)fname, n); + stp->fname[i] = erts_atom_put(fname, n, ERTS_ATOM_ENC_LATIN1, 1); } } diff --git a/erts/emulator/beam/bif.c b/erts/emulator/beam/bif.c index 182129cd36..bde70911a2 100644 --- a/erts/emulator/beam/bif.c +++ b/erts/emulator/beam/bif.c @@ -2604,9 +2604,13 @@ BIF_RETTYPE delete_element_2(BIF_ALIST_3) BIF_RETTYPE atom_to_list_1(BIF_ALIST_1) { - Uint need; - Eterm* hp; Atom* ap; + Uint num_chars, num_built, num_eaten; + byte* err_pos; + Eterm res; +#ifdef DEBUG + int ares; +#endif if (is_not_atom(BIF_ARG_1)) BIF_ERROR(BIF_P, BADARG); @@ -2615,9 +2619,18 @@ BIF_RETTYPE atom_to_list_1(BIF_ALIST_1) ap = atom_tab(atom_val(BIF_ARG_1)); if (ap->len == 0) BIF_RET(NIL); /* the empty atom */ - need = ap->len*2; - hp = HAlloc(BIF_P, need); - BIF_RET(buf_to_intlist(&hp,(char*)ap->name,ap->len, NIL)); + +#ifdef DEBUG + ares = +#endif + erts_analyze_utf8(ap->name, ap->len, &err_pos, &num_chars, NULL); + ASSERT(ares == ERTS_UTF8_OK); + + res = erts_utf8_to_list(BIF_P, num_chars, ap->name, ap->len, ap->len, + &num_built, &num_eaten, NIL); + ASSERT(num_built == num_chars); + ASSERT(num_eaten == ap->len); + BIF_RET(res); } /**********************************************************************/ @@ -2627,18 +2640,19 @@ BIF_RETTYPE atom_to_list_1(BIF_ALIST_1) BIF_RETTYPE list_to_atom_1(BIF_ALIST_1) { Eterm res; - char *buf = (char *) erts_alloc(ERTS_ALC_T_TMP, MAX_ATOM_LENGTH); - int i = intlist_to_buf(BIF_ARG_1, buf, MAX_ATOM_LENGTH); + char *buf = (char *) erts_alloc(ERTS_ALC_T_TMP, MAX_ATOM_CHARACTERS); + int i = intlist_to_buf(BIF_ARG_1, buf, MAX_ATOM_CHARACTERS); if (i < 0) { erts_free(ERTS_ALC_T_TMP, (void *) buf); i = list_length(BIF_ARG_1); - if (i > MAX_ATOM_LENGTH) { + if (i > MAX_ATOM_CHARACTERS) { BIF_ERROR(BIF_P, SYSTEM_LIMIT); } BIF_ERROR(BIF_P, BADARG); } - res = am_atom_put(buf, i); + res = erts_atom_put((byte *) buf, i, ERTS_ATOM_ENC_LATIN1, 1); + ASSERT(is_atom(res)); erts_free(ERTS_ALC_T_TMP, (void *) buf); BIF_RET(res); } @@ -2648,16 +2662,16 @@ BIF_RETTYPE list_to_atom_1(BIF_ALIST_1) BIF_RETTYPE list_to_existing_atom_1(BIF_ALIST_1) { int i; - char *buf = (char *) erts_alloc(ERTS_ALC_T_TMP, MAX_ATOM_LENGTH); + char *buf = (char *) erts_alloc(ERTS_ALC_T_TMP, MAX_ATOM_CHARACTERS); - if ((i = intlist_to_buf(BIF_ARG_1, buf, MAX_ATOM_LENGTH)) < 0) { + if ((i = intlist_to_buf(BIF_ARG_1, buf, MAX_ATOM_CHARACTERS)) < 0) { error: erts_free(ERTS_ALC_T_TMP, (void *) buf); BIF_ERROR(BIF_P, BADARG); } else { Eterm a; - if (erts_atom_get(buf, i, &a)) { + if (erts_atom_get(buf, i, &a, 1)) { erts_free(ERTS_ALC_T_TMP, (void *) buf); BIF_RET(a); } else { diff --git a/erts/emulator/beam/dist.c b/erts/emulator/beam/dist.c index 64cd93a100..145e6861f6 100644 --- a/erts/emulator/beam/dist.c +++ b/erts/emulator/beam/dist.c @@ -1729,7 +1729,7 @@ dsig_send(ErtsDSigData *dsdp, Eterm ctl, Eterm msg, int force_busy) data_size += erts_encode_dist_ext_size(ctl, flags, acmp); if (is_value(msg)) data_size += erts_encode_dist_ext_size(msg, flags, acmp); - erts_finalize_atom_cache_map(acmp); + erts_finalize_atom_cache_map(acmp, flags); dhdr_ext_size = erts_encode_ext_dist_header_size(acmp); data_size += dhdr_ext_size; @@ -2073,7 +2073,8 @@ erts_dist_command(Port *prt, int reds_limit) ASSERT(ob); do { ob->extp = erts_encode_ext_dist_header_finalize(ob->extp, - dep->cache); + dep->cache, + flags); if (!(flags & DFLAG_DIST_HDR_ATOM_CACHE)) *--ob->extp = PASS_THROUGH; /* Old node; 'pass through' needed */ @@ -2117,7 +2118,8 @@ erts_dist_command(Port *prt, int reds_limit) Uint size; oq.first->extp = erts_encode_ext_dist_header_finalize(oq.first->extp, - dep->cache); + dep->cache, + flags); reds += ERTS_PORT_REDS_DIST_CMD_FINALIZE; if (!(flags & DFLAG_DIST_HDR_ATOM_CACHE)) *--oq.first->extp = PASS_THROUGH; /* Old node; 'pass through' diff --git a/erts/emulator/beam/dist.h b/erts/emulator/beam/dist.h index 2bc3d9c881..310d09768d 100644 --- a/erts/emulator/beam/dist.h +++ b/erts/emulator/beam/dist.h @@ -38,7 +38,8 @@ #define DFLAG_UNICODE_IO 0x1000 #define DFLAG_DIST_HDR_ATOM_CACHE 0x2000 #define DFLAG_SMALL_ATOM_TAGS 0x4000 -#define DFLAGS_INTERNAL_TAGS 0x8000 +#define DFLAG_INTERNAL_TAGS 0x8000 +#define DFLAG_UTF8_ATOMS 0x10000 /* All flags that should be enabled when term_to_binary/1 is used. */ #define TERM_TO_BINARY_DFLAGS (DFLAG_EXTENDED_REFERENCES \ diff --git a/erts/emulator/beam/erl_alloc.c b/erts/emulator/beam/erl_alloc.c index 5da8c69c03..007cdbdfa6 100644 --- a/erts/emulator/beam/erl_alloc.c +++ b/erts/emulator/beam/erl_alloc.c @@ -3062,13 +3062,13 @@ erts_request_alloc_info(struct process *c_p, Eterm alloc = CAR(consp); for (ai = ERTS_ALC_A_MIN; ai <= ERTS_ALC_A_MAX; ai++) - if (erts_is_atom_str((char *) erts_alc_a2ad[ai], alloc)) + if (erts_is_atom_str(erts_alc_a2ad[ai], alloc, 0)) goto save_alloc; - if (erts_is_atom_str("mseg_alloc", alloc)) { + if (erts_is_atom_str("mseg_alloc", alloc, 0)) { ai = ERTS_ALC_INFO_A_MSEG_ALLOC; goto save_alloc; } - if (erts_is_atom_str("alloc_util", alloc)) { + if (erts_is_atom_str("alloc_util", alloc, 0)) { ai = ERTS_ALC_INFO_A_ALLOC_UTIL; save_alloc: if (req_ai[ai]) diff --git a/erts/emulator/beam/erl_alloc.types b/erts/emulator/beam/erl_alloc.types index d4de0d076a..3d437652ce 100644 --- a/erts/emulator/beam/erl_alloc.types +++ b/erts/emulator/beam/erl_alloc.types @@ -49,6 +49,8 @@ # true after a "+enable X" statement or if it has been passed as a # command line argument to make_alloc_types. The variable X is false # after a "+disable X" statement or if it has never been mentioned. +# +# IMPORTANT! Only use 7-bit ascii text in this file! +if smp +disable threads_no_smp diff --git a/erts/emulator/beam/erl_alloc_util.c b/erts/emulator/beam/erl_alloc_util.c index 3d6761339b..6de0099636 100644 --- a/erts/emulator/beam/erl_alloc_util.c +++ b/erts/emulator/beam/erl_alloc_util.c @@ -2906,10 +2906,10 @@ make_name_atoms(Allctr_t *allctr) char alloc[] = "alloc"; char realloc[] = "realloc"; char free[] = "free"; - char buf[MAX_ATOM_LENGTH]; + char buf[MAX_ATOM_CHARACTERS]; size_t prefix_len = strlen(allctr->name_prefix); - if (prefix_len > MAX_ATOM_LENGTH + sizeof(realloc) - 1) + if (prefix_len > MAX_ATOM_CHARACTERS + sizeof(realloc) - 1) erl_exit(1,"Too long allocator name: %salloc\n",allctr->name_prefix); memcpy((void *) buf, (void *) allctr->name_prefix, prefix_len); diff --git a/erts/emulator/beam/erl_bif_ddll.c b/erts/emulator/beam/erl_bif_ddll.c index 7cbea55eac..889fefacfc 100644 --- a/erts/emulator/beam/erl_bif_ddll.c +++ b/erts/emulator/beam/erl_bif_ddll.c @@ -1835,7 +1835,10 @@ static Eterm build_load_error_hp(Eterm *hp, int code) static Eterm mkatom(char *str) { - return am_atom_put(str, sys_strlen(str)); + return erts_atom_put((byte *) str, + sys_strlen(str), + ERTS_ATOM_ENC_LATIN1, + 1); } static char *pick_list_or_atom(Eterm name_term) diff --git a/erts/emulator/beam/erl_bif_info.c b/erts/emulator/beam/erl_bif_info.c index fabddffc68..b8026063e6 100755 --- a/erts/emulator/beam/erl_bif_info.c +++ b/erts/emulator/beam/erl_bif_info.c @@ -2298,8 +2298,10 @@ BIF_RETTYPE system_info_1(BIF_ALIST_1) for (i = num_instructions-1; i >= 0; i--) { res = erts_bld_cons(hpp, hszp, erts_bld_tuple(hpp, hszp, 2, - am_atom_put(opc[i].name, - strlen(opc[i].name)), + erts_atom_put(opc[i].name, + strlen(opc[i].name), + ERTS_ATOM_ENC_LATIN1, + 1), erts_bld_uint(hpp, hszp, opc[i].count)), res); @@ -3873,7 +3875,7 @@ static Eterm lcnt_build_lock_stats_term(Eterm **hpp, Uint *szp, erts_lcnt_lock_s timer_ns = stats->timer.ns; timer_n = stats->timer_n; - af = am_atom_put(stats->file, strlen(stats->file)); + af = erts_atom_put(stats->file, strlen(stats->file), ERTS_ATOM_ENC_LATIN1, 1); uil = erts_bld_uint( hpp, szp, line); tloc = erts_bld_tuple(hpp, szp, 2, af, uil); @@ -3910,13 +3912,13 @@ static Eterm lcnt_build_lock_term(Eterm **hpp, Uint *szp, erts_lcnt_lock_t *lock ASSERT(ltype); - type = am_atom_put(ltype, strlen(ltype)); - name = am_atom_put(lock->name, strlen(lock->name)); + type = erts_atom_put(ltype, strlen(ltype), ERTS_ATOM_ENC_LATIN1, 1); + name = erts_atom_put(lock->name, strlen(lock->name), ERTS_ATOM_ENC_LATIN1, 1); if (lock->flag & ERTS_LCNT_LT_ALLOC) { /* use allocator types names as id's for allocator locks */ ltype = (char *) ERTS_ALC_A2AD(signed_val(lock->id)); - id = am_atom_put(ltype, strlen(ltype)); + id = erts_atom_put(ltype, strlen(ltype), ERTS_ATOM_ENC_LATIN1, 1); } else if (lock->flag & ERTS_LCNT_LT_PROCLOCK) { /* use registered names as id's for process locks if available */ proc = erts_proc_lookup(lock->id); @@ -3956,12 +3958,12 @@ static Eterm lcnt_build_result_term(Eterm **hpp, Uint *szp, erts_lcnt_data_t *da dtns = erts_bld_uint( hpp, szp, data->duration.ns); tdt = erts_bld_tuple(hpp, szp, 2, dts, dtns); - adur = am_atom_put(str_duration, strlen(str_duration)); + adur = erts_atom_put(str_duration, strlen(str_duration), ERTS_ATOM_ENC_LATIN1, 1); tdur = erts_bld_tuple(hpp, szp, 2, adur, tdt); /* lock tuple */ - aloc = am_atom_put(str_locks, strlen(str_locks)); + aloc = erts_atom_put(str_locks, strlen(str_locks), ERTS_ATOM_ENC_LATIN1, 1); for (lock = data->current_locks->head; lock != NULL ; lock = lock->next ) { lloc = lcnt_build_lock_term(hpp, szp, lock, lloc); @@ -4097,14 +4099,14 @@ BIF_RETTYPE erts_debug_lock_counters_1(BIF_ALIST_1) static void os_info_init(void) { - Eterm type = am_atom_put(os_type, strlen(os_type)); + Eterm type = erts_atom_put((byte *) os_type, strlen(os_type), ERTS_ATOM_ENC_LATIN1, 1); Eterm flav; int major, minor, build; char* buf = erts_alloc(ERTS_ALC_T_TMP, 1024); /* More than enough */ Eterm* hp; os_flavor(buf, 1024); - flav = am_atom_put(buf, strlen(buf)); + flav = erts_atom_put((byte *) buf, strlen(buf), ERTS_ATOM_ENC_LATIN1, 1); erts_free(ERTS_ALC_T_TMP, (void *) buf); hp = erts_alloc(ERTS_ALC_T_LL_TEMP_TERM, (3+4)*sizeof(Eterm)); os_type_tuple = TUPLE2(hp, type, flav); diff --git a/erts/emulator/beam/erl_bif_port.c b/erts/emulator/beam/erl_bif_port.c index 81146e38d7..a4b837541b 100644 --- a/erts/emulator/beam/erl_bif_port.c +++ b/erts/emulator/beam/erl_bif_port.c @@ -66,7 +66,7 @@ BIF_RETTYPE open_port_2(BIF_ALIST_2) } else { str = "einval"; } - BIF_P->fvalue = am_atom_put(str, strlen(str)); + BIF_P->fvalue = erts_atom_put((byte *) str, strlen(str), ERTS_ATOM_ENC_LATIN1, 1); BIF_ERROR(BIF_P, EXC_ERROR); } diff --git a/erts/emulator/beam/erl_db.c b/erts/emulator/beam/erl_db.c index 48a95cdf32..7932966539 100644 --- a/erts/emulator/beam/erl_db.c +++ b/erts/emulator/beam/erl_db.c @@ -3830,7 +3830,7 @@ erts_ets_colliding_names(Process* p, Eterm name, Uint cnt) while (index >= atom_table_size()) { char tmp[20]; erts_snprintf(tmp, sizeof(tmp), "am%x", atom_table_size()); - am_atom_put(tmp,strlen(tmp)); + erts_atom_put((byte *) tmp, strlen(tmp), ERTS_ATOM_ENC_LATIN1, 1); } list = CONS(hp, make_atom(index), list); hp += 2; diff --git a/erts/emulator/beam/erl_db_util.c b/erts/emulator/beam/erl_db_util.c index bb08762b26..58c4d75c31 100644 --- a/erts/emulator/beam/erl_db_util.c +++ b/erts/emulator/beam/erl_db_util.c @@ -4773,7 +4773,8 @@ static int match_compact(ErlHeapFragment *expr, DMCErrInfo *err_info) ASSERT(j < x); erts_snprintf(buff+1, sizeof(buff) - 1, "%u", (unsigned) j); /* Yes, writing directly into terms, they ARE off heap */ - *p = am_atom_put(buff, strlen(buff)); + *p = erts_atom_put((byte *) buff, strlen(buff), + ERTS_ATOM_ENC_LATIN1, 1); } ++p; } diff --git a/erts/emulator/beam/erl_init.c b/erts/emulator/beam/erl_init.c index b518683730..61b3c09d16 100644 --- a/erts/emulator/beam/erl_init.c +++ b/erts/emulator/beam/erl_init.c @@ -358,7 +358,7 @@ erl_first_process_otp(char* modname, void* code, unsigned size, int argc, char** ErlSpawnOpts so; Eterm env; - start_mod = am_atom_put(modname, sys_strlen(modname)); + start_mod = erts_atom_put((byte *) modname, sys_strlen(modname), ERTS_ATOM_ENC_LATIN1, 1); if (erts_find_function(start_mod, am_start, 2, erts_active_code_ix()) == NULL) { erl_exit(5, "No function %s:start/2\n", modname); @@ -455,7 +455,7 @@ load_preloaded(void) i = 0; while ((name = preload_p[i].name) != NULL) { length = preload_p[i].size; - module_name = am_atom_put(name, sys_strlen(name)); + module_name = erts_atom_put((byte *) name, sys_strlen(name), ERTS_ATOM_ENC_LATIN1, 1); if ((code = sys_preload_begin(&preload_p[i])) == 0) erl_exit(1, "Failed to find preloaded code for module %s\n", name); diff --git a/erts/emulator/beam/erl_nif.c b/erts/emulator/beam/erl_nif.c index 1bd2d933b2..fb295c9a8a 100644 --- a/erts/emulator/beam/erl_nif.c +++ b/erts/emulator/beam/erl_nif.c @@ -743,16 +743,23 @@ int enif_get_atom(ErlNifEnv* env, Eterm atom, char* buf, unsigned len, { Atom* ap; ASSERT(encoding == ERL_NIF_LATIN1); - if (is_not_atom(atom)) { + if (is_not_atom(atom) || len==0) { return 0; } ap = atom_tab(atom_val(atom)); - if (ap->len+1 > len) { + + if (ap->latin1_chars < 0 || ap->latin1_chars >= len) { return 0; } - sys_memcpy(buf, ap->name, ap->len); - buf[ap->len] = '\0'; - return ap->len + 1; + if (ap->latin1_chars == ap->len) { + sys_memcpy(buf, ap->name, ap->len); + } + else { + int dlen = erts_utf8_to_latin1((byte*)buf, ap->name, ap->len); + ASSERT(dlen == ap->latin1_chars); (void)dlen; + } + buf[ap->latin1_chars] = '\0'; + return ap->latin1_chars + 1; } int enif_get_int(ErlNifEnv* env, Eterm term, int* ip) @@ -854,7 +861,10 @@ int enif_get_atom_length(ErlNifEnv* env, Eterm atom, unsigned* len, ASSERT(enc == ERL_NIF_LATIN1); if (is_not_atom(atom)) return 0; ap = atom_tab(atom_val(atom)); - *len = ap->len; + if (ap->latin1_chars < 0) { + return 0; + } + *len = ap->latin1_chars; return 1; } @@ -961,7 +971,7 @@ ERL_NIF_TERM enif_make_atom(ErlNifEnv* env, const char* name) ERL_NIF_TERM enif_make_atom_len(ErlNifEnv* env, const char* name, size_t len) { - return am_atom_put(name, len); + return erts_atom_put((byte*)name, len, ERTS_ATOM_ENC_LATIN1, 1); } int enif_make_existing_atom(ErlNifEnv* env, const char* name, ERL_NIF_TERM* atom, @@ -974,7 +984,7 @@ int enif_make_existing_atom_len(ErlNifEnv* env, const char* name, size_t len, ERL_NIF_TERM* atom, ErlNifCharEncoding encoding) { ASSERT(encoding == ERL_NIF_LATIN1); - return erts_atom_get(name, len, atom); + return erts_atom_get(name, len, atom, 1); } ERL_NIF_TERM enif_make_tuple(ErlNifEnv* env, unsigned cnt, ...) @@ -1633,7 +1643,7 @@ BIF_RETTYPE load_nif_2(BIF_ALIST_2) "this vm variant (%s).", entry->vm_variant, ERL_NIF_VM_VARIANT); } - else if (!erts_is_atom_str((char*)entry->name, mod_atom)) { + else if (!erts_is_atom_str((char*)entry->name, mod_atom, 1)) { ret = load_nif_error(BIF_P, bad_lib, "Library module name '%s' does not" " match calling module '%T'", entry->name, mod_atom); } @@ -1643,7 +1653,7 @@ BIF_RETTYPE load_nif_2(BIF_ALIST_2) for (i=0; i < entry->num_of_funcs && ret==am_ok; i++) { BeamInstr** code_pp; ErlNifFunc* f = &entry->funcs[i]; - if (!erts_atom_get(f->name, sys_strlen(f->name), &f_atom) + if (!erts_atom_get(f->name, sys_strlen(f->name), &f_atom, 1) || (code_pp = get_func_pp(mod->curr.code, f_atom, f->arity))==NULL) { ret = load_nif_error(BIF_P,bad_lib,"Function not found %T:%s/%u", mod_atom, f->name, f->arity); @@ -1746,7 +1756,7 @@ BIF_RETTYPE load_nif_2(BIF_ALIST_2) for (i=0; i < entry->num_of_funcs; i++) { BeamInstr* code_ptr; - erts_atom_get(entry->funcs[i].name, sys_strlen(entry->funcs[i].name), &f_atom); + erts_atom_get(entry->funcs[i].name, sys_strlen(entry->funcs[i].name), &f_atom, 1); code_ptr = *get_func_pp(mod->curr.code, f_atom, entry->funcs[i].arity); if (code_ptr[1] == 0) { diff --git a/erts/emulator/beam/erl_unicode.c b/erts/emulator/beam/erl_unicode.c index 51559aea1c..883405d066 100644 --- a/erts/emulator/beam/erl_unicode.c +++ b/erts/emulator/beam/erl_unicode.c @@ -1154,15 +1154,24 @@ BIF_RETTYPE unicode_characters_to_list_2(BIF_ALIST_2) * When input to characters_to_list is a plain binary and the format is 'unicode', we do * a faster analyze and size count with this function. */ -int erts_analyze_utf8(byte *source, Uint size, - byte **err_pos, Uint *num_chars, int *left) +static ERTS_INLINE int +analyze_utf8(byte *source, Uint size, byte **err_pos, Uint *num_chars, int *left, + Sint *num_latin1_chars, Uint max_chars) { + Uint latin1_count; + int is_latin1; *err_pos = source; + if (num_latin1_chars) { + is_latin1 = 1; + latin1_count = 0; + } *num_chars = 0; while (size) { if (((*source) & ((byte) 0x80)) == 0) { source++; - --size; + --size; + if (num_latin1_chars) + latin1_count++; } else if (((*source) & ((byte) 0xE0)) == 0xC0) { if (size < 2) { return ERTS_UTF8_INCOMPLETE; @@ -1171,6 +1180,11 @@ int erts_analyze_utf8(byte *source, Uint size, ((*source) < 0xC2) /* overlong */) { return ERTS_UTF8_ERROR; } + if (num_latin1_chars) { + latin1_count++; + if ((source[0] & ((byte) 0xFC)) != ((byte) 0xC0)) + is_latin1 = 0; + } source += 2; size -= 2; } else if (((*source) & ((byte) 0xF0)) == 0xE0) { @@ -1188,6 +1202,8 @@ int erts_analyze_utf8(byte *source, Uint size, } source += 3; size -= 3; + if (num_latin1_chars) + is_latin1 = 0; } else if (((*source) & ((byte) 0xF8)) == 0xF0) { if (size < 4) { return ERTS_UTF8_INCOMPLETE; @@ -1205,21 +1221,40 @@ int erts_analyze_utf8(byte *source, Uint size, } source += 4; size -= 4; + if (num_latin1_chars) + is_latin1 = 0; } else { return ERTS_UTF8_ERROR; } ++(*num_chars); *err_pos = source; - if (left && --(*left) <= 0) { + if (max_chars && size > 0 && *num_chars == max_chars) + return ERTS_UTF8_OK_MAX_CHARS; + if (left && --(*left) <= 0 && size) { return ERTS_UTF8_ANALYZE_MORE; } } + if (num_latin1_chars) + *num_latin1_chars = is_latin1 ? latin1_count : -1; return ERTS_UTF8_OK; } +int erts_analyze_utf8(byte *source, Uint size, + byte **err_pos, Uint *num_chars, int *left) +{ + return analyze_utf8(source, size, err_pos, num_chars, left, NULL, 0); +} + +int erts_analyze_utf8_x(byte *source, Uint size, + byte **err_pos, Uint *num_chars, int *left, + Sint *num_latin1_chars, Uint max_chars) +{ + return analyze_utf8(source, size, err_pos, num_chars, left, num_latin1_chars, max_chars); +} + /* * No errors should be able to occur - no overlongs, no malformed, no nothing - */ + */ static Eterm do_utf8_to_list(Process *p, Uint num, byte *bytes, Uint sz, Uint left, Uint *num_built, Uint *num_eaten, Eterm tail) @@ -1275,6 +1310,12 @@ static Eterm do_utf8_to_list(Process *p, Uint num, byte *bytes, Uint sz, return ret; } +Eterm erts_utf8_to_list(Process *p, Uint num, byte *bytes, Uint sz, Uint left, + Uint *num_built, Uint *num_eaten, Eterm tail) +{ + return do_utf8_to_list(p, num, bytes, sz, left, num_built, num_eaten, tail); +} + static int is_candidate(Uint cp) { int index,pos; @@ -1812,31 +1853,25 @@ BIF_RETTYPE atom_to_binary_2(BIF_ALIST_2) ap = atom_tab(atom_val(BIF_ARG_1)); if (BIF_ARG_2 == am_latin1) { - BIF_RET(new_binary(BIF_P, ap->name, ap->len)); - } else if (BIF_ARG_2 == am_utf8 || BIF_ARG_2 == am_unicode) { - int bin_size = 0; - int i; Eterm bin_term; - byte* bin_p; - for (i = 0; i < ap->len; i++) { - bin_size += (ap->name[i] >= 0x80) ? 2 : 1; + if (ap->latin1_chars < 0) { + goto error; } - if (bin_size == ap->len) { - BIF_RET(new_binary(BIF_P, ap->name, ap->len)); + if (ap->latin1_chars == ap->len) { + bin_term = new_binary(BIF_P, ap->name, ap->len); } - bin_term = new_binary(BIF_P, 0, bin_size); - bin_p = binary_bytes(bin_term); - for (i = 0; i < ap->len; i++) { - byte b = ap->name[i]; - if (b < 0x80) { - *bin_p++ = b; - } else { - *bin_p++ = 0xC0 | (b >> 6); - *bin_p++ = 0x80 | (b & 0x3F); - } + else { + byte* bin_p; + int dbg_sz; + bin_term = new_binary(BIF_P, 0, ap->latin1_chars); + bin_p = binary_bytes(bin_term); + dbg_sz = erts_utf8_to_latin1(bin_p, ap->name, ap->len); + ASSERT(dbg_sz == ap->latin1_chars); (void)dbg_sz; } BIF_RET(bin_term); + } else if (BIF_ARG_2 == am_utf8 || BIF_ARG_2 == am_unicode) { + BIF_RET(new_binary(BIF_P, ap->name, ap->len)); } else { error: BIF_ERROR(BIF_P, BADARG); @@ -1844,118 +1879,78 @@ BIF_RETTYPE atom_to_binary_2(BIF_ALIST_2) } static BIF_RETTYPE -binary_to_atom(Process* p, Eterm bin, Eterm enc, int must_exist) +binary_to_atom(Process* proc, Eterm bin, Eterm enc, int must_exist) { byte* bytes; byte *temp_alloc = NULL; Uint bin_size; if ((bytes = erts_get_aligned_binary_bytes(bin, &temp_alloc)) == 0) { - BIF_ERROR(p, BADARG); + BIF_ERROR(proc, BADARG); } bin_size = binary_size(bin); if (enc == am_latin1) { Eterm a; - if (bin_size > MAX_ATOM_LENGTH) { + if (bin_size > MAX_ATOM_CHARACTERS) { system_limit: erts_free_aligned_binary_bytes(temp_alloc); - BIF_ERROR(p, SYSTEM_LIMIT); + BIF_ERROR(proc, SYSTEM_LIMIT); } if (!must_exist) { - a = am_atom_put((char *)bytes, bin_size); - erts_free_aligned_binary_bytes(temp_alloc); + a = erts_atom_put((byte *) bytes, + bin_size, + ERTS_ATOM_ENC_LATIN1, + 0); + erts_free_aligned_binary_bytes(temp_alloc); + if (is_non_value(a)) + goto badarg; BIF_RET(a); - } else if (erts_atom_get((char *)bytes, bin_size, &a)) { + } else if (erts_atom_get((char *)bytes, bin_size, &a, 1)) { erts_free_aligned_binary_bytes(temp_alloc); BIF_RET(a); } else { goto badarg; } } else if (enc == am_utf8 || enc == am_unicode) { - char *buf; - char *dst; - int i; - int num_chars; Eterm res; + Uint num_chars = 0; + const byte* p = bytes; + Uint left = bin_size; - if (bin_size > 2*MAX_ATOM_LENGTH) { - byte* err_pos; - Uint n; - int reds_left = bin_size+1; /* Number of reductions left. */ - - if (erts_analyze_utf8(bytes, bin_size, &err_pos, - &n, &reds_left) == ERTS_UTF8_OK) { - /* - * Correct UTF-8 encoding, but too many characters to - * fit in an atom. - */ + while (left) { + if (++num_chars > MAX_ATOM_CHARACTERS) { goto system_limit; - } else { - /* - * Something wrong in the UTF-8 encoding or Unicode code - * points > 255. - */ - goto badarg; } - } - - /* - * Allocate a temporary buffer the same size as the binary, - * so that we don't need an extra overflow test. - */ - buf = (char *) erts_alloc(ERTS_ALC_T_TMP, bin_size); - dst = buf; - for (i = 0; i < bin_size; i++) { - int c = bytes[i]; - if (c < 0x80) { - *dst++ = c; - } else if (i < bin_size-1) { - int c2; - if ((c & 0xE0) != 0xC0) { - goto free_badarg; - } - i++; - c = (c & 0x3F) << 6; - c2 = bytes[i]; - if ((c2 & 0xC0) != 0x80) { - goto free_badarg; - } - c = c | (c2 & 0x3F); - if (0x80 <= c && c < 256) { - *dst++ = c; - } else { - goto free_badarg; - } - } else { - free_badarg: - erts_free(ERTS_ALC_T_TMP, (void *) buf); - goto badarg; + if ((p[0] & 0x80) == 0) { + ++p; + --left; } + else if (left >= 2 + && (p[0] & 0xFE) == 0xC2 /* only allow latin1 subset */ + && (p[1] & 0xC0) == 0x80) { + p += 2; + left -= 2; + } + else goto badarg; } - num_chars = dst - buf; - if (num_chars > MAX_ATOM_LENGTH) { - erts_free(ERTS_ALC_T_TMP, (void *) buf); - goto system_limit; - } + if (!must_exist) { - res = am_atom_put(buf, num_chars); - erts_free(ERTS_ALC_T_TMP, (void *) buf); - erts_free_aligned_binary_bytes(temp_alloc); - BIF_RET(res); - } else { - int exists = erts_atom_get(buf, num_chars, &res); - erts_free(ERTS_ALC_T_TMP, (void *) buf); - if (exists) { - erts_free_aligned_binary_bytes(temp_alloc); - BIF_RET(res); - } else { - goto badarg; - } + res = erts_atom_put((byte *) bytes, + bin_size, + ERTS_ATOM_ENC_UTF8, + 0); + } + else if (!erts_atom_get((char*)bytes, bin_size, &res, 0)) { + goto badarg; } + erts_free_aligned_binary_bytes(temp_alloc); + if (is_non_value(res)) + goto badarg; + BIF_RET(res); } else { badarg: erts_free_aligned_binary_bytes(temp_alloc); - BIF_ERROR(p, BADARG); + BIF_ERROR(proc, BADARG); } } @@ -2670,3 +2665,28 @@ BIF_RETTYPE file_native_name_encoding_0(BIF_ALIST_0) } } +int erts_utf8_to_latin1(byte* dest, const byte* source, int slen) +{ + /* + * Assumes source contains valid utf8 that can be encoded as latin1, + * and that dest has enough room. + */ + byte* dp = dest; + + while (slen > 0) { + if ((source[0] & 0x80) == 0) { + *dp++ = *source++; + --slen; + } + else { + ASSERT(slen > 1); + ASSERT((source[0] & 0xFE) == 0xC2); + ASSERT((source[1] & 0xC0) == 0x80); + *dp++ = (char) ((source[0] << 6) | (source[1] & 0x3F)); + source += 2; + slen -= 2; + } + } + return dp - dest; +} + diff --git a/erts/emulator/beam/external.c b/erts/emulator/beam/external.c index ab1065aaa1..8c4d9108d4 100644 --- a/erts/emulator/beam/external.c +++ b/erts/emulator/beam/external.c @@ -142,6 +142,7 @@ erts_init_atom_cache_map(ErtsAtomCacheMap *acmp) { if (acmp) { int ix; + acmp->long_atoms = 0; for (ix = 0; ix < ERTS_ATOM_CACHE_SIZE; ix++) acmp->cache[ix].iix = -1; acmp->sz = 0; @@ -154,6 +155,7 @@ erts_reset_atom_cache_map(ErtsAtomCacheMap *acmp) { if (acmp) { int i; + acmp->long_atoms = 0; for (i = 0; i < acmp->sz; i++) { ASSERT(0 <= acmp->cix[i] && acmp->cix[i] < ERTS_ATOM_CACHE_SIZE); acmp->cache[acmp->cix[i]].iix = -1; @@ -175,9 +177,23 @@ erts_destroy_atom_cache_map(ErtsAtomCacheMap *acmp) } static ERTS_INLINE void -insert_acache_map(ErtsAtomCacheMap *acmp, Eterm atom) +insert_acache_map(ErtsAtomCacheMap *acmp, Eterm atom, Uint32 dflags) { - if (acmp && acmp->sz < ERTS_MAX_INTERNAL_ATOM_CACHE_ENTRIES) { + /* + * If the receiver do not understand utf8 atoms + * and this atom cannot be represented in latin1, + * we are not allowed to cache it. + * + * In this case all atoms are assumed to have + * latin1 encoding in the cache. By refusing it + * in the cache we will instead encode it using + * ATOM_UTF8_EXT/SMALL_ATOM_UTF8_EXT which the + * receiver do not recognize and tear down the + * connection. + */ + if (acmp && acmp->sz < ERTS_MAX_INTERNAL_ATOM_CACHE_ENTRIES + && ((dflags & DFLAG_UTF8_ATOMS) + || atom_tab(atom_val(atom))->latin1_chars >= 0)) { int ix; ASSERT(acmp->hdr_sz < 0); ix = atom2cix(atom); @@ -190,7 +206,7 @@ insert_acache_map(ErtsAtomCacheMap *acmp, Eterm atom) } static ERTS_INLINE int -get_iix_acache_map(ErtsAtomCacheMap *acmp, Eterm atom) +get_iix_acache_map(ErtsAtomCacheMap *acmp, Eterm atom, Uint32 dflags) { if (!acmp) return -1; @@ -199,7 +215,9 @@ get_iix_acache_map(ErtsAtomCacheMap *acmp, Eterm atom) ASSERT(is_atom(atom)); ix = atom2cix(atom); if (acmp->cache[ix].iix < 0) { - ASSERT(acmp->sz == ERTS_MAX_INTERNAL_ATOM_CACHE_ENTRIES); + ASSERT(acmp->sz == ERTS_MAX_INTERNAL_ATOM_CACHE_ENTRIES + || (!(dflags & DFLAG_UTF8_ATOMS) + && atom_tab(atom_val(atom))->latin1_chars < 0)); return -1; } else { @@ -210,18 +228,17 @@ get_iix_acache_map(ErtsAtomCacheMap *acmp, Eterm atom) } void -erts_finalize_atom_cache_map(ErtsAtomCacheMap *acmp) +erts_finalize_atom_cache_map(ErtsAtomCacheMap *acmp, Uint32 dflags) { if (acmp) { -#if MAX_ATOM_LENGTH > 255 -#error "This code is not complete; long_atoms info need to be passed to the following stages." - int long_atoms = 0; /* !0 if one or more atoms are long than 255. */ -#endif + int utf8_atoms = (int) (dflags & DFLAG_UTF8_ATOMS); + int long_atoms = 0; /* !0 if one or more atoms are longer than 255. */ int i; int sz; int fix_sz = 1 /* VERSION_MAGIC */ + 1 /* DIST_HEADER */ + + 1 /* dist header flags */ + 1 /* number of internal cache entries */ ; int min_sz; @@ -230,22 +247,23 @@ erts_finalize_atom_cache_map(ErtsAtomCacheMap *acmp) min_sz = fix_sz+(2+4)*acmp->sz; sz = fix_sz; for (i = 0; i < acmp->sz; i++) { + Atom *a; Eterm atom; int len; atom = acmp->cache[acmp->cix[i]].atom; ASSERT(is_atom(atom)); - len = atom_tab(atom_val(atom))->len; -#if MAX_ATOM_LENGTH > 255 + a = atom_tab(atom_val(atom)); + len = (int) (utf8_atoms ? a->len : a->latin1_chars); + ASSERT(len >= 0); if (!long_atoms && len > 255) long_atoms = 1; -#endif /* Enough for a new atom cache value */ sz += 1 /* cix */ + 1 /* length */ + len /* text */; } -#if MAX_ATOM_LENGTH > 255 - if (long_atoms) + if (long_atoms) { + acmp->long_atoms = 1; sz += acmp->sz; /* we need 2 bytes per atom for length */ -#endif + } /* Dynamically sized flag field */ sz += ERTS_DIST_HDR_ATOM_CACHE_FLAG_BYTES(acmp->sz); if (sz < min_sz) @@ -274,6 +292,7 @@ byte *erts_encode_ext_dist_header_setup(byte *ctl_ext, ErtsAtomCacheMap *acmp) else { int i; byte *ep = ctl_ext; + byte dist_hdr_flags = acmp->long_atoms ? ERTS_DIST_HDR_LONG_ATOMS_FLG : 0; ASSERT(acmp->hdr_sz >= 0); /* * Write cache update instructions. Note that this is a purely @@ -296,28 +315,36 @@ byte *erts_encode_ext_dist_header_setup(byte *ctl_ext, ErtsAtomCacheMap *acmp) } --ep; put_int8(acmp->sz, ep); + --ep; + put_int8(dist_hdr_flags, ep); *--ep = DIST_HEADER; *--ep = VERSION_MAGIC; return ep; } } -byte *erts_encode_ext_dist_header_finalize(byte *ext, ErtsAtomCache *cache) +byte *erts_encode_ext_dist_header_finalize(byte *ext, ErtsAtomCache *cache, Uint32 dflags) { byte *ip; byte instr_buf[(2+4)*ERTS_ATOM_CACHE_SIZE]; int ci, sz; + byte dist_hdr_flags; + int long_atoms; + int utf8_atoms = (int) (dflags & DFLAG_UTF8_ATOMS); register byte *ep = ext; ASSERT(ep[0] == VERSION_MAGIC); if (ep[1] != DIST_HEADER) return ext; + dist_hdr_flags = ep[2]; + long_atoms = ERTS_DIST_HDR_LONG_ATOMS_FLG & ((int) dist_hdr_flags); + /* * Update output atom cache and write the external version of * the dist header. We write the header backwards just * before the actual term(s). */ - ep += 2; + ep += 3; ci = (int) get_int8(ep); ASSERT(0 <= ci && ci < ERTS_ATOM_CACHE_SIZE); ep += 1; @@ -342,12 +369,7 @@ byte *erts_encode_ext_dist_header_finalize(byte *ext, ErtsAtomCache *cache) flgs_bytes = ERTS_DIST_HDR_ATOM_CACHE_FLAG_BYTES(ci); ASSERT(flgs_bytes <= sizeof(flgs_buf)); -#if MAX_ATOM_LENGTH > 255 - /* long_atoms info needs to be passed from previous stages */ - if (long_atoms) - flgs |= ERTS_DIST_HDR_LONG_ATOMS_FLG; -#endif - flgs = 0; + flgs = (Uint32) dist_hdr_flags; flgs_buf_ix = 0; if ((ci & 1) == 0) used_half_bytes = 2; @@ -382,17 +404,22 @@ byte *erts_encode_ext_dist_header_finalize(byte *ext, ErtsAtomCache *cache) Atom *a; cache->out_arr[cix] = atom; a = atom_tab(atom_val(atom)); - sz = a->len; - ep -= sz; - sys_memcpy((void *) ep, (void *) a->name, sz); -#if MAX_ATOM_LENGTH > 255 + if (utf8_atoms) { + sz = a->len; + ep -= sz; + sys_memcpy((void *) ep, (void *) a->name, sz); + } + else { + ASSERT(0 <= a->latin1_chars && a->latin1_chars <= MAX_ATOM_CHARACTERS); + ep -= a->latin1_chars; + sz = erts_utf8_to_latin1(ep, a->name, a->len); + ASSERT(a->latin1_chars == sz); + } if (long_atoms) { ep -= 2; put_int16(sz, ep); } - else -#endif - { + else { ASSERT(0 <= sz && sz <= 255); --ep; put_int8(sz, ep); @@ -467,7 +494,7 @@ Uint erts_encode_ext_size_2(Eterm term, unsigned dflags) Uint erts_encode_ext_size_ets(Eterm term) { - return encode_size_struct2(NULL, term, TERM_TO_BINARY_DFLAGS|DFLAGS_INTERNAL_TAGS); + return encode_size_struct2(NULL, term, TERM_TO_BINARY_DFLAGS|DFLAG_INTERNAL_TAGS); } @@ -500,7 +527,7 @@ void erts_encode_ext(Eterm term, byte **ext) byte* erts_encode_ext_ets(Eterm term, byte *ep, struct erl_off_heap_header** off_heap) { - return enc_term(NULL, term, ep, TERM_TO_BINARY_DFLAGS|DFLAGS_INTERNAL_TAGS, + return enc_term(NULL, term, ep, TERM_TO_BINARY_DFLAGS|DFLAG_INTERNAL_TAGS, off_heap); } @@ -553,6 +580,7 @@ erts_prepare_dist_ext(ErtsDistExternal *edep, #endif register byte *ep = ext; + int utf8_atoms = (int) (dep->flags & DFLAG_UTF8_ATOMS); edep->heap_size = -1; edep->ext_endp = ext+size; @@ -611,9 +639,7 @@ erts_prepare_dist_ext(ErtsDistExternal *edep, ERTS_EXT_HDR_FAIL; ep++; if (no_atoms) { -#if MAX_ATOM_LENGTH > 255 int long_atoms = 0; -#endif #ifdef DEBUG byte *flgs_buf = ep; #endif @@ -632,14 +658,8 @@ erts_prepare_dist_ext(ErtsDistExternal *edep, */ byte_ix = ERTS_DIST_HDR_ATOM_CACHE_FLAG_BYTE_IX(no_atoms); bit_ix = ERTS_DIST_HDR_ATOM_CACHE_FLAG_BIT_IX(no_atoms); - if (flgsp[byte_ix] & (((byte) ERTS_DIST_HDR_LONG_ATOMS_FLG) - << bit_ix)) { -#if MAX_ATOM_LENGTH > 255 + if (flgsp[byte_ix] & (((byte) ERTS_DIST_HDR_LONG_ATOMS_FLG) << bit_ix)) long_atoms = 1; -#else - ERTS_EXT_HDR_FAIL; /* Long atoms not supported yet */ -#endif - } #ifdef DEBUG byte_ix = 0; @@ -707,23 +727,25 @@ erts_prepare_dist_ext(ErtsDistExternal *edep, if (cix >= ERTS_ATOM_CACHE_SIZE) ERTS_EXT_HDR_FAIL; ep++; -#if MAX_ATOM_LENGTH > 255 if (long_atoms) { CHKSIZE(2); len = get_int16(ep); ep += 2; } - else -#endif - { + else { CHKSIZE(1); len = get_int8(ep); ep++; } - if (len > MAX_ATOM_LENGTH) - ERTS_EXT_HDR_FAIL; /* Too long atom */ CHKSIZE(len); - atom = am_atom_put((char *) ep, len); + atom = erts_atom_put((byte *) ep, + len, + (utf8_atoms + ? ERTS_ATOM_ENC_UTF8 + : ERTS_ATOM_ENC_LATIN1), + 0); + if (is_non_value(atom)) + ERTS_EXT_HDR_FAIL; ep += len; cache->in_arr[cix] = atom; edep->attab.atom[tix] = atom; @@ -1404,11 +1426,12 @@ static byte* enc_atom(ErtsAtomCacheMap *acmp, Eterm atom, byte *ep, Uint32 dflags) { int iix; - int i, j; + int len; + int utf8_atoms = (int) (dflags & DFLAG_UTF8_ATOMS); ASSERT(is_atom(atom)); - if (dflags & DFLAGS_INTERNAL_TAGS) { + if (dflags & DFLAG_INTERNAL_TAGS) { Uint aval = atom_val(atom); ASSERT(aval < (1<<24)); if (aval >= (1 << 16)) { @@ -1423,27 +1446,46 @@ enc_atom(ErtsAtomCacheMap *acmp, Eterm atom, byte *ep, Uint32 dflags) } return ep; } + /* * term_to_binary/1,2 and the initial distribution message * don't use the cache. */ - iix = get_iix_acache_map(acmp, atom); - if (iix < 0) { - i = atom_val(atom); - j = atom_tab(i)->len; - if ((MAX_ATOM_LENGTH <= 255 || j <= 255) - && (dflags & DFLAG_SMALL_ATOM_TAGS)) { - *ep++ = SMALL_ATOM_EXT; - put_int8(j, ep); - ep++; + + iix = get_iix_acache_map(acmp, atom, dflags); + if (iix < 0) { + Atom *a = atom_tab(atom_val(atom)); + if (utf8_atoms || a->latin1_chars < 0) { + len = a->len; + if (len > 255) { + *ep++ = ATOM_UTF8_EXT; + put_int16(len, ep); + ep += 2; + } + else { + *ep++ = SMALL_ATOM_UTF8_EXT; + put_int8(len, ep); + ep += 1; + } + sys_memcpy((char *) ep, (char *) a->name, len); } else { - *ep++ = ATOM_EXT; - put_int16(j, ep); - ep += 2; + if (a->latin1_chars <= 255 && (dflags & DFLAG_SMALL_ATOM_TAGS)) { + *ep++ = SMALL_ATOM_EXT; + len = erts_utf8_to_latin1(ep+1, a->name, a->len); + ASSERT(len == a->latin1_chars); + put_int8(len, ep); + ep++; + } + else { + *ep++ = ATOM_EXT; + len = erts_utf8_to_latin1(ep+2, a->name, a->len); + ASSERT(len == a->latin1_chars); + put_int16(len, ep); + ep += 2; + } } - sys_memcpy((char *) ep, (char*)atom_tab(i)->name, (int) j); - ep += j; + ep += len; return ep; } @@ -1472,7 +1514,7 @@ enc_pid(ErtsAtomCacheMap *acmp, Eterm pid, byte* ep, Uint32 dflags) ep += 4; put_int32(os, ep); ep += 4; - *ep++ = (is_internal_pid(pid) && (dflags & DFLAGS_INTERNAL_TAGS)) ? + *ep++ = (is_internal_pid(pid) && (dflags & DFLAG_INTERNAL_TAGS)) ? INTERNAL_CREATION : pid_creation(pid); return ep; } @@ -1482,7 +1524,7 @@ static byte* dec_atom(ErtsDistExternal *edep, byte* ep, Eterm* objp) { Uint len; - int n; + int n, is_latin1; switch (*ep++) { case ATOM_CACHE_REF: @@ -1498,17 +1540,37 @@ dec_atom(ErtsDistExternal *edep, byte* ep, Eterm* objp) case ATOM_EXT: len = get_int16(ep), ep += 2; + is_latin1 = 1; goto dec_atom_common; case SMALL_ATOM_EXT: len = get_int8(ep); ep++; + is_latin1 = 1; + goto dec_atom_common; + case ATOM_UTF8_EXT: + len = get_int16(ep), + ep += 2; + is_latin1 = 0; + goto dec_atom_common; + case SMALL_ATOM_UTF8_EXT: + len = get_int8(ep), + ep++; + is_latin1 = 0; dec_atom_common: if (edep && (edep->flags & ERTS_DIST_EXT_BTT_SAFE)) { - if (!erts_atom_get((char*)ep, len, objp)) { + if (!erts_atom_get((char*)ep, len, objp, is_latin1)) { goto error; } } else { - *objp = am_atom_put((char*)ep, len); + Eterm atom = erts_atom_put(ep, + len, + (is_latin1 + ? ERTS_ATOM_ENC_LATIN1 + : ERTS_ATOM_ENC_UTF8), + 0); + if (is_non_value(atom)) + goto error; + *objp = atom; } ep += len; break; @@ -1770,7 +1832,7 @@ enc_term(ErtsAtomCacheMap *acmp, Eterm obj, byte* ep, Uint32 dflags, put_int16(i, ep); ep += 2; ep = enc_atom(acmp,ref_node_name(obj),ep,dflags); - *ep++ = ((dflags & DFLAGS_INTERNAL_TAGS) && is_internal_ref(obj)) ? + *ep++ = ((dflags & DFLAG_INTERNAL_TAGS) && is_internal_ref(obj)) ? INTERNAL_CREATION : ref_creation(obj); ref_num = ref_numbers(obj); for (j = 0; j < i; j++) { @@ -1787,7 +1849,7 @@ enc_term(ErtsAtomCacheMap *acmp, Eterm obj, byte* ep, Uint32 dflags, j = port_number(obj); put_int32(j, ep); ep += 4; - *ep++ = ((dflags & DFLAGS_INTERNAL_TAGS) && is_internal_port(obj)) ? + *ep++ = ((dflags & DFLAG_INTERNAL_TAGS) && is_internal_port(obj)) ? INTERNAL_CREATION : port_creation(obj); break; @@ -1868,7 +1930,7 @@ enc_term(ErtsAtomCacheMap *acmp, Eterm obj, byte* ep, Uint32 dflags, byte* bytes; ERTS_GET_BINARY_BYTES(obj, bytes, bitoffs, bitsize); - if (dflags & DFLAGS_INTERNAL_TAGS) { + if (dflags & DFLAG_INTERNAL_TAGS) { ProcBin* pb = (ProcBin*) binary_val(obj); Uint bytesize = pb->size; if (pb->thing_word == HEADER_SUB_BIN) { @@ -2113,7 +2175,7 @@ static byte* dec_term(ErtsDistExternal *edep, Eterm** hpp, byte* ep, ErlOffHeap* off_heap, Eterm* objp) { Eterm* hp_saved = *hpp; - int n; + int n, is_latin1; register Eterm* hp = *hpp; /* Please don't take the address of hp */ Eterm* next = objp; @@ -2199,17 +2261,37 @@ dec_term(ErtsDistExternal *edep, Eterm** hpp, byte* ep, ErlOffHeap* off_heap, Et case ATOM_EXT: n = get_int16(ep); ep += 2; - goto dec_term_atom_common; + is_latin1 = 1; + goto dec_term_atom_common; case SMALL_ATOM_EXT: n = get_int8(ep); ep++; + is_latin1 = 1; + goto dec_term_atom_common; + case ATOM_UTF8_EXT: + n = get_int16(ep); + ep += 2; + is_latin1 = 0; + goto dec_term_atom_common; + case SMALL_ATOM_UTF8_EXT: + n = get_int8(ep); + ep++; + is_latin1 = 0; dec_term_atom_common: if (edep && (edep->flags & ERTS_DIST_EXT_BTT_SAFE)) { - if (!erts_atom_get((char*)ep, n, objp)) { + if (!erts_atom_get((char*)ep, n, objp, is_latin1)) { goto error; } } else { - *objp = am_atom_put((char*)ep, n); + Eterm atom = erts_atom_put(ep, + n, + (is_latin1 + ? ERTS_ATOM_ENC_LATIN1 + : ERTS_ATOM_ENC_UTF8), + 0); + if (is_non_value(atom)) + goto error; + *objp = atom; } ep += n; break; @@ -2869,7 +2951,7 @@ encode_size_struct2(ErtsAtomCacheMap *acmp, Eterm obj, unsigned dflags) result++; break; case ATOM_DEF: - if (dflags & DFLAGS_INTERNAL_TAGS) { + if (dflags & DFLAG_INTERNAL_TAGS) { if (atom_val(obj) >= (1<<16)) { result += 1 + 3; } @@ -2878,17 +2960,22 @@ encode_size_struct2(ErtsAtomCacheMap *acmp, Eterm obj, unsigned dflags) } } else { - int alen = atom_tab(atom_val(obj))->len; - if ((MAX_ATOM_LENGTH <= 255 || alen <= 255) - && (dflags & DFLAG_SMALL_ATOM_TAGS)) { - /* Make sure a SMALL_ATOM_EXT fits: SMALL_ATOM_EXT l t1 t2... */ - result += 1 + 1 + alen; + Atom *a = atom_tab(atom_val(obj)); + int alen; + if ((dflags & DFLAG_UTF8_ATOMS) || a->latin1_chars < 0) { + alen = a->len; + result += 1 + 1 + alen; + if (alen > 255) { + result++; /* ATOM_UTF8_EXT (not small) */ + } } else { - /* Make sure an ATOM_EXT fits: ATOM_EXT l1 l0 t1 t2... */ - result += 1 + 2 + alen; + alen = a->latin1_chars; + result += 1 + 1 + alen; + if (alen > 255 || !(dflags & DFLAG_SMALL_ATOM_TAGS)) + result++; /* ATOM_EXT (not small) */ } - insert_acache_map(acmp, obj); + insert_acache_map(acmp, obj, dflags); } break; case SMALL_DEF: @@ -2969,7 +3056,7 @@ encode_size_struct2(ErtsAtomCacheMap *acmp, Eterm obj, unsigned dflags) } break; case BINARY_DEF: - if (dflags & DFLAGS_INTERNAL_TAGS) { + if (dflags & DFLAG_INTERNAL_TAGS) { ProcBin* pb = (ProcBin*) binary_val(obj); Uint sub_extra = 0; Uint tot_bytes = pb->size; @@ -3058,6 +3145,17 @@ encode_size_struct2(ErtsAtomCacheMap *acmp, Eterm obj, unsigned dflags) return result; } +static int is_valid_utf8_atom(byte* bytes, Uint nbytes) +{ + byte* err_pos; + Uint num_chars; + + /*SVERK Do we really need to validate correct utf8? */ + return nbytes <= MAX_ATOM_SZ_LIMIT + && erts_analyze_utf8(bytes, nbytes, &err_pos, &num_chars, NULL) == ERTS_UTF8_OK + && num_chars <= MAX_ATOM_CHARACTERS; +} + static Sint decoded_size(byte *ep, byte* endp, int internal_tags) { @@ -3125,21 +3223,41 @@ decoded_size(byte *ep, byte* endp, int internal_tags) case ATOM_EXT: CHKSIZE(2); n = get_int16(ep); - if (n > MAX_ATOM_LENGTH) { + if (n > MAX_ATOM_CHARACTERS) { return -1; } SKIP(n+2+atom_extra_skip); atom_extra_skip = 0; break; + case ATOM_UTF8_EXT: + CHKSIZE(2); + n = get_int16(ep); + ep += 2; + if (!is_valid_utf8_atom(ep, n)) { + return -1; + } + SKIP(n+atom_extra_skip); + atom_extra_skip = 0; + break; case SMALL_ATOM_EXT: CHKSIZE(1); n = get_int8(ep); - if (n > MAX_ATOM_LENGTH) { + if (n > MAX_ATOM_CHARACTERS) { return -1; } SKIP(n+1+atom_extra_skip); atom_extra_skip = 0; break; + case SMALL_ATOM_UTF8_EXT: + CHKSIZE(1); + n = get_int8(ep); + ep++; + if (!is_valid_utf8_atom(ep, n)) { + return -1; + } + SKIP(n+atom_extra_skip); + atom_extra_skip = 0; + break; case ATOM_CACHE_REF: SKIP(1+atom_extra_skip); atom_extra_skip = 0; diff --git a/erts/emulator/beam/external.h b/erts/emulator/beam/external.h index eddd4571dd..ad430117c8 100644 --- a/erts/emulator/beam/external.h +++ b/erts/emulator/beam/external.h @@ -51,6 +51,8 @@ #define NEW_FUN_EXT 'p' #define EXPORT_EXT 'q' #define FUN_EXT 'u' +#define ATOM_UTF8_EXT 'v' +#define SMALL_ATOM_UTF8_EXT 'w' #define DIST_HEADER 'D' #define ATOM_CACHE_REF 'R' @@ -90,6 +92,7 @@ typedef struct cache { typedef struct { int hdr_sz; int sz; + int long_atoms; int cix[ERTS_ATOM_CACHE_SIZE]; struct { Eterm atom; @@ -150,12 +153,12 @@ typedef struct { void erts_init_atom_cache_map(ErtsAtomCacheMap *); void erts_reset_atom_cache_map(ErtsAtomCacheMap *); void erts_destroy_atom_cache_map(ErtsAtomCacheMap *); -void erts_finalize_atom_cache_map(ErtsAtomCacheMap *); +void erts_finalize_atom_cache_map(ErtsAtomCacheMap *, Uint32); Uint erts_encode_ext_dist_header_size(ErtsAtomCacheMap *); Uint erts_encode_ext_dist_header_size(ErtsAtomCacheMap *); byte *erts_encode_ext_dist_header_setup(byte *, ErtsAtomCacheMap *); -byte *erts_encode_ext_dist_header_finalize(byte *, ErtsAtomCache *); +byte *erts_encode_ext_dist_header_finalize(byte *, ErtsAtomCache *, Uint32); Uint erts_encode_dist_ext_size(Eterm, Uint32, ErtsAtomCacheMap *); void erts_encode_dist_ext(Eterm, byte **, Uint32, ErtsAtomCacheMap *); diff --git a/erts/emulator/beam/global.h b/erts/emulator/beam/global.h index 298241618f..41c2a0f2b9 100755 --- a/erts/emulator/beam/global.h +++ b/erts/emulator/beam/global.h @@ -786,16 +786,23 @@ Sint erts_native_filename_need(Eterm ioterm, int encoding); void erts_copy_utf8_to_utf16_little(byte *target, byte *bytes, int num_chars); int erts_analyze_utf8(byte *source, Uint size, byte **err_pos, Uint *num_chars, int *left); +int erts_analyze_utf8_x(byte *source, Uint size, + byte **err_pos, Uint *num_chars, int *left, + Sint *num_latin1_chars, Uint max_chars); char *erts_convert_filename_to_native(Eterm name, char *statbuf, size_t statbuf_size, ErtsAlcType_t alloc_type, int allow_empty, int allow_atom, Sint *used /* out */); Eterm erts_convert_native_to_filename(Process *p, byte *bytes); +Eterm erts_utf8_to_list(Process *p, Uint num, byte *bytes, Uint sz, Uint left, + Uint *num_built, Uint *num_eaten, Eterm tail); +int erts_utf8_to_latin1(byte* dest, const byte* source, int slen); #define ERTS_UTF8_OK 0 #define ERTS_UTF8_INCOMPLETE 1 #define ERTS_UTF8_ERROR 2 #define ERTS_UTF8_ANALYZE_MORE 3 +#define ERTS_UTF8_OK_MAX_CHARS 4 void bin_write(int, void*, byte*, size_t); int intlist_to_buf(Eterm, char*, int); /* most callers pass plain char*'s */ diff --git a/erts/emulator/beam/io.c b/erts/emulator/beam/io.c index 04dc9bb236..536a3cc819 100644 --- a/erts/emulator/beam/io.c +++ b/erts/emulator/beam/io.c @@ -663,8 +663,11 @@ erts_open_driver(erts_driver_t* driver, /* Pointer to driver. */ if (IS_TRACED_FL(port, F_TRACE_PORTS)) { trace_port_open(port, - pid, - am_atom_put(port->name, strlen(port->name))); + pid, + erts_atom_put((byte *) port->name, + strlen(port->name), + ERTS_ATOM_ENC_LATIN1, + 1)); } error_number = error_type = 0; @@ -2677,8 +2680,11 @@ static ERTS_INLINE void lcnt_enable_drv_lock_count(erts_driver_t *dp, int enable erts_lcnt_init_lock_x(&dp->lock->lcnt, "driver_lock", ERTS_LCNT_LT_MUTEX, - am_atom_put(dp->name, - sys_strlen(dp->name))); + erts_atom_put((byte*)dp->name, + sys_strlen(dp->name), + ERTS_ATOM_ENC_LATIN1, + 1)); + else erts_lcnt_destroy_lock(&dp->lock->lcnt); @@ -7043,7 +7049,8 @@ int driver_exit(ErlDrvPort ix, int err) return driver_failure_term(ix, am_normal, 0); else { char* err_str = erl_errno_id(err); - Eterm am_err = am_atom_put(err_str, sys_strlen(err_str)); + Eterm am_err = erts_atom_put((byte *) err_str, sys_strlen(err_str), + ERTS_ATOM_ENC_LATIN1, 1); return driver_failure_term(ix, am_err, 0); } } @@ -7056,8 +7063,12 @@ int driver_failure(ErlDrvPort ix, int code) int driver_failure_atom(ErlDrvPort ix, char* string) { - Eterm am = am_atom_put(string, strlen(string)); - return driver_failure_term(ix, am, 0); + return driver_failure_term(ix, + erts_atom_put((byte *) string, + strlen(string), + ERTS_ATOM_ENC_LATIN1, + 1), + 0); } int driver_failure_posix(ErlDrvPort ix, int err) @@ -7074,7 +7085,10 @@ int driver_failure_eof(ErlDrvPort ix) ErlDrvTermData driver_mk_atom(char* string) { - Eterm am = am_atom_put(string, sys_strlen(string)); + Eterm am = erts_atom_put((byte *) string, + sys_strlen(string), + ERTS_ATOM_ENC_LATIN1, + 1); ERTS_SMP_CHK_NO_PROC_LOCKS; return (ErlDrvTermData) am; } @@ -7369,7 +7383,10 @@ init_driver(erts_driver_t *drv, ErlDrvEntry *de, DE_Handle *handle) erts_mtx_init_x(drv->lock, "driver_lock", #if defined(ERTS_ENABLE_LOCK_CHECK) || defined(ERTS_ENABLE_LOCK_COUNT) - am_atom_put(drv->name, sys_strlen(drv->name)) + erts_atom_put((byte *) drv->name, + sys_strlen(drv->name), + ERTS_ATOM_ENC_LATIN1, + 1) #else NIL #endif diff --git a/erts/emulator/beam/utils.c b/erts/emulator/beam/utils.c index 5261effef9..2a6a8efd4d 100644 --- a/erts/emulator/beam/utils.c +++ b/erts/emulator/beam/utils.c @@ -371,7 +371,7 @@ Eterm erts_bld_atom(Uint **hpp, Uint *szp, char *str) { if (hpp) - return am_atom_put(str, sys_strlen(str)); + return erts_atom_put((byte *) str, sys_strlen(str), ERTS_ATOM_ENC_LATIN1, 1); else return THE_NON_VALUE; } diff --git a/erts/emulator/test/bif_SUITE.erl b/erts/emulator/test/bif_SUITE.erl index e2442861c7..02c6de8cb1 100644 --- a/erts/emulator/test/bif_SUITE.erl +++ b/erts/emulator/test/bif_SUITE.erl @@ -481,8 +481,6 @@ binary_to_atom(Config) when is_list(Config) -> %% Bad UTF8 sequences. ?line ?BADARG(binary_to_atom(id(<<255>>), utf8)), ?line ?BADARG(binary_to_atom(id(<<255,0>>), utf8)), - ?line ?BADARG(binary_to_atom(id(<<0:512/unit:8,255>>), utf8)), - ?line ?BADARG(binary_to_atom(id(<<0:512/unit:8,255,0>>), utf8)), ?line ?BADARG(binary_to_atom(id(<<16#C0,16#80>>), utf8)), %Overlong 0. ?line [?BADARG(binary_to_atom(<<C/utf8>>, utf8)) || C <- lists:seq(256, 16#D7FF)], @@ -494,6 +492,8 @@ binary_to_atom(Config) when is_list(Config) -> C <- lists:seq(16#90000, 16#10FFFF)], %% system_limit failures. + ?line ?SYS_LIMIT(binary_to_atom(id(<<0:512/unit:8,255>>), utf8)), + ?line ?SYS_LIMIT(binary_to_atom(id(<<0:512/unit:8,255,0>>), utf8)), ?line ?SYS_LIMIT(binary_to_atom(<<0:256/unit:8>>, latin1)), ?line ?SYS_LIMIT(binary_to_atom(<<0:257/unit:8>>, latin1)), ?line ?SYS_LIMIT(binary_to_atom(<<0:512/unit:8>>, latin1)), diff --git a/erts/emulator/test/distribution_SUITE.erl b/erts/emulator/test/distribution_SUITE.erl index f3a177faf2..101007c288 100644 --- a/erts/emulator/test/distribution_SUITE.erl +++ b/erts/emulator/test/distribution_SUITE.erl @@ -18,7 +18,17 @@ %% -module(distribution_SUITE). --compile(r13). +-compile(r15). + +-define(VERSION_MAGIC, 131). + +-define(ATOM_EXT, 100). +-define(REFERENCE_EXT, 101). +-define(PORT_EXT, 102). +-define(PID_EXT, 103). +-define(NEW_REFERENCE_EXT, 114). +-define(ATOM_UTF8_EXT, 118). +-define(SMALL_ATOM_UTF8_EXT, 119). %% Tests distribution and the tcp driver. @@ -37,8 +47,10 @@ dist_auto_connect_never/1, dist_auto_connect_once/1, dist_parallel_send/1, atom_roundtrip/1, - atom_roundtrip_r13b/1, + unicode_atom_roundtrip/1, + atom_roundtrip_r15b/1, contended_atom_cache_entry/1, + contended_unicode_atom_cache_entry/1, bad_dist_structure/1, bad_dist_ext_receive/1, bad_dist_ext_process_info/1, @@ -62,8 +74,9 @@ all() -> link_to_dead_new_node, applied_monitor_node, ref_port_roundtrip, nil_roundtrip, stop_dist, {group, trap_bif}, {group, dist_auto_connect}, - dist_parallel_send, atom_roundtrip, atom_roundtrip_r13b, - contended_atom_cache_entry, bad_dist_structure, {group, bad_dist_ext}]. + dist_parallel_send, atom_roundtrip, unicode_atom_roundtrip, atom_roundtrip_r15b, + contended_atom_cache_entry, contended_unicode_atom_cache_entry, + bad_dist_structure, {group, bad_dist_ext}]. groups() -> [{bulk_send, [], [bulk_send_small, bulk_send_big, bulk_send_bigbig]}, @@ -1085,19 +1098,27 @@ atom_roundtrip(Config) when is_list(Config) -> ?line stop_node(Node), ?line ok. -atom_roundtrip_r13b(Config) when is_list(Config) -> - case ?t:is_release_available("r13b") of +atom_roundtrip_r15b(Config) when is_list(Config) -> + case ?t:is_release_available("r15b") of true -> ?line AtomData = atom_data(), ?line verify_atom_data(AtomData), - ?line {ok, Node} = start_node(Config, [], "r13b"), + ?line {ok, Node} = start_node(Config, [], "r15b"), ?line do_atom_roundtrip(Node, AtomData), ?line stop_node(Node), ?line ok; false -> - ?line {skip,"No OTP R13B available"} + ?line {skip,"No OTP R15B available"} end. +unicode_atom_roundtrip(Config) when is_list(Config) -> + ?line AtomData = unicode_atom_data(), + ?line verify_atom_data(AtomData), + ?line {ok, Node} = start_node(Config), + ?line do_atom_roundtrip(Node, AtomData), + ?line stop_node(Node), + ?line ok. + do_atom_roundtrip(Node, AtomData) -> ?line Parent = self(), ?line Proc = spawn_link(Node, fun () -> verify_atom_data_loop(Parent) end), @@ -1128,12 +1149,76 @@ atom_data() -> lists:seq(1, 2000)). verify_atom_data(AtomData) -> - lists:foreach(fun ({Atom, AtomTxt}) -> - AtomTxt = atom_to_list(Atom) + lists:foreach(fun ({Atom, AtomTxt}) when is_atom(Atom) -> + AtomTxt = atom_to_list(Atom); + ({PPR, AtomTxt}) -> + % Pid, Port, or Ref + AtomTxt = atom_to_list(node(PPR)) end, AtomData). +uc_atom_tup(ATxt) -> + Atom = string_to_atom(ATxt), + ATxt = atom_to_list(Atom), + {Atom, ATxt}. + +uc_pid_tup(ATxt) -> + ATxtExt = string_to_atom_ext(ATxt), + Pid = mk_pid({ATxtExt, 1}, 4711,17), + true = is_pid(Pid), + Atom = node(Pid), + true = is_atom(Atom), + ATxt = atom_to_list(Atom), + {Pid, ATxt}. + +uc_port_tup(ATxt) -> + ATxtExt = string_to_atom_ext(ATxt), + Port = mk_port({ATxtExt, 2}, 4711), + true = is_port(Port), + Atom = node(Port), + true = is_atom(Atom), + ATxt = atom_to_list(Atom), + {Port, ATxt}. + +uc_ref_tup(ATxt) -> + ATxtExt = string_to_atom_ext(ATxt), + Ref = mk_ref({ATxtExt, 3}, [4711,17, 4711]), + true = is_reference(Ref), + Atom = node(Ref), + true = is_atom(Atom), + ATxt = atom_to_list(Atom), + {Ref, ATxt}. + + +unicode_atom_data() -> + [uc_pid_tup(lists:seq(16#1f600, 16#1f600+249) ++ "@host"), + uc_pid_tup(lists:seq(16#1f600, 16#1f600+30) ++ "@host"), + uc_port_tup(lists:seq(16#1f600, 16#1f600+249) ++ "@host"), + uc_port_tup(lists:seq(16#1f600, 16#1f600+30) ++ "@host"), + uc_ref_tup(lists:seq(16#1f600, 16#1f600+249) ++ "@host"), + uc_ref_tup(lists:seq(16#1f600, 16#1f600+30) ++ "@host"), + uc_atom_tup(lists:seq(16#1f600, 16#1f600+254)), + uc_atom_tup(lists:seq(16#1f600, 16#1f600+63)), + uc_atom_tup(lists:seq(0, 254)), + uc_atom_tup(lists:seq(100, 163)), + uc_atom_tup(lists:seq(200, 354)), + uc_atom_tup(lists:seq(200, 263)), + uc_atom_tup(lists:seq(2000, 2254)), + uc_atom_tup(lists:seq(2000, 2063)), + uc_atom_tup(lists:seq(65500, 65754)), + uc_atom_tup(lists:seq(65500, 65563)) + | lists:map(fun (N) -> + uc_atom_tup(lists:seq(64000+N, 64254+N)) + end, + lists:seq(1, 2000))]. + contended_atom_cache_entry(Config) when is_list(Config) -> + contended_atom_cache_entry_test(Config, latin1). + +contended_unicode_atom_cache_entry(Config) when is_list(Config) -> + contended_atom_cache_entry_test(Config, unicode). + +contended_atom_cache_entry_test(Config, Type) -> ?line TestServer = self(), ?line ProcessPairs = 10, ?line Msgs = 100000, @@ -1147,9 +1232,16 @@ contended_atom_cache_entry(Config) when is_list(Config) -> true), Master = self(), CIX = get_cix(), - TestAtoms = get_conflicting_atoms(CIX, ProcessPairs), + TestAtoms = case Type of + latin1 -> + get_conflicting_atoms(CIX, + ProcessPairs); + unicode -> + get_conflicting_unicode_atoms(CIX, + ProcessPairs) + end, io:format("Testing with the following atoms all using " - "cache index ~p:~n ~p~n", + "cache index ~p:~n ~w~n", [CIX, TestAtoms]), Ps = lists:map( fun (A) -> @@ -1159,8 +1251,12 @@ contended_atom_cache_entry(Config) when is_list(Config) -> fun () -> Atom = receive {Ref, txt, ATxt} -> - list_to_atom( - ATxt) + case Type of + latin1 -> + list_to_atom(ATxt); + unicode -> + string_to_atom(ATxt) + end end, receive_ref_atom(Ref, Atom, @@ -1252,6 +1348,20 @@ get_conflicting_atoms(CIX, N) -> get_conflicting_atoms(CIX, N) end. +get_conflicting_unicode_atoms(_CIX, 0) -> + []; +get_conflicting_unicode_atoms(CIX, N) -> + {A, B, C} = now(), + Atom = string_to_atom([16#1f608] ++ "atom" ++ integer_to_list(A*1000000000000 + + B*1000000 + + C)), + case erts_debug:get_internal_state({atom_out_cache_index, Atom}) of + CIX -> + [Atom|get_conflicting_unicode_atoms(CIX, N-1)]; + _ -> + get_conflicting_unicode_atoms(CIX, N) + end. + -define(COOKIE, ''). -define(DOP_LINK, 1). -define(DOP_SEND, 2). @@ -2131,3 +2241,190 @@ repeat(_Fun, 0) -> repeat(Fun, N) -> Fun(), repeat(Fun, N-1). + +string_to_atom_ext(String) -> + Utf8List = string_to_utf8_list(String), + Len = length(Utf8List), + case Len < 256 of + true -> + [?SMALL_ATOM_UTF8_EXT, Len | Utf8List]; + false -> + [?ATOM_UTF8_EXT, Len bsr 8, Len band 16#ff | Utf8List] + end. + +string_to_atom(String) -> + binary_to_term(list_to_binary([?VERSION_MAGIC + | string_to_atom_ext(String)])). + +string_to_utf8_list([]) -> + []; +string_to_utf8_list([CP|CPs]) when is_integer(CP), + 0 =< CP, + CP =< 16#7F -> + [CP | string_to_utf8_list(CPs)]; +string_to_utf8_list([CP|CPs]) when is_integer(CP), + 16#80 =< CP, + CP =< 16#7FF -> + [16#C0 bor (CP bsr 6), + 16#80 bor (16#3F band CP) + | string_to_utf8_list(CPs)]; +string_to_utf8_list([CP|CPs]) when is_integer(CP), + 16#800 =< CP, + CP =< 16#FFFF -> + [16#E0 bor (CP bsr 12), + 16#80 bor (16#3F band (CP bsr 6)), + 16#80 bor (16#3F band CP) + | string_to_utf8_list(CPs)]; +string_to_utf8_list([CP|CPs]) when is_integer(CP), + 16#10000 =< CP, + CP =< 16#10FFFF -> + [16#F0 bor (CP bsr 18), + 16#80 bor (16#3F band (CP bsr 12)), + 16#80 bor (16#3F band (CP bsr 6)), + 16#80 bor (16#3F band CP) + | string_to_utf8_list(CPs)]. + +utf8_list_to_string([]) -> + []; +utf8_list_to_string([B|Bs]) when is_integer(B), + 0 =< B, + B =< 16#7F -> + [B | utf8_list_to_string(Bs)]; +utf8_list_to_string([B0, B1 | Bs]) when is_integer(B0), + 16#C0 =< B0, + B0 =< 16#DF, + is_integer(B1), + 16#80 =< B1, + B1 =< 16#BF -> + [(((B0 band 16#1F) bsl 6) + bor (B1 band 16#3F)) + | utf8_list_to_string(Bs)]; +utf8_list_to_string([B0, B1, B2 | Bs]) when is_integer(B0), + 16#E0 =< B0, + B0 =< 16#EF, + is_integer(B1), + 16#80 =< B1, + B1 =< 16#BF, + is_integer(B2), + 16#80 =< B2, + B2 =< 16#BF -> + [(((B0 band 16#F) bsl 12) + bor ((B1 band 16#3F) bsl 6) + bor (B2 band 16#3F)) + | utf8_list_to_string(Bs)]; +utf8_list_to_string([B0, B1, B2, B3 | Bs]) when is_integer(B0), + 16#F0 =< B0, + B0 =< 16#F7, + is_integer(B1), + 16#80 =< B1, + B1 =< 16#BF, + is_integer(B2), + 16#80 =< B2, + B2 =< 16#BF, + is_integer(B3), + 16#80 =< B3, + B3 =< 16#BF -> + [(((B0 band 16#7) bsl 18) + bor ((B1 band 16#3F) bsl 12) + bor ((B2 band 16#3F) bsl 6) + bor (B3 band 16#3F)) + | utf8_list_to_string(Bs)]. + +mk_pid({NodeName, Creation}, Number, Serial) when is_atom(NodeName) -> + <<?VERSION_MAGIC, NodeNameExt/binary>> = term_to_binary(NodeName), + mk_pid({NodeNameExt, Creation}, Number, Serial); +mk_pid({NodeNameExt, Creation}, Number, Serial) -> + case catch binary_to_term(list_to_binary([?VERSION_MAGIC, + ?PID_EXT, + NodeNameExt, + uint32_be(Number), + uint32_be(Serial), + uint8(Creation)])) of + Pid when is_pid(Pid) -> + Pid; + {'EXIT', {badarg, _}} -> + exit({badarg, mk_pid, [{NodeNameExt, Creation}, Number, Serial]}); + Other -> + exit({unexpected_binary_to_term_result, Other}) + end. + +mk_port({NodeName, Creation}, Number) when is_atom(NodeName) -> + <<?VERSION_MAGIC, NodeNameExt/binary>> = term_to_binary(NodeName), + mk_port({NodeNameExt, Creation}, Number); +mk_port({NodeNameExt, Creation}, Number) -> + case catch binary_to_term(list_to_binary([?VERSION_MAGIC, + ?PORT_EXT, + NodeNameExt, + uint32_be(Number), + uint8(Creation)])) of + Port when is_port(Port) -> + Port; + {'EXIT', {badarg, _}} -> + exit({badarg, mk_port, [{NodeNameExt, Creation}, Number]}); + Other -> + exit({unexpected_binary_to_term_result, Other}) + end. + +mk_ref({NodeName, Creation}, [Number] = NL) when is_atom(NodeName), + is_integer(Creation), + is_integer(Number) -> + <<?VERSION_MAGIC, NodeNameExt/binary>> = term_to_binary(NodeName), + mk_ref({NodeNameExt, Creation}, NL); +mk_ref({NodeNameExt, Creation}, [Number]) when is_integer(Creation), + is_integer(Number) -> + case catch binary_to_term(list_to_binary([?VERSION_MAGIC, + ?REFERENCE_EXT, + NodeNameExt, + uint32_be(Number), + uint8(Creation)])) of + Ref when is_reference(Ref) -> + Ref; + {'EXIT', {badarg, _}} -> + exit({badarg, mk_ref, [{NodeNameExt, Creation}, [Number]]}); + Other -> + exit({unexpected_binary_to_term_result, Other}) + end; +mk_ref({NodeName, Creation}, Numbers) when is_atom(NodeName), + is_integer(Creation), + is_list(Numbers) -> + <<?VERSION_MAGIC, NodeNameExt/binary>> = term_to_binary(NodeName), + mk_ref({NodeNameExt, Creation}, Numbers); +mk_ref({NodeNameExt, Creation}, Numbers) when is_integer(Creation), + is_list(Numbers) -> + case catch binary_to_term(list_to_binary([?VERSION_MAGIC, + ?NEW_REFERENCE_EXT, + uint16_be(length(Numbers)), + NodeNameExt, + uint8(Creation), + lists:map(fun (N) -> + uint32_be(N) + end, + Numbers)])) of + Ref when is_reference(Ref) -> + Ref; + {'EXIT', {badarg, _}} -> + exit({badarg, mk_ref, [{NodeNameExt, Creation}, Numbers]}); + Other -> + exit({unexpected_binary_to_term_result, Other}) + end. + + +uint32_be(Uint) when is_integer(Uint), 0 =< Uint, Uint < 1 bsl 32 -> + [(Uint bsr 24) band 16#ff, + (Uint bsr 16) band 16#ff, + (Uint bsr 8) band 16#ff, + Uint band 16#ff]; +uint32_be(Uint) -> + exit({badarg, uint32_be, [Uint]}). + + +uint16_be(Uint) when is_integer(Uint), 0 =< Uint, Uint < 1 bsl 16 -> + [(Uint bsr 8) band 16#ff, + Uint band 16#ff]; +uint16_be(Uint) -> + exit({badarg, uint16_be, [Uint]}). + +uint8(Uint) when is_integer(Uint), 0 =< Uint, Uint < 1 bsl 8 -> + Uint band 16#ff; +uint8(Uint) -> + exit({badarg, uint8, [Uint]}). diff --git a/erts/epmd/src/epmd_cli.c b/erts/epmd/src/epmd_cli.c index 74408e3ebe..1d4de64b63 100644 --- a/erts/epmd/src/epmd_cli.c +++ b/erts/epmd/src/epmd_cli.c @@ -22,6 +22,7 @@ #endif #include "epmd.h" /* Renamed from 'epmd_r4.h' */ #include "epmd_int.h" +#include "erl_printf.h" /* erts_snprintf */ /* forward declarations */ @@ -114,16 +115,18 @@ void epmd_call(EpmdVars *g,int what) epmd_cleanup_exit(g,1); } j = ntohl(i); - if (!g->silent) - printf("epmd: up and running on port %d with data:\n", j); + if (!g->silent) { + rval = erts_snprintf(buf, OUTBUF_SIZE, + "epmd: up and running on port %d with data:\n", j); + write(1, buf, rval); + } while(1) { - if ((rval = read(fd,buf,1)) <= 0) { + if ((rval = read(fd,buf,OUTBUF_SIZE)) <= 0) { close(fd); epmd_cleanup_exit(g,0); } - buf[rval] = '\0'; if (!g->silent) - printf("%s",buf); + write(1, buf, rval); /* Potentially UTF-8 encoded */ } } diff --git a/erts/epmd/src/epmd_int.h b/erts/epmd/src/epmd_int.h index 14d05c3f19..b25412c905 100644 --- a/erts/epmd/src/epmd_int.h +++ b/erts/epmd/src/epmd_int.h @@ -226,13 +226,25 @@ #define MAX_UNREG_COUNT 1000 #define DEBUG_MAX_UNREG_COUNT 5 -/* Maximum length of a node name == atom name */ -#define MAXSYMLEN 255 +/* + * Maximum length of a node name == atom name + * 255 characters; UTF-8 encoded -> max 255*4 + */ +#define MAXSYMLEN (255*4) #define MAX_LISTEN_SOCKETS 16 -#define INBUF_SIZE 1024 -#define OUTBUF_SIZE 1024 +/* + * Largest request: ALIVE2_REQ + * 2 + 13 + 2*MAXSYMLEN + * Largest response: PORT2_RESP + * 2 + 14 + 2*MAXSYMLEN + * + * That is, 3*MAXSYMLEN should be large enough + */ + +#define INBUF_SIZE (3*MAXSYMLEN) +#define OUTBUF_SIZE (3*MAXSYMLEN) #define get_int16(s) ((((unsigned char*) (s))[0] << 8) | \ (((unsigned char*) (s))[1])) diff --git a/erts/epmd/src/epmd_srv.c b/erts/epmd/src/epmd_srv.c index 36565b7438..2a74c4955e 100644 --- a/erts/epmd/src/epmd_srv.c +++ b/erts/epmd/src/epmd_srv.c @@ -73,7 +73,7 @@ static int conn_open(EpmdVars*,int); static int conn_close_fd(EpmdVars*,int); static void node_init(EpmdVars*); -static Node *node_reg2(EpmdVars*,char*, int, int, unsigned char, unsigned char, int, int, int, char*); +static Node *node_reg2(EpmdVars*, int, char*, int, int, unsigned char, unsigned char, int, int, int, char*); static int node_unreg(EpmdVars*,char*); static int node_unreg_sock(EpmdVars*,int); @@ -81,6 +81,113 @@ static int reply(EpmdVars*,int,char *,int); static void dbg_print_buf(EpmdVars*,char *,int); static void print_names(EpmdVars*); +static int is_same_str(char *x, char *y) +{ + int i = 0; + /* + * Using strcmp() == 0 is probably ok, but just to be sure, + * since we got UTF-8 strings, we do it ourselves. + * + * We assume null-terminated correctly encoded UTF-8. + */ + while (x[i] == y[i]) { + if (x[i] == '\0') + return 1; + i++; + } + return 0; +} + +static int copy_str(char *x, char *y) +{ + int i = 0; + /* + * Using strcpy() is probably ok, but just to be sure, + * since we got UTF-8 strings, we do it ourselves. + * + * We assume null-terminated correctly encoded UTF-8. + */ + while (1) { + x[i] = y[i]; + if (y[i] == '\0') + return i; + i++; + } +} + +static int length_str(char *x) +{ + int i = 0; + /* + * Using strlen is probably ok, but just to be sure, + * since we got UTF-8 strings, we do it ourselves. + * + * We assume null-terminated correctly encoded UTF-8. + */ + while (x[i]) + i++; + return i; +} + +static int verify_utf8(const char *src, int sz, int null_term) +{ + unsigned char *source = (unsigned char *) src; + int size = sz; + int num_chars = 0; + while (size) { + if (null_term && (*source) == 0) + return num_chars; + if (((*source) & ((unsigned char) 0x80)) == 0) { + source++; + --size; + } else if (((*source) & ((unsigned char) 0xE0)) == 0xC0) { + if (size < 2) + return -1; + if (((source[1] & ((unsigned char) 0xC0)) != 0x80) || + ((*source) < 0xC2) /* overlong */) { + return -1; + } + source += 2; + size -= 2; + } else if (((*source) & ((unsigned char) 0xF0)) == 0xE0) { + if (size < 3) + return -1; + if (((source[1] & ((unsigned char) 0xC0)) != 0x80) || + ((source[2] & ((unsigned char) 0xC0)) != 0x80) || + (((*source) == 0xE0) && (source[1] < 0xA0)) /* overlong */ ) { + return -1; + } + if ((((*source) & ((unsigned char) 0xF)) == 0xD) && + ((source[1] & 0x20) != 0)) { + return -1; + } + source += 3; + size -= 3; + } else if (((*source) & ((unsigned char) 0xF8)) == 0xF0) { + if (size < 4) + return -1; + if (((source[1] & ((unsigned char) 0xC0)) != 0x80) || + ((source[2] & ((unsigned char) 0xC0)) != 0x80) || + ((source[3] & ((unsigned char) 0xC0)) != 0x80) || + (((*source) == 0xF0) && (source[1] < 0x90)) /* overlong */) { + return -1; + } + if ((((*source) & ((unsigned char)0x7)) > 0x4U) || + ((((*source) & ((unsigned char)0x7)) == 0x4U) && + ((source[1] & ((unsigned char)0x3F)) > 0xFU))) { + return -1; + } + source += 4; + size -= 4; + } else { + return -1; + } + ++num_chars; + } + return num_chars; +} + + static EPMD_INLINE void select_fd_set(EpmdVars* g, int fd) { FD_SET(fd, &g->orig_read_mask); @@ -525,10 +632,11 @@ static void do_request(g, fd, s, buf, bsize) } name = &buf[11]; name[namelen]='\000'; + extra = &buf[11+namelen+2]; extra[extralen]='\000'; wbuf[0] = EPMD_ALIVE2_RESP; - if ((node = node_reg2(g, name, fd, eport, nodetype, protocol, + if ((node = node_reg2(g, namelen, name, fd, eport, nodetype, protocol, highvsn, lowvsn, extralen, extra)) == NULL) { wbuf[1] = 1; /* error */ put_int16(99, wbuf+2); @@ -573,22 +681,28 @@ static void do_request(g, fd, s, buf, bsize) { char *name = &buf[1]; /* Points to node name */ + int nsz; Node *node; - + + nsz = verify_utf8(name, bsize, 0); + if (nsz < 1 || 255 < nsz) { + dbg_printf(g,0,"invalid node name in PORT2_REQ"); + return; + } + wbuf[0] = EPMD_PORT2_RESP; for (node = g->nodes.reg; node; node = node->next) { int offset; - if (strcmp(node->symname, name) == 0) { + if (is_same_str(node->symname, name)) { wbuf[1] = 0; /* ok */ put_int16(node->port,wbuf+2); wbuf[4] = node->nodetype; wbuf[5] = node->protocol; put_int16(node->highvsn,wbuf+6); put_int16(node->lowvsn,wbuf+8); - put_int16(strlen(node->symname),wbuf+10); + put_int16(length_str(node->symname),wbuf+10); offset = 12; - strcpy(wbuf + offset,node->symname); - offset += strlen(node->symname); + offset += copy_str(wbuf + offset,node->symname); put_int16(node->extralen,wbuf + offset); offset += 2; memcpy(wbuf + offset,node->extra,node->extralen); @@ -629,15 +743,22 @@ static void do_request(g, fd, s, buf, bsize) for (node = g->nodes.reg; node; node = node->next) { - int len; + int len = 0; + int r; /* CAREFUL!!! These are parsed by "erl_epmd.erl" so a slight change in syntax will break < OTP R3A */ - erts_snprintf(wbuf, sizeof(wbuf), "name %s at port %d\n",node->symname, node->port); - len = strlen(wbuf); + len += copy_str(&wbuf[len], "name "); + len += copy_str(&wbuf[len], node->symname); + r = erts_snprintf(&wbuf[len], sizeof(wbuf)-len, + " at port %d\n", node->port); + if (r < 0) + goto failed_names_resp; + len += r; if (reply(g, fd, wbuf, len) != len) { + failed_names_resp: dbg_tty_printf(g,1,"failed to send NAMES_RESP"); return; } @@ -665,16 +786,22 @@ static void do_request(g, fd, s, buf, bsize) for (node = g->nodes.reg; node; node = node->next) { - int len; + int len = 0, r; /* CAREFUL!!! These are parsed by "erl_epmd.erl" so a slight change in syntax will break < OTP R3A */ - erts_snprintf(wbuf, sizeof(wbuf), "active name <%s> at port %d, fd = %d\n", - node->symname, node->port, node->fd); - len = strlen(wbuf) + 1; - if (reply(g, fd,wbuf,len) != len) + len += copy_str(&wbuf[len], "active name <"); + len += copy_str(&wbuf[len], node->symname); + r = erts_snprintf(&wbuf[len], sizeof(wbuf)-len, + "> at port %d, fd = %d\n", + node->port, node->fd); + if (r < 0) + goto failed_dump_resp; + len += r + 1; + if (reply(g, fd,wbuf,len) != len) { + failed_dump_resp: dbg_tty_printf(g,1,"failed to send DUMP_RESP"); return; } @@ -682,16 +809,22 @@ static void do_request(g, fd, s, buf, bsize) for (node = g->nodes.unreg; node; node = node->next) { - int len; + int len = 0, r; /* CAREFUL!!! These are parsed by "erl_epmd.erl" so a slight change in syntax will break < OTP R3A */ - erts_snprintf(wbuf, sizeof(wbuf), "old/unused name <%s>, port = %d, fd = %d \n", - node->symname,node->port, node->fd); - len = strlen(wbuf) + 1; - if (reply(g, fd,wbuf,len) != len) + len += copy_str(&wbuf[len], "old/unused name <"); + len += copy_str(&wbuf[len], node->symname); + r = erts_snprintf(&wbuf[len], sizeof(wbuf)-len, + ">, port = %d, fd = %d \n", + node->port, node->fd); + if (r < 0) + goto failed_dump_resp2; + len += r + 1; + if (reply(g, fd,wbuf,len) != len) { + failed_dump_resp2: dbg_tty_printf(g,1,"failed to send DUMP_RESP"); return; } @@ -933,7 +1066,7 @@ static int node_unreg(EpmdVars *g,char *name) Node *node = g->nodes.reg; /* Point to first node */ for (; node; prev = &node->next, node = node->next) - if (strcmp(node->symname, name) == 0) + if (is_same_str(node->symname, name)) { dbg_tty_printf(g,1,"unregistering '%s:%d', port %d", node->symname, node->creation, node->port); @@ -1013,6 +1146,7 @@ static int node_unreg_sock(EpmdVars *g,int fd) */ static Node *node_reg2(EpmdVars *g, + int namelen, char* name, int fd, int port, @@ -1025,6 +1159,7 @@ static Node *node_reg2(EpmdVars *g, { Node *prev; /* Point to previous node or NULL */ Node *node; /* Point to first node */ + int sz; /* Can be NULL; means old style */ if (extra == NULL) @@ -1032,21 +1167,47 @@ static Node *node_reg2(EpmdVars *g, /* Fail if node name is too long */ - if (strlen(name) > MAXSYMLEN) + + if (namelen > MAXSYMLEN) { - dbg_printf(g,0,"node name is too long (%d) %s", strlen(name), name); + too_long_name: + dbg_printf(g,0,"node name is too long (%d) %s", namelen, name); return NULL; } + + sz = verify_utf8(name, namelen, 0); + if (sz > 255) + goto too_long_name; + + if (sz < 0) { + dbg_printf(g,0,"invalid node name encoding"); + return NULL; + } + if (extralen > MAXSYMLEN) { - dbg_printf(g,0,"extra data is too long (%d) %s", strlen(name), name); +#if 0 + too_long_extra: +#endif + dbg_printf(g,0,"extra data is too long (%d) %s", extralen, extra); return NULL; } +#if 0 /* Should we require valid utf8 here? */ + sz = verify_utf8(extra, extralen, 0); + if (sz > 255) + goto too_long_extra; + + if (sz < 0) { + dbg_printf(g,0,"invalid extra data encoding"); + return NULL; + } +#endif + /* Fail if it is already registered */ for (node = g->nodes.reg; node; node = node->next) - if (strcmp(node->symname, name) == 0) + if (is_same_str(node->symname, name)) { dbg_printf(g,0,"node name already occupied %s", name); return NULL; @@ -1058,7 +1219,7 @@ static Node *node_reg2(EpmdVars *g, prev = NULL; for (node = g->nodes.unreg; node; prev = node, node = node->next) - if (strcmp(node->symname, name) == 0) + if (is_same_str(node->symname, name)) { dbg_tty_printf(g,1,"reusing slot with same name '%s'", node->symname); @@ -1126,7 +1287,7 @@ static Node *node_reg2(EpmdVars *g, node->lowvsn = lowvsn; node->extralen = extralen; memcpy(node->extra,extra,extralen); - strcpy(node->symname,name); + copy_str(node->symname,name); select_fd_set(g, fd); if (highvsn == 0) { diff --git a/erts/epmd/test/epmd_SUITE.erl b/erts/epmd/test/epmd_SUITE.erl index fd9969ae2b..fc0abef400 100644 --- a/erts/epmd/test/epmd_SUITE.erl +++ b/erts/epmd/test/epmd_SUITE.erl @@ -45,6 +45,8 @@ register_names_1/1, register_names_2/1, register_duplicate_name/1, + unicode_name/1, + long_unicode_name/1, get_port_nr/1, slow_get_port_nr/1, unregister_others_name_1/1, @@ -107,7 +109,8 @@ suite() -> [{ct_hooks,[ts_install_cth]}]. all() -> [register_name, register_names_1, register_names_2, - register_duplicate_name, get_port_nr, slow_get_port_nr, + register_duplicate_name, unicode_name, long_unicode_name, + get_port_nr, slow_get_port_nr, unregister_others_name_1, unregister_others_name_2, register_overflow, name_with_null_inside, name_null_terminated, stupid_names_req, no_data, @@ -197,6 +200,37 @@ register_duplicate_name(Config) when is_list(Config) -> ?line ok = close(Sock), % Unregister ok. +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% + +unicode_name(doc) -> + ["Check that we can register and lookup a unicode name"]; +unicode_name(suite) -> + []; +unicode_name(Config) when is_list(Config) -> + ok = epmdrun(), + NodeName = [16#1f608], + {ok,Sock} = register_node_v2(4711, 72, 0, 5, 5, NodeName, []), + {ok,NodeInfo} = port_please_v2(NodeName), + NodeName = NodeInfo#node_info.node_name, + ok = close(Sock), + ok. + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% + +long_unicode_name(doc) -> + ["Check that we can register and lookup a long unicode name"]; +long_unicode_name(suite) -> + []; +long_unicode_name(Config) when is_list(Config) -> + ok = epmdrun(), + BaseChar = 16#1f600, + NodeName = lists:seq(BaseChar, BaseChar+200), % will be 800 bytes long + {ok,Sock} = register_node_v2(4711, 72, 0, 5, 5, NodeName, []), + {ok,NodeInfo} = port_please_v2(NodeName), + NodeName = NodeInfo#node_info.node_name, + ok = close(Sock), + ok. + % Internal function to register a node name, no close, i.e. unregister register_node(Name) -> @@ -205,9 +239,10 @@ register_node(Name,Port) -> register_node_v2(Port,$M,0,5,5,Name,""). register_node_v2(Port, NodeType, Prot, HVsn, LVsn, Name, Extra) -> + Utf8Name = unicode:characters_to_binary(Name), Req = [?EPMD_ALIVE2_REQ, put16(Port), NodeType, Prot, put16(HVsn), put16(LVsn), - size16(Name), Name, + put16(size(Utf8Name)), binary_to_list(Utf8Name), size16(Extra), Extra], case send_req(Req) of {ok,Sock} -> @@ -226,7 +261,8 @@ register_node_v2(Port, NodeType, Prot, HVsn, LVsn, Name, Extra) -> % Internal function to fetch information about a node port_please_v2(Name) -> - case send_req([?EPMD_PORT_PLEASE2_REQ, Name]) of + case send_req([?EPMD_PORT_PLEASE2_REQ, + binary_to_list(unicode:characters_to_binary(Name))]) of {ok,Sock} -> case recv_until_sock_closes(Sock) of {ok, Resp} -> @@ -247,7 +283,7 @@ parse_port2_resp(Resp) -> ELen:16,Extra:ELen/binary>> when Res =:= 0 -> {ok, #node_info{port=Port,node_type=NodeType,prot=Prot, hvsn=HVsn,lvsn=LVsn, - node_name=binary_to_list(NodeName), + node_name=unicode:characters_to_list(NodeName), extra=binary_to_list(Extra)}}; _Other -> test_server:format("invalid port2 resp: ~p~n", @@ -737,7 +773,7 @@ buffer_overrun_2(doc) -> ["Test security vulnerability in fake extra lengths in alive2_req"]; buffer_overrun_2(Config) when is_list(Config) -> ?line ok = epmdrun(), - ?line [false | Rest] = [hostile2(N) || N <- lists:seq(255,10000)], + ?line [false | Rest] = [hostile2(N) || N <- lists:seq(255*4,10000)], ?line true = alltrue(Rest), ok. hostile(N) -> @@ -880,6 +916,7 @@ no_live_killing(Config) when is_list(Config) -> ?line close(Sock3), ok. + %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% % Terminate all tests with killing epmd. |