aboutsummaryrefslogtreecommitdiffstats
path: root/erts
diff options
context:
space:
mode:
authorJosé Valim <[email protected]>2016-05-31 14:28:54 +0200
committerJosé Valim <[email protected]>2017-01-30 15:24:05 +0100
commit26b59dfe67ef551cd94765557cdd8c79794bcc38 (patch)
tree696adc07b3e7a4a3f1ed6c52311ff6e163b218b4 /erts
parent6c7539b0e39996f870385e5276e08c0dd98b6eb8 (diff)
downloadotp-26b59dfe67ef551cd94765557cdd8c79794bcc38.tar.gz
otp-26b59dfe67ef551cd94765557cdd8c79794bcc38.tar.bz2
otp-26b59dfe67ef551cd94765557cdd8c79794bcc38.zip
Add new AtU8 beam chunk
The new chunk stores atoms encoded in UTF-8. beam_lib has also been modified to handle the new 'utf8_atoms' attribute while the 'atoms' attribute may be a missing chunk from now on. The binary_to_atom/2 BIF can now encode any utf8 binary with up to 255 characters. The list_to_atom/1 BIF can now accept codepoints higher than 255 with up to 255 characters (thanks to Björn Gustavsson).
Diffstat (limited to 'erts')
-rw-r--r--erts/doc/src/erl_ext_dist.xml15
-rw-r--r--erts/doc/src/erlang.xml35
-rw-r--r--erts/emulator/beam/atom.c35
-rw-r--r--erts/emulator/beam/atom.h3
-rw-r--r--erts/emulator/beam/beam_load.c66
-rw-r--r--erts/emulator/beam/bif.c14
-rw-r--r--erts/emulator/beam/erl_unicode.c83
-rw-r--r--erts/emulator/beam/global.h1
-rw-r--r--erts/emulator/beam/utils.c62
-rw-r--r--erts/emulator/test/bif_SUITE.erl47
-rw-r--r--erts/emulator/test/code_SUITE.erl10
11 files changed, 242 insertions, 129 deletions
diff --git a/erts/doc/src/erl_ext_dist.xml b/erts/doc/src/erl_ext_dist.xml
index 4f799f8f34..a436a9ca74 100644
--- a/erts/doc/src/erl_ext_dist.xml
+++ b/erts/doc/src/erl_ext_dist.xml
@@ -119,16 +119,11 @@
<tcaption>Compressed Data Format when Expanded</tcaption></table>
<marker id="utf8_atoms"/>
<note>
- <p>As from ERTS 5.10 (OTP R16) support
- for UTF-8 encoded atoms has been introduced in the external format.
- However, only characters that can be encoded using Latin-1 (ISO-8859-1)
- are currently supported in atoms. The support for UTF-8 encoded atoms
- in the external format has been implemented to be able to support
- all Unicode characters in atoms in <em>some future release</em>.
- Until full Unicode support for atoms has been introduced,
- it is an <em>error</em> to pass atoms containing
- characters that cannot be encoded in Latin-1, and <em>the behavior is
- undefined</em>.</p>
+ <p>As from ERTS 9.0 (OTP 20), UTF-8 encoded atoms may contain any Unicode
+ character. Although the support for UTF-8 encoded atoms in the external
+ format is available since ERTS 5.10 (OTP R16), passing atoms that cannot
+ be encoded in Latin-1 is an <em>error</em> in versions earlier than
+ Erlang/OTP 20, and <em>the behavior is undefined</em>.</p>
<p>When distribution flag <seealso marker="erl_dist_protocol#dflags">
<c>DFLAG_UTF8_ATOMS</c></seealso> has been exchanged between both nodes
in the <seealso marker="erl_dist_protocol#distribution_handshake">
diff --git a/erts/doc/src/erlang.xml b/erts/doc/src/erlang.xml
index b3fab3874b..cf038c49f0 100644
--- a/erts/doc/src/erlang.xml
+++ b/erts/doc/src/erlang.xml
@@ -325,16 +325,11 @@ Z = erlang:adler32_combine(X,Y,iolist_size(Data2)).</code>
is <c>latin1</c>, one byte exists for each character
in the text representation. If <c><anno>Encoding</anno></c> is
<c>utf8</c> or
- <c>unicode</c>, the characters are encoded using UTF-8
- (that is, characters from 128 through 255 are
- encoded in two bytes).</p>
+ <c>unicode</c>, the characters are encoded using UTF-8 where
+ characters may require multiple bytes.</p>
<note>
- <p><c>atom_to_binary(<anno>Atom</anno>, latin1)</c> never
- fails, as the text representation of an atom can only
- contain characters from 0 through 255. In a future release,
- the text representation
- of atoms can be allowed to contain any Unicode character and
- <c>atom_to_binary(<anno>Atom</anno>, latin1)</c> then fails if the
+ <p>As from Erlang/OTP 20, atoms can contain any Unicode character
+ and <c>atom_to_binary(<anno>Atom</anno>, latin1)</c> may fail if the
text representation for <c><anno>Atom</anno></c> contains a Unicode
character &gt; 255.</p>
</note>
@@ -402,13 +397,11 @@ Z = erlang:adler32_combine(X,Y,iolist_size(Data2)).</code>
translation of bytes in the binary is done.
If <c><anno>Encoding</anno></c>
is <c>utf8</c> or <c>unicode</c>, the binary must contain
- valid UTF-8 sequences. Only Unicode characters up
- to 255 are allowed.</p>
+ valid UTF-8 sequences.</p>
<note>
- <p><c>binary_to_atom(<anno>Binary</anno>, utf8)</c> fails if
- the binary contains Unicode characters &gt; 255.
- In a future release, such Unicode characters can be allowed and
- <c>binary_to_atom(<anno>Binary</anno>, utf8)</c> does then not fail.
+ <p>As from Erlang/OTP 20, <c>binary_to_atom(<anno>Binary</anno>, utf8)</c>
+ is capable of encoding any Unicode character. Earlier versions would
+ fail if the binary contained Unicode characters &gt; 255.
For more information about Unicode support in atoms, see the
<seealso marker="erl_ext_dist#utf8_atoms">note on UTF-8
encoded atoms</seealso>
@@ -419,9 +412,7 @@ Z = erlang:adler32_combine(X,Y,iolist_size(Data2)).</code>
> <input>binary_to_atom(&lt;&lt;"Erlang"&gt;&gt;, latin1).</input>
'Erlang'
> <input>binary_to_atom(&lt;&lt;1024/utf8&gt;&gt;, utf8).</input>
-** exception error: bad argument
- in function binary_to_atom/2
- called as binary_to_atom(&lt;&lt;208,128&gt;&gt;,utf8)</pre>
+'Ѐ'</pre>
</desc>
</func>
@@ -2401,10 +2392,10 @@ os_prompt%</pre>
<desc>
<p>Returns the atom whose text representation is
<c><anno>String</anno></c>.</p>
- <p><c><anno>String</anno></c> can only contain ISO-latin-1
- characters (that is, numbers &lt; 256) as the implementation does not
- allow Unicode characters equal to or above 256 in atoms.
- For more information on Unicode support in atoms, see
+ <p>As from Erlang/OTP 20, <c><anno>String</anno></c> may contain
+ any Unicode character. Earlier versions allowed only ISO-latin-1
+ characters as the implementation did not allow Unicode characters
+ above 255. For more information on Unicode support in atoms, see
<seealso marker="erl_ext_dist#utf8_atoms">note on UTF-8
encoded atoms</seealso>
in section "External Term Format" in the User's Guide.</p>
diff --git a/erts/emulator/beam/atom.c b/erts/emulator/beam/atom.c
index 2b5ad097a0..2055c29190 100644
--- a/erts/emulator/beam/atom.c
+++ b/erts/emulator/beam/atom.c
@@ -233,10 +233,10 @@ need_convertion:
}
/*
- * erts_atom_put() may fail. If it fails THE_NON_VALUE is returned!
+ * erts_atom_put_index() may fail. Returns negative indexes for errors.
*/
-Eterm
-erts_atom_put(const byte *name, int len, ErtsAtomEncoding enc, int trunc)
+int
+erts_atom_put_index(const byte *name, int len, ErtsAtomEncoding enc, int trunc)
{
byte utf8_copy[MAX_ATOM_SZ_FROM_LATIN1];
const byte *text = name;
@@ -253,7 +253,7 @@ erts_atom_put(const byte *name, int len, ErtsAtomEncoding enc, int trunc)
if (trunc)
tlen = 0;
else
- return THE_NON_VALUE;
+ return ATOM_MAX_CHARS_ERROR;
}
switch (enc) {
@@ -262,7 +262,7 @@ erts_atom_put(const byte *name, int len, ErtsAtomEncoding enc, int trunc)
if (trunc)
tlen = MAX_ATOM_CHARACTERS;
else
- return THE_NON_VALUE;
+ return ATOM_MAX_CHARS_ERROR;
}
#ifdef DEBUG
for (aix = 0; aix < len; aix++) {
@@ -276,7 +276,7 @@ erts_atom_put(const byte *name, int len, ErtsAtomEncoding enc, int trunc)
if (trunc)
tlen = MAX_ATOM_CHARACTERS;
else
- return THE_NON_VALUE;
+ return ATOM_MAX_CHARS_ERROR;
}
no_latin1_chars = tlen;
latin1_to_utf8(utf8_copy, &text, &tlen);
@@ -284,7 +284,7 @@ erts_atom_put(const byte *name, int len, ErtsAtomEncoding enc, int trunc)
case ERTS_ATOM_ENC_UTF8:
/* First sanity check; need to verify later */
if (tlen > MAX_ATOM_SZ_LIMIT && !trunc)
- return THE_NON_VALUE;
+ return ATOM_MAX_CHARS_ERROR;
break;
}
@@ -295,7 +295,7 @@ erts_atom_put(const byte *name, int len, ErtsAtomEncoding enc, int trunc)
atom_read_unlock();
if (aix >= 0) {
/* Already in table no need to verify it */
- return make_atom(aix);
+ return aix;
}
if (enc == ERTS_ATOM_ENC_UTF8) {
@@ -314,13 +314,13 @@ erts_atom_put(const byte *name, int len, ErtsAtomEncoding enc, int trunc)
case ERTS_UTF8_OK_MAX_CHARS:
/* Truncated... */
if (!trunc)
- return THE_NON_VALUE;
+ return ATOM_MAX_CHARS_ERROR;
ASSERT(no_chars == MAX_ATOM_CHARACTERS);
tlen = err_pos - text;
break;
default:
/* Bad utf8... */
- return THE_NON_VALUE;
+ return ATOM_BAD_ENCODING_ERROR;
}
}
@@ -333,7 +333,20 @@ erts_atom_put(const byte *name, int len, ErtsAtomEncoding enc, int trunc)
atom_write_lock();
aix = index_put(&erts_atom_table, (void*) &a);
atom_write_unlock();
- return make_atom(aix);
+ return aix;
+}
+
+/*
+ * erts_atom_put() may fail. If it fails THE_NON_VALUE is returned!
+ */
+Eterm
+erts_atom_put(const byte *name, int len, ErtsAtomEncoding enc, int trunc)
+{
+ int aix = erts_atom_put_index(name, len, enc, trunc);
+ if (aix >= 0)
+ return make_atom(aix);
+ else
+ return THE_NON_VALUE;
}
Eterm
diff --git a/erts/emulator/beam/atom.h b/erts/emulator/beam/atom.h
index abd3b44993..be998a46bd 100644
--- a/erts/emulator/beam/atom.h
+++ b/erts/emulator/beam/atom.h
@@ -29,6 +29,8 @@
#define MAX_ATOM_SZ_LIMIT (4*MAX_ATOM_CHARACTERS) /* theoretical byte limit */
#define ATOM_LIMIT (1024*1024)
#define MIN_ATOM_TABLE_SIZE 8192
+#define ATOM_BAD_ENCODING_ERROR -1
+#define ATOM_MAX_CHARS_ERROR -2
#ifndef ARCH_32
/* Internal atom cache needs MAX_ATOM_TABLE_SIZE to be less than an
@@ -133,6 +135,7 @@ int atom_table_sz(void); /* table size in bytes, excluding stored objects */
Eterm am_atom_put(const char*, int); /* ONLY 7-bit ascii! */
Eterm erts_atom_put(const byte *name, int len, ErtsAtomEncoding enc, int trunc);
+int erts_atom_put_index(const byte *name, int len, ErtsAtomEncoding enc, int trunc);
void init_atom_table(void);
void atom_info(fmtfn_t, void *);
void dump_atoms(fmtfn_t, void *);
diff --git a/erts/emulator/beam/beam_load.c b/erts/emulator/beam/beam_load.c
index 8f1faa6719..1899ffb079 100644
--- a/erts/emulator/beam/beam_load.c
+++ b/erts/emulator/beam/beam_load.c
@@ -157,13 +157,15 @@ typedef struct {
#define STR_CHUNK 2
#define IMP_CHUNK 3
#define EXP_CHUNK 4
-#define NUM_MANDATORY 5
+#define MIN_MANDATORY 1
+#define MAX_MANDATORY 5
#define LAMBDA_CHUNK 5
#define LITERAL_CHUNK 6
#define ATTR_CHUNK 7
#define COMPILE_CHUNK 8
#define LINE_CHUNK 9
+#define UTF8_ATOM_CHUNK 10
#define NUM_CHUNK_TYPES (sizeof(chunk_types)/sizeof(chunk_types[0]))
@@ -173,9 +175,13 @@ typedef struct {
static Uint chunk_types[] = {
/*
- * Mandatory chunk types -- these MUST be present.
+ * Atom chunk types -- Atom or AtU8 MUST be present.
*/
MakeIffId('A', 't', 'o', 'm'), /* 0 */
+
+ /*
+ * Mandatory chunk types -- these MUST be present.
+ */
MakeIffId('C', 'o', 'd', 'e'), /* 1 */
MakeIffId('S', 't', 'r', 'T'), /* 2 */
MakeIffId('I', 'm', 'p', 'T'), /* 3 */
@@ -189,6 +195,7 @@ static Uint chunk_types[] = {
MakeIffId('A', 't', 't', 'r'), /* 7 */
MakeIffId('C', 'I', 'n', 'f'), /* 8 */
MakeIffId('L', 'i', 'n', 'e'), /* 9 */
+ MakeIffId('A', 't', 'U', '8'), /* 10 */
};
/*
@@ -490,9 +497,9 @@ static Eterm stub_insert_new_code(Process *c_p, ErtsProcLocks c_p_locks,
#endif
static int init_iff_file(LoaderState* stp, byte* code, Uint size);
static int scan_iff_file(LoaderState* stp, Uint* chunk_types,
- Uint num_types, Uint num_mandatory);
+ Uint num_types);
static int verify_chunks(LoaderState* stp);
-static int load_atom_table(LoaderState* stp);
+static int load_atom_table(LoaderState* stp, ErtsAtomEncoding enc);
static int load_import_table(LoaderState* stp);
static int read_export_table(LoaderState* stp);
static int is_bif(Eterm mod, Eterm func, unsigned arity);
@@ -629,7 +636,7 @@ erts_prepare_loading(Binary* magic, Process *c_p, Eterm group_leader,
CHKALLOC();
CHKBLK(ERTS_ALC_T_CODE,stp->code);
if (!init_iff_file(stp, code, unloaded_size) ||
- !scan_iff_file(stp, chunk_types, NUM_CHUNK_TYPES, NUM_MANDATORY) ||
+ !scan_iff_file(stp, chunk_types, NUM_CHUNK_TYPES) ||
!verify_chunks(stp)) {
goto load_error;
}
@@ -674,9 +681,16 @@ erts_prepare_loading(Binary* magic, Process *c_p, Eterm group_leader,
*/
CHKBLK(ERTS_ALC_T_CODE,stp->code);
- define_file(stp, "atom table", ATOM_CHUNK);
- if (!load_atom_table(stp)) {
- goto load_error;
+ if (stp->chunks[UTF8_ATOM_CHUNK].size > 0) {
+ define_file(stp, "utf8 atom table", UTF8_ATOM_CHUNK);
+ if (!load_atom_table(stp, ERTS_ATOM_ENC_UTF8)) {
+ goto load_error;
+ }
+ } else {
+ define_file(stp, "atom table", ATOM_CHUNK);
+ if (!load_atom_table(stp, ERTS_ATOM_ENC_LATIN1)) {
+ goto load_error;
+ }
}
/*
@@ -1212,7 +1226,7 @@ init_iff_file(LoaderState* stp, byte* code, Uint size)
* Scan the IFF file. The header should have been verified by init_iff_file().
*/
static int
-scan_iff_file(LoaderState* stp, Uint* chunk_types, Uint num_types, Uint num_mandatory)
+scan_iff_file(LoaderState* stp, Uint* chunk_types, Uint num_types)
{
Uint count;
Uint id;
@@ -1291,7 +1305,16 @@ verify_chunks(LoaderState* stp)
MD5_CTX context;
MD5Init(&context);
- for (i = 0; i < NUM_MANDATORY; i++) {
+
+ if (stp->chunks[UTF8_ATOM_CHUNK].start != NULL) {
+ MD5Update(&context, stp->chunks[UTF8_ATOM_CHUNK].start, stp->chunks[UTF8_ATOM_CHUNK].size);
+ } else if (stp->chunks[ATOM_CHUNK].start != NULL) {
+ MD5Update(&context, stp->chunks[ATOM_CHUNK].start, stp->chunks[ATOM_CHUNK].size);
+ } else {
+ LoadError0(stp, "mandatory chunk of type 'Atom' or 'AtU8' not found\n");
+ }
+
+ for (i = MIN_MANDATORY; i < MAX_MANDATORY; i++) {
if (stp->chunks[i].start != NULL) {
MD5Update(&context, stp->chunks[i].start, stp->chunks[i].size);
} else {
@@ -1352,7 +1375,7 @@ verify_chunks(LoaderState* stp)
}
static int
-load_atom_table(LoaderState* stp)
+load_atom_table(LoaderState* stp, ErtsAtomEncoding enc)
{
unsigned int i;
@@ -1371,7 +1394,7 @@ load_atom_table(LoaderState* stp)
GetByte(stp, n);
GetString(stp, atom, n);
- stp->atom[i] = erts_atom_put(atom, n, ERTS_ATOM_ENC_LATIN1, 1);
+ stp->atom[i] = erts_atom_put(atom, n, enc, 1);
}
/*
@@ -5937,7 +5960,7 @@ code_get_chunk_2(BIF_ALIST_2)
goto error;
}
if (!init_iff_file(stp, start, binary_size(Bin)) ||
- !scan_iff_file(stp, &chunk, 1, 1) ||
+ !scan_iff_file(stp, &chunk, 1) ||
stp->chunks[0].start == NULL) {
res = am_undefined;
goto done;
@@ -5986,7 +6009,7 @@ code_module_md5_1(BIF_ALIST_1)
}
stp->module = THE_NON_VALUE; /* Suppress diagnostiscs */
if (!init_iff_file(stp, bytes, binary_size(Bin)) ||
- !scan_iff_file(stp, chunk_types, NUM_CHUNK_TYPES, NUM_MANDATORY) ||
+ !scan_iff_file(stp, chunk_types, NUM_CHUNK_TYPES) ||
!verify_chunks(stp)) {
res = am_undefined;
goto done;
@@ -6335,7 +6358,7 @@ erts_make_stub_module(Process* p, Eterm hipe_magic_bin, Eterm Beam, Eterm Info)
if (!init_iff_file(stp, bytes, size)) {
goto error;
}
- if (!scan_iff_file(stp, chunk_types, NUM_CHUNK_TYPES, NUM_MANDATORY) ||
+ if (!scan_iff_file(stp, chunk_types, NUM_CHUNK_TYPES) ||
!verify_chunks(stp)) {
goto error;
}
@@ -6343,9 +6366,16 @@ erts_make_stub_module(Process* p, Eterm hipe_magic_bin, Eterm Beam, Eterm Info)
if (!read_code_header(stp)) {
goto error;
}
- define_file(stp, "atom table", ATOM_CHUNK);
- if (!load_atom_table(stp)) {
- goto error;
+ if (stp->chunks[UTF8_ATOM_CHUNK].size > 0) {
+ define_file(stp, "utf8 atom table", UTF8_ATOM_CHUNK);
+ if (!load_atom_table(stp, ERTS_ATOM_ENC_UTF8)) {
+ goto error;
+ }
+ } else {
+ define_file(stp, "atom table", ATOM_CHUNK);
+ if (!load_atom_table(stp, ERTS_ATOM_ENC_LATIN1)) {
+ goto error;
+ }
}
define_file(stp, "export table", EXP_CHUNK);
if (!stub_read_export_table(stp)) {
diff --git a/erts/emulator/beam/bif.c b/erts/emulator/beam/bif.c
index d886c2985e..95bf13c07c 100644
--- a/erts/emulator/beam/bif.c
+++ b/erts/emulator/beam/bif.c
@@ -3022,8 +3022,8 @@ BIF_RETTYPE atom_to_list_1(BIF_ALIST_1)
BIF_RETTYPE list_to_atom_1(BIF_ALIST_1)
{
Eterm res;
- char *buf = (char *) erts_alloc(ERTS_ALC_T_TMP, MAX_ATOM_CHARACTERS);
- Sint i = intlist_to_buf(BIF_ARG_1, buf, MAX_ATOM_CHARACTERS);
+ byte *buf = (byte *) erts_alloc(ERTS_ALC_T_TMP, MAX_ATOM_SZ_LIMIT);
+ Sint i = erts_unicode_list_to_buf(BIF_ARG_1, buf, MAX_ATOM_CHARACTERS);
if (i < 0) {
erts_free(ERTS_ALC_T_TMP, (void *) buf);
@@ -3033,7 +3033,7 @@ BIF_RETTYPE list_to_atom_1(BIF_ALIST_1)
}
BIF_ERROR(BIF_P, BADARG);
}
- res = erts_atom_put((byte *) buf, i, ERTS_ATOM_ENC_LATIN1, 1);
+ res = erts_atom_put(buf, i, ERTS_ATOM_ENC_UTF8, 1);
ASSERT(is_atom(res));
erts_free(ERTS_ALC_T_TMP, (void *) buf);
BIF_RET(res);
@@ -3043,17 +3043,17 @@ BIF_RETTYPE list_to_atom_1(BIF_ALIST_1)
BIF_RETTYPE list_to_existing_atom_1(BIF_ALIST_1)
{
- Sint i;
- char *buf = (char *) erts_alloc(ERTS_ALC_T_TMP, MAX_ATOM_CHARACTERS);
+ byte *buf = (byte *) erts_alloc(ERTS_ALC_T_TMP, MAX_ATOM_SZ_LIMIT);
+ Sint i = erts_unicode_list_to_buf(BIF_ARG_1, buf, MAX_ATOM_CHARACTERS);
- if ((i = intlist_to_buf(BIF_ARG_1, buf, MAX_ATOM_CHARACTERS)) < 0) {
+ if (i < 0) {
error:
erts_free(ERTS_ALC_T_TMP, (void *) buf);
BIF_ERROR(BIF_P, BADARG);
} else {
Eterm a;
- if (erts_atom_get(buf, i, &a, ERTS_ATOM_ENC_LATIN1)) {
+ if (erts_atom_get((char *) buf, i, &a, ERTS_ATOM_ENC_UTF8)) {
erts_free(ERTS_ALC_T_TMP, (void *) buf);
BIF_RET(a);
} else {
diff --git a/erts/emulator/beam/erl_unicode.c b/erts/emulator/beam/erl_unicode.c
index bd5e1482fb..8919898181 100644
--- a/erts/emulator/beam/erl_unicode.c
+++ b/erts/emulator/beam/erl_unicode.c
@@ -1890,74 +1890,57 @@ binary_to_atom(Process* proc, Eterm bin, Eterm enc, int must_exist)
byte* bytes;
byte *temp_alloc = NULL;
Uint bin_size;
+ Eterm a;
if ((bytes = erts_get_aligned_binary_bytes(bin, &temp_alloc)) == 0) {
BIF_ERROR(proc, BADARG);
}
bin_size = binary_size(bin);
+
if (enc == am_latin1) {
- Eterm a;
- if (bin_size > MAX_ATOM_CHARACTERS) {
- system_limit:
- erts_free_aligned_binary_bytes(temp_alloc);
- BIF_ERROR(proc, SYSTEM_LIMIT);
- }
if (!must_exist) {
- a = erts_atom_put((byte *) bytes,
- bin_size,
- ERTS_ATOM_ENC_LATIN1,
- 0);
- erts_free_aligned_binary_bytes(temp_alloc);
- if (is_non_value(a))
- goto badarg;
- BIF_RET(a);
- } else if (erts_atom_get((char *)bytes, bin_size, &a, ERTS_ATOM_ENC_LATIN1)) {
- erts_free_aligned_binary_bytes(temp_alloc);
- BIF_RET(a);
- } else {
+ int lix = erts_atom_put_index((byte *) bytes,
+ bin_size,
+ ERTS_ATOM_ENC_LATIN1,
+ 0);
+ if (lix == ATOM_BAD_ENCODING_ERROR) {
+ badarg:
+ erts_free_aligned_binary_bytes(temp_alloc);
+ BIF_ERROR(proc, BADARG);
+ } else if (lix == ATOM_MAX_CHARS_ERROR) {
+ system_limit:
+ erts_free_aligned_binary_bytes(temp_alloc);
+ BIF_ERROR(proc, SYSTEM_LIMIT);
+ }
+
+ a = make_atom(lix);
+ } else if (!erts_atom_get((char *)bytes, bin_size, &a, ERTS_ATOM_ENC_LATIN1)) {
goto badarg;
}
- } else if (enc == am_utf8 || enc == am_unicode) {
- Eterm res;
- Uint num_chars = 0;
- const byte* p = bytes;
- Uint left = bin_size;
- while (left) {
- if (++num_chars > MAX_ATOM_CHARACTERS) {
+ } else if (enc == am_utf8 || enc == am_unicode) {
+ if (!must_exist) {
+ int uix = erts_atom_put_index((byte *) bytes,
+ bin_size,
+ ERTS_ATOM_ENC_UTF8,
+ 0);
+ if (uix == ATOM_BAD_ENCODING_ERROR) {
+ goto badarg;
+ } else if (uix == ATOM_MAX_CHARS_ERROR) {
goto system_limit;
}
- if ((p[0] & 0x80) == 0) {
- ++p;
- --left;
- }
- else if (left >= 2
- && (p[0] & 0xFE) == 0xC2 /* only allow latin1 subset */
- && (p[1] & 0xC0) == 0x80) {
- p += 2;
- left -= 2;
- }
- else goto badarg;
- }
- if (!must_exist) {
- res = erts_atom_put((byte *) bytes,
- bin_size,
- ERTS_ATOM_ENC_UTF8,
- 0);
+ a = make_atom(uix);
}
- else if (!erts_atom_get((char*)bytes, bin_size, &res, ERTS_ATOM_ENC_UTF8)) {
+ else if (!erts_atom_get((char*)bytes, bin_size, &a, ERTS_ATOM_ENC_UTF8)) {
goto badarg;
}
- erts_free_aligned_binary_bytes(temp_alloc);
- if (is_non_value(res))
- goto badarg;
- BIF_RET(res);
} else {
- badarg:
- erts_free_aligned_binary_bytes(temp_alloc);
- BIF_ERROR(proc, BADARG);
+ goto badarg;
}
+
+ erts_free_aligned_binary_bytes(temp_alloc);
+ BIF_RET(a);
}
BIF_RETTYPE binary_to_atom_2(BIF_ALIST_2)
diff --git a/erts/emulator/beam/global.h b/erts/emulator/beam/global.h
index 2b2f3c5cdc..9f2b43d216 100644
--- a/erts/emulator/beam/global.h
+++ b/erts/emulator/beam/global.h
@@ -1373,6 +1373,7 @@ int erts_utf8_to_latin1(byte* dest, const byte* source, int slen);
void bin_write(fmtfn_t, void*, byte*, size_t);
Sint intlist_to_buf(Eterm, char*, Sint); /* most callers pass plain char*'s */
+Sint erts_unicode_list_to_buf(Eterm list, byte *buf, Sint len);
struct Sint_buf {
#if defined(ARCH_64)
diff --git a/erts/emulator/beam/utils.c b/erts/emulator/beam/utils.c
index ec502d5a78..36b818505c 100644
--- a/erts/emulator/beam/utils.c
+++ b/erts/emulator/beam/utils.c
@@ -3923,6 +3923,68 @@ intlist_to_buf(Eterm list, char *buf, Sint len)
return -2; /* not enough space */
}
+/* Fill buf with the contents of the unicode list.
+ * Return the number of bytes in the buffer,
+ * or -1 for type error,
+ * or -2 for not enough buffer space (buffer contains truncated result).
+ */
+Sint
+erts_unicode_list_to_buf(Eterm list, byte *buf, Sint len)
+{
+ Eterm* listptr;
+ Sint sz = 0;
+
+ if (is_nil(list)) {
+ return 0;
+ }
+ if (is_not_list(list)) {
+ return -1;
+ }
+ listptr = list_val(list);
+
+ while (len-- > 0) {
+ Sint val;
+
+ if (is_not_small(CAR(listptr))) {
+ return -1;
+ }
+ val = signed_val(CAR(listptr));
+ if (0 <= val && val < 0x80) {
+ buf[sz] = val;
+ sz++;
+ } else if (val < 0x800) {
+ buf[sz+0] = 0xC0 | (val >> 6);
+ buf[sz+1] = 0x80 | (val & 0x3F);
+ sz += 2;
+ } else if (val < 0x10000UL) {
+ if (0xD800 <= val && val <= 0xDFFF) {
+ return -1;
+ }
+ buf[sz+0] = 0xE0 | (val >> 12);
+ buf[sz+1] = 0x80 | ((val >> 6) & 0x3F);
+ buf[sz+2] = 0x80 | (val & 0x3F);
+ sz += 3;
+ } else if (val < 0x110000) {
+ buf[sz+0] = 0xF0 | (val >> 18);
+ buf[sz+1] = 0x80 | ((val >> 12) & 0x3F);
+ buf[sz+2] = 0x80 | ((val >> 6) & 0x3F);
+ buf[sz+3] = 0x80 | (val & 0x3F);
+ sz += 4;
+ } else {
+ return -1;
+ }
+ list = CDR(listptr);
+ if (is_nil(list)) {
+ return sz;
+ }
+ if (is_not_list(list)) {
+ return -1;
+ }
+ listptr = list_val(list);
+ }
+ return -2; /* not enough space */
+}
+
/*
** Convert an integer to a byte list
** return pointer to converted stuff (need not to be at start of buf!)
diff --git a/erts/emulator/test/bif_SUITE.erl b/erts/emulator/test/bif_SUITE.erl
index f70fb0e501..339c827602 100644
--- a/erts/emulator/test/bif_SUITE.erl
+++ b/erts/emulator/test/bif_SUITE.erl
@@ -26,7 +26,7 @@
-export([all/0, suite/0,
display/1, display_huge/0,
erl_bif_types/1,guard_bifs_in_erl_bif_types/1,
- shadow_comments/1,
+ shadow_comments/1,list_to_utf8_atom/1,
specs/1,improper_bif_stubs/1,auto_imports/1,
t_list_to_existing_atom/1,os_env/1,otp_7526/1,
binary_to_atom/1,binary_to_existing_atom/1,
@@ -43,7 +43,7 @@ all() ->
[erl_bif_types, guard_bifs_in_erl_bif_types, shadow_comments,
specs, improper_bif_stubs, auto_imports,
t_list_to_existing_atom, os_env, otp_7526,
- display,
+ display, list_to_utf8_atom,
atom_to_binary, binary_to_atom, binary_to_existing_atom,
erl_crash_dump_bytes, min_max, erlang_halt, is_builtin,
error_stacktrace, error_stacktrace_during_call_trace].
@@ -339,6 +339,38 @@ check_stub({_,F,A}, B) ->
ct:fail(invalid_body)
end.
+list_to_utf8_atom(Config) when is_list(Config) ->
+ 'hello' = atom_roundtrip("hello"),
+ 'こんにちは' = atom_roundtrip("こんにちは"),
+
+ %% Test all edge cases.
+ _ = atom_roundtrip([16#80]),
+ _ = atom_roundtrip([16#7F]),
+ _ = atom_roundtrip([16#FF]),
+ _ = atom_roundtrip([16#100]),
+ _ = atom_roundtrip([16#7FF]),
+ _ = atom_roundtrip([16#800]),
+ _ = atom_roundtrip([16#D7FF]),
+ atom_badarg([16#D800]),
+ atom_badarg([16#DFFF]),
+ _ = atom_roundtrip([16#E000]),
+ _ = atom_roundtrip([16#FFFF]),
+ _ = atom_roundtrip([16#1000]),
+ _ = atom_roundtrip([16#10FFFF]),
+ atom_badarg([16#110000]),
+ ok.
+
+atom_roundtrip(String) ->
+ Atom = list_to_atom(String),
+ Atom = list_to_existing_atom(String),
+ String = atom_to_list(Atom),
+ Atom.
+
+atom_badarg(String) ->
+ {'EXIT',{badarg,_}} = (catch list_to_atom(String)),
+ {'EXIT',{badarg,_}} = (catch list_to_existing_atom(String)),
+ ok.
+
t_list_to_existing_atom(Config) when is_list(Config) ->
all = list_to_existing_atom("all"),
?MODULE = list_to_existing_atom(?MODULE_STRING),
@@ -429,6 +461,8 @@ binary_to_atom(Config) when is_list(Config) ->
Long = lists:seq(0, 254),
LongAtom = list_to_atom(Long),
LongBin = list_to_binary(Long),
+ UnicodeLongAtom = list_to_atom([$é || _ <- lists:seq(0, 254)]),
+ UnicodeLongBin = << <<"é"/utf8>> || _ <- lists:seq(0, 254)>>,
%% latin1
'' = test_binary_to_atom(<<>>, latin1),
@@ -440,12 +474,17 @@ binary_to_atom(Config) when is_list(Config) ->
'' = test_binary_to_atom(<<>>, utf8),
HalfLongAtom = test_binary_to_atom(HalfLongBin, utf8),
HalfLongAtom = test_binary_to_atom(HalfLongBin, unicode),
+ UnicodeLongAtom = test_binary_to_atom(UnicodeLongBin, utf8),
+ UnicodeLongAtom = test_binary_to_atom(UnicodeLongBin, unicode),
[] = [C || C <- lists:seq(128, 255),
begin
list_to_atom([C]) =/=
test_binary_to_atom(<<C/utf8>>, utf8)
end],
+ <<"こんにちは"/utf8>> =
+ atom_to_binary(test_binary_to_atom(<<"こんにちは"/utf8>>, utf8), utf8),
+
%% badarg failures.
fail_binary_to_atom(atom),
fail_binary_to_atom(42),
@@ -464,10 +503,6 @@ binary_to_atom(Config) when is_list(Config) ->
?BADARG(binary_to_atom(id(<<255>>), utf8)),
?BADARG(binary_to_atom(id(<<255,0>>), utf8)),
?BADARG(binary_to_atom(id(<<16#C0,16#80>>), utf8)), %Overlong 0.
- [?BADARG(binary_to_atom(<<C/utf8>>, utf8)) || C <- lists:seq(256, 16#D7FF)],
- [?BADARG(binary_to_atom(<<C/utf8>>, utf8)) || C <- lists:seq(16#E000, 16#FFFD)],
- [?BADARG(binary_to_atom(<<C/utf8>>, utf8)) || C <- lists:seq(16#10000, 16#8FFFF)],
- [?BADARG(binary_to_atom(<<C/utf8>>, utf8)) || C <- lists:seq(16#90000, 16#10FFFF)],
%% system_limit failures.
?SYS_LIMIT(binary_to_atom(id(<<0:512/unit:8,255>>), utf8)),
diff --git a/erts/emulator/test/code_SUITE.erl b/erts/emulator/test/code_SUITE.erl
index b29520ab9f..d07166ed98 100644
--- a/erts/emulator/test/code_SUITE.erl
+++ b/erts/emulator/test/code_SUITE.erl
@@ -296,16 +296,16 @@ get_chunk(Config) when is_list(Config) ->
{ok,my_code_test,Code} = compile:file(File, [binary]),
%% Should work.
- Chunk = get_chunk_ok("Atom", Code),
- Chunk = get_chunk_ok("Atom", make_sub_binary(Code)),
- Chunk = get_chunk_ok("Atom", make_unaligned_sub_binary(Code)),
+ Chunk = get_chunk_ok("AtU8", Code),
+ Chunk = get_chunk_ok("AtU8", make_sub_binary(Code)),
+ Chunk = get_chunk_ok("AtU8", make_unaligned_sub_binary(Code)),
%% Should fail.
- {'EXIT',{badarg,_}} = (catch code:get_chunk(bit_sized_binary(Code), "Atom")),
+ {'EXIT',{badarg,_}} = (catch code:get_chunk(bit_sized_binary(Code), "AtU8")),
{'EXIT',{badarg,_}} = (catch code:get_chunk(Code, "bad chunk id")),
%% Invalid beam code or missing chunk should return 'undefined'.
- undefined = code:get_chunk(<<"not a beam module">>, "Atom"),
+ undefined = code:get_chunk(<<"not a beam module">>, "AtU8"),
undefined = code:get_chunk(Code, "XXXX"),
ok.