From 26b59dfe67ef551cd94765557cdd8c79794bcc38 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jos=C3=A9=20Valim?= <jose.valim@plataformatec.com.br>
Date: Tue, 31 May 2016 14:28:54 +0200
Subject: Add new AtU8 beam chunk
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The new chunk stores atoms encoded in UTF-8.

beam_lib has also been modified to handle the new
'utf8_atoms' attribute while the 'atoms' attribute
may be a missing chunk from now on.

The binary_to_atom/2 BIF can now encode any utf8
binary with up to 255 characters.

The list_to_atom/1 BIF can now accept codepoints
higher than 255 with up to 255 characters (thanks
to Björn Gustavsson).
---
 erts/doc/src/erl_ext_dist.xml                   | 15 ++---
 erts/doc/src/erlang.xml                         | 35 ++++-------
 erts/emulator/beam/atom.c                       | 35 +++++++----
 erts/emulator/beam/atom.h                       |  3 +
 erts/emulator/beam/beam_load.c                  | 66 ++++++++++++++------
 erts/emulator/beam/bif.c                        | 14 ++---
 erts/emulator/beam/erl_unicode.c                | 83 ++++++++++---------------
 erts/emulator/beam/global.h                     |  1 +
 erts/emulator/beam/utils.c                      | 62 ++++++++++++++++++
 erts/emulator/test/bif_SUITE.erl                | 47 ++++++++++++--
 erts/emulator/test/code_SUITE.erl               | 10 +--
 lib/compiler/src/beam_asm.erl                   | 29 ++++++---
 lib/compiler/src/beam_dict.erl                  | 12 ++--
 lib/compiler/src/compile.erl                    | 21 +++++--
 lib/compiler/test/compile_SUITE.erl             | 31 +++++++--
 lib/compiler/test/compile_SUITE_data/simple.erl |  5 +-
 lib/compiler/test/lc_SUITE.erl                  |  2 +-
 lib/kernel/test/code_SUITE.erl                  |  2 +-
 lib/stdlib/src/beam_lib.erl                     | 54 ++++++++++------
 lib/stdlib/test/beam_lib_SUITE.erl              | 45 +++++++++-----
 lib/stdlib/test/erl_scan_SUITE.erl              | 15 +++--
 21 files changed, 387 insertions(+), 200 deletions(-)
diff --git a/erts/doc/src/erl_ext_dist.xml b/erts/doc/src/erl_ext_dist.xml
index 4f799f8f34..a436a9ca74 100644
--- a/erts/doc/src/erl_ext_dist.xml
+++ b/erts/doc/src/erl_ext_dist.xml
@@ -119,16 +119,11 @@
     <tcaption>Compressed Data Format when Expanded</tcaption></table>
     <marker id="utf8_atoms"/>
     <note>
-      <p>As from ERTS 5.10 (OTP R16) support
-        for UTF-8 encoded atoms has been introduced in the external format.
-        However, only characters that can be encoded using Latin-1 (ISO-8859-1)
-        are currently supported in atoms. The support for UTF-8 encoded atoms
-        in the external format has been implemented to be able to support
-        all Unicode characters in atoms in <em>some future release</em>.
-        Until full Unicode support for atoms has been introduced,
-        it is an <em>error</em> to pass atoms containing
-        characters that cannot be encoded in Latin-1, and <em>the behavior is
-        undefined</em>.</p>
+      <p>As from ERTS 9.0 (OTP 20), UTF-8 encoded atoms may contain any Unicode
+        character. Although the support for UTF-8 encoded atoms in the external
+        format is available since ERTS 5.10 (OTP R16), passing atoms that cannot
+        be encoded in Latin-1 is an <em>error</em> in versions earlier than
+        Erlang/OTP 20, and <em>the behavior is undefined</em>.</p>
       <p>When distribution flag <seealso marker="erl_dist_protocol#dflags">
         <c>DFLAG_UTF8_ATOMS</c></seealso> has been exchanged between both nodes
         in the <seealso marker="erl_dist_protocol#distribution_handshake">
diff --git a/erts/doc/src/erlang.xml b/erts/doc/src/erlang.xml
index b3fab3874b..cf038c49f0 100644
--- a/erts/doc/src/erlang.xml
+++ b/erts/doc/src/erlang.xml
@@ -325,16 +325,11 @@ Z = erlang:adler32_combine(X,Y,iolist_size(Data2)).</code>
           is <c>latin1</c>, one byte exists for each character
           in the text representation. If <c><anno>Encoding</anno></c> is
           <c>utf8</c> or
-          <c>unicode</c>, the characters are encoded using UTF-8
-          (that is, characters from 128 through 255 are
-          encoded in two bytes).</p>
+          <c>unicode</c>, the characters are encoded using UTF-8 where
+          characters may require multiple bytes.</p>
         <note>
-          <p><c>atom_to_binary(<anno>Atom</anno>, latin1)</c> never
-            fails, as the text representation of an atom can only
-            contain characters from 0 through 255. In a future release,
-            the text representation
-            of atoms can be allowed to contain any Unicode character and
-            <c>atom_to_binary(<anno>Atom</anno>, latin1)</c> then fails if the
+          <p>As from Erlang/OTP 20, atoms can contain any Unicode character
+            and <c>atom_to_binary(<anno>Atom</anno>, latin1)</c> may fail if the
             text representation for <c><anno>Atom</anno></c> contains a Unicode
             character &gt; 255.</p>
         </note>
@@ -402,13 +397,11 @@ Z = erlang:adler32_combine(X,Y,iolist_size(Data2)).</code>
           translation of bytes in the binary is done.
           If <c><anno>Encoding</anno></c>
           is <c>utf8</c> or <c>unicode</c>, the binary must contain
-          valid UTF-8 sequences. Only Unicode characters up
-          to 255 are allowed.</p>
+          valid UTF-8 sequences.</p>
         <note>
-          <p><c>binary_to_atom(<anno>Binary</anno>, utf8)</c> fails if
-            the binary contains Unicode characters &gt; 255.
-            In a future release, such Unicode characters can be allowed and
-            <c>binary_to_atom(<anno>Binary</anno>, utf8)</c> does then not fail.
+          <p>As from Erlang/OTP 20, <c>binary_to_atom(<anno>Binary</anno>, utf8)</c>
+            is capable of encoding any Unicode character. Earlier versions would
+            fail if the binary contained Unicode characters &gt; 255.
             For more information about Unicode support in atoms, see the
             <seealso marker="erl_ext_dist#utf8_atoms">note on UTF-8
             encoded atoms</seealso>
@@ -419,9 +412,7 @@ Z = erlang:adler32_combine(X,Y,iolist_size(Data2)).</code>
 > <input>binary_to_atom(&lt;&lt;"Erlang"&gt;&gt;, latin1).</input>
 'Erlang'
 > <input>binary_to_atom(&lt;&lt;1024/utf8&gt;&gt;, utf8).</input>
-** exception error: bad argument
-     in function  binary_to_atom/2
-        called as binary_to_atom(&lt;&lt;208,128&gt;&gt;,utf8)</pre>
+'Ѐ'</pre>
       </desc>
     </func>
 
@@ -2401,10 +2392,10 @@ os_prompt%</pre>
       <desc>
         <p>Returns the atom whose text representation is
           <c><anno>String</anno></c>.</p>
-        <p><c><anno>String</anno></c> can only contain ISO-latin-1
-          characters (that is, numbers &lt; 256) as the implementation does not
-          allow Unicode characters equal to or above 256 in atoms.
-          For more information on Unicode support in atoms, see
+        <p>As from Erlang/OTP 20, <c><anno>String</anno></c> may contain
+          any Unicode character. Earlier versions allowed only ISO-latin-1
+          characters as the implementation did not allow Unicode characters
+          above 255. For more information on Unicode support in atoms, see
           <seealso marker="erl_ext_dist#utf8_atoms">note on UTF-8
           encoded atoms</seealso>
           in section "External Term Format" in the User's Guide.</p>
diff --git a/erts/emulator/beam/atom.c b/erts/emulator/beam/atom.c
index 2b5ad097a0..2055c29190 100644
--- a/erts/emulator/beam/atom.c
+++ b/erts/emulator/beam/atom.c
@@ -233,10 +233,10 @@ need_convertion:
 }
 
 /*
- * erts_atom_put() may fail. If it fails THE_NON_VALUE is returned!
+ * erts_atom_put_index() may fail. Returns negative indexes for errors.
  */
-Eterm
-erts_atom_put(const byte *name, int len, ErtsAtomEncoding enc, int trunc)
+int
+erts_atom_put_index(const byte *name, int len, ErtsAtomEncoding enc, int trunc)
 {
     byte utf8_copy[MAX_ATOM_SZ_FROM_LATIN1];
     const byte *text = name;
@@ -253,7 +253,7 @@ erts_atom_put(const byte *name, int len, ErtsAtomEncoding enc, int trunc)
 	if (trunc)
 	    tlen = 0;
 	else
-	    return THE_NON_VALUE;
+	    return ATOM_MAX_CHARS_ERROR;
     }
 
     switch (enc) {
@@ -262,7 +262,7 @@ erts_atom_put(const byte *name, int len, ErtsAtomEncoding enc, int trunc)
 	    if (trunc)
 		tlen = MAX_ATOM_CHARACTERS;
 	    else
-		return THE_NON_VALUE;
+		return ATOM_MAX_CHARS_ERROR;
 	}
 #ifdef DEBUG
 	for (aix = 0; aix < len; aix++) {
@@ -276,7 +276,7 @@ erts_atom_put(const byte *name, int len, ErtsAtomEncoding enc, int trunc)
 	    if (trunc)
 		tlen = MAX_ATOM_CHARACTERS;
 	    else
-		return THE_NON_VALUE;
+		return ATOM_MAX_CHARS_ERROR;
 	}
 	no_latin1_chars = tlen;
 	latin1_to_utf8(utf8_copy, &text, &tlen);
@@ -284,7 +284,7 @@ erts_atom_put(const byte *name, int len, ErtsAtomEncoding enc, int trunc)
     case ERTS_ATOM_ENC_UTF8:
 	/* First sanity check; need to verify later */
 	if (tlen > MAX_ATOM_SZ_LIMIT && !trunc)
-	    return THE_NON_VALUE;
+	    return ATOM_MAX_CHARS_ERROR;
 	break;
     }
 
@@ -295,7 +295,7 @@ erts_atom_put(const byte *name, int len, ErtsAtomEncoding enc, int trunc)
     atom_read_unlock();
     if (aix >= 0) {
 	/* Already in table no need to verify it */
-	return make_atom(aix);
+	return aix;
     }
 
     if (enc == ERTS_ATOM_ENC_UTF8) {
@@ -314,13 +314,13 @@ erts_atom_put(const byte *name, int len, ErtsAtomEncoding enc, int trunc)
 	case ERTS_UTF8_OK_MAX_CHARS:
 	    /* Truncated... */
 	    if (!trunc)
-		return THE_NON_VALUE;
+		return ATOM_MAX_CHARS_ERROR;
 	    ASSERT(no_chars == MAX_ATOM_CHARACTERS);
 	    tlen = err_pos - text;
 	    break;
 	default:
 	    /* Bad utf8... */
-	    return THE_NON_VALUE;
+	    return ATOM_BAD_ENCODING_ERROR;
 	}
     }
 
@@ -333,7 +333,20 @@ erts_atom_put(const byte *name, int len, ErtsAtomEncoding enc, int trunc)
     atom_write_lock();
     aix = index_put(&erts_atom_table, (void*) &a);
     atom_write_unlock();
-    return make_atom(aix);
+    return aix;
+}
+
+/*
+ * erts_atom_put() may fail. If it fails THE_NON_VALUE is returned!
+ */
+Eterm
+erts_atom_put(const byte *name, int len, ErtsAtomEncoding enc, int trunc)
+{
+    int aix = erts_atom_put_index(name, len, enc, trunc);
+    if (aix >= 0)
+	return make_atom(aix);
+    else
+	return THE_NON_VALUE;
 }
 
 Eterm
diff --git a/erts/emulator/beam/atom.h b/erts/emulator/beam/atom.h
index abd3b44993..be998a46bd 100644
--- a/erts/emulator/beam/atom.h
+++ b/erts/emulator/beam/atom.h
@@ -29,6 +29,8 @@
 #define MAX_ATOM_SZ_LIMIT (4*MAX_ATOM_CHARACTERS) /* theoretical byte limit */
 #define ATOM_LIMIT (1024*1024)
 #define MIN_ATOM_TABLE_SIZE 8192
+#define ATOM_BAD_ENCODING_ERROR -1
+#define ATOM_MAX_CHARS_ERROR -2
 
 #ifndef ARCH_32
 /* Internal atom cache needs MAX_ATOM_TABLE_SIZE to be less than an
@@ -133,6 +135,7 @@ int atom_table_sz(void);	/* table size in bytes, excluding stored objects */
 
 Eterm am_atom_put(const char*, int); /* ONLY 7-bit ascii! */
 Eterm erts_atom_put(const byte *name, int len, ErtsAtomEncoding enc, int trunc);
+int erts_atom_put_index(const byte *name, int len, ErtsAtomEncoding enc, int trunc);
 void init_atom_table(void);
 void atom_info(fmtfn_t, void *);
 void dump_atoms(fmtfn_t, void *);
diff --git a/erts/emulator/beam/beam_load.c b/erts/emulator/beam/beam_load.c
index 8f1faa6719..1899ffb079 100644
--- a/erts/emulator/beam/beam_load.c
+++ b/erts/emulator/beam/beam_load.c
@@ -157,13 +157,15 @@ typedef struct {
 #define STR_CHUNK 2
 #define IMP_CHUNK 3
 #define EXP_CHUNK 4
-#define NUM_MANDATORY 5
+#define MIN_MANDATORY 1
+#define MAX_MANDATORY 5
 
 #define LAMBDA_CHUNK 5
 #define LITERAL_CHUNK 6
 #define ATTR_CHUNK 7
 #define COMPILE_CHUNK 8
 #define LINE_CHUNK 9
+#define UTF8_ATOM_CHUNK 10
 
 #define NUM_CHUNK_TYPES (sizeof(chunk_types)/sizeof(chunk_types[0]))
 
@@ -173,9 +175,13 @@ typedef struct {
 
 static Uint chunk_types[] = {
     /*
-     * Mandatory chunk types -- these MUST be present.
+     * Atom chunk types -- Atom or AtU8 MUST be present.
      */
     MakeIffId('A', 't', 'o', 'm'), /* 0 */
+
+    /*
+     * Mandatory chunk types -- these MUST be present.
+     */
     MakeIffId('C', 'o', 'd', 'e'), /* 1 */
     MakeIffId('S', 't', 'r', 'T'), /* 2 */
     MakeIffId('I', 'm', 'p', 'T'), /* 3 */
@@ -189,6 +195,7 @@ static Uint chunk_types[] = {
     MakeIffId('A', 't', 't', 'r'), /* 7 */
     MakeIffId('C', 'I', 'n', 'f'), /* 8 */
     MakeIffId('L', 'i', 'n', 'e'), /* 9 */
+    MakeIffId('A', 't', 'U', '8'), /* 10 */
 };
 
 /*
@@ -490,9 +497,9 @@ static Eterm stub_insert_new_code(Process *c_p, ErtsProcLocks c_p_locks,
 #endif
 static int init_iff_file(LoaderState* stp, byte* code, Uint size);
 static int scan_iff_file(LoaderState* stp, Uint* chunk_types,
-			 Uint num_types, Uint num_mandatory);
+			 Uint num_types);
 static int verify_chunks(LoaderState* stp);
-static int load_atom_table(LoaderState* stp);
+static int load_atom_table(LoaderState* stp, ErtsAtomEncoding enc);
 static int load_import_table(LoaderState* stp);
 static int read_export_table(LoaderState* stp);
 static int is_bif(Eterm mod, Eterm func, unsigned arity);
@@ -629,7 +636,7 @@ erts_prepare_loading(Binary* magic, Process *c_p, Eterm group_leader,
     CHKALLOC();
     CHKBLK(ERTS_ALC_T_CODE,stp->code);
     if (!init_iff_file(stp, code, unloaded_size) ||
-	!scan_iff_file(stp, chunk_types, NUM_CHUNK_TYPES, NUM_MANDATORY) ||
+	!scan_iff_file(stp, chunk_types, NUM_CHUNK_TYPES) ||
 	!verify_chunks(stp)) {
 	goto load_error;
     }
@@ -674,9 +681,16 @@ erts_prepare_loading(Binary* magic, Process *c_p, Eterm group_leader,
      */
 
     CHKBLK(ERTS_ALC_T_CODE,stp->code);
-    define_file(stp, "atom table", ATOM_CHUNK);
-    if (!load_atom_table(stp)) {
-	goto load_error;
+    if (stp->chunks[UTF8_ATOM_CHUNK].size > 0) {
+        define_file(stp, "utf8 atom table", UTF8_ATOM_CHUNK);
+        if (!load_atom_table(stp, ERTS_ATOM_ENC_UTF8)) {
+            goto load_error;
+        }
+    } else {
+        define_file(stp, "atom table", ATOM_CHUNK);
+        if (!load_atom_table(stp, ERTS_ATOM_ENC_LATIN1)) {
+            goto load_error;
+        }
     }
 
     /*
@@ -1212,7 +1226,7 @@ init_iff_file(LoaderState* stp, byte* code, Uint size)
  * Scan the IFF file. The header should have been verified by init_iff_file().
  */
 static int
-scan_iff_file(LoaderState* stp, Uint* chunk_types, Uint num_types, Uint num_mandatory)
+scan_iff_file(LoaderState* stp, Uint* chunk_types, Uint num_types)
 {
     Uint count;
     Uint id;
@@ -1291,7 +1305,16 @@ verify_chunks(LoaderState* stp)
     MD5_CTX context;
 
     MD5Init(&context);
-    for (i = 0; i < NUM_MANDATORY; i++) {
+
+    if (stp->chunks[UTF8_ATOM_CHUNK].start != NULL) {
+	MD5Update(&context, stp->chunks[UTF8_ATOM_CHUNK].start, stp->chunks[UTF8_ATOM_CHUNK].size);
+    } else if (stp->chunks[ATOM_CHUNK].start != NULL) {
+	MD5Update(&context, stp->chunks[ATOM_CHUNK].start, stp->chunks[ATOM_CHUNK].size);
+    } else {
+        LoadError0(stp, "mandatory chunk of type 'Atom' or 'AtU8' not found\n");
+    }
+
+    for (i = MIN_MANDATORY; i < MAX_MANDATORY; i++) {
 	if (stp->chunks[i].start != NULL) {
 	    MD5Update(&context, stp->chunks[i].start, stp->chunks[i].size);
 	} else {
@@ -1352,7 +1375,7 @@ verify_chunks(LoaderState* stp)
 }
 
 static int
-load_atom_table(LoaderState* stp)
+load_atom_table(LoaderState* stp, ErtsAtomEncoding enc)
 {
     unsigned int i;
 
@@ -1371,7 +1394,7 @@ load_atom_table(LoaderState* stp)
 
 	GetByte(stp, n);
 	GetString(stp, atom, n);
-	stp->atom[i] = erts_atom_put(atom, n, ERTS_ATOM_ENC_LATIN1, 1);
+	stp->atom[i] = erts_atom_put(atom, n, enc, 1);
     }
 
     /*
@@ -5937,7 +5960,7 @@ code_get_chunk_2(BIF_ALIST_2)
 	goto error;
     }
     if (!init_iff_file(stp, start, binary_size(Bin)) ||
-	!scan_iff_file(stp, &chunk, 1, 1) ||
+	!scan_iff_file(stp, &chunk, 1) ||
 	stp->chunks[0].start == NULL) {
 	res = am_undefined;
 	goto done;
@@ -5986,7 +6009,7 @@ code_module_md5_1(BIF_ALIST_1)
     }
     stp->module = THE_NON_VALUE; /* Suppress diagnostiscs */
     if (!init_iff_file(stp, bytes, binary_size(Bin)) ||
-	!scan_iff_file(stp, chunk_types, NUM_CHUNK_TYPES, NUM_MANDATORY) ||
+	!scan_iff_file(stp, chunk_types, NUM_CHUNK_TYPES) ||
 	!verify_chunks(stp)) {
 	res = am_undefined;
 	goto done;
@@ -6335,7 +6358,7 @@ erts_make_stub_module(Process* p, Eterm hipe_magic_bin, Eterm Beam, Eterm Info)
     if (!init_iff_file(stp, bytes, size)) {
 	goto error;
     }
-    if (!scan_iff_file(stp, chunk_types, NUM_CHUNK_TYPES, NUM_MANDATORY) ||
+    if (!scan_iff_file(stp, chunk_types, NUM_CHUNK_TYPES) ||
 	!verify_chunks(stp)) {
 	goto error;
     }
@@ -6343,9 +6366,16 @@ erts_make_stub_module(Process* p, Eterm hipe_magic_bin, Eterm Beam, Eterm Info)
     if (!read_code_header(stp)) {
 	goto error;
     }
-    define_file(stp, "atom table", ATOM_CHUNK);
-    if (!load_atom_table(stp)) {
-	goto error;
+    if (stp->chunks[UTF8_ATOM_CHUNK].size > 0) {
+        define_file(stp, "utf8 atom table", UTF8_ATOM_CHUNK);
+        if (!load_atom_table(stp, ERTS_ATOM_ENC_UTF8)) {
+            goto error;
+        }
+    } else {
+        define_file(stp, "atom table", ATOM_CHUNK);
+        if (!load_atom_table(stp, ERTS_ATOM_ENC_LATIN1)) {
+            goto error;
+        }
     }
     define_file(stp, "export table", EXP_CHUNK);
     if (!stub_read_export_table(stp)) {
diff --git a/erts/emulator/beam/bif.c b/erts/emulator/beam/bif.c
index d886c2985e..95bf13c07c 100644
--- a/erts/emulator/beam/bif.c
+++ b/erts/emulator/beam/bif.c
@@ -3022,8 +3022,8 @@ BIF_RETTYPE atom_to_list_1(BIF_ALIST_1)
 BIF_RETTYPE list_to_atom_1(BIF_ALIST_1)
 {
     Eterm res;
-    char *buf = (char *) erts_alloc(ERTS_ALC_T_TMP, MAX_ATOM_CHARACTERS);
-    Sint i = intlist_to_buf(BIF_ARG_1, buf, MAX_ATOM_CHARACTERS);
+    byte *buf = (byte *) erts_alloc(ERTS_ALC_T_TMP, MAX_ATOM_SZ_LIMIT);
+    Sint i = erts_unicode_list_to_buf(BIF_ARG_1, buf, MAX_ATOM_CHARACTERS);
 
     if (i < 0) {
 	erts_free(ERTS_ALC_T_TMP, (void *) buf);
@@ -3033,7 +3033,7 @@ BIF_RETTYPE list_to_atom_1(BIF_ALIST_1)
 	}
 	BIF_ERROR(BIF_P, BADARG);
     }
-    res = erts_atom_put((byte *) buf, i, ERTS_ATOM_ENC_LATIN1, 1);
+    res = erts_atom_put(buf, i, ERTS_ATOM_ENC_UTF8, 1);
     ASSERT(is_atom(res));
     erts_free(ERTS_ALC_T_TMP, (void *) buf);
     BIF_RET(res);
@@ -3043,17 +3043,17 @@ BIF_RETTYPE list_to_atom_1(BIF_ALIST_1)
  
 BIF_RETTYPE list_to_existing_atom_1(BIF_ALIST_1)
 {
-    Sint i;
-    char *buf = (char *) erts_alloc(ERTS_ALC_T_TMP, MAX_ATOM_CHARACTERS);
+    byte *buf = (byte *) erts_alloc(ERTS_ALC_T_TMP, MAX_ATOM_SZ_LIMIT);
+    Sint i = erts_unicode_list_to_buf(BIF_ARG_1, buf, MAX_ATOM_CHARACTERS);
 
-    if ((i = intlist_to_buf(BIF_ARG_1, buf, MAX_ATOM_CHARACTERS)) < 0) {
+    if (i < 0) {
     error:
 	erts_free(ERTS_ALC_T_TMP, (void *) buf);
 	BIF_ERROR(BIF_P, BADARG);
     } else {
 	Eterm a;
 	
-	if (erts_atom_get(buf, i, &a, ERTS_ATOM_ENC_LATIN1)) {
+	if (erts_atom_get((char *) buf, i, &a, ERTS_ATOM_ENC_UTF8)) {
 	    erts_free(ERTS_ALC_T_TMP, (void *) buf);
 	    BIF_RET(a);
 	} else {
diff --git a/erts/emulator/beam/erl_unicode.c b/erts/emulator/beam/erl_unicode.c
index bd5e1482fb..8919898181 100644
--- a/erts/emulator/beam/erl_unicode.c
+++ b/erts/emulator/beam/erl_unicode.c
@@ -1890,74 +1890,57 @@ binary_to_atom(Process* proc, Eterm bin, Eterm enc, int must_exist)
     byte* bytes;
     byte *temp_alloc = NULL;
     Uint bin_size;
+    Eterm a;
 
     if ((bytes = erts_get_aligned_binary_bytes(bin, &temp_alloc)) == 0) {
 	BIF_ERROR(proc, BADARG);
     }
     bin_size = binary_size(bin);
+
     if (enc == am_latin1) {
-	Eterm a;
-	if (bin_size > MAX_ATOM_CHARACTERS) {
-	system_limit:
-	    erts_free_aligned_binary_bytes(temp_alloc);
-	    BIF_ERROR(proc, SYSTEM_LIMIT);
-	}
 	if (!must_exist) {
-	    a = erts_atom_put((byte *) bytes,
-			      bin_size,
-			      ERTS_ATOM_ENC_LATIN1,
-			      0);
-    	    erts_free_aligned_binary_bytes(temp_alloc);
-	    if (is_non_value(a))
-		goto badarg;
-	    BIF_RET(a);
-	} else if (erts_atom_get((char *)bytes, bin_size, &a, ERTS_ATOM_ENC_LATIN1)) {
-	    erts_free_aligned_binary_bytes(temp_alloc);
-	    BIF_RET(a);
-	} else {
+	    int lix = erts_atom_put_index((byte *) bytes,
+					  bin_size,
+					  ERTS_ATOM_ENC_LATIN1,
+					  0);
+	    if (lix == ATOM_BAD_ENCODING_ERROR) {
+	    badarg:
+		erts_free_aligned_binary_bytes(temp_alloc);
+		BIF_ERROR(proc, BADARG);
+	    } else if (lix == ATOM_MAX_CHARS_ERROR) {
+	    system_limit:
+		erts_free_aligned_binary_bytes(temp_alloc);
+		BIF_ERROR(proc, SYSTEM_LIMIT);
+	    }
+
+	    a = make_atom(lix);
+	} else if (!erts_atom_get((char *)bytes, bin_size, &a, ERTS_ATOM_ENC_LATIN1)) {
 	    goto badarg;
 	}
-    } else if (enc == am_utf8 || enc == am_unicode) {
-	Eterm res;
-	Uint num_chars = 0;
-	const byte* p = bytes;
-	Uint left = bin_size;
 
-	while (left) {
-	    if (++num_chars > MAX_ATOM_CHARACTERS) {
+    } else if (enc == am_utf8 || enc == am_unicode) {
+	if (!must_exist) {
+	    int uix = erts_atom_put_index((byte *) bytes,
+					  bin_size,
+					  ERTS_ATOM_ENC_UTF8,
+					  0);
+	    if (uix == ATOM_BAD_ENCODING_ERROR) {
+		goto badarg;
+	    } else if (uix == ATOM_MAX_CHARS_ERROR) {
 		goto system_limit;
 	    }
-	    if ((p[0] & 0x80) == 0) {
-		++p;
-		--left;
-	    }
-	    else if (left >= 2
-		     && (p[0] & 0xFE) == 0xC2 /* only allow latin1 subset */
-		     && (p[1] & 0xC0) == 0x80) {
-		p += 2;
-		left -= 2;
-	    }
-	    else goto badarg;
-	}
 
-	if (!must_exist) {
-	    res = erts_atom_put((byte *) bytes,
-				bin_size,
-				ERTS_ATOM_ENC_UTF8,
-				0);
+	    a = make_atom(uix);
 	}
-	else if (!erts_atom_get((char*)bytes, bin_size, &res, ERTS_ATOM_ENC_UTF8)) {
+	else if (!erts_atom_get((char*)bytes, bin_size, &a, ERTS_ATOM_ENC_UTF8)) {
 	    goto badarg;
 	}
-	erts_free_aligned_binary_bytes(temp_alloc);
-	if (is_non_value(res))
-	    goto badarg;
-	BIF_RET(res);
     } else {
-    badarg:
-	erts_free_aligned_binary_bytes(temp_alloc);
-	BIF_ERROR(proc, BADARG);
+	goto badarg;
     }
+
+    erts_free_aligned_binary_bytes(temp_alloc);
+    BIF_RET(a);
 }
 
 BIF_RETTYPE binary_to_atom_2(BIF_ALIST_2)
diff --git a/erts/emulator/beam/global.h b/erts/emulator/beam/global.h
index 2b2f3c5cdc..9f2b43d216 100644
--- a/erts/emulator/beam/global.h
+++ b/erts/emulator/beam/global.h
@@ -1373,6 +1373,7 @@ int erts_utf8_to_latin1(byte* dest, const byte* source, int slen);
 
 void bin_write(fmtfn_t, void*, byte*, size_t);
 Sint intlist_to_buf(Eterm, char*, Sint); /* most callers pass plain char*'s */
+Sint erts_unicode_list_to_buf(Eterm list, byte *buf, Sint len);
 
 struct Sint_buf {
 #if defined(ARCH_64)
diff --git a/erts/emulator/beam/utils.c b/erts/emulator/beam/utils.c
index ec502d5a78..36b818505c 100644
--- a/erts/emulator/beam/utils.c
+++ b/erts/emulator/beam/utils.c
@@ -3923,6 +3923,68 @@ intlist_to_buf(Eterm list, char *buf, Sint len)
     return -2;			/* not enough space */
 }
 
+/* Fill buf with the contents of the unicode list.
+ * Return the number of bytes in the buffer,
+ * or -1 for type error,
+ * or -2 for not enough buffer space (buffer contains truncated result).
+ */
+Sint
+erts_unicode_list_to_buf(Eterm list, byte *buf, Sint len)
+{
+    Eterm* listptr;
+    Sint sz = 0;
+
+    if (is_nil(list)) {
+	return 0;
+    }
+    if (is_not_list(list)) {
+	return -1;
+    }
+    listptr = list_val(list);
+
+    while (len-- > 0) {
+	Sint val;
+
+	if (is_not_small(CAR(listptr))) {
+	    return -1;
+	}
+	val = signed_val(CAR(listptr));
+	if (0 <= val && val < 0x80) {
+	    buf[sz] = val;
+	    sz++;
+	} else if (val < 0x800) {
+	    buf[sz+0] = 0xC0 | (val >> 6);
+	    buf[sz+1] = 0x80 | (val & 0x3F);
+	    sz += 2;
+	} else if (val < 0x10000UL) {
+	    if (0xD800 <= val && val <= 0xDFFF) {
+		return -1;
+	    }
+	    buf[sz+0] = 0xE0 | (val >> 12);
+	    buf[sz+1] = 0x80 | ((val >> 6) & 0x3F);
+	    buf[sz+2] = 0x80 | (val & 0x3F);
+	    sz += 3;
+	} else if (val < 0x110000) {
+	    buf[sz+0] = 0xF0 | (val >> 18);
+	    buf[sz+1] = 0x80 | ((val >> 12) & 0x3F);
+	    buf[sz+2] = 0x80 | ((val >> 6) & 0x3F);
+	    buf[sz+3] = 0x80 | (val & 0x3F);
+	    sz += 4;
+	} else {
+	    return -1;
+	}
+	list = CDR(listptr);
+	if (is_nil(list)) {
+	    return sz;
+	}
+	if (is_not_list(list)) {
+	    return -1;
+	}
+	listptr = list_val(list);
+    }
+    return -2;			/* not enough space */
+}
+
 /*
 ** Convert an integer to a byte list
 ** return pointer to converted stuff (need not to be at start of buf!)
diff --git a/erts/emulator/test/bif_SUITE.erl b/erts/emulator/test/bif_SUITE.erl
index f70fb0e501..339c827602 100644
--- a/erts/emulator/test/bif_SUITE.erl
+++ b/erts/emulator/test/bif_SUITE.erl
@@ -26,7 +26,7 @@
 -export([all/0, suite/0,
 	 display/1, display_huge/0,
 	 erl_bif_types/1,guard_bifs_in_erl_bif_types/1,
-	 shadow_comments/1,
+	 shadow_comments/1,list_to_utf8_atom/1,
 	 specs/1,improper_bif_stubs/1,auto_imports/1,
 	 t_list_to_existing_atom/1,os_env/1,otp_7526/1,
 	 binary_to_atom/1,binary_to_existing_atom/1,
@@ -43,7 +43,7 @@ all() ->
     [erl_bif_types, guard_bifs_in_erl_bif_types, shadow_comments,
      specs, improper_bif_stubs, auto_imports,
      t_list_to_existing_atom, os_env, otp_7526,
-     display,
+     display, list_to_utf8_atom,
      atom_to_binary, binary_to_atom, binary_to_existing_atom,
      erl_crash_dump_bytes, min_max, erlang_halt, is_builtin,
      error_stacktrace, error_stacktrace_during_call_trace].
@@ -339,6 +339,38 @@ check_stub({_,F,A}, B) ->
 	    ct:fail(invalid_body)
     end.
 
+list_to_utf8_atom(Config) when is_list(Config) ->
+    'hello' = atom_roundtrip("hello"),
+    'こんにちは' = atom_roundtrip("こんにちは"),
+
+    %% Test all edge cases.
+    _ = atom_roundtrip([16#80]),
+    _ = atom_roundtrip([16#7F]),
+    _ = atom_roundtrip([16#FF]),
+    _ = atom_roundtrip([16#100]),
+    _ = atom_roundtrip([16#7FF]),
+    _ = atom_roundtrip([16#800]),
+    _ = atom_roundtrip([16#D7FF]),
+    atom_badarg([16#D800]),
+    atom_badarg([16#DFFF]),
+    _ = atom_roundtrip([16#E000]),
+    _ = atom_roundtrip([16#FFFF]),
+    _ = atom_roundtrip([16#1000]),
+    _ = atom_roundtrip([16#10FFFF]),
+    atom_badarg([16#110000]),
+    ok.
+
+atom_roundtrip(String) ->
+    Atom = list_to_atom(String),
+    Atom = list_to_existing_atom(String),
+    String = atom_to_list(Atom),
+    Atom.
+
+atom_badarg(String) ->
+    {'EXIT',{badarg,_}} = (catch list_to_atom(String)),
+    {'EXIT',{badarg,_}} = (catch list_to_existing_atom(String)),
+    ok.
+
 t_list_to_existing_atom(Config) when is_list(Config) ->
     all = list_to_existing_atom("all"),
     ?MODULE = list_to_existing_atom(?MODULE_STRING),
@@ -429,6 +461,8 @@ binary_to_atom(Config) when is_list(Config) ->
     Long = lists:seq(0, 254),
     LongAtom = list_to_atom(Long),
     LongBin = list_to_binary(Long),
+    UnicodeLongAtom = list_to_atom([$é || _ <- lists:seq(0, 254)]),
+    UnicodeLongBin = << <<"é"/utf8>> || _ <- lists:seq(0, 254)>>,
 
     %% latin1
     '' = test_binary_to_atom(<<>>, latin1),
@@ -440,12 +474,17 @@ binary_to_atom(Config) when is_list(Config) ->
     '' = test_binary_to_atom(<<>>, utf8),
     HalfLongAtom = test_binary_to_atom(HalfLongBin, utf8),
     HalfLongAtom = test_binary_to_atom(HalfLongBin, unicode),
+    UnicodeLongAtom = test_binary_to_atom(UnicodeLongBin, utf8),
+    UnicodeLongAtom = test_binary_to_atom(UnicodeLongBin, unicode),
     [] = [C || C <- lists:seq(128, 255),
 		     begin
 			 list_to_atom([C]) =/=
 			     test_binary_to_atom(<<C/utf8>>, utf8)
 		     end],
 
+    <<"こんにちは"/utf8>> =
+	atom_to_binary(test_binary_to_atom(<<"こんにちは"/utf8>>, utf8), utf8),
+
     %% badarg failures.
     fail_binary_to_atom(atom),
     fail_binary_to_atom(42),
@@ -464,10 +503,6 @@ binary_to_atom(Config) when is_list(Config) ->
     ?BADARG(binary_to_atom(id(<<255>>), utf8)),
     ?BADARG(binary_to_atom(id(<<255,0>>), utf8)),
     ?BADARG(binary_to_atom(id(<<16#C0,16#80>>), utf8)), %Overlong 0.
-    [?BADARG(binary_to_atom(<<C/utf8>>, utf8)) || C <- lists:seq(256, 16#D7FF)],
-    [?BADARG(binary_to_atom(<<C/utf8>>, utf8)) || C <- lists:seq(16#E000, 16#FFFD)],
-    [?BADARG(binary_to_atom(<<C/utf8>>, utf8)) || C <- lists:seq(16#10000, 16#8FFFF)],
-    [?BADARG(binary_to_atom(<<C/utf8>>, utf8)) || C <- lists:seq(16#90000, 16#10FFFF)],
 
     %% system_limit failures.
     ?SYS_LIMIT(binary_to_atom(id(<<0:512/unit:8,255>>), utf8)),
diff --git a/erts/emulator/test/code_SUITE.erl b/erts/emulator/test/code_SUITE.erl
index b29520ab9f..d07166ed98 100644
--- a/erts/emulator/test/code_SUITE.erl
+++ b/erts/emulator/test/code_SUITE.erl
@@ -296,16 +296,16 @@ get_chunk(Config) when is_list(Config) ->
     {ok,my_code_test,Code} = compile:file(File, [binary]),
 
     %% Should work.
-    Chunk = get_chunk_ok("Atom", Code),
-    Chunk = get_chunk_ok("Atom", make_sub_binary(Code)),
-    Chunk = get_chunk_ok("Atom", make_unaligned_sub_binary(Code)),
+    Chunk = get_chunk_ok("AtU8", Code),
+    Chunk = get_chunk_ok("AtU8", make_sub_binary(Code)),
+    Chunk = get_chunk_ok("AtU8", make_unaligned_sub_binary(Code)),
 
     %% Should fail.
-    {'EXIT',{badarg,_}} = (catch code:get_chunk(bit_sized_binary(Code), "Atom")),
+    {'EXIT',{badarg,_}} = (catch code:get_chunk(bit_sized_binary(Code), "AtU8")),
     {'EXIT',{badarg,_}} = (catch code:get_chunk(Code, "bad chunk id")),
 
     %% Invalid beam code or missing chunk should return 'undefined'.
-    undefined = code:get_chunk(<<"not a beam module">>, "Atom"),
+    undefined = code:get_chunk(<<"not a beam module">>, "AtU8"),
     undefined = code:get_chunk(Code, "XXXX"),
 
     ok.
diff --git a/lib/compiler/src/beam_asm.erl b/lib/compiler/src/beam_asm.erl
index a2f5dc674c..2fc2850591 100644
--- a/lib/compiler/src/beam_asm.erl
+++ b/lib/compiler/src/beam_asm.erl
@@ -21,7 +21,7 @@
 
 -module(beam_asm).
 
--export([module/4]).
+-export([module/5]).
 -export([encode/2]).
 
 -export_type([fail/0,label/0,reg/0,src/0,module_code/0,function_name/0]).
@@ -57,20 +57,20 @@
 -type module_code() ::
         {module(),[_],[_],[asm_function()],pos_integer()}.
 
--spec module(module_code(), exports(), [_], [compile:option()]) ->
+-spec module(module_code(), exports(), [_], [compile:option()], [compile:option()]) ->
                     {'ok',binary()}.
 
-module(Code, Abst, SourceFile, Opts) ->
-    {ok,assemble(Code, Abst, SourceFile, Opts)}.
+module(Code, Abst, SourceFile, Opts, CompilerOpts) ->
+    {ok,assemble(Code, Abst, SourceFile, Opts, CompilerOpts)}.
 
-assemble({Mod,Exp0,Attr0,Asm0,NumLabels}, Abst, SourceFile, Opts) ->
+assemble({Mod,Exp0,Attr0,Asm0,NumLabels}, Abst, SourceFile, Opts, CompilerOpts) ->
     {1,Dict0} = beam_dict:atom(Mod, beam_dict:new()),
     {0,Dict1} = beam_dict:fname(atom_to_list(Mod) ++ ".erl", Dict0),
     NumFuncs = length(Asm0),
     {Asm,Attr} = on_load(Asm0, Attr0),
     Exp = cerl_sets:from_list(Exp0),
     {Code,Dict2} = assemble_1(Asm, Exp, Dict1, []),
-    build_file(Code, Attr, Dict2, NumLabels, NumFuncs, Abst, SourceFile, Opts).
+    build_file(Code, Attr, Dict2, NumLabels, NumFuncs, Abst, SourceFile, Opts, CompilerOpts).
 
 on_load(Fs0, Attr0) ->
     case proplists:get_value(on_load, Attr0) of
@@ -113,7 +113,7 @@ assemble_function([H|T], Acc, Dict0) ->
 assemble_function([], Code, Dict) ->
     {Code, Dict}.
 
-build_file(Code, Attr, Dict, NumLabels, NumFuncs, Abst, SourceFile, Opts) ->
+build_file(Code, Attr, Dict, NumLabels, NumFuncs, Abst, SourceFile, Opts, CompilerOpts) ->
     %% Create the code chunk.
 
     CodeChunk = chunk(<<"Code">>,
@@ -125,9 +125,9 @@ build_file(Code, Attr, Dict, NumLabels, NumFuncs, Abst, SourceFile, Opts) ->
 		      Code),
 
     %% Create the atom table chunk.
-
-    {NumAtoms, AtomTab} = beam_dict:atom_table(Dict),
-    AtomChunk = chunk(<<"Atom">>, <<NumAtoms:32>>, AtomTab),
+    AtomEncoding = atom_encoding(CompilerOpts),
+    {NumAtoms, AtomTab} = beam_dict:atom_table(Dict, AtomEncoding),
+    AtomChunk = chunk(atom_chunk_name(AtomEncoding), <<NumAtoms:32>>, AtomTab),
 
     %% Create the import table chunk.
 
@@ -203,6 +203,15 @@ build_file(Code, Attr, Dict, NumLabels, NumFuncs, Abst, SourceFile, Opts) ->
 	     end,
     build_form(<<"BEAM">>, Chunks).
 
+atom_encoding(Opts) ->
+    case proplists:get_bool(no_utf8_atoms, Opts) of
+	false -> utf8;
+	true -> latin1
+    end.
+
+atom_chunk_name(utf8) -> <<"AtU8">>;
+atom_chunk_name(latin1) -> <<"Atom">>.
+
 %% finalize_fun_table(Essentials, MD5) -> FinalizedEssentials
 %%  Update the 'old_uniq' field in the entry for each fun in the
 %%  'FunT' chunk. We'll use part of the MD5 for the module as a
diff --git a/lib/compiler/src/beam_dict.erl b/lib/compiler/src/beam_dict.erl
index 719d799fd7..990e86062a 100644
--- a/lib/compiler/src/beam_dict.erl
+++ b/lib/compiler/src/beam_dict.erl
@@ -24,7 +24,7 @@
 -export([new/0,opcode/2,highest_opcode/1,
 	 atom/2,local/4,export/4,import/4,
 	 string/2,lambda/3,literal/2,line/2,fname/2,
-	 atom_table/1,local_table/1,export_table/1,import_table/1,
+	 atom_table/2,local_table/1,export_table/1,import_table/1,
 	 string_table/1,lambda_table/1,literal_table/1,
 	 line_table/1]).
 
@@ -197,15 +197,15 @@ fname(Name, #asm{fnames=Fnames}=Dict) ->
     end.
 
 %% Returns the atom table.
-%%    atom_table(Dict) -> {LastIndex,[Length,AtomString...]}
--spec atom_table(bdict()) -> {non_neg_integer(), [[non_neg_integer(),...]]}.
+%%    atom_table(Dict, Encoding) -> {LastIndex,[Length,AtomString...]}
+-spec atom_table(bdict(), latin1 | utf8) -> {non_neg_integer(), [[non_neg_integer(),...]]}.
 
-atom_table(#asm{atoms=Atoms}) ->
+atom_table(#asm{atoms=Atoms}, Encoding) ->
     NumAtoms = maps:size(Atoms),
     Sorted = lists:keysort(2, maps:to_list(Atoms)),
     {NumAtoms,[begin
-                   L = atom_to_list(A),
-                   [length(L)|L]
+                   L = atom_to_binary(A, Encoding),
+                   [byte_size(L),L]
                end || {A,_} <- Sorted]}.
 
 %% Returns the table of local functions.
diff --git a/lib/compiler/src/compile.erl b/lib/compiler/src/compile.erl
index 069add7890..dcd962df66 100644
--- a/lib/compiler/src/compile.erl
+++ b/lib/compiler/src/compile.erl
@@ -214,11 +214,21 @@ expand_opt(report, Os) ->
 expand_opt(return, Os) ->
     [return_errors,return_warnings|Os];
 expand_opt(r12, Os) ->
-    [no_recv_opt,no_line_info|Os];
+    [no_recv_opt,no_line_info,no_utf8_atoms|Os];
 expand_opt(r13, Os) ->
-    [no_recv_opt,no_line_info|Os];
+    [no_recv_opt,no_line_info,no_utf8_atoms|Os];
 expand_opt(r14, Os) ->
-    [no_line_info|Os];
+    [no_line_info,no_utf8_atoms|Os];
+expand_opt(r15, Os) ->
+    [no_utf8_atoms|Os];
+expand_opt(r16, Os) ->
+    [no_utf8_atoms|Os];
+expand_opt(r17, Os) ->
+    [no_utf8_atoms|Os];
+expand_opt(r18, Os) ->
+    [no_utf8_atoms|Os];
+expand_opt(r19, Os) ->
+    [no_utf8_atoms|Os];
 expand_opt({debug_info_key,_}=O, Os) ->
     [encrypt_debug_info,O|Os];
 expand_opt(no_float_opt, Os) ->
@@ -1376,13 +1386,14 @@ encrypt({des3_cbc=Type,Key,IVec,BlockSize}, Bin0) ->
 save_core_code(Code, St) ->
     {ok,Code,St#compile{core_code=cerl:from_records(Code)}}.
 
-beam_asm(Code0, #compile{ifile=File,abstract_code=Abst,mod_options=Opts0}=St) ->
+beam_asm(Code0, #compile{ifile=File,abstract_code=Abst,
+			 options=CompilerOpts,mod_options=Opts0}=St) ->
     Source = paranoid_absname(File),
     Opts1 = lists:map(fun({debug_info_key,_}) -> {debug_info_key,'********'};
 			 (Other) -> Other
 		      end, Opts0),
     Opts2 = [O || O <- Opts1, effects_code_generation(O)],
-    case beam_asm:module(Code0, Abst, Source, Opts2) of
+    case beam_asm:module(Code0, Abst, Source, Opts2, CompilerOpts) of
 	{ok,Code} -> {ok,Code,St#compile{abstract_code=[]}}
     end.
 
diff --git a/lib/compiler/test/compile_SUITE.erl b/lib/compiler/test/compile_SUITE.erl
index 8c09414a52..8d7facd727 100644
--- a/lib/compiler/test/compile_SUITE.erl
+++ b/lib/compiler/test/compile_SUITE.erl
@@ -30,7 +30,7 @@
 	 file_1/1, forms_2/1, module_mismatch/1, big_file/1, outdir/1,
 	 binary/1, makedep/1, cond_and_ifdef/1, listings/1, listings_big/1,
 	 other_output/1, kernel_listing/1, encrypted_abstr/1,
-	 strict_record/1,
+	 strict_record/1, utf8_atoms/1,
 	 cover/1, env/1, core/1,
 	 core_roundtrip/1, asm/1, optimized_guards/1,
 	 sys_pre_attributes/1, dialyzer/1,
@@ -48,7 +48,7 @@ all() ->
     [app_test, appup_test, file_1, forms_2, module_mismatch, big_file, outdir,
      binary, makedep, cond_and_ifdef, listings, listings_big,
      other_output, kernel_listing, encrypted_abstr,
-     strict_record,
+     strict_record, utf8_atoms,
      cover, env, core, core_roundtrip, asm, optimized_guards,
      sys_pre_attributes, dialyzer, warnings, pre_load_check,
      env_compiler_options].
@@ -450,8 +450,10 @@ do_kernel_listing({M,A}) ->
     try
 	{ok,M,Kern} = compile:forms(A, [to_kernel]),
 	IoList = v3_kernel_pp:format(Kern),
-	_ = iolist_size(IoList),
-	ok
+	case unicode:characters_to_binary(IoList) of
+	    Bin when is_binary(Bin) ->
+		ok
+	end
     catch
 	throw:{error,Error} ->
 	    io:format("*** compilation failure '~p' for module ~s\n",
@@ -680,6 +682,23 @@ test_sloppy() ->
     {1,2} = record_access:test(Turtle),
     Turtle.
 
+utf8_atoms(Config) when is_list(Config) ->
+    Anno = erl_anno:new(1),
+    Atom = binary_to_atom(<<"こんにちは"/utf8>>, utf8),
+    Forms = [{attribute,Anno,compile,[export_all]},
+	     {function,Anno,atom,0,[{clause,Anno,[],[],[{atom,Anno,Atom}]}]}],
+
+    Utf8AtomForms = [{attribute,Anno,module,utf8_atom}|Forms],
+    {ok,utf8_atom,Utf8AtomBin} =
+	compile:forms(Utf8AtomForms, [binary]),
+    {ok,{utf8_atom,[{atoms,_}]}} =
+	beam_lib:chunks(Utf8AtomBin, [atoms]),
+    code:load_binary(utf8_atom, "compile_SUITE", Utf8AtomBin),
+    Atom = utf8_atom:atom(),
+
+    NoUtf8AtomForms = [{attribute,Anno,module,no_utf8_atom}|Forms],
+    error = compile:forms(NoUtf8AtomForms, [binary, r19]).
+
 env(Config) when is_list(Config) ->
     {Simple,Target} = get_files(Config, simple, env),
     {ok,Cwd} = file:get_cwd(),
@@ -751,7 +770,7 @@ do_core_1(M, A, Outdir) ->
     {ok,M,Core0} = compile:forms(A, [to_core]),
     CoreFile = filename:join(Outdir, atom_to_list(M)++".core"),
     CorePP = core_pp:format(Core0),
-    ok = file:write_file(CoreFile, CorePP),
+    ok = file:write_file(CoreFile, unicode:characters_to_binary(CorePP)),
 
     %% Parse the .core file and return the result as Core Erlang Terms.
     Core = case compile:file(CoreFile, [report_errors,from_core,no_copt,to_core,binary]) of
@@ -823,7 +842,7 @@ do_core_roundtrip_1(Mod, Abstr, Outdir) ->
 do_core_roundtrip_2(M, Core0, Outdir) ->
     CoreFile = filename:join(Outdir, atom_to_list(M)++".core"),
     CorePP = core_pp:format_all(Core0),
-    ok = file:write_file(CoreFile, CorePP),
+    ok = file:write_file(CoreFile, unicode:characters_to_binary(CorePP)),
 
     %% Parse the .core file and return the result as Core Erlang Terms.
     Core2 = case compile:file(CoreFile, [report_errors,from_core,
diff --git a/lib/compiler/test/compile_SUITE_data/simple.erl b/lib/compiler/test/compile_SUITE_data/simple.erl
index d8324dafaf..9385d101e0 100644
--- a/lib/compiler/test/compile_SUITE_data/simple.erl
+++ b/lib/compiler/test/compile_SUITE_data/simple.erl
@@ -19,7 +19,7 @@
 %%
 -module(simple).
 
--export([test/0]).
+-export([test/0,unicode/0]).
 
 -ifdef(need_foo).
 -export([foo/0]).
@@ -28,6 +28,9 @@
 test() ->
     passed.
 
+unicode() ->
+    {"это",'спутник'}.
+
 %% Conditional inclusion.
 %% Compile with [{d, need_foo}, {d, foo_value, 42}].
 
diff --git a/lib/compiler/test/lc_SUITE.erl b/lib/compiler/test/lc_SUITE.erl
index 3cb49433ce..6904ee7f52 100644
--- a/lib/compiler/test/lc_SUITE.erl
+++ b/lib/compiler/test/lc_SUITE.erl
@@ -226,7 +226,7 @@ effect(Config) when is_list(Config) ->
 	lc_SUITE ->
 	    _ = [{'EXIT',{badarg,_}} =
 		     (catch binary_to_atom(<<C/utf8>>, utf8)) ||
-		    C <- lists:seq(16#10000, 16#FFFFF)];
+		    C <- lists:seq(16#FF10000, 16#FFFFFFF)];
 	_ ->
 	    ok
     end,
diff --git a/lib/kernel/test/code_SUITE.erl b/lib/kernel/test/code_SUITE.erl
index 4914ce9e4c..f368232bfc 100644
--- a/lib/kernel/test/code_SUITE.erl
+++ b/lib/kernel/test/code_SUITE.erl
@@ -323,7 +323,7 @@ load_abs(Config) when is_list(Config) ->
     {error, nofile} = code:load_abs(TestDir ++ "/duuuumy_mod"),
     {error, badfile} = code:load_abs(TestDir ++ "/code_a_test"),
     {'EXIT', _} = (catch code:load_abs({})),
-    {'EXIT', _} = (catch code:load_abs("Non-latin-имя-файла")),
+    {error, nofile} = code:load_abs("Non-latin-имя-файла"),
     {module, code_b_test} = code:load_abs(TestDir ++ "/code_b_test"),
     code:stick_dir(TestDir),
     {error, sticky_directory} = code:load_abs(TestDir ++ "/code_b_test"),
diff --git a/lib/stdlib/src/beam_lib.erl b/lib/stdlib/src/beam_lib.erl
index d7ee5c1f5d..461acf03be 100644
--- a/lib/stdlib/src/beam_lib.erl
+++ b/lib/stdlib/src/beam_lib.erl
@@ -63,7 +63,7 @@
 -type label()     :: integer().
 
 -type chunkid()   :: nonempty_string(). % approximation of the strings below
-%% "Abst" | "Attr" | "CInf" | "ExpT" | "ImpT" | "LocT" | "Atom".
+%% "Abst" | "Attr" | "CInf" | "ExpT" | "ImpT" | "LocT" | "Atom" | "AtU8".
 -type chunkname() :: 'abstract_code' | 'attributes' | 'compile_info'
                    | 'exports' | 'labeled_exports'
                    | 'imports' | 'indexed_imports'
@@ -520,6 +520,8 @@ read_chunk_data(File0, ChunkNames0, Options)
     end.
 
 %% -> {ok, list()} | throw(Error)
+check_chunks([atoms | Ids], File, IL, L) ->
+    check_chunks(Ids, File, ["Atom", "AtU8" | IL], [{atom_chunk, atoms} | L]);
 check_chunks([ChunkName | Ids], File, IL, L) when is_atom(ChunkName) ->
     ChunkId = chunk_name_to_id(ChunkName, File),
     check_chunks(Ids, File, [ChunkId | IL], [{ChunkId, ChunkName} | L]);
@@ -537,6 +539,10 @@ scan_beam(File, What0, AllowMissingChunks) ->
     case scan_beam1(File, What0) of
 	{missing, _FD, Mod, Data, What} when AllowMissingChunks ->
 	    {ok, Mod, [{Id, missing_chunk} || Id <- What] ++ Data};
+	{missing, _FD, Mod, Data, ["Atom"]} ->
+	    {ok, Mod, Data};
+	{missing, _FD, Mod, Data, ["AtU8"]} ->
+	    {ok, Mod, Data};
 	{missing, FD, _Mod, _Data, What} ->
 	    error({missing_chunk, filename(FD), hd(What)});
 	R ->
@@ -581,18 +587,23 @@ scan_beam(FD, Pos, What, Mod, Data) ->
 	    error({invalid_beam_file, filename(FD), Pos})
     end.
 
-get_data(Cs, "Atom"=Id, FD, Size, Pos, Pos2, _Mod, Data) ->
+get_atom_data(Cs, Id, FD, Size, Pos, Pos2, Data, Encoding) ->
     NewCs = del_chunk(Id, Cs),
     {NFD, Chunk} = get_chunk(Id, Pos, Size, FD),
     <<_Num:32, Chunk2/binary>> = Chunk,
-    {Module, _} = extract_atom(Chunk2),
+    {Module, _} = extract_atom(Chunk2, Encoding),
     C = case Cs of
 	    info -> 
 		{Id, Pos, Size};
 	    _ -> 
 		{Id, Chunk}
 	end,
-    scan_beam(NFD, Pos2, NewCs, Module, [C | Data]);
+    scan_beam(NFD, Pos2, NewCs, Module, [C | Data]).
+
+get_data(Cs, "Atom" = Id, FD, Size, Pos, Pos2, _Mod, Data) ->
+    get_atom_data(Cs, Id, FD, Size, Pos, Pos2, Data, latin1);
+get_data(Cs, "AtU8" = Id, FD, Size, Pos, Pos2, _Mod, Data) ->
+    get_atom_data(Cs, Id, FD, Size, Pos, Pos2, Data, utf8);
 get_data(info, Id, FD, Size, Pos, Pos2, Mod, Data) ->
     scan_beam(FD, Pos2, info, Mod, [{Id, Pos, Size} | Data]);
 get_data(Chunks, Id, FD, Size, Pos, Pos2, Mod, Data) ->
@@ -624,6 +635,9 @@ get_chunk(Id, Pos, Size, FD) ->
 	    {NFD, Chunk}
     end.
 
+chunks_to_data([{atom_chunk, Name} | CNs], Chunks, File, Cs, Module, Atoms, L) ->
+    {NewAtoms, Ret} = chunk_to_data(Name, <<"">>, File, Cs, Atoms, Module),
+    chunks_to_data(CNs, Chunks, File, Cs, Module, NewAtoms, [Ret | L]);
 chunks_to_data([{Id, Name} | CNs], Chunks, File, Cs, Module, Atoms, L) ->
     {_Id, Chunk} = lists:keyfind(Id, 1, Chunks),
     {NewAtoms, Ret} = chunk_to_data(Name, Chunk, File, Cs, Atoms, Module),
@@ -651,7 +665,7 @@ chunk_to_data(abstract_code=Id, Chunk, File, _Cs, AtomTable, Mod) ->
 	<<>> ->
 	    {AtomTable, {Id, no_abstract_code}};
 	<<0:8,N:8,Mode0:N/binary,Rest/binary>> ->
-	    Mode = list_to_atom(binary_to_list(Mode0)),
+	    Mode = binary_to_atom(Mode0, utf8),
 	    decrypt_abst(Mode, Mod, File, Id, AtomTable, Rest);
 	_ ->
 	    case catch binary_to_term(Chunk) of
@@ -683,7 +697,6 @@ chunk_to_data(ChunkId, Chunk, _File,
 	      _Cs, AtomTable, _Module) when is_list(ChunkId) ->
     {AtomTable, {ChunkId, Chunk}}. % Chunk is a binary
 
-chunk_name_to_id(atoms, _)           -> "Atom";
 chunk_name_to_id(indexed_imports, _) -> "ImpT";
 chunk_name_to_id(imports, _)         -> "ImpT";
 chunk_name_to_id(exports, _)         -> "ExpT";
@@ -738,25 +751,30 @@ atm(AT, N) ->
 
 %% AT is updated.
 ensure_atoms({empty, AT}, Cs) ->
-    {_Id, AtomChunk} = lists:keyfind("Atom", 1, Cs),
-    extract_atoms(AtomChunk, AT),
+    case lists:keyfind("AtU8", 1, Cs) of
+	{_Id, AtomChunk} when is_binary(AtomChunk) ->
+	    extract_atoms(AtomChunk, AT, utf8);
+	_ ->
+	    {_Id, AtomChunk} = lists:keyfind("Atom", 1, Cs),
+	    extract_atoms(AtomChunk, AT, latin1)
+    end,
     AT;
 ensure_atoms(AT, _Cs) ->
     AT.
 
-extract_atoms(<<_Num:32, B/binary>>, AT) ->
-    extract_atoms(B, 1, AT).
+extract_atoms(<<_Num:32, B/binary>>, AT, Encoding) ->
+    extract_atoms(B, 1, AT, Encoding).
 
-extract_atoms(<<>>, _I, _AT) ->
+extract_atoms(<<>>, _I, _AT, _Encoding) ->
     true;
-extract_atoms(B, I, AT) ->
-    {Atom, B1} = extract_atom(B),
+extract_atoms(B, I, AT, Encoding) ->
+    {Atom, B1} = extract_atom(B, Encoding),
     true = ets:insert(AT, {I, Atom}),
-    extract_atoms(B1, I+1, AT).
+    extract_atoms(B1, I+1, AT, Encoding).
 
-extract_atom(<<Len, B/binary>>) ->
+extract_atom(<<Len, B/binary>>, Encoding) ->
     <<SB:Len/binary, Tail/binary>> = B,
-    {list_to_atom(binary_to_list(SB)), Tail}.
+    {binary_to_atom(SB, Encoding), Tail}.
 
 %%% Utils.
 
@@ -856,12 +874,12 @@ significant_chunks() ->
 %% for a module. They are listed in the order that they should be MD5:ed.
 
 md5_chunks() ->
-    ["Atom", "Code", "StrT", "ImpT", "ExpT", "FunT", "LitT"].
+    ["Atom", "AtU8", "Code", "StrT", "ImpT", "ExpT", "FunT", "LitT"].
 
 %% The following chunks are mandatory in every Beam file.
 
 mandatory_chunks() ->
-    ["Code", "ExpT", "ImpT", "StrT", "Atom"].
+    ["Code", "ExpT", "ImpT", "StrT"].
 
 %%% ====================================================================
 %%% The rest of the file handles encrypted debug info.
diff --git a/lib/stdlib/test/beam_lib_SUITE.erl b/lib/stdlib/test/beam_lib_SUITE.erl
index 4521ecc0ef..279e15f703 100644
--- a/lib/stdlib/test/beam_lib_SUITE.erl
+++ b/lib/stdlib/test/beam_lib_SUITE.erl
@@ -81,12 +81,8 @@ normal(Conf) when is_list(Conf) ->
     NoOfTables = length(ets:all()),
     P0 = pps(),
 
-    CompileFlags = [{outdir,PrivDir}, debug_info],
-    {ok,_} = compile:file(Source, CompileFlags),
-    {ok, Binary} = file:read_file(BeamFile),
-
-    do_normal(BeamFile),
-    do_normal(Binary),
+    do_normal(Source, PrivDir, BeamFile, []),
+    do_normal(Source, PrivDir, BeamFile, [no_utf8_atoms]),
 
     {ok,_} = compile:file(Source, [{outdir,PrivDir}, no_debug_info]),
     {ok, {simple, [{abstract_code, no_abstract_code}]}} =
@@ -101,7 +97,15 @@ normal(Conf) when is_list(Conf) ->
     true = (P0 == pps()),
     ok.
 
-do_normal(BeamFile) ->
+do_normal(Source, PrivDir, BeamFile, Opts) ->
+    CompileFlags = [{outdir,PrivDir}, debug_info | Opts],
+    {ok,_} = compile:file(Source, CompileFlags),
+    {ok, Binary} = file:read_file(BeamFile),
+
+    do_normal(BeamFile, Opts),
+    do_normal(Binary, Opts).
+
+do_normal(BeamFile, Opts) ->
     Imports = {imports, [{erlang, get_module_info, 1},
 			 {erlang, get_module_info, 2},
 			 {lists, member, 2}]},
@@ -130,20 +134,31 @@ do_normal(BeamFile) ->
 	beam_lib:chunks(BeamFile, [abstract_code]),
 
     %% Test reading optional chunks.
-    All = ["Atom", "Code", "StrT", "ImpT", "ExpT", "FunT", "LitT"],
+    All = ["Atom", "Code", "StrT", "ImpT", "ExpT", "FunT", "LitT", "AtU8"],
     {ok,{simple,Chunks}} = beam_lib:chunks(BeamFile, All, [allow_missing_chunks]),
-    verify_simple(Chunks).
+    case {verify_simple(Chunks),Opts} of
+	{{missing_chunk, AtomBin}, []} when is_binary(AtomBin) -> ok;
+	{{AtomBin, missing_chunk}, [no_utf8_atoms]} when is_binary(AtomBin) -> ok
+    end,
 
-verify_simple([{"Atom", AtomBin},
+    %% Make sure that reading the atom chunk works when the 'allow_missing_chunks'
+    %% option is used.
+    Some = ["Code",atoms,"ExpT","LitT"],
+    {ok,{simple,SomeChunks}} = beam_lib:chunks(BeamFile, Some, [allow_missing_chunks]),
+    [{"Code",<<_/binary>>},{atoms,[_|_]},{"ExpT",<<_/binary>>},{"LitT",missing_chunk}] =
+	SomeChunks.
+
+verify_simple([{"Atom", PlainAtomChunk},
 	       {"Code", CodeBin},
 	       {"StrT", StrBin},
 	       {"ImpT", ImpBin},
 	       {"ExpT", ExpBin},
 	       {"FunT", missing_chunk},
-	       {"LitT", missing_chunk}])
-  when is_binary(AtomBin), is_binary(CodeBin), is_binary(StrBin),
+	       {"LitT", missing_chunk},
+	       {"AtU8", AtU8Chunk}])
+  when is_binary(CodeBin), is_binary(StrBin),
        is_binary(ImpBin), is_binary(ExpBin) ->
-    ok.
+    {PlainAtomChunk, AtU8Chunk}.
 
 %% Read invalid beam files.
 error(Conf) when is_list(Conf) ->
@@ -211,7 +226,7 @@ last_chunk(Bin) ->
 do_error(BeamFile, ACopy) ->
     %% evil tests
     Chunks = chunk_info(BeamFile),
-    {value, {_, AtomStart, _}} = lists:keysearch("Atom", 1, Chunks),
+    {value, {_, AtomStart, _}} = lists:keysearch("AtU8", 1, Chunks),
     {value, {_, ImportStart, _}} = lists:keysearch("ImpT", 1, Chunks),
     {value, {_, AbstractStart, _}} = lists:keysearch("Abst", 1, Chunks),
     {value, {_, AttributesStart, _}} =
@@ -234,7 +249,7 @@ do_error(BeamFile, ACopy) ->
     verify(not_a_beam_file, beam_lib:info(BF7)),
 
     BF8 = set_byte(ACopy, BeamFile, 13, 17),
-    verify(missing_chunk, beam_lib:chunks(BF8, ["Atom"])),
+    verify(missing_chunk, beam_lib:chunks(BF8, ["AtU8"])),
 
     BF9 = set_byte(ACopy, BeamFile, CompileInfoStart+10, 17),
     verify(invalid_chunk, beam_lib:chunks(BF9, [compile_info])).
diff --git a/lib/stdlib/test/erl_scan_SUITE.erl b/lib/stdlib/test/erl_scan_SUITE.erl
index 4ae734eb65..7d0ba967f9 100644
--- a/lib/stdlib/test/erl_scan_SUITE.erl
+++ b/lib/stdlib/test/erl_scan_SUITE.erl
@@ -772,10 +772,9 @@ unicode() ->
         erl_scan:string([1089]),
     {error,{{1,1},erl_scan,{illegal,character}},{1,2}} =
         erl_scan:string([1089], {1,1}),
-    {error,{1,erl_scan,{illegal,atom}},1} =
-        erl_scan:string("'a"++[1089]++"b'", 1),
-    {error,{{1,1},erl_scan,{illegal,atom}},{1,6}} =
-        erl_scan:string("'a"++[1089]++"b'", {1,1}),
+    {error,{{1,3},erl_scan,{illegal,character}},{1,4}} =
+        erl_scan:string("'a" ++ [999999999] ++ "c'", {1,1}),
+
     test("\"a"++[1089]++"b\""),
     {ok,[{char,1,1}],1} =
         erl_scan_string([$$,$\\,$^,1089], 1),
@@ -786,8 +785,8 @@ unicode() ->
         erl_scan:format_error(Error),
     {error,{{1,1},erl_scan,_},{1,11}} =
         erl_scan:string("\"qa\\x{aaa}",{1,1}),
-    {error,{{1,1},erl_scan,{illegal,atom}},{1,12}} =
-        erl_scan:string("'qa\\x{aaa}'",{1,1}),
+    {error,{{1,1},erl_scan,_},{1,11}} =
+        erl_scan:string("'qa\\x{aaa}",{1,1}),
 
     {ok,[{char,1,1089}],1} =
         erl_scan_string([$$,1089], 1),
@@ -904,9 +903,9 @@ more_chars() ->
 %% OTP-10302. Unicode characters scanner/parser.
 otp_10302(Config) when is_list(Config) ->
     %% From unicode():
-    {error,{1,erl_scan,{illegal,atom}},1} =
+    {ok,[{atom,1,'aсb'}],1} =
         erl_scan:string("'a"++[1089]++"b'", 1),
-    {error,{{1,1},erl_scan,{illegal,atom}},{1,12}} =
+    {ok,[{atom,{1,1},'qaપ'}],{1,12}} =
         erl_scan:string("'qa\\x{aaa}'",{1,1}),
 
     {ok,[{char,1,1089}],1} = erl_scan_string([$$,1089], 1),
-- 
cgit v1.2.3