From 81a5f0f91ce50416d44f48752cb2b7f4ae02d953 Mon Sep 17 00:00:00 2001
From: Sverker Eriksson <sverker@erlang.org>
Date: Tue, 8 Jan 2013 11:06:44 +0100
Subject: erts: Refactor rename DFLAG(S)_INTERNAL_TAGS for conformity

---
 erts/emulator/beam/dist.h     |  2 +-
 erts/emulator/beam/external.c | 18 +++++++++---------
 2 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/erts/emulator/beam/dist.h b/erts/emulator/beam/dist.h
index 845151c895..0db7d56cb2 100644
--- a/erts/emulator/beam/dist.h
+++ b/erts/emulator/beam/dist.h
@@ -38,7 +38,7 @@
 #define DFLAG_UNICODE_IO          0x1000
 #define DFLAG_DIST_HDR_ATOM_CACHE 0x2000
 #define DFLAG_SMALL_ATOM_TAGS     0x4000
-#define DFLAGS_INTERNAL_TAGS      0x8000
+#define DFLAG_INTERNAL_TAGS       0x8000
 
 /* All flags that should be enabled when term_to_binary/1 is used. */
 #define TERM_TO_BINARY_DFLAGS (DFLAG_EXTENDED_REFERENCES	\
diff --git a/erts/emulator/beam/external.c b/erts/emulator/beam/external.c
index ab1065aaa1..263ffc4eb3 100644
--- a/erts/emulator/beam/external.c
+++ b/erts/emulator/beam/external.c
@@ -467,7 +467,7 @@ Uint erts_encode_ext_size_2(Eterm term, unsigned dflags)
 
 Uint erts_encode_ext_size_ets(Eterm term)
 {
-    return encode_size_struct2(NULL, term, TERM_TO_BINARY_DFLAGS|DFLAGS_INTERNAL_TAGS);
+    return encode_size_struct2(NULL, term, TERM_TO_BINARY_DFLAGS|DFLAG_INTERNAL_TAGS);
 }
 
 
@@ -500,7 +500,7 @@ void erts_encode_ext(Eterm term, byte **ext)
 
 byte* erts_encode_ext_ets(Eterm term, byte *ep, struct erl_off_heap_header** off_heap)
 {
-    return enc_term(NULL, term, ep, TERM_TO_BINARY_DFLAGS|DFLAGS_INTERNAL_TAGS,
+    return enc_term(NULL, term, ep, TERM_TO_BINARY_DFLAGS|DFLAG_INTERNAL_TAGS,
 		    off_heap);
 }
 
@@ -1408,7 +1408,7 @@ enc_atom(ErtsAtomCacheMap *acmp, Eterm atom, byte *ep, Uint32 dflags)
 
     ASSERT(is_atom(atom));
 
-    if (dflags & DFLAGS_INTERNAL_TAGS) {
+    if (dflags & DFLAG_INTERNAL_TAGS) {
 	Uint aval = atom_val(atom);
 	ASSERT(aval < (1<<24));
 	if (aval >= (1 << 16)) {
@@ -1472,7 +1472,7 @@ enc_pid(ErtsAtomCacheMap *acmp, Eterm pid, byte* ep, Uint32 dflags)
     ep += 4;
     put_int32(os, ep);
     ep += 4;
-    *ep++ = (is_internal_pid(pid) && (dflags & DFLAGS_INTERNAL_TAGS)) ?
+    *ep++ = (is_internal_pid(pid) && (dflags & DFLAG_INTERNAL_TAGS)) ?
 	INTERNAL_CREATION : pid_creation(pid);
     return ep;
 }
@@ -1770,7 +1770,7 @@ enc_term(ErtsAtomCacheMap *acmp, Eterm obj, byte* ep, Uint32 dflags,
 	    put_int16(i, ep);
 	    ep += 2;
 	    ep = enc_atom(acmp,ref_node_name(obj),ep,dflags);
-	    *ep++ = ((dflags & DFLAGS_INTERNAL_TAGS) && is_internal_ref(obj)) ?
+	    *ep++ = ((dflags & DFLAG_INTERNAL_TAGS) && is_internal_ref(obj)) ?
 		INTERNAL_CREATION : ref_creation(obj);
 	    ref_num = ref_numbers(obj);
 	    for (j = 0; j < i; j++) {
@@ -1787,7 +1787,7 @@ enc_term(ErtsAtomCacheMap *acmp, Eterm obj, byte* ep, Uint32 dflags,
 	    j = port_number(obj);
 	    put_int32(j, ep);
 	    ep += 4;
-	    *ep++ = ((dflags & DFLAGS_INTERNAL_TAGS) && is_internal_port(obj)) ?
+	    *ep++ = ((dflags & DFLAG_INTERNAL_TAGS) && is_internal_port(obj)) ?
 		INTERNAL_CREATION : port_creation(obj);
 	    break;
 
@@ -1868,7 +1868,7 @@ enc_term(ErtsAtomCacheMap *acmp, Eterm obj, byte* ep, Uint32 dflags,
 		byte* bytes;
 
 		ERTS_GET_BINARY_BYTES(obj, bytes, bitoffs, bitsize);
-		if (dflags & DFLAGS_INTERNAL_TAGS) {
+		if (dflags & DFLAG_INTERNAL_TAGS) {
 		    ProcBin* pb = (ProcBin*) binary_val(obj);
 		    Uint bytesize = pb->size;
 		    if (pb->thing_word == HEADER_SUB_BIN) {
@@ -2869,7 +2869,7 @@ encode_size_struct2(ErtsAtomCacheMap *acmp, Eterm obj, unsigned dflags)
 	    result++;
 	    break;
 	case ATOM_DEF:
-	    if (dflags & DFLAGS_INTERNAL_TAGS) {
+	    if (dflags & DFLAG_INTERNAL_TAGS) {
 		if (atom_val(obj) >= (1<<16)) {
 		    result += 1 + 3;
 		}
@@ -2969,7 +2969,7 @@ encode_size_struct2(ErtsAtomCacheMap *acmp, Eterm obj, unsigned dflags)
 	    }
 	    break;
 	case BINARY_DEF:
-	    if (dflags & DFLAGS_INTERNAL_TAGS) {
+	    if (dflags & DFLAG_INTERNAL_TAGS) {
 		ProcBin* pb = (ProcBin*) binary_val(obj);
 		Uint sub_extra = 0;
 		Uint tot_bytes = pb->size;
-- 
cgit v1.2.3


From a9a57385f0a5648bbfeccc5e8ef0bfe2cdac80c7 Mon Sep 17 00:00:00 2001
From: Sverker Eriksson <sverker@erlang.org>
Date: Fri, 14 Dec 2012 19:52:59 +0100
Subject: erts: Change internal representation of atoms to utf8

---
 erts/emulator/beam/atom.c           |  61 +++++++++++--
 erts/emulator/beam/atom.h           |  51 ++++++++---
 erts/emulator/beam/beam_load.c      |   6 +-
 erts/emulator/beam/bif.c            |  36 +++++---
 erts/emulator/beam/dist.h           |   1 +
 erts/emulator/beam/erl_alloc.c      |   6 +-
 erts/emulator/beam/erl_alloc_util.c |   4 +-
 erts/emulator/beam/erl_nif.c        |   8 +-
 erts/emulator/beam/erl_unicode.c    | 174 ++++++++++++++++--------------------
 erts/emulator/beam/external.c       | 127 ++++++++++++++++++++------
 erts/emulator/beam/external.h       |   2 +
 erts/emulator/beam/global.h         |   1 +
 erts/emulator/test/bif_SUITE.erl    |   4 +-
 13 files changed, 312 insertions(+), 169 deletions(-)

diff --git a/erts/emulator/beam/atom.c b/erts/emulator/beam/atom.c
index d7c7f117cf..b41a98f2a2 100644
--- a/erts/emulator/beam/atom.c
+++ b/erts/emulator/beam/atom.c
@@ -111,7 +111,7 @@ atom_text_alloc(int bytes)
 {
     byte *res;
 
-    ASSERT(bytes <= MAX_ATOM_LENGTH);
+    ASSERT(bytes <= MAX_ATOM_SZ_LIMIT);
     if (atom_text_pos + bytes >= atom_text_end) {
 	more_atom_space();
     }
@@ -198,7 +198,11 @@ am_atom_put(const char* name, int len)
     Atom a;
     Eterm ret;
     int aix;
-
+#ifdef DEBUG
+    byte* err_pos;
+    Uint num_chars;
+    ASSERT(erts_analyze_utf8(name, len, &err_pos, &num_chars, NULL) == ERTS_UTF8_OK);
+#endif
     /*
      * Silently truncate the atom if it is too long. Overlong atoms
      * could occur in situations where we have no good way to return
@@ -209,8 +213,8 @@ am_atom_put(const char* name, int len)
      * list_to_atom/1), the caller should check the length before
      * calling this function.
      */
-    if (len > MAX_ATOM_LENGTH) {
-	len = MAX_ATOM_LENGTH;
+    if (len > MAX_ATOM_SZ_LIMIT) {
+	len = MAX_ATOM_SZ_LIMIT;  /*SVERK Urk... */
     }
 #ifdef ERTS_ATOM_PUT_OPS_STAT
     erts_smp_atomic_inc_nob(&atom_put_ops);
@@ -230,6 +234,49 @@ am_atom_put(const char* name, int len)
     return ret;
 }
 
+static void latin1_to_utf8(byte* conv_buf, const byte** srcp, int* lenp)
+{
+    byte* dst;
+    const byte* src = *srcp;
+    int i, len = *lenp;
+
+    for (i=0 ; i < len; ++i) {
+	if (src[i] & 0x80) {
+	    goto need_convertion;
+	}
+    }
+    return;
+
+need_convertion:
+    sys_memcpy(conv_buf, src, i);
+    dst = conv_buf + i;
+    for ( ; i < len; ++i) {
+	unsigned char chr = src[i];
+	if (!(chr & 0x80)) {
+	    *dst++ = chr;
+	}
+	else {
+	    *dst++ = 0xC0 | (chr >> 6);
+	    *dst++ = 0x80 | (chr & 0x3F);
+	}
+    }
+    *srcp = conv_buf;	
+    *lenp = dst - conv_buf;
+}
+
+
+Eterm
+am_atom_put2(const byte* name, int len, int is_latin1)
+{
+    byte utf8_copy[MAX_ATOM_SZ_FROM_LATIN1];
+
+    if (is_latin1) {
+	latin1_to_utf8(utf8_copy, &name, &len);
+    }
+    return am_atom_put((const char*)name, len);
+}
+
+
 
 int atom_table_size(void)
 {
@@ -264,14 +311,18 @@ int atom_table_sz(void)
 }
 
 int
-erts_atom_get(const char *name, int len, Eterm* ap)
+erts_atom_get(const char *name, int len, Eterm* ap, int is_latin1)
 {
+    byte utf8_copy[MAX_ATOM_SZ_FROM_LATIN1];
     Atom a;
     int i;
     int res;
 
     a.len = len;
     a.name = (byte *)name;
+    if (is_latin1) {
+	latin1_to_utf8(utf8_copy, (const byte**)&a.name, &a.len);
+    }
     atom_read_lock();
     i = index_get(&erts_atom_table, (void*) &a);
     res = i < 0 ? 0 : (*ap = make_atom(i), 1);
diff --git a/erts/emulator/beam/atom.h b/erts/emulator/beam/atom.h
index fd9c04d3d0..84dd6d8901 100644
--- a/erts/emulator/beam/atom.h
+++ b/erts/emulator/beam/atom.h
@@ -26,7 +26,9 @@
 
 #include "erl_atom_table.h"
 
-#define MAX_ATOM_LENGTH 255
+#define MAX_ATOM_CHARACTERS 255
+#define MAX_ATOM_SZ_FROM_LATIN1 (2*MAX_ATOM_CHARACTERS)
+#define MAX_ATOM_SZ_LIMIT (4*MAX_ATOM_CHARACTERS) /* theoretical byte limit */
 #define ATOM_LIMIT (1024*1024)
 #define MIN_ATOM_TABLE_SIZE 8192
 
@@ -53,8 +55,8 @@ typedef struct atom {
 extern IndexTable erts_atom_table;
 
 ERTS_GLB_INLINE Atom* atom_tab(Uint i);
-ERTS_GLB_INLINE int erts_is_atom_bytes(byte *text, size_t len, Eterm term);
-ERTS_GLB_INLINE int erts_is_atom_str(char *str, Eterm term);
+ERTS_GLB_INLINE int erts_is_atom_utf8_bytes(byte *text, size_t len, Eterm term);
+ERTS_GLB_INLINE int erts_is_atom_str(const char *str, Eterm term, int is_latin1);
 
 #if ERTS_GLB_INLINE_INCL_FUNC_DEF
 ERTS_GLB_INLINE Atom*
@@ -63,7 +65,7 @@ atom_tab(Uint i)
     return (Atom *) erts_index_lookup(&erts_atom_table, i);
 }
 
-ERTS_GLB_INLINE int erts_is_atom_bytes(byte *text, size_t len, Eterm term)
+ERTS_GLB_INLINE int erts_is_atom_utf8_bytes(byte *text, size_t len, Eterm term)
 {
     Atom *a;
     if (!is_atom(term))
@@ -73,30 +75,50 @@ ERTS_GLB_INLINE int erts_is_atom_bytes(byte *text, size_t len, Eterm term)
 	    && sys_memcmp((void *) a->name, (void *) text, len) == 0);
 }
 
-ERTS_GLB_INLINE int erts_is_atom_str(char *str, Eterm term)
+ERTS_GLB_INLINE int erts_is_atom_str(const char *str, Eterm term, int is_latin1)
 {
     Atom *a;
     int i, len;
-    char *aname;
+    const byte* aname;
+    const byte* s = (const byte*) str;
+
     if (!is_atom(term))
 	return 0;
     a = atom_tab(atom_val(term));
     len = a->len;
-    aname = (char *) a->name;
-    for (i = 0; i < len; i++)
-	if (aname[i] != str[i] || str[i] == '\0')
-	    return 0;
-    return str[len] == '\0';
+    aname = a->name;
+    if (is_latin1) {
+	for (i = 0; i < len; s++) {
+	    if (aname[i] < 0x80) {
+		if (aname[i] != *s || *s == '\0')
+		    return 0;
+		i++;
+	    }
+	    else {
+		if (aname[i]   != (0xC0 | (*s >> 6)) || 
+		    aname[i+1] != (0x80 | (*s & 0x3F))) {
+		    return 0;
+		}
+		i += 2;
+	    }
+	}
+    }
+    else {
+	for (i = 0; i < len; i++, s++)
+	    if (aname[i] != *s || *s == '\0')
+		return 0;
+    }
+    return *s == '\0';
 }
 
 #endif
 
 /*
  * Note, ERTS_IS_ATOM_STR() expects the first argument to be a
- * string literal.
+ * 7-bit ASCII string literal.
  */
 #define ERTS_IS_ATOM_STR(LSTR, TERM) \
-  (erts_is_atom_bytes((byte *) LSTR, sizeof(LSTR) - 1, (TERM)))
+  (erts_is_atom_utf8_bytes((byte *) LSTR, sizeof(LSTR) - 1, (TERM)))
 #define ERTS_DECL_AM(S) Eterm AM_ ## S = am_atom_put(#S, sizeof(#S) - 1)
 #define ERTS_INIT_AM(S) AM_ ## S = am_atom_put(#S, sizeof(#S) - 1)
 
@@ -104,12 +126,13 @@ int atom_table_size(void);	/* number of elements */
 int atom_table_sz(void);	/* table size in bytes, excluding stored objects */
 
 Eterm am_atom_put(const char*, int); /* most callers pass plain char*'s */
+Eterm am_atom_put2(const byte*, int, int is_latin1);
 int atom_erase(byte*, int);
 int atom_static_put(byte*, int);
 void init_atom_table(void);
 void atom_info(int, void *);
 void dump_atoms(int, void *);
-int erts_atom_get(const char* name, int len, Eterm* ap);
+int erts_atom_get(const char* name, int len, Eterm* ap, int is_latin1);
 void erts_atom_get_text_space_sizes(Uint *reserved, Uint *used);
 #endif
 
diff --git a/erts/emulator/beam/beam_load.c b/erts/emulator/beam/beam_load.c
index b51f076a5d..7bb964d5aa 100644
--- a/erts/emulator/beam/beam_load.c
+++ b/erts/emulator/beam/beam_load.c
@@ -1230,7 +1230,7 @@ load_atom_table(LoaderState* stp)
 
 	GetByte(stp, n);
 	GetString(stp, atom, n);
-	stp->atom[i] = am_atom_put((char*)atom, n);
+	stp->atom[i] = am_atom_put2(atom, n, 1);
     }
 
     /*
@@ -1240,7 +1240,7 @@ load_atom_table(LoaderState* stp)
     if (is_nil(stp->module)) {
 	stp->module = stp->atom[1];
     } else if (stp->atom[1] != stp->module) {
-	char sbuf[256];
+	char sbuf[MAX_ATOM_SZ_FROM_LATIN1];
 	Atom* ap;
 
 	ap = atom_tab(atom_val(stp->atom[1]));
@@ -1620,7 +1620,7 @@ read_line_table(LoaderState* stp)
 
 	    GetInt(stp, 2, n);
 	    GetString(stp, fname, n);
-	    stp->fname[i] = am_atom_put((char*)fname, n);
+	    stp->fname[i] = am_atom_put((char*)fname, n); /*SVERK ? */
 	}
     }
 
diff --git a/erts/emulator/beam/bif.c b/erts/emulator/beam/bif.c
index 1cdce49eef..89157068c0 100644
--- a/erts/emulator/beam/bif.c
+++ b/erts/emulator/beam/bif.c
@@ -2536,9 +2536,11 @@ BIF_RETTYPE append_element_2(BIF_ALIST_2)
 
 BIF_RETTYPE atom_to_list_1(BIF_ALIST_1)
 {
-    Uint need;
-    Eterm* hp;
+    Eterm do_utf8_to_list(Process*, Uint num, byte *bytes, Uint sz, Uint left,
+			  Uint *num_built, Uint *num_eaten, Eterm tail); /*SVERK */
     Atom* ap;
+    Uint num_chars, num_built, num_eaten;
+    Eterm res;
 
     if (is_not_atom(BIF_ARG_1))
 	BIF_ERROR(BIF_P, BADARG);
@@ -2547,9 +2549,19 @@ BIF_RETTYPE atom_to_list_1(BIF_ALIST_1)
     ap = atom_tab(atom_val(BIF_ARG_1));
     if (ap->len == 0)
 	BIF_RET(NIL);	/* the empty atom */
-    need = ap->len*2;
-    hp = HAlloc(BIF_P, need);
-    BIF_RET(buf_to_intlist(&hp,(char*)ap->name,ap->len, NIL));
+    {
+	byte* err_pos;
+	if (erts_analyze_utf8(ap->name, ap->len, &err_pos, &num_chars, NULL)
+	    != ERTS_UTF8_OK) {
+	    BIF_ERROR(BIF_P, BADARG);
+	}
+    }
+    
+    res = do_utf8_to_list(BIF_P, num_chars, ap->name, ap->len, ap->len,
+			  &num_built, &num_eaten, NIL);
+    ASSERT(num_built == num_chars);
+    ASSERT(num_eaten == ap->len);
+    BIF_RET(res);
 }
 
 /**********************************************************************/
@@ -2559,18 +2571,18 @@ BIF_RETTYPE atom_to_list_1(BIF_ALIST_1)
 BIF_RETTYPE list_to_atom_1(BIF_ALIST_1)
 {
     Eterm res;
-    char *buf = (char *) erts_alloc(ERTS_ALC_T_TMP, MAX_ATOM_LENGTH);
-    int i = intlist_to_buf(BIF_ARG_1, buf, MAX_ATOM_LENGTH);
+    char *buf = (char *) erts_alloc(ERTS_ALC_T_TMP, MAX_ATOM_CHARACTERS);
+    int i = intlist_to_buf(BIF_ARG_1, buf, MAX_ATOM_CHARACTERS);
 
     if (i < 0) {
 	erts_free(ERTS_ALC_T_TMP, (void *) buf);
 	i = list_length(BIF_ARG_1);
-	if (i > MAX_ATOM_LENGTH) {
+	if (i > MAX_ATOM_CHARACTERS) {
 	    BIF_ERROR(BIF_P, SYSTEM_LIMIT);
 	}
 	BIF_ERROR(BIF_P, BADARG);
     }
-    res = am_atom_put(buf, i);
+    res = am_atom_put2((byte*)buf, i, 1);
     erts_free(ERTS_ALC_T_TMP, (void *) buf);
     BIF_RET(res);
 }
@@ -2580,16 +2592,16 @@ BIF_RETTYPE list_to_atom_1(BIF_ALIST_1)
 BIF_RETTYPE list_to_existing_atom_1(BIF_ALIST_1)
 {
     int i;
-    char *buf = (char *) erts_alloc(ERTS_ALC_T_TMP, MAX_ATOM_LENGTH);
+    char *buf = (char *) erts_alloc(ERTS_ALC_T_TMP, MAX_ATOM_CHARACTERS);
 
-    if ((i = intlist_to_buf(BIF_ARG_1, buf, MAX_ATOM_LENGTH)) < 0) {
+    if ((i = intlist_to_buf(BIF_ARG_1, buf, MAX_ATOM_CHARACTERS)) < 0) {
     error:
 	erts_free(ERTS_ALC_T_TMP, (void *) buf);
 	BIF_ERROR(BIF_P, BADARG);
     } else {
 	Eterm a;
 	
-	if (erts_atom_get(buf, i, &a)) {
+	if (erts_atom_get(buf, i, &a, 1)) {
 	    erts_free(ERTS_ALC_T_TMP, (void *) buf);
 	    BIF_RET(a);
 	} else {
diff --git a/erts/emulator/beam/dist.h b/erts/emulator/beam/dist.h
index 0db7d56cb2..129f637dba 100644
--- a/erts/emulator/beam/dist.h
+++ b/erts/emulator/beam/dist.h
@@ -39,6 +39,7 @@
 #define DFLAG_DIST_HDR_ATOM_CACHE 0x2000
 #define DFLAG_SMALL_ATOM_TAGS     0x4000
 #define DFLAG_INTERNAL_TAGS       0x8000
+#define DFLAG_UTF8_ATOMS          0x10000
 
 /* All flags that should be enabled when term_to_binary/1 is used. */
 #define TERM_TO_BINARY_DFLAGS (DFLAG_EXTENDED_REFERENCES	\
diff --git a/erts/emulator/beam/erl_alloc.c b/erts/emulator/beam/erl_alloc.c
index 3eee53eba3..061f229f59 100644
--- a/erts/emulator/beam/erl_alloc.c
+++ b/erts/emulator/beam/erl_alloc.c
@@ -3045,13 +3045,13 @@ erts_request_alloc_info(struct process *c_p,
 	Eterm alloc = CAR(consp);
 
 	for (ai = ERTS_ALC_A_MIN; ai <= ERTS_ALC_A_MAX; ai++)
-	    if (erts_is_atom_str((char *) erts_alc_a2ad[ai], alloc))
+	    if (erts_is_atom_str(erts_alc_a2ad[ai], alloc, 0))
 		goto save_alloc;
-	if (erts_is_atom_str("mseg_alloc", alloc)) {
+	if (erts_is_atom_str("mseg_alloc", alloc, 0)) {
 	    ai = ERTS_ALC_INFO_A_MSEG_ALLOC;
 	    goto save_alloc;
 	}
-	if (erts_is_atom_str("alloc_util", alloc)) {
+	if (erts_is_atom_str("alloc_util", alloc, 0)) {
 	    ai = ERTS_ALC_INFO_A_ALLOC_UTIL;
 	save_alloc:
 	    if (req_ai[ai])
diff --git a/erts/emulator/beam/erl_alloc_util.c b/erts/emulator/beam/erl_alloc_util.c
index 97ba306a79..f8a8c00715 100644
--- a/erts/emulator/beam/erl_alloc_util.c
+++ b/erts/emulator/beam/erl_alloc_util.c
@@ -2815,10 +2815,10 @@ make_name_atoms(Allctr_t *allctr)
     char alloc[] = "alloc";
     char realloc[] = "realloc";
     char free[] = "free";
-    char buf[MAX_ATOM_LENGTH];
+    char buf[MAX_ATOM_CHARACTERS];
     size_t prefix_len = strlen(allctr->name_prefix);
 
-    if (prefix_len > MAX_ATOM_LENGTH + sizeof(realloc) - 1)
+    if (prefix_len > MAX_ATOM_CHARACTERS + sizeof(realloc) - 1)
 	erl_exit(1,"Too long allocator name: %salloc\n",allctr->name_prefix);
 
     memcpy((void *) buf, (void *) allctr->name_prefix, prefix_len);
diff --git a/erts/emulator/beam/erl_nif.c b/erts/emulator/beam/erl_nif.c
index 632d756481..185ac75d73 100644
--- a/erts/emulator/beam/erl_nif.c
+++ b/erts/emulator/beam/erl_nif.c
@@ -974,7 +974,7 @@ int enif_make_existing_atom_len(ErlNifEnv* env, const char* name, size_t len,
 				ERL_NIF_TERM* atom, ErlNifCharEncoding encoding)
 {
     ASSERT(encoding == ERL_NIF_LATIN1);
-    return erts_atom_get(name, len, atom);
+    return erts_atom_get(name, len, atom, 1);
 }
 
 ERL_NIF_TERM enif_make_tuple(ErlNifEnv* env, unsigned cnt, ...)
@@ -1633,7 +1633,7 @@ BIF_RETTYPE load_nif_2(BIF_ALIST_2)
 			     "this vm variant (%s).",
 			     entry->vm_variant, ERL_NIF_VM_VARIANT);
     }
-    else if (!erts_is_atom_str((char*)entry->name, mod_atom)) {
+    else if (!erts_is_atom_str((char*)entry->name, mod_atom, 1)) {
 	ret = load_nif_error(BIF_P, bad_lib, "Library module name '%s' does not"
 			     " match calling module '%T'", entry->name, mod_atom);
     }
@@ -1643,7 +1643,7 @@ BIF_RETTYPE load_nif_2(BIF_ALIST_2)
 	for (i=0; i < entry->num_of_funcs && ret==am_ok; i++) {
 	    BeamInstr** code_pp;
 	    ErlNifFunc* f = &entry->funcs[i];
-	    if (!erts_atom_get(f->name, sys_strlen(f->name), &f_atom)
+	    if (!erts_atom_get(f->name, sys_strlen(f->name), &f_atom, 1)
 		|| (code_pp = get_func_pp(mod->curr.code, f_atom, f->arity))==NULL) {
 		ret = load_nif_error(BIF_P,bad_lib,"Function not found %T:%s/%u",
 				     mod_atom, f->name, f->arity);
@@ -1746,7 +1746,7 @@ BIF_RETTYPE load_nif_2(BIF_ALIST_2)
 	for (i=0; i < entry->num_of_funcs; i++)
 	{
 	    BeamInstr* code_ptr;
-	    erts_atom_get(entry->funcs[i].name, sys_strlen(entry->funcs[i].name), &f_atom); 
+	    erts_atom_get(entry->funcs[i].name, sys_strlen(entry->funcs[i].name), &f_atom, 1); 
 	    code_ptr = *get_func_pp(mod->curr.code, f_atom, entry->funcs[i].arity);
 	    
 	    if (code_ptr[1] == 0) {
diff --git a/erts/emulator/beam/erl_unicode.c b/erts/emulator/beam/erl_unicode.c
index 51559aea1c..e24b6f1458 100644
--- a/erts/emulator/beam/erl_unicode.c
+++ b/erts/emulator/beam/erl_unicode.c
@@ -1155,7 +1155,7 @@ BIF_RETTYPE unicode_characters_to_list_2(BIF_ALIST_2)
  * a faster analyze and size count with this function.
  */
 int erts_analyze_utf8(byte *source, Uint size, 
-			byte **err_pos, Uint *num_chars, int *left)
+		      byte **err_pos, Uint *num_chars, int *left)
 {
     *err_pos = source;
     *num_chars = 0;
@@ -1210,7 +1210,7 @@ int erts_analyze_utf8(byte *source, Uint size,
 	}
 	++(*num_chars);
 	*err_pos = source;
-	if (left && --(*left) <= 0) {
+	if (left && --(*left) <= 0 && size) {
 	    return ERTS_UTF8_ANALYZE_MORE;
 	}
     }
@@ -1220,7 +1220,7 @@ int erts_analyze_utf8(byte *source, Uint size,
 /*
  * No errors should be able to occur - no overlongs, no malformed, no nothing
  */    
-static Eterm do_utf8_to_list(Process *p, Uint num, byte *bytes, Uint sz, 
+Eterm do_utf8_to_list(Process *p, Uint num, byte *bytes, Uint sz, 
 			     Uint left,
 			     Uint *num_built, Uint *num_eaten, Eterm tail)
 {
@@ -1812,31 +1812,36 @@ BIF_RETTYPE atom_to_binary_2(BIF_ALIST_2)
     ap = atom_tab(atom_val(BIF_ARG_1));
 
     if (BIF_ARG_2 == am_latin1) {
-	BIF_RET(new_binary(BIF_P, ap->name, ap->len));
-    } else if (BIF_ARG_2 == am_utf8 || BIF_ARG_2 == am_unicode) {
-	int bin_size = 0;
 	int i;
 	Eterm bin_term;
-	byte* bin_p;
-
-	for (i = 0; i < ap->len; i++) {
-	    bin_size += (ap->name[i] >= 0x80) ? 2 : 1;
+	int bin_size = ap->len;
+
+	for (i = 0; i < ap->len; ) {  
+	    if (ap->name[i] < 0x80) i++;
+	    else {
+		ASSERT(ap->name[i] >= 0xC0);
+		if (ap->name[i] < 0xE0) {
+		    ASSERT(i+1 < ap->len && (ap->name[i+1] & 0xC0) == 0x80);
+		    i += 2;
+		    bin_size -= 1;
+		}
+		else goto error;
+	    }
 	}
 	if (bin_size == ap->len) {
-	    BIF_RET(new_binary(BIF_P, ap->name, ap->len));
+	    bin_term = new_binary(BIF_P, ap->name, ap->len);
 	}
-	bin_term = new_binary(BIF_P, 0, bin_size);
-	bin_p = binary_bytes(bin_term);
-	for (i = 0; i < ap->len; i++) {
-	    byte b = ap->name[i];
-	    if (b < 0x80) {
-		*bin_p++ = b;
-	    } else {
-		*bin_p++ = 0xC0 | (b >> 6);
-		*bin_p++ = 0x80 | (b & 0x3F);
-	    }
+	else {
+	    byte* bin_p;
+	    int dbg_sz;
+	    bin_term = new_binary(BIF_P, 0, bin_size);
+	    bin_p = binary_bytes(bin_term);
+	    dbg_sz = erts_utf8_to_latin1(bin_p, ap->name, ap->len);
+	    ASSERT(dbg_sz == bin_size); (void)dbg_sz; 
 	}
 	BIF_RET(bin_term);
+    } else if (BIF_ARG_2 == am_utf8 || BIF_ARG_2 == am_unicode) {
+	BIF_RET(new_binary(BIF_P, ap->name, ap->len));
     } else {
     error:
 	BIF_ERROR(BIF_P, BADARG);
@@ -1856,102 +1861,52 @@ binary_to_atom(Process* p, Eterm bin, Eterm enc, int must_exist)
     bin_size = binary_size(bin);
     if (enc == am_latin1) {
 	Eterm a;
-	if (bin_size > MAX_ATOM_LENGTH) {
+	if (bin_size > MAX_ATOM_CHARACTERS) {
 	system_limit:
 	    erts_free_aligned_binary_bytes(temp_alloc);
 	    BIF_ERROR(p, SYSTEM_LIMIT);
 	}
 	if (!must_exist) {
-	    a = am_atom_put((char *)bytes, bin_size);
+	    a = am_atom_put2(bytes, bin_size, 1);
 	    erts_free_aligned_binary_bytes(temp_alloc);
 	    BIF_RET(a);
-	} else if (erts_atom_get((char *)bytes, bin_size, &a)) {
+	} else if (erts_atom_get((char *)bytes, bin_size, &a, 1)) {
 	    erts_free_aligned_binary_bytes(temp_alloc);
 	    BIF_RET(a);
 	} else {
 	    goto badarg;
 	}
     } else if (enc == am_utf8 || enc == am_unicode) {
-	char *buf;
-	char *dst;
-	int i;
-	int num_chars;
 	Eterm res;
+	Uint num_chars = 0;
+	const byte* p = bytes;
+	Uint left = bin_size;
 
-	if (bin_size > 2*MAX_ATOM_LENGTH) {
-	    byte* err_pos;
-	    Uint n;
-	    int reds_left = bin_size+1; /* Number of reductions left. */
-
-	    if (erts_analyze_utf8(bytes, bin_size, &err_pos,
-			     &n, &reds_left) == ERTS_UTF8_OK) {
-		/* 
-		 * Correct UTF-8 encoding, but too many characters to
-		 * fit in an atom.
-		 */
+	while (left) {
+	    if (++num_chars > MAX_ATOM_CHARACTERS) {
 		goto system_limit;
-	    } else {
-		/*
-		 * Something wrong in the UTF-8 encoding or Unicode code
-		 * points > 255.
-		 */
-		goto badarg;
 	    }
-	}
-
-	/*
-	 * Allocate a temporary buffer the same size as the binary,
-	 * so that we don't need an extra overflow test.
-	 */
-	buf = (char *) erts_alloc(ERTS_ALC_T_TMP, bin_size);
-	dst = buf;
-	for (i = 0; i < bin_size; i++) {
-	    int c = bytes[i];
-	    if (c < 0x80) {
-		*dst++ = c;
-	    } else if (i < bin_size-1) {
-		int c2;
-		if ((c & 0xE0) != 0xC0) {
-		    goto free_badarg;
-		}
-		i++;
-		c = (c & 0x3F) << 6;
-		c2 = bytes[i];
-		if ((c2 & 0xC0) != 0x80) {
-		    goto free_badarg;
-		}
-		c = c | (c2 & 0x3F);
-		if (0x80 <= c && c < 256) {
-		    *dst++ = c;
-		} else {
-		    goto free_badarg;
-		}
-	    } else {
-	    free_badarg:
-		erts_free(ERTS_ALC_T_TMP, (void *) buf);
-		goto badarg;
+	    if ((p[0] & 0x80) == 0) {
+		++p;
+		--left;
 	    }
+	    else if (left >= 2
+		     && (p[0] & 0xFE) == 0xC2 /* only allow latin1 subset */
+		     && (p[1] & 0xC0) == 0x80) {
+		p += 2;
+		left -= 2;
+	    }
+	    else goto badarg;
 	}
-	num_chars = dst - buf;
-	if (num_chars > MAX_ATOM_LENGTH) {
-	    erts_free(ERTS_ALC_T_TMP, (void *) buf);
-	    goto system_limit;
-	}
+
 	if (!must_exist) {
-	    res = am_atom_put(buf, num_chars);
-	    erts_free(ERTS_ALC_T_TMP, (void *) buf);
-	    erts_free_aligned_binary_bytes(temp_alloc);
-	    BIF_RET(res);
-	} else {
-	    int exists = erts_atom_get(buf, num_chars, &res);
-	    erts_free(ERTS_ALC_T_TMP, (void *) buf);
-	    if (exists) {
-		erts_free_aligned_binary_bytes(temp_alloc);
-		BIF_RET(res);
-	    } else {
-		goto badarg;
-	    }
+	    res = am_atom_put((char*)bytes, bin_size);
+	}
+	else if (!erts_atom_get((char*)bytes, bin_size, &res, 0)) {
+	    goto badarg;
 	}
+	erts_free_aligned_binary_bytes(temp_alloc);
+	BIF_RET(res);
     } else {
     badarg:
 	erts_free_aligned_binary_bytes(temp_alloc);
@@ -2670,3 +2625,30 @@ BIF_RETTYPE file_native_name_encoding_0(BIF_ALIST_0)
     }
 }
 
+/* Assumes 'dest' has enough room.
+ */
+int erts_utf8_to_latin1(byte* dest, const byte* source, unsigned slen)
+{
+    byte* dp = dest;
+    while (slen > 0) {
+	if ((source[0] & 0x80) == 0) {
+	    *dp++ = *source++;
+	    --slen;
+	}
+	else if (slen > 1 &&
+		 (source[0] & 0xFE) == 0xC2 &&
+		 (source[1] & 0xC0) == 0x80) {
+	    *dp++ = (char) ((source[0] << 6) | (source[1] & 0x3F));
+	    source += 2;
+	    slen -= 2;
+	}
+	else {
+	    /* Just let unconvertable octets through. This should not happen
+	       in a correctly upgraded system */
+	    *dp++ = *source++;
+	    --slen;
+	}
+    }
+    return dp - dest;
+}
+
diff --git a/erts/emulator/beam/external.c b/erts/emulator/beam/external.c
index 263ffc4eb3..68edcd0fa6 100644
--- a/erts/emulator/beam/external.c
+++ b/erts/emulator/beam/external.c
@@ -707,7 +707,7 @@ erts_prepare_dist_ext(ErtsDistExternal *edep,
 		    if (cix >= ERTS_ATOM_CACHE_SIZE)
 			ERTS_EXT_HDR_FAIL;
 		    ep++;
-#if MAX_ATOM_LENGTH > 255
+#if MAX_ATOM_CHARACTERS > 255
 		    if (long_atoms) {
 			CHKSIZE(2);
 			len = get_int16(ep);
@@ -720,7 +720,7 @@ erts_prepare_dist_ext(ErtsDistExternal *edep,
 			len = get_int8(ep);
 			ep++;
 		    }
-		    if (len > MAX_ATOM_LENGTH)
+		    if (len > MAX_ATOM_CHARACTERS)
 			ERTS_EXT_HDR_FAIL; /* Too long atom */
 		    CHKSIZE(len);
 		    atom = am_atom_put((char *) ep, len);
@@ -1431,18 +1431,33 @@ enc_atom(ErtsAtomCacheMap *acmp, Eterm atom, byte *ep, Uint32 dflags)
     if (iix < 0) { 
 	i = atom_val(atom);
 	j = atom_tab(i)->len;
-	if ((MAX_ATOM_LENGTH <= 255 || j <= 255)
-	    && (dflags & DFLAG_SMALL_ATOM_TAGS)) {
-	    *ep++ = SMALL_ATOM_EXT;
-	    put_int8(j, ep);
-	    ep++;
+	if (dflags & DFLAG_UTF8_ATOMS) {
+	    if (j <= 255) {
+		*ep++ = ATOM_UTF8_EXT;
+		put_int16(j, ep);
+		ep += 2;
+	    }
+	    else {
+		*ep++ = SMALL_ATOM_UTF8_EXT;
+		put_int8(j, ep);
+		ep += 2;
+	    }
+	    sys_memcpy((char *) ep, (char*)atom_tab(i)->name, (int) j);
 	}
 	else {
-	    *ep++ = ATOM_EXT;
-	    put_int16(j, ep);
-	    ep += 2;
+	    if (j <= 255 && (dflags & DFLAG_SMALL_ATOM_TAGS)) {
+		*ep++ = SMALL_ATOM_EXT;
+		j = erts_utf8_to_latin1(ep+1, atom_tab(i)->name, j);
+		put_int8(j, ep);
+		ep++;
+	    }
+	    else {
+		*ep++ = ATOM_EXT;
+		j = erts_utf8_to_latin1(ep+2, atom_tab(i)->name, j);
+		put_int16(j, ep);
+		ep += 2;
+	    }	    
 	}
-	sys_memcpy((char *) ep, (char*)atom_tab(i)->name, (int) j);
 	ep += j;
 	return ep;
     }
@@ -1482,7 +1497,7 @@ static byte*
 dec_atom(ErtsDistExternal *edep, byte* ep, Eterm* objp)
 {
     Uint len;
-    int n;
+    int n, is_latin1;
 
     switch (*ep++) {
     case ATOM_CACHE_REF:
@@ -1498,17 +1513,29 @@ dec_atom(ErtsDistExternal *edep, byte* ep, Eterm* objp)
     case ATOM_EXT:
 	len = get_int16(ep),
 	ep += 2;
+	is_latin1 = 1;
         goto dec_atom_common;
     case SMALL_ATOM_EXT:
 	len = get_int8(ep);
 	ep++;
+	is_latin1 = 1;
+	goto dec_atom_common;
+    case ATOM_UTF8_EXT:
+	len = get_int16(ep),
+	ep += 2;
+	is_latin1 = 0;
+	goto dec_atom_common;
+    case SMALL_ATOM_UTF8_EXT:
+	len = get_int8(ep),
+	ep++;
+	is_latin1 = 0;
     dec_atom_common:
         if (edep && (edep->flags & ERTS_DIST_EXT_BTT_SAFE)) {
-	    if (!erts_atom_get((char*)ep, len, objp)) {
+	    if (!erts_atom_get((char*)ep, len, objp, is_latin1)) {
                 goto error;
 	    }
         } else {
-            *objp = am_atom_put((char*)ep, len);
+            *objp = am_atom_put2(ep, len, is_latin1);
         }
 	ep += len;
 	break;
@@ -2113,7 +2140,7 @@ static byte*
 dec_term(ErtsDistExternal *edep, Eterm** hpp, byte* ep, ErlOffHeap* off_heap, Eterm* objp)
 {
     Eterm* hp_saved = *hpp;
-    int n;
+    int n, is_latin1;
     register Eterm* hp = *hpp;	/* Please don't take the address of hp */
     Eterm* next = objp;
 
@@ -2199,17 +2226,29 @@ dec_term(ErtsDistExternal *edep, Eterm** hpp, byte* ep, ErlOffHeap* off_heap, Et
 	case ATOM_EXT:
 	    n = get_int16(ep);
 	    ep += 2;
-            goto dec_term_atom_common;
+	    is_latin1 = 1;
+	    goto dec_term_atom_common;
 	case SMALL_ATOM_EXT:
 	    n = get_int8(ep);
 	    ep++;
+	    is_latin1 = 1;
+	    goto dec_term_atom_common;
+	case ATOM_UTF8_EXT:
+	    n = get_int16(ep);
+	    ep += 2;
+	    is_latin1 = 0;
+	    goto dec_term_atom_common;
+	case SMALL_ATOM_UTF8_EXT:
+	    n = get_int8(ep);
+	    ep++;
+	    is_latin1 = 0;
 dec_term_atom_common:
 	    if (edep && (edep->flags & ERTS_DIST_EXT_BTT_SAFE)) {
-		if (!erts_atom_get((char*)ep, n, objp)) {
+		if (!erts_atom_get((char*)ep, n, objp, is_latin1)) {
 		    goto error;
 		}
 	    } else {
-	        *objp = am_atom_put((char*)ep, n);
+	        *objp = am_atom_put2(ep, n, is_latin1);
 	    }
 	    ep += n;
 	    break;
@@ -2879,14 +2918,15 @@ encode_size_struct2(ErtsAtomCacheMap *acmp, Eterm obj, unsigned dflags)
 	    }
 	    else {
 		int alen = atom_tab(atom_val(obj))->len;
-		if ((MAX_ATOM_LENGTH <= 255 || alen <= 255)
-		    && (dflags & DFLAG_SMALL_ATOM_TAGS)) {
-		    /* Make sure a SMALL_ATOM_EXT fits: SMALL_ATOM_EXT l t1 t2... */
-			result += 1 + 1 + alen;
-		}
-		else {
-		    /* Make sure an ATOM_EXT fits: ATOM_EXT l1 l0 t1 t2... */
-			result += 1 + 2 + alen;
+		result += 1 + 1 + alen;
+		if (dflags & DFLAG_UTF8_ATOMS) {
+		    if (alen > 255) {
+			result++; /* ATOM_UTF8_EXT (not small) */
+		    }
+		    /*SVERK we use utf8 length which is an over estimation */
+		}		    
+		else if (alen > 255 || !(dflags & DFLAG_SMALL_ATOM_TAGS)) {
+		    result++; /* ATOM_EXT (not small) */
 		}
 		insert_acache_map(acmp, obj);
 	    }
@@ -3058,6 +3098,17 @@ encode_size_struct2(ErtsAtomCacheMap *acmp, Eterm obj, unsigned dflags)
     return result;
 }
 
+static int is_valid_utf8_atom(byte* bytes, Uint nbytes)
+{
+    byte* err_pos;
+    Uint num_chars;
+
+    /*SVERK Do we really need to validate correct utf8? */
+    return nbytes <= MAX_ATOM_SZ_LIMIT
+	&& erts_analyze_utf8(bytes, nbytes, &err_pos, &num_chars, NULL) == ERTS_UTF8_OK
+	&& num_chars <= MAX_ATOM_CHARACTERS; 
+}
+
 static Sint
 decoded_size(byte *ep, byte* endp, int internal_tags)
 {
@@ -3125,21 +3176,41 @@ decoded_size(byte *ep, byte* endp, int internal_tags)
 	case ATOM_EXT:
 	    CHKSIZE(2);
 	    n = get_int16(ep);
-	    if (n > MAX_ATOM_LENGTH) {
+	    if (n > MAX_ATOM_CHARACTERS) {
 		return -1;
 	    }
 	    SKIP(n+2+atom_extra_skip);
 	    atom_extra_skip = 0;
 	    break;
+	case ATOM_UTF8_EXT:
+	    CHKSIZE(2);
+	    n = get_int16(ep);
+	    ep += 2;
+	    if (!is_valid_utf8_atom(ep, n)) {
+		return -1;
+	    }
+	    SKIP(n+atom_extra_skip);
+	    atom_extra_skip = 0;
+	    break;
 	case SMALL_ATOM_EXT:
 	    CHKSIZE(1);
 	    n = get_int8(ep);
-	    if (n > MAX_ATOM_LENGTH) {
+	    if (n > MAX_ATOM_CHARACTERS) {
 		return -1;
 	    }
 	    SKIP(n+1+atom_extra_skip);
 	    atom_extra_skip = 0;
 	    break;
+	case SMALL_ATOM_UTF8_EXT:
+	    CHKSIZE(1);
+	    n = get_int8(ep);
+	    ep++;
+	    if (!is_valid_utf8_atom(ep, n)) {
+		return -1;
+	    }
+	    SKIP(n+atom_extra_skip);
+	    atom_extra_skip = 0;
+	    break;
 	case ATOM_CACHE_REF:
 	    SKIP(1+atom_extra_skip);
 	    atom_extra_skip = 0;
diff --git a/erts/emulator/beam/external.h b/erts/emulator/beam/external.h
index eddd4571dd..50eea62225 100644
--- a/erts/emulator/beam/external.h
+++ b/erts/emulator/beam/external.h
@@ -51,6 +51,8 @@
 #define NEW_FUN_EXT       'p'
 #define EXPORT_EXT        'q'
 #define FUN_EXT           'u'
+#define ATOM_UTF8_EXT     'v'
+#define SMALL_ATOM_UTF8_EXT 'w'
 
 #define DIST_HEADER       'D'
 #define ATOM_CACHE_REF    'R'
diff --git a/erts/emulator/beam/global.h b/erts/emulator/beam/global.h
index 4c0d3421c8..1500424d3e 100755
--- a/erts/emulator/beam/global.h
+++ b/erts/emulator/beam/global.h
@@ -1525,6 +1525,7 @@ char *erts_convert_filename_to_native(Eterm name, char *statbuf,
 				      int allow_empty, int allow_atom,
 				      Sint *used /* out */);
 Eterm erts_convert_native_to_filename(Process *p, byte *bytes);
+int erts_utf8_to_latin1(byte* dest, const byte* source, unsigned slen);
 #define ERTS_UTF8_OK 0
 #define ERTS_UTF8_INCOMPLETE 1
 #define ERTS_UTF8_ERROR 2
diff --git a/erts/emulator/test/bif_SUITE.erl b/erts/emulator/test/bif_SUITE.erl
index e2442861c7..02c6de8cb1 100644
--- a/erts/emulator/test/bif_SUITE.erl
+++ b/erts/emulator/test/bif_SUITE.erl
@@ -481,8 +481,6 @@ binary_to_atom(Config) when is_list(Config) ->
     %% Bad UTF8 sequences.
     ?line ?BADARG(binary_to_atom(id(<<255>>), utf8)),
     ?line ?BADARG(binary_to_atom(id(<<255,0>>), utf8)),
-    ?line ?BADARG(binary_to_atom(id(<<0:512/unit:8,255>>), utf8)),
-    ?line ?BADARG(binary_to_atom(id(<<0:512/unit:8,255,0>>), utf8)),
     ?line ?BADARG(binary_to_atom(id(<<16#C0,16#80>>), utf8)), %Overlong 0.
     ?line [?BADARG(binary_to_atom(<<C/utf8>>, utf8)) ||
 	      C <- lists:seq(256, 16#D7FF)],
@@ -494,6 +492,8 @@ binary_to_atom(Config) when is_list(Config) ->
 	      C <- lists:seq(16#90000, 16#10FFFF)],
 
     %% system_limit failures.
+    ?line ?SYS_LIMIT(binary_to_atom(id(<<0:512/unit:8,255>>), utf8)),
+    ?line ?SYS_LIMIT(binary_to_atom(id(<<0:512/unit:8,255,0>>), utf8)),
     ?line ?SYS_LIMIT(binary_to_atom(<<0:256/unit:8>>, latin1)),
     ?line ?SYS_LIMIT(binary_to_atom(<<0:257/unit:8>>, latin1)),
     ?line ?SYS_LIMIT(binary_to_atom(<<0:512/unit:8>>, latin1)),
-- 
cgit v1.2.3


From e4e007afd032f7aca359d2665f91ddb12727521a Mon Sep 17 00:00:00 2001
From: Sverker Eriksson <sverker@erlang.org>
Date: Tue, 18 Dec 2012 18:31:46 +0100
Subject: stdlib: Fix printing of unicode atoms

---
 lib/stdlib/src/io_lib.erl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lib/stdlib/src/io_lib.erl b/lib/stdlib/src/io_lib.erl
index 513d904c39..0f1f417d01 100644
--- a/lib/stdlib/src/io_lib.erl
+++ b/lib/stdlib/src/io_lib.erl
@@ -276,7 +276,7 @@ write_atom(Atom) ->
     Chars = atom_to_list(Atom),
     case quote_atom(Atom, Chars) of
 	true ->
-	    write_string(Chars, $');   %'
+	    write_unicode_string(Chars, $');   %'
 	false ->
 	    Chars
     end.
-- 
cgit v1.2.3


From 685d009efcfd7521e9c918a14b58eac19755299d Mon Sep 17 00:00:00 2001
From: Sverker Eriksson <sverker@erlang.org>
Date: Fri, 21 Dec 2012 15:50:21 +0100
Subject: erl_interface: Enable decode of unicode atoms

No API changes or additions. Just the ability for erl_interface to decode
unicode atoms and convert them into latin1 strings to preserve backward
compatibility for the existing API.
---
 lib/erl_interface/include/ei.h                |   2 +
 lib/erl_interface/src/decode/decode_atom.c    |  68 +++++++--
 lib/erl_interface/src/decode/decode_boolean.c |  32 ++---
 lib/erl_interface/src/decode/decode_pid.c     |  13 +-
 lib/erl_interface/src/decode/decode_port.c    |  12 +-
 lib/erl_interface/src/decode/decode_ref.c     |  26 +---
 lib/erl_interface/src/legacy/erl_marshal.c    | 200 +++++++++++++-------------
 lib/erl_interface/src/misc/ei_decode_term.c   |  37 +----
 lib/erl_interface/src/misc/ei_printterm.c     |   2 +
 lib/erl_interface/src/misc/get_type.c         |   8 +-
 lib/erl_interface/src/misc/putget.h           |   3 +
 11 files changed, 202 insertions(+), 201 deletions(-)

diff --git a/lib/erl_interface/include/ei.h b/lib/erl_interface/include/ei.h
index ae815b414a..8f07b24852 100644
--- a/lib/erl_interface/include/ei.h
+++ b/lib/erl_interface/include/ei.h
@@ -115,6 +115,8 @@
 #define ERL_FLOAT_EXT         'c'
 #define NEW_FLOAT_EXT         'F'
 #define ERL_ATOM_EXT          'd'
+#define ERL_SMALL_ATOM_EXT    's'
+#define ERL_UNICODE_ATOM_EXT  'v'
 #define ERL_REFERENCE_EXT     'e'
 #define ERL_NEW_REFERENCE_EXT 'r'
 #define ERL_PORT_EXT          'f'
diff --git a/lib/erl_interface/src/decode/decode_atom.c b/lib/erl_interface/src/decode/decode_atom.c
index c2e6a0426e..84edf1766a 100644
--- a/lib/erl_interface/src/decode/decode_atom.c
+++ b/lib/erl_interface/src/decode/decode_atom.c
@@ -21,24 +21,76 @@
 #include "eiext.h"
 #include "putget.h"
 
+static int utf8_to_latin1(char* dest, const char* source, unsigned len);
+
 int ei_decode_atom(const char *buf, int *index, char *p)
 {
   const char *s = buf + *index;
   const char *s0 = s;
   int len;
 
-  if (get8(s) != ERL_ATOM_EXT) return -1;
+  switch (get8(s)) {
+  case ERL_ATOM_EXT:
+      len = get16be(s);
+      if (len > MAXATOMLEN) return -1;
+      if (p) {
+	  memmove(p,s,len); 
+	  p[len] = (char)0;
+      }
+      break;
+
+  case ERL_SMALL_ATOM_EXT:
+      len = get8(s);
+      if (p) {
+	  memmove(p,s,len); 
+	  p[len] = (char)0;
+      }
+      break;
+
+  case ERL_UNICODE_ATOM_EXT:
+      len = get16be(s);
 
-  len = get16be(s);
+      if (len > 2*MAXATOMLEN) return -1;
 
-  if (len > MAXATOMLEN) return -1;
+      if (p && utf8_to_latin1(p, s, len) < 0) return -1;
+      break;
 
-  if (p) {
-    memmove(p,s,len); 
-    p[len] = (char)0;
+  default:
+      return -1;
   }
+
   s += len;
   *index += s-s0;
-  
-  return 0; 
+  return 0;
+}
+
+int ei_internal_get_atom(const char** bufp, char* p)
+{
+    int ix = 0;
+    if (ei_decode_atom(*bufp, &ix, p) < 0) return -1;
+    *bufp += ix;
+    return 0;
+}
+
+static int utf8_to_latin1(char* dest, const char* source, unsigned slen)
+{
+    const char* dest_end = dest + MAXATOMLEN - 1;
+
+    while (slen > 0 && dest < dest_end) {
+	if ((source[0] & 0x80) == 0) {
+	    *dest++ = *source++;
+	    --slen;
+	}
+	else if (slen > 1 &&
+		 (source[0] & 0xFE) == 0xC2 &&
+		 (source[1] & 0xC0) == 0x80) {
+	    *dest++ = (char) ((source[0] << 6) | (source[1] & 0x3F));
+	    source += 2;
+	    slen -= 2;
+	}
+	else return -1;
+    }
+    *dest = 0;
+    return 0;
 }
+
diff --git a/lib/erl_interface/src/decode/decode_boolean.c b/lib/erl_interface/src/decode/decode_boolean.c
index 9fd09c63f1..0a7a06f1d4 100644
--- a/lib/erl_interface/src/decode/decode_boolean.c
+++ b/lib/erl_interface/src/decode/decode_boolean.c
@@ -26,32 +26,20 @@ int ei_decode_boolean(const char *buf, int *index, int *p)
 {
   const char *s = buf + *index;
   const char *s0 = s;
-  int len;
+  char tbuf[MAXATOMLEN+1];
   int t;
 
-  if (get8(s) != ERL_ATOM_EXT) return -1;
+  if (get_atom(&s, tbuf) < 0) return -1;
 
-  len = get16be(s);
-
-  switch (len) {
-  case 4:
-    /* typecast makes ansi happy */
-    if (strncmp((char*)s,"true",4)) return -1;
-    t = 1;
-    break;
-
-  case 5:
-    if (strncmp((char*)s,"false",5)) return -1;
-    t = 0;
-    break;
-    
-  default:
-    return -1;
-  }
-  
-  s += len;
+  if (memcmp(tbuf, "true", 5) == 0)
+      t = 1;
+  else if (memcmp(tbuf, "false", 6) == 0)
+      t = 0;
+  else
+      return -1;
+      
   if (p) *p = t;
   *index += s-s0;
-
   return 0; 
 }
+
diff --git a/lib/erl_interface/src/decode/decode_pid.c b/lib/erl_interface/src/decode/decode_pid.c
index 9ed1c36db6..a762ae499e 100644
--- a/lib/erl_interface/src/decode/decode_pid.c
+++ b/lib/erl_interface/src/decode/decode_pid.c
@@ -21,6 +21,7 @@
 #include "eiext.h"
 #include "putget.h"
 
+
 int ei_decode_pid(const char *buf, int *index, erlang_pid *p)
 {
   const char *s = buf + *index;
@@ -30,17 +31,7 @@ int ei_decode_pid(const char *buf, int *index, erlang_pid *p)
   if (get8(s) != ERL_PID_EXT) return -1;
 
   /* first the nodename */
-  if (get8(s) != ERL_ATOM_EXT) return -1;
-
-  len = get16be(s);
-
-  if (len > MAXATOMLEN) return -1;
-  
-  if (p) {
-    memmove(p->node, s, len);
-    p->node[len] = (char)0;
-  }
-  s += len;
+  if (get_atom(&s, p->node) < 0) return -1;
   
   /* now the numbers: num (4), serial (4), creation (1) */
   if (p) {
diff --git a/lib/erl_interface/src/decode/decode_port.c b/lib/erl_interface/src/decode/decode_port.c
index 28abed801a..6eb2bc9197 100644
--- a/lib/erl_interface/src/decode/decode_port.c
+++ b/lib/erl_interface/src/decode/decode_port.c
@@ -30,17 +30,7 @@ int ei_decode_port(const char *buf, int *index, erlang_port *p)
   if (get8(s) != ERL_PORT_EXT) return -1;
 
   /* first the nodename */
-  if (get8(s) != ERL_ATOM_EXT) return -1;
-
-  len = get16be(s);
-
-  if (len > MAXATOMLEN) return -1;
-
-  if (p) {
-    memmove(p->node, s, len);
-    p->node[len] = (char)0;
-  }
-  s += len;
+  if (get_atom(&s, p->node) < 0) return -1;
   
   /* now the numbers: num (4), creation (1) */
   if (p) {
diff --git a/lib/erl_interface/src/decode/decode_ref.c b/lib/erl_interface/src/decode/decode_ref.c
index 7b15808bc5..df3c30777b 100644
--- a/lib/erl_interface/src/decode/decode_ref.c
+++ b/lib/erl_interface/src/decode/decode_ref.c
@@ -21,6 +21,7 @@
 #include "eiext.h"
 #include "putget.h"
 
+
 int ei_decode_ref(const char *buf, int *index, erlang_ref *p)
 {
   const char *s = buf + *index;
@@ -30,18 +31,8 @@ int ei_decode_ref(const char *buf, int *index, erlang_ref *p)
   switch (get8(s)) {
     case ERL_REFERENCE_EXT:
 
-      /* first the nodename */
-      if (get8(s) != ERL_ATOM_EXT) return -1;
-
-      len = get16be(s);
-
-      if (len > MAXATOMLEN) return -1;
-
-      if (p) {
-	  memmove(p->node, s, len);
-	  p->node[len] = (char)0;
-      }
-      s += len;
+      /* nodename */
+      if (get_atom(&s, p->node) < 0) return -1;
   
       /* now the numbers: num (4), creation (1) */
       if (p) {
@@ -62,15 +53,7 @@ int ei_decode_ref(const char *buf, int *index, erlang_ref *p)
       if (p) p->len = count;
 
       /* then the nodename */
-      if (get8(s) != ERL_ATOM_EXT) return -1;
-      len = get16be(s);
-      if (len > MAXATOMLEN) return -1;
-
-      if (p) {
-	  memmove(p->node, s, len);
-	  p->node[len] = (char)0;
-      }
-      s += len;
+      if (get_atom(&s, p->node) < 0) return -1;
 
       /* creation */
       if (p) {
@@ -95,3 +78,4 @@ int ei_decode_ref(const char *buf, int *index, erlang_ref *p)
       return -1;
   }
 }
+
diff --git a/lib/erl_interface/src/legacy/erl_marshal.c b/lib/erl_interface/src/legacy/erl_marshal.c
index dad715c762..775d7e82ca 100644
--- a/lib/erl_interface/src/legacy/erl_marshal.c
+++ b/lib/erl_interface/src/legacy/erl_marshal.c
@@ -51,7 +51,13 @@ static void erl_long_to_fp(long l, unsigned *d);
 
 #define CMP_ARRAY_SIZE 256
 /* FIXME problem for threaded ? */
-static char cmp_array[CMP_ARRAY_SIZE]; 
+
+static enum
+{
+    ERL_NUM_CMP=1, ERL_ATOM_CMP, ERL_REF_CMP, ERL_FUN_CMP, ERL_PORT_CMP,
+    ERL_PID_CMP, ERL_TUPLE_CMP, ERL_NIL_CMP, ERL_LIST_CMP, ERL_BIN_CMP
+}cmp_array[CMP_ARRAY_SIZE];
+
 static int init_cmp_array_p=1; /* initialize array, the first time */
 
 #if defined(VXWORKS) && CPU == PPC860
@@ -69,10 +75,8 @@ static int init_cmp_array_p=1; /* initialize array, the first time */
 static int cmp_floats(double f1, double f2);
 static INLINE double to_float(long l);
 
-#define ERL_NUM_CMP 1
-#define ERL_REF_CMP 3
-
 #define IS_ERL_NUM(t) (cmp_array[t]==ERL_NUM_CMP)
+#define IS_ERL_ATOM(t) (cmp_array[t]==ERL_ATOM_CMP)
 
 #define CMP_NUM_CLASS_SIZE 256
 static unsigned char cmp_num_class[CMP_NUM_CLASS_SIZE]; 
@@ -100,25 +104,27 @@ void erl_init_marshal(void)
 {
   if (init_cmp_array_p) {
     memset(cmp_array, 0, CMP_ARRAY_SIZE);
-    cmp_array[ERL_SMALL_INTEGER_EXT] = 1;
-    cmp_array[ERL_INTEGER_EXT]       = 1;
-    cmp_array[ERL_FLOAT_EXT]         = 1;
-    cmp_array[NEW_FLOAT_EXT]         = 1;
-    cmp_array[ERL_SMALL_BIG_EXT]     = 1;
-    cmp_array[ERL_LARGE_BIG_EXT]     = 1;
-    cmp_array[ERL_ATOM_EXT]          = 2;
-    cmp_array[ERL_REFERENCE_EXT]     = 3;
-    cmp_array[ERL_NEW_REFERENCE_EXT] = 3;
-    cmp_array[ERL_FUN_EXT]           = 4;
-    cmp_array[ERL_NEW_FUN_EXT]       = 4;
-    cmp_array[ERL_PORT_EXT]          = 5;
-    cmp_array[ERL_PID_EXT]           = 6;
-    cmp_array[ERL_SMALL_TUPLE_EXT]   = 7;
-    cmp_array[ERL_LARGE_TUPLE_EXT]   = 7;
-    cmp_array[ERL_NIL_EXT]           = 8;
-    cmp_array[ERL_STRING_EXT]        = 9;
-    cmp_array[ERL_LIST_EXT]          = 9;
-    cmp_array[ERL_BINARY_EXT]        = 10;
+    cmp_array[ERL_SMALL_INTEGER_EXT] = ERL_NUM_CMP;
+    cmp_array[ERL_INTEGER_EXT]       = ERL_NUM_CMP;
+    cmp_array[ERL_FLOAT_EXT]         = ERL_NUM_CMP;
+    cmp_array[NEW_FLOAT_EXT]         = ERL_NUM_CMP;
+    cmp_array[ERL_SMALL_BIG_EXT]     = ERL_NUM_CMP;
+    cmp_array[ERL_LARGE_BIG_EXT]     = ERL_NUM_CMP;
+    cmp_array[ERL_ATOM_EXT]          = ERL_ATOM_CMP;
+    cmp_array[ERL_SMALL_ATOM_EXT]    = ERL_ATOM_CMP;
+    cmp_array[ERL_UNICODE_ATOM_EXT]  = ERL_ATOM_CMP;
+    cmp_array[ERL_REFERENCE_EXT]     = ERL_REF_CMP;
+    cmp_array[ERL_NEW_REFERENCE_EXT] = ERL_REF_CMP;
+    cmp_array[ERL_FUN_EXT]           = ERL_FUN_CMP;
+    cmp_array[ERL_NEW_FUN_EXT]       = ERL_FUN_CMP;
+    cmp_array[ERL_PORT_EXT]          = ERL_PORT_CMP;
+    cmp_array[ERL_PID_EXT]           = ERL_PID_CMP;
+    cmp_array[ERL_SMALL_TUPLE_EXT]   = ERL_TUPLE_CMP;
+    cmp_array[ERL_LARGE_TUPLE_EXT]   = ERL_TUPLE_CMP;
+    cmp_array[ERL_NIL_EXT]           = ERL_NIL_CMP;
+    cmp_array[ERL_STRING_EXT]        = ERL_LIST_CMP;
+    cmp_array[ERL_LIST_EXT]          = ERL_LIST_CMP;
+    cmp_array[ERL_BINARY_EXT]        = ERL_BIN_CMP;
     init_cmp_array_p = 0;
   }
   if (init_cmp_num_class_p) {
@@ -644,31 +650,14 @@ int erl_encode_buf(ETERM *ep, unsigned char **ext)
 
 } /* erl_encode_buf */
 
-/*
- * A nice macro to make it look cleaner in the 
- * cases of PID's,PORT's and REF's below. 
- * It reads the NODE name from a buffer.
- */
-#define READ_THE_NODE(ext,cp,len,i) \
-/* eat first atom, repr. the node */ \
-if (**ext != ERL_ATOM_EXT) \
-  return (ETERM *) NULL; \
-*ext += 1; \
-i = (**ext << 8) | (*ext)[1]; \
-cp = (char *) *(ext) + 2; \
-*ext += (i + 2); \
-len = i
-
-#define STATIC_NODE_BUF_SZ 30
-
-#define SET_NODE(node,node_buf,cp,len) \
-if (len >= STATIC_NODE_BUF_SZ) node = erl_malloc(len+1); \
-else node = node_buf; \
-memcpy(node, cp, len); \
-node[len] = '\0'
-
-#define RESET_NODE(node,len) \
-if (len >= STATIC_NODE_BUF_SZ) free(node)
+
+static int read_atom(unsigned char** ext, char* dst)
+{
+    int offs = 0;
+    int ret = ei_decode_atom((char*)*ext, &offs, dst);
+    *ext += offs;
+    return ret;
+}
 
 /*
  * The actual DECODE engine.
@@ -676,16 +665,17 @@ if (len >= STATIC_NODE_BUF_SZ) free(node)
  */
 static ETERM *erl_decode_it(unsigned char **ext)
 {
+    char atom_buf[MAXATOMLEN+1];
     char *cp;
     ETERM *ep,*tp,*np;
     unsigned int u,sign;
-    int i,j,len,arity;
+    int i,j,arity;
     double ff;
     
     /* Assume we are going to decode an integer */
     ep = erl_alloc_eterm(ERL_INTEGER);
     ERL_COUNT(ep) = 1;
-    
+
     switch (*(*ext)++) 
     {
     case ERL_INTEGER_EXT:
@@ -774,27 +764,27 @@ static ETERM *erl_decode_it(unsigned char **ext)
 	return ep;
       
     case ERL_ATOM_EXT:
+    case ERL_SMALL_ATOM_EXT:
+    case ERL_UNICODE_ATOM_EXT:
 	ERL_TYPE(ep) = ERL_ATOM;
-	i = (**ext << 8) | (*ext)[1];
-	cp = (char *) *(ext) + 2;
-	*ext += (i + 2);
+	--(*ext);
+	if (read_atom(ext, atom_buf) < 0) return NULL;
+
+	i = strlen(atom_buf);
 	ep->uval.aval.len = i;
 	ep->uval.aval.a = (char *) erl_malloc(i+1);
-	memcpy(ep->uval.aval.a, cp, i);
-	ep->uval.aval.a[i]='\0';
+	memcpy(ep->uval.aval.a, atom_buf, i+1);
 	return ep;
       
     case ERL_PID_EXT:
 	erl_free_term(ep);
 	{			/* Why not use the constructors? */
-	    char *node;
-	    char node_buf[STATIC_NODE_BUF_SZ];
+	    char* node = atom_buf;
 	    unsigned int number, serial;
 	    unsigned char creation;
 	    ETERM *eterm_p;
 
-	    READ_THE_NODE(ext,cp,len,i);
-	    SET_NODE(node,node_buf,cp,len);
+	    if (read_atom(ext, node) < 0) return NULL;
 
 	    /* get the integers */
 #if 0
@@ -816,20 +806,17 @@ static ETERM *erl_decode_it(unsigned char **ext)
 #endif
 	    creation =  *(*ext)++; 
 	    eterm_p = erl_mk_pid(node, number, serial, creation);
-	    RESET_NODE(node,len);
 	    return eterm_p;
 	}
     case ERL_REFERENCE_EXT:
 	erl_free_term(ep);
 	{
-	    char *node;
-	    char node_buf[STATIC_NODE_BUF_SZ];
+	    char* node = atom_buf;
 	    unsigned int number;
 	    unsigned char creation;
 	    ETERM *eterm_p;
 
-	    READ_THE_NODE(ext,cp,len,i);
-	    SET_NODE(node,node_buf,cp,len);
+	    if (read_atom(ext, node) < 0) return NULL;
 
 	    /* get the integers */
 #if 0
@@ -841,15 +828,13 @@ static ETERM *erl_decode_it(unsigned char **ext)
 #endif
 	    creation =  *(*ext)++; 
 	    eterm_p = erl_mk_ref(node, number, creation);
-	    RESET_NODE(node,len);
 	    return eterm_p;
 	}
 
     case ERL_NEW_REFERENCE_EXT: 
 	erl_free_term(ep);
 	{
-	    char *node;
-	    char node_buf[STATIC_NODE_BUF_SZ];
+	    char* node = atom_buf;
 	    size_t cnt, i;
 	    unsigned int n[3];
 	    unsigned char creation;
@@ -862,8 +847,7 @@ static ETERM *erl_decode_it(unsigned char **ext)
 	    *ext += 2;
 #endif
 
-	    READ_THE_NODE(ext,cp,len,i);
-	    SET_NODE(node,node_buf,cp,len);
+	    if (read_atom(ext, node) < 0) return NULL;
 
 	    /* get the integers */
 	    creation =  *(*ext)++; 
@@ -878,21 +862,18 @@ static ETERM *erl_decode_it(unsigned char **ext)
 #endif
 	    }
 	    eterm_p = __erl_mk_reference(node, cnt, n, creation);
-	    RESET_NODE(node,len);
 	    return eterm_p;
 	}
 
     case ERL_PORT_EXT:
 	erl_free_term(ep);
 	{
-	    char *node;
-	    char node_buf[STATIC_NODE_BUF_SZ];
+	    char* node = atom_buf;
 	    unsigned int number;
 	    unsigned char creation;
 	    ETERM *eterm_p;
 
-	    READ_THE_NODE(ext,cp,len,i);
-	    SET_NODE(node,node_buf,cp,len);
+	    if (read_atom(ext, node) < 0) return NULL;
 
 	    /* get the integers */
 #if 0
@@ -904,7 +885,6 @@ static ETERM *erl_decode_it(unsigned char **ext)
 #endif
 	    creation =  *(*ext)++; 
 	    eterm_p = erl_mk_port(node, number, creation);
-	    RESET_NODE(node,len);
 	    return eterm_p;
 	}
 
@@ -1140,6 +1120,8 @@ unsigned char erl_ext_type(unsigned char *ext)
     case ERL_INTEGER_EXT:
 	return ERL_INTEGER;
     case ERL_ATOM_EXT:
+    case ERL_SMALL_ATOM_EXT:
+    case ERL_UNICODE_ATOM_EXT:
 	return ERL_ATOM;
     case ERL_PID_EXT:
 	return ERL_PID;
@@ -1191,6 +1173,8 @@ int erl_ext_size(unsigned char *t)
     case ERL_SMALL_INTEGER_EXT:
     case ERL_INTEGER_EXT:
     case ERL_ATOM_EXT:
+    case ERL_SMALL_ATOM_EXT:
+    case ERL_UNICODE_ATOM_EXT:
     case ERL_PID_EXT:
     case ERL_PORT_EXT:
     case ERL_REFERENCE_EXT:
@@ -1229,15 +1213,31 @@ int erl_ext_size(unsigned char *t)
 
 } /* ext_size */
 
-/*
- * A nice macro that eats up the atom pointed to.
- */
-#define JUMP_ATOM(ext,i) \
-if (**ext != ERL_ATOM_EXT) \
-  return 0; \
-*ext += 1; \
-i = (**ext << 8) | (*ext)[1]; \
-*ext += (i + 2)
+
+static int jump_atom(unsigned char** ext)
+{
+    unsigned char* e = *ext;
+    int len;
+
+    switch (*e++) {
+    case ERL_ATOM_EXT:
+    case ERL_UNICODE_ATOM_EXT:
+	len = (e[0] << 8) | e[1];
+	e += (len + 2);
+	break;
+
+    case ERL_SMALL_ATOM_EXT:
+	len = e[0];
+	e += (len + 1);
+	break;
+
+    default:
+	return 0;
+    }
+    *ext = e;
+    return 1;
+}
+
 
 /*
  * MOVE the POINTER PAST the ENCODED ETERM we
@@ -1259,25 +1259,26 @@ static int jump(unsigned char **ext)
 	*ext += 1;
 	break;
     case ERL_ATOM_EXT:
-	i = (**ext << 8) | (*ext)[1];
-	*ext += (i + 2);
+    case ERL_SMALL_ATOM_EXT:
+    case ERL_UNICODE_ATOM_EXT:
+	jump_atom(ext);
 	break;
     case ERL_PID_EXT:
 	/* eat first atom */
-	JUMP_ATOM(ext,i);
+	if (!jump_atom(ext)) return 0;
 	*ext += 9;		/* Two int's and the creation field */
 	break;
     case ERL_REFERENCE_EXT:
     case ERL_PORT_EXT:
 	/* first field is an atom */
-	JUMP_ATOM(ext,i);
+	if (!jump_atom(ext)) return 0;
 	*ext += 5;		/* One int and the creation field */
 	break;
     case ERL_NEW_REFERENCE_EXT:
 	n = (**ext << 8) | (*ext)[1];
 	*ext += 2;
 	/* first field is an atom */
-	JUMP_ATOM(ext,i);
+	if (!jump_atom(ext)) return 0;
 	*ext += 4*n+1;
 	break;
     case ERL_NIL_EXT:
@@ -1437,9 +1438,8 @@ do {								\
 
 #define CMP_EXT_SKIP_ATOM(EP)					\
 do {								\
-    if ((EP)[0] != ERL_ATOM_EXT)				\
+    if (!jump_atom(&(EP)))					\
 	return CMP_EXT_ERROR_CODE;				\
-    (EP) += 3 + ((EP)[1] << 8 | (EP)[2]);			\
 } while (0)
 
 /* 
@@ -1569,6 +1569,7 @@ static int cmp_exe2(unsigned char **e1, unsigned char **e2)
   }
 
   *e2 += 1;
+  i = j = 0;
   switch (*(*e1)++) 
     {
     case ERL_SMALL_INTEGER_EXT:
@@ -1589,11 +1590,16 @@ static int cmp_exe2(unsigned char **e1, unsigned char **e2)
       *e1 += 4; *e2 += 4;
       return ret;
     case ERL_ATOM_EXT:
-      i = (**e1 << 8) | (*e1)[1];
-      j = (**e2 << 8) | (*e2)[1];
-      ret = cmpbytes(*e1 +2, i, *e2 +2, j);
-      *e1 += (i + 2);
-      *e2 += (j + 2);
+    case ERL_UNICODE_ATOM_EXT:
+      i = (**e1) << 8; (*e1)++;
+      j = (**e2) << 8; (*e2)++;
+      /*fall through*/
+    case ERL_SMALL_ATOM_EXT:
+      i |= (**e1); (*e1)++;
+      j |= (**e2); (*e2)++;
+      ret = cmpbytes(*e1, i, *e2, j);
+      *e1 += i;
+      *e2 += j;
       return ret;
     case ERL_PID_EXT: {
       unsigned char *n1 = *e1;
@@ -1622,7 +1628,7 @@ static int cmp_exe2(unsigned char **e1, unsigned char **e2)
     }
     case ERL_PORT_EXT:
       /* First compare node names ... */
-      if (**e1 != ERL_ATOM_EXT || **e2 != ERL_ATOM_EXT)
+      if (!IS_ERL_ATOM(**e1) || !IS_ERL_ATOM(**e2))
 	  return CMP_EXT_ERROR_CODE;
       ret = cmp_exe2(e1, e2);
       *e1 += 5; *e2 += 5;
diff --git a/lib/erl_interface/src/misc/ei_decode_term.c b/lib/erl_interface/src/misc/ei_decode_term.c
index 0b82ef0e35..6773f90bfc 100644
--- a/lib/erl_interface/src/misc/ei_decode_term.c
+++ b/lib/erl_interface/src/misc/ei_decode_term.c
@@ -48,20 +48,12 @@ int ei_decode_ei_term(const char* buf, int* index, ei_term* term)
     case NEW_FLOAT_EXT:
         return ei_decode_double(buf, index, &term->value.d_val);
     case ERL_ATOM_EXT:
-	len = get16be(s);
-	if (len > MAXATOMLEN) return -1;
-	memcpy(term->value.atom_name, s, len); 
-	term->value.atom_name[len] = '\0';
-	s += len;
-	break;
+    case ERL_SMALL_ATOM_EXT:
+    case ERL_UNICODE_ATOM_EXT:
+	return ei_decode_atom(buf, index, term->value.atom_name);
     case ERL_REFERENCE_EXT:
 	/* first the nodename */
-	if (get8(s) != ERL_ATOM_EXT) return -1;
-	len = get16be(s);
-	if (len > MAXATOMLEN) return -1;
-	memcpy(term->value.ref.node, s, len);
-	term->value.ref.node[len] = '\0';
-	s += len;
+	if (get_atom(&s, term->value.ref.node) < 0) return -1;
         /* now the numbers: num (4), creation (1) */
 	term->value.ref.n[0] = get32be(s);
 	term->value.ref.len = 1;
@@ -71,12 +63,7 @@ int ei_decode_ei_term(const char* buf, int* index, ei_term* term)
 	/* first the integer count */
 	term->value.ref.len = get16be(s);
 	/* then the nodename */
-	if (get8(s) != ERL_ATOM_EXT) return -1;
-	len = get16be(s);
-	if (len > MAXATOMLEN) return -1;
-	memcpy(term->value.ref.node, s, len);
-	term->value.ref.node[len] = '\0';
-	s += len;
+	if (get_atom(&s, term->value.ref.node) < 0) return -1;
 	/* creation */
 	term->value.ref.creation = get8(s) & 0x03;
 	/* finally the id integers */
@@ -88,22 +75,12 @@ int ei_decode_ei_term(const char* buf, int* index, ei_term* term)
 	}
 	break;
     case ERL_PORT_EXT:
-	if (get8(s) != ERL_ATOM_EXT) return -1;
-	len = get16be(s);
-	if (len > MAXATOMLEN) return -1;
-	memcpy(term->value.port.node, s, len);
-	term->value.port.node[len] = '\0';
+	if (get_atom(&s, term->value.port.node) < 0) return -1;
 	term->value.port.id = get32be(s) & 0x0fffffff; /* 28 bits */;
 	term->value.port.creation = get8(s) & 0x03;
 	break;
     case ERL_PID_EXT:
-	if (get8(s) != ERL_ATOM_EXT) return -1;
-	/* name first */
-	len = get16be(s); 
-	if (len > MAXATOMLEN) return -1;
-	memcpy(term->value.pid.node, s, len);
-	term->value.pid.node[len] = '\0';
-	s += len;
+	if (get_atom(&s, term->value.pid.node) < 0) return -1;
 	/* now the numbers: num (4), serial (4), creation (1) */
 	term->value.pid.num = get32be(s) & 0x7fff; /* 15 bits */
 	term->value.pid.serial = get32be(s) & 0x1fff; /* 13 bits */
diff --git a/lib/erl_interface/src/misc/ei_printterm.c b/lib/erl_interface/src/misc/ei_printterm.c
index 5fc6b3542c..620c6e72e2 100644
--- a/lib/erl_interface/src/misc/ei_printterm.c
+++ b/lib/erl_interface/src/misc/ei_printterm.c
@@ -133,6 +133,8 @@ static int print_term(FILE* fp, ei_x_buff* x,
     ei_get_type_internal(buf, index, &ty, &n);
     switch (ty) {
     case ERL_ATOM_EXT:
+    case ERL_SMALL_ATOM_EXT:
+    case ERL_UNICODE_ATOM_EXT:
 	if (ei_decode_atom(buf, index, a) < 0)
 	   goto err;
 	doquote = !islower((int)a[0]);
diff --git a/lib/erl_interface/src/misc/get_type.c b/lib/erl_interface/src/misc/get_type.c
index 2a680d0f94..c9a040fbbd 100644
--- a/lib/erl_interface/src/misc/get_type.c
+++ b/lib/erl_interface/src/misc/get_type.c
@@ -55,10 +55,12 @@ int ei_get_type(const char *buf, const int *index, int *type, int *len)
 	break;
 
     case ERL_SMALL_TUPLE_EXT:
+    case ERL_SMALL_ATOM_EXT:
 	*len  = get8(s);
 	break;
     
     case ERL_ATOM_EXT:
+    case ERL_UNICODE_ATOM_EXT:
     case ERL_STRING_EXT:
 	*len  = get16be(s);
 	break;
@@ -114,10 +116,14 @@ int ei_get_type_internal(const char *buf, const int *index,
   *type = get8(s);
   
   switch (*type) {
+  case ERL_SMALL_ATOM_EXT:
+    *type = ERL_ATOM_EXT;
   case ERL_SMALL_TUPLE_EXT:
     *len = get8(s);
     break;
-    
+
+  case ERL_UNICODE_ATOM_EXT:
+    *type = ERL_ATOM_EXT;
   case ERL_ATOM_EXT:
   case ERL_STRING_EXT:
     *len = get16be(s);
diff --git a/lib/erl_interface/src/misc/putget.h b/lib/erl_interface/src/misc/putget.h
index 7a43de324b..8b0d4d3404 100644
--- a/lib/erl_interface/src/misc/putget.h
+++ b/lib/erl_interface/src/misc/putget.h
@@ -105,6 +105,9 @@
        ((EI_ULONGLONG)((unsigned char *)(s))[-2] << 8)  | \
         (EI_ULONGLONG)((unsigned char *)(s))[-1]))
 
+int ei_internal_get_atom(const char** bufp, char* p);
+#define get_atom ei_internal_get_atom
+
 typedef union float_ext {
     double d;
     EI_ULONGLONG val;
-- 
cgit v1.2.3


From 5d79f55ca441727578d34b78ee0d6d8aa80976ee Mon Sep 17 00:00:00 2001
From: Rickard Green <rickard@erlang.org>
Date: Sat, 5 Jan 2013 03:07:14 +0100
Subject: Implement UTF-8 atom support for jinterface

---
 .../com/ericsson/otp/erlang/AbstractNode.java      |   4 +-
 .../com/ericsson/otp/erlang/OtpErlangAtom.java     |   2 +-
 .../com/ericsson/otp/erlang/OtpExternal.java       |   6 +
 .../com/ericsson/otp/erlang/OtpInputStream.java    |  64 +++++++--
 .../com/ericsson/otp/erlang/OtpOutputStream.java   |  60 ++++++++-
 lib/jinterface/test/nc_SUITE.erl                   | 146 +++++++++++++++------
 6 files changed, 226 insertions(+), 56 deletions(-)

diff --git a/lib/jinterface/java_src/com/ericsson/otp/erlang/AbstractNode.java b/lib/jinterface/java_src/com/ericsson/otp/erlang/AbstractNode.java
index 16cb544a16..c76fad5e45 100644
--- a/lib/jinterface/java_src/com/ericsson/otp/erlang/AbstractNode.java
+++ b/lib/jinterface/java_src/com/ericsson/otp/erlang/AbstractNode.java
@@ -90,6 +90,8 @@ public class AbstractNode {
     static final int dFlagExportPtrTag = 0x200; // NOT SUPPORTED
     static final int dFlagBitBinaries = 0x400;
     static final int dFlagNewFloats = 0x800;
+    static final int dFlagUnicodeIo = 0x1000;
+    static final int dFlagUtf8Atoms = 0x10000;
 
     int ntype = NTYPE_R6;
     int proto = 0; // tcp/ip
@@ -98,7 +100,7 @@ public class AbstractNode {
     int creation = 0;
     int flags = dFlagExtendedReferences | dFlagExtendedPidsPorts
 	    | dFlagBitBinaries | dFlagNewFloats | dFlagFunTags
-	    | dflagNewFunTags;
+	    | dflagNewFunTags | dFlagUtf8Atoms;
 
     /* initialize hostname and default cookie */
     static {
diff --git a/lib/jinterface/java_src/com/ericsson/otp/erlang/OtpErlangAtom.java b/lib/jinterface/java_src/com/ericsson/otp/erlang/OtpErlangAtom.java
index ced4dbb8c2..2768edc6fa 100644
--- a/lib/jinterface/java_src/com/ericsson/otp/erlang/OtpErlangAtom.java
+++ b/lib/jinterface/java_src/com/ericsson/otp/erlang/OtpErlangAtom.java
@@ -51,7 +51,7 @@ public class OtpErlangAtom extends OtpErlangObject implements Serializable,
 		    "null string value");
 	}
 
-	if (atom.length() > maxAtomLength) {
+	if (atom.codePointCount(0, atom.length()) > maxAtomLength) {
 	    throw new java.lang.IllegalArgumentException("Atom may not exceed "
 		    + maxAtomLength + " characters: " + atom);
 	}
diff --git a/lib/jinterface/java_src/com/ericsson/otp/erlang/OtpExternal.java b/lib/jinterface/java_src/com/ericsson/otp/erlang/OtpExternal.java
index e70b9a786b..2a4cd4fa2d 100644
--- a/lib/jinterface/java_src/com/ericsson/otp/erlang/OtpExternal.java
+++ b/lib/jinterface/java_src/com/ericsson/otp/erlang/OtpExternal.java
@@ -88,6 +88,12 @@ public class OtpExternal {
     /** The tag used for old Funs */
     public static final int funTag = 117;
 
+    /** The tag used for unicode atoms */
+    public static final int atomUtf8Tag = 118;
+
+    /** The tag used for small unicode atoms */
+    public static final int smallAtomUtf8Tag = 119;
+
     /** The tag used for compressed terms */
     public static final int compressedTag = 80;
 
diff --git a/lib/jinterface/java_src/com/ericsson/otp/erlang/OtpInputStream.java b/lib/jinterface/java_src/com/ericsson/otp/erlang/OtpInputStream.java
index ae5f4ee072..c2a79af841 100644
--- a/lib/jinterface/java_src/com/ericsson/otp/erlang/OtpInputStream.java
+++ b/lib/jinterface/java_src/com/ericsson/otp/erlang/OtpInputStream.java
@@ -351,26 +351,64 @@ public class OtpInputStream extends ByteArrayInputStream {
      */
     public String read_atom() throws OtpErlangDecodeException {
 	int tag;
-	int len;
+	int len = -1;
 	byte[] strbuf;
 	String atom;
 
 	tag = read1skip_version();
 
-	if (tag != OtpExternal.atomTag) {
-	    throw new OtpErlangDecodeException(
-		    "wrong tag encountered, expected " + OtpExternal.atomTag
-			    + ", got " + tag);
-	}
+	switch (tag) {
 
-	len = read2BE();
+	case OtpExternal.atomTag:
+	    len = read2BE();
+	    strbuf = new byte[len];
+	    this.readN(strbuf);
+	    try {
+		atom = new String(strbuf, "ISO-8859-1");
+	    } catch (final java.io.UnsupportedEncodingException e) {
+		throw new OtpErlangDecodeException(
+		    "Failed to decode ISO-8859-1 atom");
+	    }
+	    if (atom.length() > OtpExternal.maxAtomLength) {
+		/*
+		 * Throwing an exception would be better I think,
+		 * but truncation seems to be the way it has
+		 * been done in other parts of OTP...
+		 */
+		atom = atom.substring(0, OtpExternal.maxAtomLength);
+	    }
+	    break;
 
-	strbuf = new byte[len];
-	this.readN(strbuf);
-	atom = OtpErlangString.newString(strbuf);
+	case OtpExternal.smallAtomUtf8Tag:
+	    len = read1();
+	    /* fall through */
+	case OtpExternal.atomUtf8Tag:
+	    if (len < 0) {
+		len = read2BE();
+	    }
+	    strbuf = new byte[len];
+	    this.readN(strbuf);
+	    try {
+		atom = new String(strbuf, "UTF-8");
+	    } catch (final java.io.UnsupportedEncodingException e) {
+		throw new OtpErlangDecodeException(
+		    "Failed to decode UTF-8 atom");
+	    }
+	    if (atom.codePointCount(0, atom.length()) > OtpExternal.maxAtomLength) {
+		/*
+		 * Throwing an exception would be better I think,
+		 * but truncation seems to be the way it has
+		 * been done in other parts of OTP...
+		 */
+		final int[] cps = OtpErlangString.stringToCodePoints(atom);
+		atom = new String(cps, 0, OtpExternal.maxAtomLength);
+	    }
+	    break;
 
-	if (atom.length() > OtpExternal.maxAtomLength) {
-	    atom = atom.substring(0, OtpExternal.maxAtomLength);
+	default:
+	    throw new OtpErlangDecodeException(
+		"wrong tag encountered, expected " + OtpExternal.atomTag
+		+ ", or "  + OtpExternal.atomUtf8Tag + ", got " + tag);
 	}
 
 	return atom;
@@ -1152,6 +1190,8 @@ public class OtpInputStream extends ByteArrayInputStream {
 	    return new OtpErlangLong(this);
 
 	case OtpExternal.atomTag:
+	case OtpExternal.smallAtomUtf8Tag:
+	case OtpExternal.atomUtf8Tag:
 	    return new OtpErlangAtom(this);
 
 	case OtpExternal.floatTag:
diff --git a/lib/jinterface/java_src/com/ericsson/otp/erlang/OtpOutputStream.java b/lib/jinterface/java_src/com/ericsson/otp/erlang/OtpOutputStream.java
index 22ebb4688a..10bdf389cd 100644
--- a/lib/jinterface/java_src/com/ericsson/otp/erlang/OtpOutputStream.java
+++ b/lib/jinterface/java_src/com/ericsson/otp/erlang/OtpOutputStream.java
@@ -343,9 +343,63 @@ public class OtpOutputStream extends ByteArrayOutputStream {
      *            the string to write.
      */
     public void write_atom(final String atom) {
-	write1(OtpExternal.atomTag);
-	write2BE(atom.length());
-	writeN(atom.getBytes());
+	String enc_atom;
+	byte[] bytes;
+	boolean isLatin1 = true;
+
+	if (atom.codePointCount(0, atom.length()) <= OtpExternal.maxAtomLength) {
+	    enc_atom = atom;
+	}
+	else {
+	    /*
+	     * Throwing an exception would be better I think,
+	     * but truncation seems to be the way it has
+	     * been done in other parts of OTP...
+	     */
+	    enc_atom = new String(OtpErlangString.stringToCodePoints(atom),
+				  0, OtpExternal.maxAtomLength);
+	}
+
+	for (int offset = 0; offset < enc_atom.length();) {
+	    final int cp = enc_atom.codePointAt(offset);
+	    if ((cp & ~0xFF) != 0) {
+		isLatin1 = false;
+		break;
+	    }
+	    offset += Character.charCount(cp);
+	}
+	try {
+	    if (isLatin1) {
+		bytes = enc_atom.getBytes("ISO-8859-1");
+		write1(OtpExternal.atomTag);
+		write2BE(bytes.length);
+	    }
+	    else {
+		bytes = enc_atom.getBytes("UTF-8");
+		final int length = bytes.length;
+		if (length < 256) {
+		    write1(OtpExternal.smallAtomUtf8Tag);
+		    write1(length);
+		}
+		else {
+		    write1(OtpExternal.atomUtf8Tag);
+		    write2BE(length);
+		}
+	    }
+	    writeN(bytes);
+	} catch (final java.io.UnsupportedEncodingException e) {
+	    /*
+	     * Sigh, why didn't the API designer add an
+	     * OtpErlangEncodeException to these encoding
+	     * functions?!? Instead of changing the API we
+	     * write an invalid atom and let it fail for
+	     * whoever trying to decode this... Sigh,
+	     * again...
+	     */
+	    write1(OtpExternal.smallAtomUtf8Tag);
+	    write1(2);
+	    write2BE(0xffff); /* Invalid UTF-8 */
+	}
     }
 
     /**
diff --git a/lib/jinterface/test/nc_SUITE.erl b/lib/jinterface/test/nc_SUITE.erl
index d5388e54f4..ccc87a6761 100644
--- a/lib/jinterface/test/nc_SUITE.erl
+++ b/lib/jinterface/test/nc_SUITE.erl
@@ -22,6 +22,15 @@
 -include_lib("common_test/include/ct.hrl").
 -include("test_server_line.hrl").
 
+-define(VERSION_MAGIC,       131).
+
+-define(ATOM_EXT,            100).
+-define(REFERENCE_EXT,       101).
+-define(PORT_EXT,            102).
+-define(PID_EXT,             103).
+-define(NEW_REFERENCE_EXT,   114).
+-define(ATOM_UTF8_EXT,       118).
+-define(SMALL_ATOM_UTF8_EXT, 119).
 
 -export([all/0, suite/0,groups/0,init_per_group/2,end_per_group/2,
 	 init_per_suite/1,
@@ -45,6 +54,10 @@
 	 unicode/1,
 	 unicode_list_to_string/1,
 	 unicode_string_to_list/1,
+	 utf8_atom/1,
+	 utf8_pid/1,
+	 utf8_port/1,
+	 utf8_ref/1,
 	 connect/1]).
 
 
@@ -58,7 +71,9 @@ all() ->
      decompress_roundtrip, compress_roundtrip,
      integer_roundtrip, fun_roundtrip, lists_roundtrip,
      lists_roundtrip_2, lists_iterator, unicode,
-     unicode_list_to_string, unicode_string_to_list, connect].
+     unicode_list_to_string, unicode_string_to_list,
+     utf8_atom, utf8_pid, utf8_port, utf8_ref,
+     connect].
 
 groups() -> 
     [].
@@ -448,6 +463,71 @@ unicode_string_to_list(Config) when is_list(Config) ->
 	    end, ["unicode"]).
 
 
+evil_smiley() ->
+    <<240,159,152,136>>.
+
+evil_smileys(0) ->
+    [];
+evil_smileys(N) ->
+    [evil_smiley() | evil_smileys(N-1)].
+
+utf8_atom(Config) when is_list(Config) ->
+    ES = evil_smiley(),
+    SmallUA = binary_to_term(list_to_binary([?VERSION_MAGIC,
+					     ?SMALL_ATOM_UTF8_EXT,
+					     size(ES),
+					     ES])),
+    true = is_atom(SmallUA),
+    NoESs = 300 div size(ES),
+    ESs = evil_smileys(NoESs),
+    LargeUA = binary_to_term(list_to_binary([?VERSION_MAGIC,
+					     ?ATOM_UTF8_EXT,
+					     uint16_be(NoESs*size(ES)),
+					     ESs])),
+    true = is_atom(LargeUA),
+    erlang:display({atom, SmallUA, LargeUA}),
+    do_echo([SmallUA, LargeUA], Config).
+
+utf8_nodenames_ext() ->
+    H = "@host",
+    ES = evil_smiley(),
+    SmallUANodeExt = list_to_binary([?SMALL_ATOM_UTF8_EXT,
+				     size(ES)+length(H),
+				     ES,
+				     H]),
+    NoESs = 300 div size(ES),
+    ESs = evil_smileys(NoESs),
+    LargeUANodeExt = list_to_binary([?ATOM_UTF8_EXT,
+				     uint16_be(NoESs*size(ES)+length(H)),
+				     ESs,
+				     H]),
+    {SmallUANodeExt, LargeUANodeExt}.
+
+utf8_pid(Config) when is_list(Config) ->
+    {SmallUANodeExt, LargeUANodeExt} = utf8_nodenames_ext(),
+    SmallPid = mk_pid({SmallUANodeExt, 2}, 4711, 4711),
+    LargePid = mk_pid({LargeUANodeExt, 2}, 4711, 4711),
+    erlang:display({pid, SmallPid, node(SmallPid)}),
+    erlang:display({pid, LargePid, node(LargePid)}),
+    do_echo([SmallPid, LargePid], Config).
+
+utf8_port(Config) when is_list(Config) ->
+    {SmallUANodeExt, LargeUANodeExt} = utf8_nodenames_ext(),
+    SmallPort = mk_port({SmallUANodeExt, 2}, 4711),
+    erlang:display({port, SmallPort, node(SmallPort)}),
+    LargePort = mk_port({LargeUANodeExt, 2}, 4711),
+    erlang:display({port, LargePort, node(LargePort)}),
+    do_echo([SmallPort, LargePort], Config).
+
+utf8_ref(Config) when is_list(Config) ->
+    {SmallUANodeExt, LargeUANodeExt} = utf8_nodenames_ext(),
+    SmallRef = mk_ref({SmallUANodeExt, 2}, [4711, 4711, 4711]),
+    erlang:display({ref, SmallRef, node(SmallRef)}),
+    LargeRef = mk_ref({LargeUANodeExt, 2}, [4711, 4711, 4711]),
+    erlang:display({ref, LargeRef, node(LargeRef)}),
+    do_echo([SmallRef, LargeRef], Config).
+    
+
 %% Lazy list
 cp_gen(N) ->
     cp_gen(N, -1, 16#110000).
@@ -646,16 +726,6 @@ make_name() ->
 		 ++ "-" ++ integer_to_list(B)
 		 ++ "-" ++ integer_to_list(C)).
 
-
-
--define(VERSION_MAGIC,       131).
-
--define(ATOM_EXT,            100).
--define(REFERENCE_EXT,       101).
--define(PORT_EXT,            102).
--define(PID_EXT,             103).
--define(NEW_REFERENCE_EXT,   114).
-
 uint32_be(Uint) when is_integer(Uint), 0 =< Uint, Uint < 1 bsl 32 ->
     [(Uint bsr 24) band 16#ff,
      (Uint bsr 16) band 16#ff,
@@ -679,72 +749,70 @@ uint8(Uint) ->
 
 
 mk_pid({NodeName, Creation}, Number, Serial) when is_atom(NodeName) ->
-    mk_pid({atom_to_list(NodeName), Creation}, Number, Serial);
-mk_pid({NodeName, Creation}, Number, Serial) ->
+    <<?VERSION_MAGIC, NodeNameExt/binary>> = term_to_binary(NodeName),
+    mk_pid({NodeNameExt, Creation}, Number, Serial);
+mk_pid({NodeNameExt, Creation}, Number, Serial) ->
     case catch binary_to_term(list_to_binary([?VERSION_MAGIC,
 					      ?PID_EXT,
-					      ?ATOM_EXT,
-					      uint16_be(length(NodeName)),
-					      NodeName,
+					      NodeNameExt,
 					      uint32_be(Number),
 					      uint32_be(Serial),
 					      uint8(Creation)])) of
 	Pid when is_pid(Pid) ->
 	    Pid;
 	{'EXIT', {badarg, _}} ->
-	    exit({badarg, mk_pid, [{NodeName, Creation}, Number, Serial]});
+	    exit({badarg, mk_pid, [{NodeNameExt, Creation}, Number, Serial]});
 	Other ->
 	    exit({unexpected_binary_to_term_result, Other})
     end.
 
 mk_port({NodeName, Creation}, Number) when is_atom(NodeName) ->
-    mk_port({atom_to_list(NodeName), Creation}, Number);
-mk_port({NodeName, Creation}, Number) ->
+    <<?VERSION_MAGIC, NodeNameExt/binary>> = term_to_binary(NodeName),
+    mk_port({NodeNameExt, Creation}, Number);
+mk_port({NodeNameExt, Creation}, Number) ->
     case catch binary_to_term(list_to_binary([?VERSION_MAGIC,
 					      ?PORT_EXT,
-					      ?ATOM_EXT,
-					      uint16_be(length(NodeName)),
-					      NodeName,
+					      NodeNameExt,
 					      uint32_be(Number),
 					      uint8(Creation)])) of
 	Port when is_port(Port) ->
 	    Port;
 	{'EXIT', {badarg, _}} ->
-	    exit({badarg, mk_port, [{NodeName, Creation}, Number]});
+	    exit({badarg, mk_port, [{NodeNameExt, Creation}, Number]});
 	Other ->
 	    exit({unexpected_binary_to_term_result, Other})
     end.
 
-mk_ref({NodeName, Creation}, Numbers) when is_atom(NodeName),
-					   is_integer(Creation),
-					   is_list(Numbers) ->
-    mk_ref({atom_to_list(NodeName), Creation}, Numbers);
-mk_ref({NodeName, Creation}, [Number]) when is_list(NodeName),
-					    is_integer(Creation),
-					    is_integer(Number) ->
+mk_ref({NodeName, Creation}, [Number] = NL) when is_atom(NodeName),
+						 is_integer(Creation),
+						 is_integer(Number) ->
+    <<?VERSION_MAGIC, NodeNameExt/binary>> = term_to_binary(NodeName),
+    mk_ref({NodeNameExt, Creation}, NL);
+mk_ref({NodeNameExt, Creation}, [Number]) when is_integer(Creation),
+					       is_integer(Number) ->
     case catch binary_to_term(list_to_binary([?VERSION_MAGIC,
 					      ?REFERENCE_EXT,
-					      ?ATOM_EXT,
-					      uint16_be(length(NodeName)),
-					      NodeName,
+					      NodeNameExt,
 					      uint32_be(Number),
 					      uint8(Creation)])) of
 	Ref when is_reference(Ref) ->
 	    Ref;
 	{'EXIT', {badarg, _}} ->
-	    exit({badarg, mk_ref, [{NodeName, Creation}, [Number]]});
+	    exit({badarg, mk_ref, [{NodeNameExt, Creation}, [Number]]});
 	Other ->
 	    exit({unexpected_binary_to_term_result, Other})
     end;
-mk_ref({NodeName, Creation}, Numbers) when is_list(NodeName),
+mk_ref({NodeName, Creation}, Numbers) when is_atom(NodeName),
 					   is_integer(Creation),
 					   is_list(Numbers) ->
+    <<?VERSION_MAGIC, NodeNameExt/binary>> = term_to_binary(NodeName),
+    mk_ref({NodeNameExt, Creation}, Numbers);
+mk_ref({NodeNameExt, Creation}, Numbers) when is_integer(Creation),
+					      is_list(Numbers) ->
     case catch binary_to_term(list_to_binary([?VERSION_MAGIC,
 					      ?NEW_REFERENCE_EXT,
 					      uint16_be(length(Numbers)),
-					      ?ATOM_EXT,
-					      uint16_be(length(NodeName)),
-					      NodeName,
+					      NodeNameExt,
 					      uint8(Creation),
 					      lists:map(fun (N) ->
 								uint32_be(N)
@@ -753,7 +821,7 @@ mk_ref({NodeName, Creation}, Numbers) when is_list(NodeName),
 	Ref when is_reference(Ref) ->
 	    Ref;
 	{'EXIT', {badarg, _}} ->
-	    exit({badarg, mk_ref, [{NodeName, Creation}, Numbers]});
+	    exit({badarg, mk_ref, [{NodeNameExt, Creation}, Numbers]});
 	Other ->
 	    exit({unexpected_binary_to_term_result, Other})
     end.
-- 
cgit v1.2.3


From 0dd3b88cdf90283d9c276ee415f985cb764e522f Mon Sep 17 00:00:00 2001
From: Rickard Green <rickard@erlang.org>
Date: Thu, 10 Jan 2013 12:47:46 +0100
Subject: UTF-8 support for distribution

---
 erts/emulator/beam/atom.c          | 164 +++++++++++++++++++++----------
 erts/emulator/beam/atom.h          |  13 ++-
 erts/emulator/beam/atom.names      |   2 +
 erts/emulator/beam/beam_load.c     |   4 +-
 erts/emulator/beam/bif.c           |  26 ++---
 erts/emulator/beam/dist.c          |   8 +-
 erts/emulator/beam/erl_alloc.types |   2 +
 erts/emulator/beam/erl_bif_ddll.c  |   5 +-
 erts/emulator/beam/erl_bif_info.c  |  22 +++--
 erts/emulator/beam/erl_bif_port.c  |   2 +-
 erts/emulator/beam/erl_db.c        |   2 +-
 erts/emulator/beam/erl_db_util.c   |   3 +-
 erts/emulator/beam/erl_init.c      |   4 +-
 erts/emulator/beam/erl_unicode.c   | 139 +++++++++++++++++++++-----
 erts/emulator/beam/external.c      | 193 +++++++++++++++++++++++--------------
 erts/emulator/beam/external.h      |   5 +-
 erts/emulator/beam/global.h        |  10 +-
 erts/emulator/beam/io.c            |  28 ++++--
 erts/emulator/beam/utils.c         |   2 +-
 lib/kernel/include/dist.hrl        |   1 +
 lib/kernel/src/dist_util.erl       |   3 +-
 21 files changed, 443 insertions(+), 195 deletions(-)

diff --git a/erts/emulator/beam/atom.c b/erts/emulator/beam/atom.c
index b41a98f2a2..82dd320ea9 100644
--- a/erts/emulator/beam/atom.c
+++ b/erts/emulator/beam/atom.c
@@ -162,6 +162,7 @@ atom_alloc(Atom* tmpl)
     obj->name = atom_text_alloc(tmpl->len);
     sys_memcpy(obj->name, tmpl->name, tmpl->len);
     obj->len = tmpl->len;
+    obj->latin1_chars = tmpl->latin1_chars;
     obj->slot.index = -1;
 
     /*
@@ -192,48 +193,6 @@ atom_free(Atom* obj)
     erts_free(ERTS_ALC_T_ATOM, (void*) obj);
 }
 
-Eterm
-am_atom_put(const char* name, int len)
-{
-    Atom a;
-    Eterm ret;
-    int aix;
-#ifdef DEBUG
-    byte* err_pos;
-    Uint num_chars;
-    ASSERT(erts_analyze_utf8(name, len, &err_pos, &num_chars, NULL) == ERTS_UTF8_OK);
-#endif
-    /*
-     * Silently truncate the atom if it is too long. Overlong atoms
-     * could occur in situations where we have no good way to return
-     * an error, such as in the I/O system. (Unfortunately, many
-     * drivers don't check for errors.)
-     *
-     * If an error should be produced for overlong atoms (such in
-     * list_to_atom/1), the caller should check the length before
-     * calling this function.
-     */
-    if (len > MAX_ATOM_SZ_LIMIT) {
-	len = MAX_ATOM_SZ_LIMIT;  /*SVERK Urk... */
-    }
-#ifdef ERTS_ATOM_PUT_OPS_STAT
-    erts_smp_atomic_inc_nob(&atom_put_ops);
-#endif
-    a.len = len;
-    a.name = (byte*)name;
-    atom_read_lock();
-    aix = index_get(&erts_atom_table, (void*) &a);
-    atom_read_unlock();
-    if (aix >= 0)
-	ret = make_atom(aix);
-    else {
-	atom_write_lock();
-	ret = make_atom(index_put(&erts_atom_table, (void*) &a));
-	atom_write_unlock();
-    }
-    return ret;
-}
-
 static void latin1_to_utf8(byte* conv_buf, const byte** srcp, int* lenp)
 {
     byte* dst;
@@ -264,19 +223,116 @@ need_convertion:
     *lenp = dst - conv_buf;
 }
 
-
+/*
+ * erts_atom_put() may fail. If it fails THE_NON_VALUE is returned!
+ */
 Eterm
-am_atom_put2(const byte* name, int len, int is_latin1)
+erts_atom_put(const byte *name, int len, ErtsAtomEncoding enc, int trunc)
 {
     byte utf8_copy[MAX_ATOM_SZ_FROM_LATIN1];
+    const byte *text = name;
+    int tlen = len;
+    Sint no_latin1_chars;
+    Atom a;
+    int aix;
 
-    if (is_latin1) {
-	latin1_to_utf8(utf8_copy, &name, &len);
+#ifdef ERTS_ATOM_PUT_OPS_STAT
+    erts_smp_atomic_inc_nob(&atom_put_ops);
+#endif
+
+    if (tlen < 0) {
+	if (trunc)
+	    tlen = 0;
+	else
+	    return THE_NON_VALUE;
     }
-    return am_atom_put((const char*)name, len);
-}
 
+    switch (enc) {
+    case ERTS_ATOM_ENC_7BIT_ASCII:
+	if (tlen > MAX_ATOM_CHARACTERS) {
+	    if (trunc)
+		tlen = MAX_ATOM_CHARACTERS;
+	    else
+		return THE_NON_VALUE;
+	}
+#ifdef DEBUG
+	for (aix = 0; aix < len; aix++) {
+	    ASSERT((name[aix] & 0x80) == 0);
+	}
+#endif
+	no_latin1_chars = tlen;
+	break;
+    case ERTS_ATOM_ENC_LATIN1:
+	if (tlen > MAX_ATOM_CHARACTERS) {
+	    if (trunc)
+		tlen = MAX_ATOM_CHARACTERS;
+	    else
+		return THE_NON_VALUE;
+	}
+	no_latin1_chars = tlen;
+	latin1_to_utf8(utf8_copy, &text, &tlen);
+	break;
+    case ERTS_ATOM_ENC_UTF8:
+	/* First sanity check; need to verify later */
+	if (tlen > MAX_ATOM_SZ_LIMIT && !trunc)
+	    return THE_NON_VALUE;
+	break;
+    }
 
+    a.len = tlen;
+    a.name = (byte *) text;
+    atom_read_lock();
+    aix = index_get(&erts_atom_table, (void*) &a);
+    atom_read_unlock();
+    if (aix >= 0) {
+	/* Already in table no need to verify it */
+	return make_atom(aix);
+    }
+
+    if (enc == ERTS_ATOM_ENC_UTF8) {
+	/* Need to verify encoding and length */
+	byte *err_pos;
+	Uint no_chars;
+	switch (erts_analyze_utf8_x((byte *) text,
+				    (Uint) tlen,
+				    &err_pos,
+				    &no_chars, NULL,
+				    &no_latin1_chars,
+				    MAX_ATOM_CHARACTERS)) {
+	case ERTS_UTF8_OK:
+	    ASSERT(no_chars <= MAX_ATOM_CHARACTERS);
+	    break;
+	case ERTS_UTF8_OK_MAX_CHARS:
+	    /* Truncated... */
+	    if (!trunc)
+		return THE_NON_VALUE;
+	    ASSERT(no_chars == MAX_ATOM_CHARACTERS);
+	    tlen = err_pos - text;
+	    break;
+	default:
+	    /* Bad utf8... */
+	    return THE_NON_VALUE;
+	}
+    }
+
+    ASSERT(tlen <= MAX_ATOM_SZ_LIMIT);
+    ASSERT(-1 <= no_latin1_chars && no_latin1_chars <= MAX_ATOM_CHARACTERS);
+
+    a.len = tlen;
+    a.latin1_chars = (Sint16) no_latin1_chars;
+    a.name = (byte *) text;
+    atom_write_lock();
+    aix = index_put(&erts_atom_table, (void*) &a);
+    atom_write_unlock();
+    return make_atom(aix);
+}
+
+Eterm
+am_atom_put(const char* name, int len)
+{
+    /* Assumes 7-bit ascii; use erts_atom_put() for other encodings... */
+    return erts_atom_put((byte *) name, len, ERTS_ATOM_ENC_7BIT_ASCII, 1);
+}
 
 int atom_table_size(void)
 {
@@ -318,10 +374,11 @@ erts_atom_get(const char *name, int len, Eterm* ap, int is_latin1)
     int i;
     int res;
 
-    a.len = len;
+    a.len = (Sint16) len;
     a.name = (byte *)name;
     if (is_latin1) {
-	latin1_to_utf8(utf8_copy, (const byte**)&a.name, &a.len);
+	latin1_to_utf8(utf8_copy, (const byte**)&a.name, &len);
+	a.len = (Sint16) len;
     }
     atom_read_lock();
     i = index_get(&erts_atom_table, (void*) &a);
@@ -384,8 +441,15 @@ init_atom_table(void)
     for (i = 0; erl_atom_names[i] != 0; i++) {
 	int ix;
 	a.len = strlen(erl_atom_names[i]);
+	a.latin1_chars = a.len;
 	a.name = (byte*)erl_atom_names[i];
 	a.slot.index = i;
+#ifdef DEBUG
+	/* Verify 7-bit ascii */
+	for (ix = 0; ix < a.len; ix++) {
+	    ASSERT((a.name[ix] & 0x80) == 0);
+	}
+#endif
 	ix = index_put(&erts_atom_table, (void*) &a);
 	atom_text_pos -= a.len;
 	atom_space -= a.len;
diff --git a/erts/emulator/beam/atom.h b/erts/emulator/beam/atom.h
index 84dd6d8901..f721999a4c 100644
--- a/erts/emulator/beam/atom.h
+++ b/erts/emulator/beam/atom.h
@@ -47,7 +47,8 @@
  */
 typedef struct atom {
     IndexSlot slot;  /* MUST BE LOCATED AT TOP OF STRUCT!!! */
-    int len;         /* length of atom name */
+    Sint16 len;      /* length of atom name (UTF-8 encoded) */
+    Sint16 latin1_chars; /* 0-255 if atom can be encoded in latin1; otherwise, -1 */
     int ord0;        /* ordinal value of first 3 bytes + 7 bits */
     byte* name;      /* name of atom */
 } Atom;
@@ -113,6 +114,12 @@ ERTS_GLB_INLINE int erts_is_atom_str(const char *str, Eterm term, int is_latin1)
 
 #endif
 
+typedef enum {
+    ERTS_ATOM_ENC_7BIT_ASCII,
+    ERTS_ATOM_ENC_LATIN1,
+    ERTS_ATOM_ENC_UTF8
+} ErtsAtomEncoding;
+
 /*
  * Note, ERTS_IS_ATOM_STR() expects the first argument to be a
  * 7-bit ASCII string literal.
@@ -125,8 +132,8 @@ ERTS_GLB_INLINE int erts_is_atom_str(const char *str, Eterm term, int is_latin1)
 int atom_table_size(void);	/* number of elements */
 int atom_table_sz(void);	/* table size in bytes, excluding stored objects */
 
-Eterm am_atom_put(const char*, int); /* most callers pass plain char*'s */
-Eterm am_atom_put2(const byte*, int, int is_latin1);
+Eterm am_atom_put(const char*, int); /* ONLY 7-bit ascii! */
+Eterm erts_atom_put(const byte *name, int len, ErtsAtomEncoding enc, int trunc);
 int atom_erase(byte*, int);
 int atom_static_put(byte*, int);
 void init_atom_table(void);
diff --git a/erts/emulator/beam/atom.names b/erts/emulator/beam/atom.names
index afcbd732df..59c9f39e7b 100644
--- a/erts/emulator/beam/atom.names
+++ b/erts/emulator/beam/atom.names
@@ -17,6 +17,8 @@
 # %CopyrightEnd%
 #
 
+#
+# IMPORTANT! All atoms defined here *need* to be in 7-bit ascii!
 #
 # File format:
 #
diff --git a/erts/emulator/beam/beam_load.c b/erts/emulator/beam/beam_load.c
index 7bb964d5aa..8b4135e21d 100644
--- a/erts/emulator/beam/beam_load.c
+++ b/erts/emulator/beam/beam_load.c
@@ -1230,7 +1230,7 @@ load_atom_table(LoaderState* stp)
 
 	GetByte(stp, n);
 	GetString(stp, atom, n);
-	stp->atom[i] = am_atom_put2(atom, n, 1);
+	stp->atom[i] = erts_atom_put(atom, n, ERTS_ATOM_ENC_LATIN1, 1);
     }
 
     /*
@@ -1620,7 +1620,7 @@ read_line_table(LoaderState* stp)
 
 	    GetInt(stp, 2, n);
 	    GetString(stp, fname, n);
-	    stp->fname[i] = am_atom_put((char*)fname, n); /*SVERK ? */
+	    stp->fname[i] = erts_atom_put(fname, n, ERTS_ATOM_ENC_LATIN1, 1);
 	}
     }
 
diff --git a/erts/emulator/beam/bif.c b/erts/emulator/beam/bif.c
index 89157068c0..a0b4a8c049 100644
--- a/erts/emulator/beam/bif.c
+++ b/erts/emulator/beam/bif.c
@@ -2536,11 +2536,13 @@ BIF_RETTYPE append_element_2(BIF_ALIST_2)
 
 BIF_RETTYPE atom_to_list_1(BIF_ALIST_1)
 {
-    Eterm do_utf8_to_list(Process*, Uint num, byte *bytes, Uint sz, Uint left,
-			  Uint *num_built, Uint *num_eaten, Eterm tail); /*SVERK */
     Atom* ap;
     Uint num_chars, num_built, num_eaten;
+    byte* err_pos;
     Eterm res;
+#ifdef DEBUG
+    int ares;
+#endif
 
     if (is_not_atom(BIF_ARG_1))
 	BIF_ERROR(BIF_P, BADARG);
@@ -2549,16 +2551,15 @@ BIF_RETTYPE atom_to_list_1(BIF_ALIST_1)
     ap = atom_tab(atom_val(BIF_ARG_1));
     if (ap->len == 0)
 	BIF_RET(NIL);	/* the empty atom */
-    {
-	byte* err_pos;
-	if (erts_analyze_utf8(ap->name, ap->len, &err_pos, &num_chars, NULL)
-	    != ERTS_UTF8_OK) {
-	    BIF_ERROR(BIF_P, BADARG);
-	}
-    }
+
+#ifdef DEBUG
+    ares =
+#endif
+	erts_analyze_utf8(ap->name, ap->len, &err_pos, &num_chars, NULL);
+    ASSERT(ares == ERTS_UTF8_OK);
     
-    res = do_utf8_to_list(BIF_P, num_chars, ap->name, ap->len, ap->len,
-			  &num_built, &num_eaten, NIL);
+    res = erts_utf8_to_list(BIF_P, num_chars, ap->name, ap->len, ap->len,
+			    &num_built, &num_eaten, NIL);
     ASSERT(num_built == num_chars);
     ASSERT(num_eaten == ap->len);
     BIF_RET(res);
@@ -2582,7 +2583,8 @@ BIF_RETTYPE list_to_atom_1(BIF_ALIST_1)
 	}
 	BIF_ERROR(BIF_P, BADARG);
     }
-    res = am_atom_put2((byte*)buf, i, 1);
+    res = erts_atom_put((byte *) buf, i, ERTS_ATOM_ENC_LATIN1, 1);
+    ASSERT(is_atom(res));
     erts_free(ERTS_ALC_T_TMP, (void *) buf);
     BIF_RET(res);
 }
diff --git a/erts/emulator/beam/dist.c b/erts/emulator/beam/dist.c
index 28c4621ff2..8c3bcd1de4 100644
--- a/erts/emulator/beam/dist.c
+++ b/erts/emulator/beam/dist.c
@@ -1646,7 +1646,7 @@ dsig_send(ErtsDSigData *dsdp, Eterm ctl, Eterm msg, int force_busy)
     data_size += erts_encode_dist_ext_size(ctl, flags, acmp);
     if (is_value(msg))
 	data_size += erts_encode_dist_ext_size(msg, flags, acmp);
-    erts_finalize_atom_cache_map(acmp);
+    erts_finalize_atom_cache_map(acmp, flags);
 
     dhdr_ext_size = erts_encode_ext_dist_header_size(acmp);
     data_size += dhdr_ext_size;
@@ -1996,7 +1996,8 @@ erts_dist_command(Port *prt, int reds_limit)
 	    ASSERT(ob);
 	    do {
 		ob->extp = erts_encode_ext_dist_header_finalize(ob->extp,
-								dep->cache);
+								dep->cache,
+								flags);
 		if (!(flags & DFLAG_DIST_HDR_ATOM_CACHE))
 		    *--ob->extp = PASS_THROUGH; /* Old node; 'pass through'
 						   needed */
@@ -2040,7 +2041,8 @@ erts_dist_command(Port *prt, int reds_limit)
 	    Uint size;
 	    oq.first->extp
 		= erts_encode_ext_dist_header_finalize(oq.first->extp,
-						       dep->cache);
+						       dep->cache,
+						       flags);
 	    reds += ERTS_PORT_REDS_DIST_CMD_FINALIZE;
 	    if (!(flags & DFLAG_DIST_HDR_ATOM_CACHE))
 		*--oq.first->extp = PASS_THROUGH; /* Old node; 'pass through'
diff --git a/erts/emulator/beam/erl_alloc.types b/erts/emulator/beam/erl_alloc.types
index 0a4407f009..2b649b589b 100644
--- a/erts/emulator/beam/erl_alloc.types
+++ b/erts/emulator/beam/erl_alloc.types
@@ -49,6 +49,8 @@
 #   true after a "+enable X" statement or if it has been passed as a
 #   command line argument to make_alloc_types. The variable X is false
 #   after a "+disable X" statement or if it has never been mentioned.
+#
+# IMPORTANT! Only use 7-bit ascii text in this file!
 
 +if smp
 +disable threads_no_smp
diff --git a/erts/emulator/beam/erl_bif_ddll.c b/erts/emulator/beam/erl_bif_ddll.c
index 7f7c975e78..59a53870f3 100644
--- a/erts/emulator/beam/erl_bif_ddll.c
+++ b/erts/emulator/beam/erl_bif_ddll.c
@@ -1869,7 +1869,10 @@ static Eterm build_load_error_hp(Eterm *hp, int code)
 
 static Eterm mkatom(char *str)
 {
-    return am_atom_put(str, sys_strlen(str));
+    return erts_atom_put((byte *) str,
+			 sys_strlen(str),
+			 ERTS_ATOM_ENC_LATIN1,
+			 1);
 }
 
 static char *pick_list_or_atom(Eterm name_term)
diff --git a/erts/emulator/beam/erl_bif_info.c b/erts/emulator/beam/erl_bif_info.c
index a3811ccdb0..c910bd0cb6 100755
--- a/erts/emulator/beam/erl_bif_info.c
+++ b/erts/emulator/beam/erl_bif_info.c
@@ -2296,8 +2296,10 @@ BIF_RETTYPE system_info_1(BIF_ALIST_1)
 	for (i = num_instructions-1; i >= 0; i--) {
 	    res = erts_bld_cons(hpp, hszp,
 				erts_bld_tuple(hpp, hszp, 2,
-					       am_atom_put(opc[i].name,
-							   strlen(opc[i].name)),
+					       erts_atom_put(opc[i].name,
+							     strlen(opc[i].name),
+							     ERTS_ATOM_ENC_LATIN1,
+							     1),
 					       erts_bld_uint(hpp, hszp,
 							     opc[i].count)),
 				res);
@@ -3901,7 +3903,7 @@ static Eterm lcnt_build_lock_stats_term(Eterm **hpp, Uint *szp, erts_lcnt_lock_s
     timer_ns = stats->timer.ns;
     timer_n  = stats->timer_n;
    
-    af    = am_atom_put(stats->file, strlen(stats->file)); 
+    af    = erts_atom_put(stats->file, strlen(stats->file), ERTS_ATOM_ENC_LATIN1, 1); 
     uil   = erts_bld_uint( hpp, szp, line);
     tloc  = erts_bld_tuple(hpp, szp, 2, af, uil);
     
@@ -3938,13 +3940,13 @@ static Eterm lcnt_build_lock_term(Eterm **hpp, Uint *szp, erts_lcnt_lock_t *lock
     
     ASSERT(ltype);
     
-    type  = am_atom_put(ltype, strlen(ltype));           
-    name  = am_atom_put(lock->name, strlen(lock->name)); 
+    type  = erts_atom_put(ltype, strlen(ltype), ERTS_ATOM_ENC_LATIN1, 1);           
+    name  = erts_atom_put(lock->name, strlen(lock->name), ERTS_ATOM_ENC_LATIN1, 1); 
 
     if (lock->flag & ERTS_LCNT_LT_ALLOC) {
 	/* use allocator types names as id's for allocator locks */
 	ltype = (char *) ERTS_ALC_A2AD(signed_val(lock->id));
-	id    = am_atom_put(ltype, strlen(ltype));
+	id    = erts_atom_put(ltype, strlen(ltype), ERTS_ATOM_ENC_LATIN1, 1);
     } else if (lock->flag & ERTS_LCNT_LT_PROCLOCK) {
 	/* use registered names as id's for process locks if available */
 	proc  = erts_proc_lookup(lock->id);
@@ -3984,12 +3986,12 @@ static Eterm lcnt_build_result_term(Eterm **hpp, Uint *szp, erts_lcnt_data_t *da
     dtns = erts_bld_uint( hpp, szp, data->duration.ns);
     tdt  = erts_bld_tuple(hpp, szp, 2, dts, dtns);
     
-    adur = am_atom_put(str_duration, strlen(str_duration));
+    adur = erts_atom_put(str_duration, strlen(str_duration), ERTS_ATOM_ENC_LATIN1, 1);
     tdur = erts_bld_tuple(hpp, szp, 2, adur, tdt);
    
     /* lock tuple */
     
-    aloc = am_atom_put(str_locks, strlen(str_locks));
+    aloc = erts_atom_put(str_locks, strlen(str_locks), ERTS_ATOM_ENC_LATIN1, 1);
     	
     for (lock = data->current_locks->head; lock != NULL ; lock = lock->next ) {
 	lloc = lcnt_build_lock_term(hpp, szp, lock, lloc);
@@ -4125,14 +4127,14 @@ BIF_RETTYPE erts_debug_lock_counters_1(BIF_ALIST_1)
 
 static void os_info_init(void)
 {
-    Eterm type = am_atom_put(os_type, strlen(os_type));
+    Eterm type = erts_atom_put((byte *) os_type, strlen(os_type), ERTS_ATOM_ENC_LATIN1, 1);
     Eterm flav;
     int major, minor, build;
     char* buf = erts_alloc(ERTS_ALC_T_TMP, 1024); /* More than enough */
     Eterm* hp;
 
     os_flavor(buf, 1024);
-    flav = am_atom_put(buf, strlen(buf));
+    flav = erts_atom_put((byte *) buf, strlen(buf), ERTS_ATOM_ENC_LATIN1, 1);
     erts_free(ERTS_ALC_T_TMP, (void *) buf);
     hp = erts_alloc(ERTS_ALC_T_LL_TEMP_TERM, (3+4)*sizeof(Eterm));
     os_type_tuple = TUPLE2(hp, type, flav);
diff --git a/erts/emulator/beam/erl_bif_port.c b/erts/emulator/beam/erl_bif_port.c
index f9009166c0..4b270414cb 100644
--- a/erts/emulator/beam/erl_bif_port.c
+++ b/erts/emulator/beam/erl_bif_port.c
@@ -68,7 +68,7 @@ BIF_RETTYPE open_port_2(BIF_ALIST_2)
 	} else {
 	    str = "einval";
 	}
-	BIF_P->fvalue = am_atom_put(str, strlen(str));
+	BIF_P->fvalue = erts_atom_put((byte *) str, strlen(str), ERTS_ATOM_ENC_LATIN1, 1);
 	BIF_ERROR(BIF_P, EXC_ERROR);
     }
 
diff --git a/erts/emulator/beam/erl_db.c b/erts/emulator/beam/erl_db.c
index 1ba1048afa..f8a4882ec0 100644
--- a/erts/emulator/beam/erl_db.c
+++ b/erts/emulator/beam/erl_db.c
@@ -3815,7 +3815,7 @@ erts_ets_colliding_names(Process* p, Eterm name, Uint cnt)
             while (index >= atom_table_size()) {
                 char tmp[20];
                 erts_snprintf(tmp, sizeof(tmp), "am%x", atom_table_size());
-                am_atom_put(tmp,strlen(tmp));
+                erts_atom_put((byte *) tmp, strlen(tmp), ERTS_ATOM_ENC_LATIN1, 1);
             }
             list = CONS(hp, make_atom(index), list);
             hp += 2;
diff --git a/erts/emulator/beam/erl_db_util.c b/erts/emulator/beam/erl_db_util.c
index 0c9ca83ce4..407a55a3d7 100644
--- a/erts/emulator/beam/erl_db_util.c
+++ b/erts/emulator/beam/erl_db_util.c
@@ -4768,7 +4768,8 @@ static int match_compact(ErlHeapFragment *expr, DMCErrInfo *err_info)
 	    ASSERT(j < x);
 	    erts_snprintf(buff+1, sizeof(buff) - 1, "%u", (unsigned) j);
 	    /* Yes, writing directly into terms, they ARE off heap */
-	    *p = am_atom_put(buff, strlen(buff));
+	    *p = erts_atom_put((byte *) buff, strlen(buff),
+			       ERTS_ATOM_ENC_LATIN1, 1);
 	}
 	++p;
     }
diff --git a/erts/emulator/beam/erl_init.c b/erts/emulator/beam/erl_init.c
index 4b90e5394a..369eab5980 100644
--- a/erts/emulator/beam/erl_init.c
+++ b/erts/emulator/beam/erl_init.c
@@ -344,7 +344,7 @@ erl_first_process_otp(char* modname, void* code, unsigned size, int argc, char**
     ErlSpawnOpts so;
     Eterm env;
     
-    start_mod = am_atom_put(modname, sys_strlen(modname));
+    start_mod = erts_atom_put((byte *) modname, sys_strlen(modname), ERTS_ATOM_ENC_LATIN1, 1);
     if (erts_find_function(start_mod, am_start, 2,
 			   erts_active_code_ix()) == NULL) {
 	erl_exit(5, "No function %s:start/2\n", modname);
@@ -441,7 +441,7 @@ load_preloaded(void)
     i = 0;
     while ((name = preload_p[i].name) != NULL) {
 	length = preload_p[i].size;
-	module_name = am_atom_put(name, sys_strlen(name));
+	module_name = erts_atom_put((byte *) name, sys_strlen(name), ERTS_ATOM_ENC_LATIN1, 1);
 	if ((code = sys_preload_begin(&preload_p[i])) == 0)
 	    erl_exit(1, "Failed to find preloaded code for module %s\n", 
 		     name);
diff --git a/erts/emulator/beam/erl_unicode.c b/erts/emulator/beam/erl_unicode.c
index e24b6f1458..6600ce4a4a 100644
--- a/erts/emulator/beam/erl_unicode.c
+++ b/erts/emulator/beam/erl_unicode.c
@@ -1154,15 +1154,24 @@ BIF_RETTYPE unicode_characters_to_list_2(BIF_ALIST_2)
  * When input to characters_to_list is a plain binary and the format is 'unicode', we do
  * a faster analyze and size count with this function.
  */
-int erts_analyze_utf8(byte *source, Uint size, 
-		      byte **err_pos, Uint *num_chars, int *left)
+static ERTS_INLINE int
+analyze_utf8(byte *source, Uint size, byte **err_pos, Uint *num_chars, int *left,
+	     Sint *num_latin1_chars, Uint max_chars)
 {
+    Uint latin1_count;
+    int is_latin1;
     *err_pos = source;
+    if (num_latin1_chars) {
+	is_latin1 = 1;
+	latin1_count = 0;
+    }
     *num_chars = 0;
     while (size) {
 	if (((*source) & ((byte) 0x80)) == 0) {
 	    source++;
-	    --size; 
+	    --size;
+	    if (num_latin1_chars)
+		latin1_count++;
 	} else if (((*source) & ((byte) 0xE0)) == 0xC0) {
 	    if (size < 2) {
 		return ERTS_UTF8_INCOMPLETE;
@@ -1173,6 +1182,11 @@ int erts_analyze_utf8(byte *source, Uint size,
 	    }
 	    source += 2;
 	    size -= 2;
+	    if (num_latin1_chars) {
+		latin1_count++;
+		if ((source[0] & ((byte) 0xFC)) != ((byte) 0xC0))
+		    is_latin1 = 0;
+	    }
 	} else if (((*source) & ((byte) 0xF0)) == 0xE0) {
 	    if (size < 3) {
 		return ERTS_UTF8_INCOMPLETE;
@@ -1188,6 +1202,8 @@ int erts_analyze_utf8(byte *source, Uint size,
 	    }
 	    source += 3;
 	    size -= 3;
+	    if (num_latin1_chars)
+		is_latin1 = 0;
 	} else if (((*source) & ((byte) 0xF8)) == 0xF0) {
 	    if (size < 4) {
 		return ERTS_UTF8_INCOMPLETE;
@@ -1205,22 +1221,41 @@ int erts_analyze_utf8(byte *source, Uint size,
 	    }
 	    source += 4;
 	    size -= 4; 
+	    if (num_latin1_chars)
+		is_latin1 = 0;
 	} else {
 	    return ERTS_UTF8_ERROR;
 	}
 	++(*num_chars);
 	*err_pos = source;
+	if (max_chars && size > 0 && *num_chars == max_chars)
+	    return ERTS_UTF8_OK_MAX_CHARS;
 	if (left && --(*left) <= 0 && size) {
 	    return ERTS_UTF8_ANALYZE_MORE;
 	}
     }
+    if (num_latin1_chars)
+	*num_latin1_chars = is_latin1 ? latin1_count : -1;
     return ERTS_UTF8_OK;
 }
 
+int erts_analyze_utf8(byte *source, Uint size, 
+		      byte **err_pos, Uint *num_chars, int *left)
+{
+    return analyze_utf8(source, size, err_pos, num_chars, left, NULL, 0);
+}
+
+int erts_analyze_utf8_x(byte *source, Uint size, 
+			byte **err_pos, Uint *num_chars, int *left,
+			Sint *num_latin1_chars, Uint max_chars)
+{
+    return analyze_utf8(source, size, err_pos, num_chars, left, num_latin1_chars, max_chars);
+}
+
 /*
  * No errors should be able to occur - no overlongs, no malformed, no nothing
- */    
-Eterm do_utf8_to_list(Process *p, Uint num, byte *bytes, Uint sz, 
+ */
+static Eterm do_utf8_to_list(Process *p, Uint num, byte *bytes, Uint sz, 
 			     Uint left,
 			     Uint *num_built, Uint *num_eaten, Eterm tail)
 {
@@ -1275,6 +1310,12 @@ Eterm do_utf8_to_list(Process *p, Uint num, byte *bytes, Uint sz,
     return ret;
 }
 
+Eterm erts_utf8_to_list(Process *p, Uint num, byte *bytes, Uint sz, Uint left,
+			Uint *num_built, Uint *num_eaten, Eterm tail)
+{
+    return do_utf8_to_list(p, num, bytes, sz, left, num_built, num_eaten, tail);
+}
+
 static int is_candidate(Uint cp)
 {
     int index,pos;
@@ -1849,14 +1890,14 @@ BIF_RETTYPE atom_to_binary_2(BIF_ALIST_2)
 }
 
 static BIF_RETTYPE
-binary_to_atom(Process* p, Eterm bin, Eterm enc, int must_exist)
+binary_to_atom(Process* proc, Eterm bin, Eterm enc, int must_exist)
 {
     byte* bytes;
     byte *temp_alloc = NULL;
     Uint bin_size;
 
     if ((bytes = erts_get_aligned_binary_bytes(bin, &temp_alloc)) == 0) {
-	BIF_ERROR(p, BADARG);
+	BIF_ERROR(proc, BADARG);
     }
     bin_size = binary_size(bin);
     if (enc == am_latin1) {
@@ -1864,11 +1905,16 @@ binary_to_atom(Process* p, Eterm bin, Eterm enc, int must_exist)
 	if (bin_size > MAX_ATOM_CHARACTERS) {
 	system_limit:
 	    erts_free_aligned_binary_bytes(temp_alloc);
-	    BIF_ERROR(p, SYSTEM_LIMIT);
+	    BIF_ERROR(proc, SYSTEM_LIMIT);
 	}
 	if (!must_exist) {
-	    a = am_atom_put2(bytes, bin_size, 1);
-	    erts_free_aligned_binary_bytes(temp_alloc);
+	    a = erts_atom_put((byte *) bytes,
+			      bin_size,
+			      ERTS_ATOM_ENC_LATIN1,
+			      0);
+    	    erts_free_aligned_binary_bytes(temp_alloc);
+	    if (is_non_value(a))
+		goto badarg;
 	    BIF_RET(a);
 	} else if (erts_atom_get((char *)bytes, bin_size, &a, 1)) {
 	    erts_free_aligned_binary_bytes(temp_alloc);
@@ -1900,17 +1946,22 @@ binary_to_atom(Process* p, Eterm bin, Eterm enc, int must_exist)
 	}
 
 	if (!must_exist) {
-	    res = am_atom_put((char*)bytes, bin_size);
+	    res = erts_atom_put((byte *) bytes,
+				bin_size,
+				ERTS_ATOM_ENC_UTF8,
+				0);
 	}
 	else if (!erts_atom_get((char*)bytes, bin_size, &res, 0)) {
 	    goto badarg;
 	}
 	erts_free_aligned_binary_bytes(temp_alloc);
+	if (is_non_value(res))
+	    goto badarg;
 	BIF_RET(res);
     } else {
     badarg:
 	erts_free_aligned_binary_bytes(temp_alloc);
-	BIF_ERROR(p, BADARG);
+	BIF_ERROR(proc, BADARG);
     }
 }
 
@@ -2625,30 +2676,70 @@ BIF_RETTYPE file_native_name_encoding_0(BIF_ALIST_0)
     }
 }
 
-/* Assumes 'dest' has enough room.
- */
-int erts_utf8_to_latin1(byte* dest, const byte* source, unsigned slen)
+int erts_utf8_is_latin1_string(const byte *string, int len)
+{
+    /* Assumes string is encoded in valid UTF-8 */
+    int i;
+    while (i < len) {
+	if ((string[i] & 0x80) == 0)
+	    i++;
+	else if (i+1 < len
+		 && (string[i] & 0xFE) == 0xC2
+		 && (string[i+1] & 0xC0) == 0x80)
+	    i +=2;
+	else
+	    return 0;
+    }
+    return 1;
+}
+
+int erts_utf8_to_latin1(byte* dest, const byte* source, int slen)
 {
+    /*
+     * Assumes source contains valid utf8 that can be encoded as latin1,
+     * and that dest has enough room.
+     */
     byte* dp = dest;
     while (slen > 0) {
 	if ((source[0] & 0x80) == 0) {
 	    *dp++ = *source++;
 	    --slen;
 	}
-	else if (slen > 1 &&
-		 (source[0] & 0xFE) == 0xC2 &&
-		 (source[1] & 0xC0) == 0x80) {
+	else {
+	    ASSERT(slen > 1);
+	    ASSERT((source[0] & 0xFE) == 0xC2);
+	    ASSERT((source[1] & 0xC0) == 0x80);
 	    *dp++ = (char) ((source[0] << 6) | (source[1] & 0x3F));
 	    source += 2;
 	    slen -= 2;
 	}
-	else {
-	    /* Just let unconvertable octets through. This should not happen
-	       in a correctly upgraded system */
-	    *dp++ = *source++;
-	    --slen;
-	}
     }
     return dp - dest;
 }
 
+int erts_utf8_to_latin1_backwards(byte *dest, const byte *source, int slen)
+{
+    /*
+     * Assumes source contains valid utf8 that can be encoded as latin1,
+     * and that dest has enough room.
+     */
+    int dix = 0;
+    int six = slen;
+    while (six > 0) {
+	six--;
+	dix--;
+	if ((source[six] & 0x80) == 0)
+	    dest[dix] = source[six];
+	else {
+	    byte c;
+	    ASSERT(six > 0);
+	    ASSERT((source[six] & 0xC0) == 0x80);
+	    ASSERT((source[six-1] & 0xFE) == 0xC2);
+ 	    c = source[six] & 0x3F;
+	    six--;
+	    c |= source[six] << 6;
+	    dest[dix] = c;
+	}
+    }
+    return -dix;
+}
diff --git a/erts/emulator/beam/external.c b/erts/emulator/beam/external.c
index 68edcd0fa6..8c4d9108d4 100644
--- a/erts/emulator/beam/external.c
+++ b/erts/emulator/beam/external.c
@@ -142,6 +142,7 @@ erts_init_atom_cache_map(ErtsAtomCacheMap *acmp)
 {
     if (acmp) {
 	int ix;
+	acmp->long_atoms = 0;
 	for (ix = 0; ix < ERTS_ATOM_CACHE_SIZE; ix++)
 	    acmp->cache[ix].iix = -1;
 	acmp->sz = 0;
@@ -154,6 +155,7 @@ erts_reset_atom_cache_map(ErtsAtomCacheMap *acmp)
 {
     if (acmp) {
 	int i;
+	acmp->long_atoms = 0;
 	for (i = 0; i < acmp->sz; i++) {
 	    ASSERT(0 <= acmp->cix[i] && acmp->cix[i] < ERTS_ATOM_CACHE_SIZE);
 	    acmp->cache[acmp->cix[i]].iix = -1;
@@ -175,9 +177,23 @@ erts_destroy_atom_cache_map(ErtsAtomCacheMap *acmp)
 }
 
 static ERTS_INLINE void
-insert_acache_map(ErtsAtomCacheMap *acmp, Eterm atom)
+insert_acache_map(ErtsAtomCacheMap *acmp, Eterm atom, Uint32 dflags)
 {
-    if (acmp && acmp->sz < ERTS_MAX_INTERNAL_ATOM_CACHE_ENTRIES) {
+    /*
+     * If the receiver do not understand utf8 atoms
+     * and this atom cannot be represented in latin1,
+     * we are not allowed to cache it.
+     *
+     * In this case all atoms are assumed to have
+     * latin1 encoding in the cache. By refusing it
+     * in the cache we will instead encode it using
+     * ATOM_UTF8_EXT/SMALL_ATOM_UTF8_EXT which the
+     * receiver do not recognize and tear down the
+     * connection.
+     */
+    if (acmp && acmp->sz < ERTS_MAX_INTERNAL_ATOM_CACHE_ENTRIES
+	&& ((dflags & DFLAG_UTF8_ATOMS)
+	    || atom_tab(atom_val(atom))->latin1_chars >= 0)) {
 	int ix;
 	ASSERT(acmp->hdr_sz < 0);
 	ix = atom2cix(atom);
@@ -190,7 +206,7 @@ insert_acache_map(ErtsAtomCacheMap *acmp, Eterm atom)
 }
 
 static ERTS_INLINE int
-get_iix_acache_map(ErtsAtomCacheMap *acmp, Eterm atom)
+get_iix_acache_map(ErtsAtomCacheMap *acmp, Eterm atom, Uint32 dflags)
 {
     if (!acmp)
 	return -1;
@@ -199,7 +215,9 @@ get_iix_acache_map(ErtsAtomCacheMap *acmp, Eterm atom)
 	ASSERT(is_atom(atom));
 	ix = atom2cix(atom);
 	if (acmp->cache[ix].iix < 0) {
-	    ASSERT(acmp->sz == ERTS_MAX_INTERNAL_ATOM_CACHE_ENTRIES);
+	    ASSERT(acmp->sz == ERTS_MAX_INTERNAL_ATOM_CACHE_ENTRIES
+		   || (!(dflags & DFLAG_UTF8_ATOMS)
+		       && atom_tab(atom_val(atom))->latin1_chars < 0));
 	    return -1;
 	}
 	else {
@@ -210,18 +228,17 @@ get_iix_acache_map(ErtsAtomCacheMap *acmp, Eterm atom)
 }
 
 void
-erts_finalize_atom_cache_map(ErtsAtomCacheMap *acmp)
+erts_finalize_atom_cache_map(ErtsAtomCacheMap *acmp, Uint32 dflags)
 {
     if (acmp) {
-#if MAX_ATOM_LENGTH > 255
-#error "This code is not complete; long_atoms info need to be passed to the following stages."
-	int long_atoms = 0; /* !0 if one or more atoms are long than 255. */
-#endif
+	int utf8_atoms = (int) (dflags & DFLAG_UTF8_ATOMS);
+	int long_atoms = 0; /* !0 if one or more atoms are longer than 255. */
 	int i;
 	int sz;
 	int fix_sz
 	    = 1 /* VERSION_MAGIC */
 	    + 1 /* DIST_HEADER */
+	    + 1 /* dist header flags */
 	    + 1 /* number of internal cache entries */
 	    ;
 	int min_sz;
@@ -230,22 +247,23 @@ erts_finalize_atom_cache_map(ErtsAtomCacheMap *acmp)
 	min_sz = fix_sz+(2+4)*acmp->sz;
 	sz = fix_sz;
 	for (i = 0; i < acmp->sz; i++) {
+	    Atom *a;
 	    Eterm atom;
 	    int len;
 	    atom = acmp->cache[acmp->cix[i]].atom;
 	    ASSERT(is_atom(atom));
-	    len = atom_tab(atom_val(atom))->len;
-#if MAX_ATOM_LENGTH > 255
+	    a = atom_tab(atom_val(atom));
+	    len = (int) (utf8_atoms ? a->len : a->latin1_chars);
+	    ASSERT(len >= 0);
 	    if (!long_atoms && len > 255)
 		long_atoms = 1;
-#endif
 	    /* Enough for a new atom cache value */
 	    sz += 1 /* cix */ + 1 /* length */ + len /* text */;
 	}
-#if MAX_ATOM_LENGTH > 255
-	if (long_atoms)
+	if (long_atoms) {
+	    acmp->long_atoms = 1;
 	    sz += acmp->sz; /* we need 2 bytes per atom for length */
-#endif
+	}
 	/* Dynamically sized flag field */
 	sz += ERTS_DIST_HDR_ATOM_CACHE_FLAG_BYTES(acmp->sz);
 	if (sz < min_sz)
@@ -274,6 +292,7 @@ byte *erts_encode_ext_dist_header_setup(byte *ctl_ext, ErtsAtomCacheMap *acmp)
     else {
 	int i;
 	byte *ep = ctl_ext;
+	byte dist_hdr_flags = acmp->long_atoms ? ERTS_DIST_HDR_LONG_ATOMS_FLG : 0;
 	ASSERT(acmp->hdr_sz >= 0);
 	/*
 	 * Write cache update instructions. Note that this is a purely
@@ -296,28 +315,36 @@ byte *erts_encode_ext_dist_header_setup(byte *ctl_ext, ErtsAtomCacheMap *acmp)
 	}
 	--ep;
 	put_int8(acmp->sz, ep);
+	--ep;
+	put_int8(dist_hdr_flags, ep);
 	*--ep = DIST_HEADER;
 	*--ep = VERSION_MAGIC;
 	return ep;
     }
 }
 
-byte *erts_encode_ext_dist_header_finalize(byte *ext, ErtsAtomCache *cache)
+byte *erts_encode_ext_dist_header_finalize(byte *ext, ErtsAtomCache *cache, Uint32 dflags)
 {
     byte *ip;
     byte instr_buf[(2+4)*ERTS_ATOM_CACHE_SIZE];
     int ci, sz;
+    byte dist_hdr_flags;
+    int long_atoms;
+    int utf8_atoms = (int) (dflags & DFLAG_UTF8_ATOMS);
     register byte *ep = ext;
     ASSERT(ep[0] == VERSION_MAGIC);
     if (ep[1] != DIST_HEADER)
 	return ext;
 
+    dist_hdr_flags = ep[2];
+    long_atoms = ERTS_DIST_HDR_LONG_ATOMS_FLG & ((int) dist_hdr_flags);
+
     /*
      * Update output atom cache and write the external version of
      * the dist header. We write the header backwards just
      * before the actual term(s).
      */
-    ep += 2;
+    ep += 3;
     ci = (int) get_int8(ep);
     ASSERT(0 <= ci && ci < ERTS_ATOM_CACHE_SIZE);
     ep += 1;
@@ -342,12 +369,7 @@ byte *erts_encode_ext_dist_header_finalize(byte *ext, ErtsAtomCache *cache)
 	flgs_bytes = ERTS_DIST_HDR_ATOM_CACHE_FLAG_BYTES(ci);
 
 	ASSERT(flgs_bytes <= sizeof(flgs_buf));
-#if MAX_ATOM_LENGTH > 255
-	/* long_atoms info needs to be passed from previous stages */
-	if (long_atoms)
-	    flgs |= ERTS_DIST_HDR_LONG_ATOMS_FLG;
-#endif
-	flgs = 0;
+	flgs = (Uint32) dist_hdr_flags;
 	flgs_buf_ix = 0;
 	if ((ci & 1) == 0)
 	    used_half_bytes = 2;
@@ -382,17 +404,22 @@ byte *erts_encode_ext_dist_header_finalize(byte *ext, ErtsAtomCache *cache)
 		Atom *a;
 		cache->out_arr[cix] = atom;
 		a = atom_tab(atom_val(atom));
-		sz = a->len;
-		ep -= sz;
-		sys_memcpy((void *) ep, (void *) a->name, sz);
-#if MAX_ATOM_LENGTH > 255
+		if (utf8_atoms) {
+		    sz = a->len;
+		    ep -= sz;
+		    sys_memcpy((void *) ep, (void *) a->name, sz);
+		}
+		else {
+		    ASSERT(0 <= a->latin1_chars && a->latin1_chars <= MAX_ATOM_CHARACTERS);
+		    ep -= a->latin1_chars;
+		    sz = erts_utf8_to_latin1(ep, a->name, a->len);
+		    ASSERT(a->latin1_chars == sz);
+		}
 		if (long_atoms) {
 		    ep -= 2;
 		    put_int16(sz, ep);
 		}
-		else
-#endif
-		{
+		else {
 		    ASSERT(0 <= sz && sz <= 255);
 		    --ep;
 		    put_int8(sz, ep);
@@ -553,6 +580,7 @@ erts_prepare_dist_ext(ErtsDistExternal *edep,
 #endif
 
     register byte *ep = ext;
+    int utf8_atoms = (int) (dep->flags & DFLAG_UTF8_ATOMS);
 
     edep->heap_size = -1;
     edep->ext_endp = ext+size;
@@ -611,9 +639,7 @@ erts_prepare_dist_ext(ErtsDistExternal *edep,
 	    ERTS_EXT_HDR_FAIL;
 	ep++;
 	if (no_atoms) {
-#if MAX_ATOM_LENGTH > 255
 	    int long_atoms = 0;
-#endif
 #ifdef DEBUG
 	    byte *flgs_buf = ep;
 #endif
@@ -632,14 +658,8 @@ erts_prepare_dist_ext(ErtsDistExternal *edep,
 	     */
 	    byte_ix = ERTS_DIST_HDR_ATOM_CACHE_FLAG_BYTE_IX(no_atoms);
 	    bit_ix = ERTS_DIST_HDR_ATOM_CACHE_FLAG_BIT_IX(no_atoms);
-	    if (flgsp[byte_ix] & (((byte) ERTS_DIST_HDR_LONG_ATOMS_FLG)
-				  << bit_ix)) {
-#if MAX_ATOM_LENGTH > 255
+	    if (flgsp[byte_ix] & (((byte) ERTS_DIST_HDR_LONG_ATOMS_FLG) << bit_ix))
 		long_atoms = 1;
-#else
-		ERTS_EXT_HDR_FAIL; /* Long atoms not supported yet */
-#endif
-	    }
 
 #ifdef DEBUG
 	    byte_ix = 0;
@@ -707,23 +727,25 @@ erts_prepare_dist_ext(ErtsDistExternal *edep,
 		    if (cix >= ERTS_ATOM_CACHE_SIZE)
 			ERTS_EXT_HDR_FAIL;
 		    ep++;
-#if MAX_ATOM_CHARACTERS > 255
 		    if (long_atoms) {
 			CHKSIZE(2);
 			len = get_int16(ep);
 			ep += 2;
 		    }
-		    else
-#endif
-		    {
+		    else {
 			CHKSIZE(1);
 			len = get_int8(ep);
 			ep++;
 		    }
-		    if (len > MAX_ATOM_CHARACTERS)
-			ERTS_EXT_HDR_FAIL; /* Too long atom */
 		    CHKSIZE(len);
-		    atom = am_atom_put((char *) ep, len);
+		    atom = erts_atom_put((byte *) ep,
+					 len,
+					 (utf8_atoms
+					  ? ERTS_ATOM_ENC_UTF8
+					  : ERTS_ATOM_ENC_LATIN1),
+					 0);
+		    if (is_non_value(atom))
+			ERTS_EXT_HDR_FAIL;
 		    ep += len;
 		    cache->in_arr[cix] = atom;
 		    edep->attab.atom[tix] = atom;
@@ -1404,7 +1426,8 @@ static byte*
 enc_atom(ErtsAtomCacheMap *acmp, Eterm atom, byte *ep, Uint32 dflags)
 {
     int iix;
-    int i, j;
+    int len;
+    int utf8_atoms = (int) (dflags & DFLAG_UTF8_ATOMS);
 
     ASSERT(is_atom(atom));
 
@@ -1423,42 +1446,46 @@ enc_atom(ErtsAtomCacheMap *acmp, Eterm atom, byte *ep, Uint32 dflags)
 	}
 	return ep;
     }
+
     /*
      * term_to_binary/1,2 and the initial distribution message
      * don't use the cache.
      */
-    iix = get_iix_acache_map(acmp, atom);
-    if (iix < 0) { 
-	i = atom_val(atom);
-	j = atom_tab(i)->len;
-	if (dflags & DFLAG_UTF8_ATOMS) {
-	    if (j <= 255) {
+
+    iix = get_iix_acache_map(acmp, atom, dflags);
+    if (iix < 0) {
+	Atom *a = atom_tab(atom_val(atom));
+	if (utf8_atoms || a->latin1_chars < 0) {
+	    len = a->len;
+	    if (len > 255) {
 		*ep++ = ATOM_UTF8_EXT;
-		put_int16(j, ep);
+		put_int16(len, ep);
 		ep += 2;
 	    }
 	    else {
 		*ep++ = SMALL_ATOM_UTF8_EXT;
-		put_int8(j, ep);
-		ep += 2;
+		put_int8(len, ep);
+		ep += 1;
 	    }
-	    sys_memcpy((char *) ep, (char*)atom_tab(i)->name, (int) j);
+	    sys_memcpy((char *) ep, (char *) a->name, len);
 	}
 	else {
-	    if (j <= 255 && (dflags & DFLAG_SMALL_ATOM_TAGS)) {
+	    if (a->latin1_chars <= 255 && (dflags & DFLAG_SMALL_ATOM_TAGS)) {
 		*ep++ = SMALL_ATOM_EXT;
-		j = erts_utf8_to_latin1(ep+1, atom_tab(i)->name, j);
-		put_int8(j, ep);
+		len = erts_utf8_to_latin1(ep+1, a->name, a->len);
+		ASSERT(len == a->latin1_chars);
+		put_int8(len, ep);
 		ep++;
 	    }
 	    else {
 		*ep++ = ATOM_EXT;
-		j = erts_utf8_to_latin1(ep+2, atom_tab(i)->name, j);
-		put_int16(j, ep);
+		len = erts_utf8_to_latin1(ep+2, a->name, a->len);
+		ASSERT(len == a->latin1_chars);
+		put_int16(len, ep);
 		ep += 2;
 	    }	    
 	}
-	ep += j;
+	ep += len;
 	return ep;
     }
 
@@ -1535,7 +1562,15 @@ dec_atom(ErtsDistExternal *edep, byte* ep, Eterm* objp)
                 goto error;
 	    }
         } else {
-            *objp = am_atom_put2(ep, len, is_latin1);
+	    Eterm atom = erts_atom_put(ep,
+				       len,
+				       (is_latin1
+					? ERTS_ATOM_ENC_LATIN1
+					: ERTS_ATOM_ENC_UTF8),
+				       0);
+	    if (is_non_value(atom))
+		goto error;
+            *objp = atom;
         }
 	ep += len;
 	break;
@@ -2248,7 +2283,15 @@ dec_term_atom_common:
 		    goto error;
 		}
 	    } else {
-	        *objp = am_atom_put2(ep, n, is_latin1);
+		Eterm atom = erts_atom_put(ep,
+					   n,
+					   (is_latin1
+					    ? ERTS_ATOM_ENC_LATIN1
+					    : ERTS_ATOM_ENC_UTF8),
+					   0);
+		if (is_non_value(atom))
+		    goto error;
+	        *objp = atom;
 	    }
 	    ep += n;
 	    break;
@@ -2917,18 +2960,22 @@ encode_size_struct2(ErtsAtomCacheMap *acmp, Eterm obj, unsigned dflags)
 		}
 	    }
 	    else {
-		int alen = atom_tab(atom_val(obj))->len;
-		result += 1 + 1 + alen;
-		if (dflags & DFLAG_UTF8_ATOMS) {
+		Atom *a = atom_tab(atom_val(obj));
+		int alen;
+		if ((dflags & DFLAG_UTF8_ATOMS) || a->latin1_chars < 0) {
+		    alen = a->len;
+		    result += 1 + 1 + alen;
 		    if (alen > 255) {
 			result++; /* ATOM_UTF8_EXT (not small) */
 		    }
-		    /*SVERK we use utf8 length which is an over estimation */
-		}		    
-		else if (alen > 255 || !(dflags & DFLAG_SMALL_ATOM_TAGS)) {
-		    result++; /* ATOM_EXT (not small) */
 		}
-		insert_acache_map(acmp, obj);
+		else {
+		    alen = a->latin1_chars;
+		    result += 1 + 1 + alen;
+		    if (alen > 255 || !(dflags & DFLAG_SMALL_ATOM_TAGS))
+			result++; /* ATOM_EXT (not small) */
+		}
+		insert_acache_map(acmp, obj, dflags);
 	    }
 	    break;
 	case SMALL_DEF:
diff --git a/erts/emulator/beam/external.h b/erts/emulator/beam/external.h
index 50eea62225..ad430117c8 100644
--- a/erts/emulator/beam/external.h
+++ b/erts/emulator/beam/external.h
@@ -92,6 +92,7 @@ typedef struct cache {
 typedef struct {
     int hdr_sz;
     int sz;
+    int long_atoms;
     int cix[ERTS_ATOM_CACHE_SIZE];
     struct {
 	Eterm atom;
@@ -152,12 +153,12 @@ typedef struct {
 void erts_init_atom_cache_map(ErtsAtomCacheMap *);
 void erts_reset_atom_cache_map(ErtsAtomCacheMap *);
 void erts_destroy_atom_cache_map(ErtsAtomCacheMap *);
-void erts_finalize_atom_cache_map(ErtsAtomCacheMap *);
+void erts_finalize_atom_cache_map(ErtsAtomCacheMap *, Uint32);
 Uint erts_encode_ext_dist_header_size(ErtsAtomCacheMap *);
 
 Uint erts_encode_ext_dist_header_size(ErtsAtomCacheMap *);
 byte *erts_encode_ext_dist_header_setup(byte *, ErtsAtomCacheMap *);
-byte *erts_encode_ext_dist_header_finalize(byte *, ErtsAtomCache *);
+byte *erts_encode_ext_dist_header_finalize(byte *, ErtsAtomCache *, Uint32);
 Uint erts_encode_dist_ext_size(Eterm, Uint32, ErtsAtomCacheMap *);
 void erts_encode_dist_ext(Eterm, byte **, Uint32, ErtsAtomCacheMap *);
 
diff --git a/erts/emulator/beam/global.h b/erts/emulator/beam/global.h
index 1500424d3e..eccdf10c75 100755
--- a/erts/emulator/beam/global.h
+++ b/erts/emulator/beam/global.h
@@ -1519,17 +1519,25 @@ Sint erts_native_filename_need(Eterm ioterm, int encoding);
 void erts_copy_utf8_to_utf16_little(byte *target, byte *bytes, int num_chars);
 int erts_analyze_utf8(byte *source, Uint size, 
 			byte **err_pos, Uint *num_chars, int *left);
+int erts_analyze_utf8_x(byte *source, Uint size, 
+			byte **err_pos, Uint *num_chars, int *left,
+			Sint *num_latin1_chars, Uint max_chars);
 char *erts_convert_filename_to_native(Eterm name, char *statbuf, 
 				      size_t statbuf_size, 
 				      ErtsAlcType_t alloc_type, 
 				      int allow_empty, int allow_atom,
 				      Sint *used /* out */);
 Eterm erts_convert_native_to_filename(Process *p, byte *bytes);
-int erts_utf8_to_latin1(byte* dest, const byte* source, unsigned slen);
+Eterm erts_utf8_to_list(Process *p, Uint num, byte *bytes, Uint sz, Uint left,
+			Uint *num_built, Uint *num_eaten, Eterm tail);
+int erts_utf8_is_latin1_string(const byte *string, int len);
+int erts_utf8_to_latin1(byte* dest, const byte* source, int slen);
+int erts_utf8_to_latin1_backwards(byte* dest, const byte* source, int slen);
 #define ERTS_UTF8_OK 0
 #define ERTS_UTF8_INCOMPLETE 1
 #define ERTS_UTF8_ERROR 2
 #define ERTS_UTF8_ANALYZE_MORE 3
+#define ERTS_UTF8_OK_MAX_CHARS 4
 
 /* erl_trace.c */
 void erts_init_trace(void);
diff --git a/erts/emulator/beam/io.c b/erts/emulator/beam/io.c
index 60b9238d38..b1eb75bede 100644
--- a/erts/emulator/beam/io.c
+++ b/erts/emulator/beam/io.c
@@ -646,8 +646,11 @@ erts_open_driver(erts_driver_t* driver,	/* Pointer to driver. */
     
     if (IS_TRACED_FL(port, F_TRACE_PORTS)) {
 	trace_port_open(port,
-		pid,		
-		am_atom_put(port->name, strlen(port->name)));
+			pid,
+			erts_atom_put((byte *) port->name,
+				      strlen(port->name),
+				      ERTS_ATOM_ENC_LATIN1,
+				      1));
     }
     
     if (driver->start) {
@@ -4765,7 +4768,8 @@ int driver_exit(ErlDrvPort ix, int err)
         return driver_failure_term(ix, am_normal, 0);
     else {
         char* err_str = erl_errno_id(err);
-        Eterm am_err = am_atom_put(err_str, sys_strlen(err_str));
+        Eterm am_err = erts_atom_put((byte *) err_str, sys_strlen(err_str),
+				     ERTS_ATOM_ENC_LATIN1, 1);
         return driver_failure_term(ix, am_err, 0);
     }
 }
@@ -4778,8 +4782,12 @@ int driver_failure(ErlDrvPort ix, int code)
 
 int driver_failure_atom(ErlDrvPort ix, char* string)
 {
-    Eterm am = am_atom_put(string, strlen(string));
-    return driver_failure_term(ix, am, 0);
+    return driver_failure_term(ix,
+			       erts_atom_put((byte *) string,
+					     strlen(string),
+					     ERTS_ATOM_ENC_LATIN1,
+					     1),
+			       0);
 }
 
 int driver_failure_posix(ErlDrvPort ix, int err)
@@ -4796,7 +4804,10 @@ int driver_failure_eof(ErlDrvPort ix)
 
 ErlDrvTermData driver_mk_atom(char* string)
 {
-    Eterm am = am_atom_put(string, sys_strlen(string));
+    Eterm am = erts_atom_put((byte *) string,
+			     sys_strlen(string),
+			     ERTS_ATOM_ENC_LATIN1,
+			     1);
     ERTS_SMP_CHK_NO_PROC_LOCKS;
     return (ErlDrvTermData) am;
 }
@@ -5091,7 +5102,10 @@ init_driver(erts_driver_t *drv, ErlDrvEntry *de, DE_Handle *handle)
 	erts_smp_mtx_init_x(drv->lock,
 			    "driver_lock",
 #if defined(ERTS_ENABLE_LOCK_CHECK) || defined(ERTS_ENABLE_LOCK_COUNT)
-			    am_atom_put(drv->name, sys_strlen(drv->name))
+			    erts_atom_put((byte *) drv->name,
+					  sys_strlen(drv->name),
+					  ERTS_ATOM_ENC_LATIN1,
+					  1)
 #else
 			    NIL
 #endif
diff --git a/erts/emulator/beam/utils.c b/erts/emulator/beam/utils.c
index 1969fc762c..97b6d01207 100644
--- a/erts/emulator/beam/utils.c
+++ b/erts/emulator/beam/utils.c
@@ -370,7 +370,7 @@ Eterm
 erts_bld_atom(Uint **hpp, Uint *szp, char *str)
 {
     if (hpp)
-	return am_atom_put(str, sys_strlen(str));
+	return erts_atom_put((byte *) str, sys_strlen(str), ERTS_ATOM_ENC_LATIN1, 1);
     else
 	return THE_NON_VALUE;
 }
diff --git a/lib/kernel/include/dist.hrl b/lib/kernel/include/dist.hrl
index 5b52f6f294..91e13d99a9 100644
--- a/lib/kernel/include/dist.hrl
+++ b/lib/kernel/include/dist.hrl
@@ -36,3 +36,4 @@
 -define(DFLAG_UNICODE_IO,16#1000).
 -define(DFLAG_DIST_HDR_ATOM_CACHE,16#2000).
 -define(DFLAG_SMALL_ATOM_TAGS, 16#4000).
+-define(DFLAG_UTF8_ATOMS, 16#10000).
diff --git a/lib/kernel/src/dist_util.erl b/lib/kernel/src/dist_util.erl
index f0d54a2f3e..b1d32322e3 100644
--- a/lib/kernel/src/dist_util.erl
+++ b/lib/kernel/src/dist_util.erl
@@ -115,7 +115,8 @@ make_this_flags(RequestType, OtherNode) ->
 	 ?DFLAG_NEW_FLOATS bor
 	 ?DFLAG_UNICODE_IO bor
 	 ?DFLAG_DIST_HDR_ATOM_CACHE bor
-	 ?DFLAG_SMALL_ATOM_TAGS).
+	 ?DFLAG_SMALL_ATOM_TAGS bor
+	 ?DFLAG_UTF8_ATOMS).
 
 handshake_other_started(#hs_data{request_type=ReqType}=HSData0) ->
     {PreOtherFlags,Node,Version} = recv_name(HSData0),
-- 
cgit v1.2.3


From e81e9e367eb0f6bae1c818f5351a7118f832c31c Mon Sep 17 00:00:00 2001
From: Sverker Eriksson <sverker@erlang.org>
Date: Wed, 16 Jan 2013 20:44:09 +0100
Subject: atom fixes for NIFs and atom_to_binary

---
 erts/emulator/beam/erl_nif.c     | 24 +++++++++-----
 erts/emulator/beam/erl_unicode.c | 67 +++++-----------------------------------
 erts/emulator/beam/global.h      |  2 --
 3 files changed, 24 insertions(+), 69 deletions(-)

diff --git a/erts/emulator/beam/erl_nif.c b/erts/emulator/beam/erl_nif.c
index 185ac75d73..f00d5f86ce 100644
--- a/erts/emulator/beam/erl_nif.c
+++ b/erts/emulator/beam/erl_nif.c
@@ -743,16 +743,23 @@ int enif_get_atom(ErlNifEnv* env, Eterm atom, char* buf, unsigned len,
 {
     Atom* ap;
     ASSERT(encoding == ERL_NIF_LATIN1);
-    if (is_not_atom(atom)) {
+    if (is_not_atom(atom) || len==0) {
 	return 0;
     }
     ap = atom_tab(atom_val(atom));
-    if (ap->len+1 > len) {
+
+    if (ap->latin1_chars < 0 || ap->latin1_chars >= len) {
 	return 0;
     }
-    sys_memcpy(buf, ap->name, ap->len);
-    buf[ap->len] = '\0';
-    return ap->len + 1;
+    if (ap->latin1_chars == ap->len) {
+	sys_memcpy(buf, ap->name, ap->len);
+    }
+    else {
+	int dlen = erts_utf8_to_latin1((byte*)buf, ap->name, ap->len);
+	ASSERT(dlen == ap->latin1_chars); (void)dlen;
+    }
+    buf[ap->latin1_chars] = '\0';
+    return ap->latin1_chars + 1;
 }
 
 int enif_get_int(ErlNifEnv* env, Eterm term, int* ip)
@@ -854,7 +861,10 @@ int enif_get_atom_length(ErlNifEnv* env, Eterm atom, unsigned* len,
     ASSERT(enc == ERL_NIF_LATIN1);
     if (is_not_atom(atom)) return 0;
     ap = atom_tab(atom_val(atom));
-    *len = ap->len;
+    if (ap->latin1_chars < 0) {
+	return 0;
+    }
+    *len = ap->latin1_chars;
     return 1;
 }
 
@@ -961,7 +971,7 @@ ERL_NIF_TERM enif_make_atom(ErlNifEnv* env, const char* name)
 
 ERL_NIF_TERM enif_make_atom_len(ErlNifEnv* env, const char* name, size_t len)
 {
-    return am_atom_put(name, len);
+    return erts_atom_put((byte*)name, len, ERTS_ATOM_ENC_LATIN1, 1);
 }
 
 int enif_make_existing_atom(ErlNifEnv* env, const char* name, ERL_NIF_TERM* atom,
diff --git a/erts/emulator/beam/erl_unicode.c b/erts/emulator/beam/erl_unicode.c
index 6600ce4a4a..c00293de89 100644
--- a/erts/emulator/beam/erl_unicode.c
+++ b/erts/emulator/beam/erl_unicode.c
@@ -1853,32 +1853,21 @@ BIF_RETTYPE atom_to_binary_2(BIF_ALIST_2)
     ap = atom_tab(atom_val(BIF_ARG_1));
 
     if (BIF_ARG_2 == am_latin1) {
-	int i;
 	Eterm bin_term;
-	int bin_size = ap->len;
-
-	for (i = 0; i < ap->len; ) {  
-	    if (ap->name[i] < 0x80) i++;
-	    else {
-		ASSERT(ap->name[i] >= 0xC0);
-		if (ap->name[i] < 0xE0) {
-		    ASSERT(i+1 < ap->len && (ap->name[i+1] & 0xC0) == 0x80);
-		    i += 2;
-		    bin_size -= 1;
-		}
-		else goto error;
-	    }
+
+	if (ap->latin1_chars < 0) {
+	    goto error;
 	}
-	if (bin_size == ap->len) {
+	if (ap->latin1_chars == ap->len) {
 	    bin_term = new_binary(BIF_P, ap->name, ap->len);
 	}
 	else {
 	    byte* bin_p;
 	    int dbg_sz;
-	    bin_term = new_binary(BIF_P, 0, bin_size);
+	    bin_term = new_binary(BIF_P, 0, ap->latin1_chars);
 	    bin_p = binary_bytes(bin_term);
 	    dbg_sz = erts_utf8_to_latin1(bin_p, ap->name, ap->len);
-	    ASSERT(dbg_sz == bin_size); (void)dbg_sz; 
+	    ASSERT(dbg_sz == ap->latin1_chars); (void)dbg_sz; 
 	}
 	BIF_RET(bin_term);
     } else if (BIF_ARG_2 == am_utf8 || BIF_ARG_2 == am_unicode) {
@@ -2676,23 +2665,6 @@ BIF_RETTYPE file_native_name_encoding_0(BIF_ALIST_0)
     }
 }
 
-int erts_utf8_is_latin1_string(const byte *string, int len)
-{
-    /* Assumes string is encoded in valid UTF-8 */
-    int i;
-    while (i < len) {
-	if ((string[i] & 0x80) == 0)
-	    i++;
-	else if (i+1 < len
-		 && (string[i] & 0xFE) == 0xC2
-		 && (string[i+1] & 0xC0) == 0x80)
-	    i +=2;
-	else
-	    return 0;
-    }
-    return 1;
-}
-
 int erts_utf8_to_latin1(byte* dest, const byte* source, int slen)
 {
     /*
@@ -2700,6 +2672,7 @@ int erts_utf8_to_latin1(byte* dest, const byte* source, int slen)
      * and that dest has enough room.
      */
     byte* dp = dest;
+
     while (slen > 0) {
 	if ((source[0] & 0x80) == 0) {
 	    *dp++ = *source++;
@@ -2717,29 +2690,3 @@ int erts_utf8_to_latin1(byte* dest, const byte* source, int slen)
     return dp - dest;
 }
 
-int erts_utf8_to_latin1_backwards(byte *dest, const byte *source, int slen)
-{
-    /*
-     * Assumes source contains valid utf8 that can be encoded as latin1,
-     * and that dest has enough room.
-     */
-    int dix = 0;
-    int six = slen;
-    while (six > 0) {
-	six--;
-	dix--;
-	if ((source[six] & 0x80) == 0)
-	    dest[dix] = source[six];
-	else {
-	    byte c;
-	    ASSERT(six > 0);
-	    ASSERT((source[six] & 0xC0) == 0x80);
-	    ASSERT((source[six-1] & 0xFE) == 0xC2);
- 	    c = source[six] & 0x3F;
-	    six--;
-	    c |= source[six] << 6;
-	    dest[dix] = c;
-	}
-    }
-    return -dix;
-}
diff --git a/erts/emulator/beam/global.h b/erts/emulator/beam/global.h
index eccdf10c75..649352ca91 100755
--- a/erts/emulator/beam/global.h
+++ b/erts/emulator/beam/global.h
@@ -1530,9 +1530,7 @@ char *erts_convert_filename_to_native(Eterm name, char *statbuf,
 Eterm erts_convert_native_to_filename(Process *p, byte *bytes);
 Eterm erts_utf8_to_list(Process *p, Uint num, byte *bytes, Uint sz, Uint left,
 			Uint *num_built, Uint *num_eaten, Eterm tail);
-int erts_utf8_is_latin1_string(const byte *string, int len);
 int erts_utf8_to_latin1(byte* dest, const byte* source, int slen);
-int erts_utf8_to_latin1_backwards(byte* dest, const byte* source, int slen);
 #define ERTS_UTF8_OK 0
 #define ERTS_UTF8_INCOMPLETE 1
 #define ERTS_UTF8_ERROR 2
-- 
cgit v1.2.3


From 97abb095cd2182d5c3fafd525da4943ef74dc8e5 Mon Sep 17 00:00:00 2001
From: Rickard Green <rickard@erlang.org>
Date: Wed, 16 Jan 2013 23:35:13 +0100
Subject: Add utf8 atom distribution test cases

---
 erts/emulator/test/distribution_SUITE.erl | 164 ++++++++++++++++++++++++++++--
 1 file changed, 154 insertions(+), 10 deletions(-)

diff --git a/erts/emulator/test/distribution_SUITE.erl b/erts/emulator/test/distribution_SUITE.erl
index f3a177faf2..aaf6420ccd 100644
--- a/erts/emulator/test/distribution_SUITE.erl
+++ b/erts/emulator/test/distribution_SUITE.erl
@@ -37,8 +37,10 @@
 	 dist_auto_connect_never/1, dist_auto_connect_once/1,
 	 dist_parallel_send/1,
 	 atom_roundtrip/1,
-	 atom_roundtrip_r13b/1,
+	 unicode_atom_roundtrip/1,
+	 atom_roundtrip_r15b/1,
 	 contended_atom_cache_entry/1,
+	 contended_unicode_atom_cache_entry/1,
 	 bad_dist_structure/1,
 	 bad_dist_ext_receive/1,
 	 bad_dist_ext_process_info/1,
@@ -62,8 +64,9 @@ all() ->
      link_to_dead_new_node, applied_monitor_node,
      ref_port_roundtrip, nil_roundtrip, stop_dist,
      {group, trap_bif}, {group, dist_auto_connect},
-     dist_parallel_send, atom_roundtrip, atom_roundtrip_r13b,
-     contended_atom_cache_entry, bad_dist_structure, {group, bad_dist_ext}].
+     dist_parallel_send, atom_roundtrip, unicode_atom_roundtrip, atom_roundtrip_r15b,
+     contended_atom_cache_entry, contended_unicode_atom_cache_entry,
+     bad_dist_structure, {group, bad_dist_ext}].
 
 groups() -> 
     [{bulk_send, [], [bulk_send_small, bulk_send_big, bulk_send_bigbig]},
@@ -1085,19 +1088,27 @@ atom_roundtrip(Config) when is_list(Config) ->
     ?line stop_node(Node),
     ?line ok.
 
-atom_roundtrip_r13b(Config) when is_list(Config) ->
-    case ?t:is_release_available("r13b") of
+atom_roundtrip_r15b(Config) when is_list(Config) ->
+    case ?t:is_release_available("r15b") of
 	true ->
 	    ?line AtomData = atom_data(),
 	    ?line verify_atom_data(AtomData),
-	    ?line {ok, Node} = start_node(Config, [], "r13b"),
+	    ?line {ok, Node} = start_node(Config, [], "r15b"),
 	    ?line do_atom_roundtrip(Node, AtomData),
 	    ?line stop_node(Node),
 	    ?line ok;
 	false ->
-	    ?line {skip,"No OTP R13B available"}
+	    ?line {skip,"No OTP R15B available"}
     end.
 
+unicode_atom_roundtrip(Config) when is_list(Config) ->
+    ?line AtomData = unicode_atom_data(),
+    ?line verify_atom_data(AtomData),
+    ?line {ok, Node} = start_node(Config),
+    ?line do_atom_roundtrip(Node, AtomData),
+    ?line stop_node(Node),
+    ?line ok.
+
 do_atom_roundtrip(Node, AtomData) ->
     ?line Parent = self(),
     ?line Proc = spawn_link(Node, fun () -> verify_atom_data_loop(Parent) end),
@@ -1133,7 +1144,32 @@ verify_atom_data(AtomData) ->
 		  end,
 		  AtomData).
 
+uc_atup(ATxt) ->
+    {string_to_atom(ATxt), ATxt}.
+
+unicode_atom_data() ->
+    [uc_atup(lists:seq(16#1f600, 16#1f600+254)),
+     uc_atup(lists:seq(16#1f600, 16#1f600+63)),
+     uc_atup(lists:seq(0, 254)),
+     uc_atup(lists:seq(100, 163)),
+     uc_atup(lists:seq(200, 354)),
+     uc_atup(lists:seq(200, 263)),
+     uc_atup(lists:seq(2000, 2254)),
+     uc_atup(lists:seq(2000, 2063)),
+     uc_atup(lists:seq(65500, 65754)),
+     uc_atup(lists:seq(65500, 65563))
+     | lists:map(fun (N) ->
+			 uc_atup(lists:seq(64000+N, 64254+N))
+		 end,
+		 lists:seq(1, 2000))].
+
 contended_atom_cache_entry(Config) when is_list(Config) ->
+    contended_atom_cache_entry_test(Config, latin1).
+
+contended_unicode_atom_cache_entry(Config) when is_list(Config) ->
+    contended_atom_cache_entry_test(Config, unicode).
+
+contended_atom_cache_entry_test(Config, Type) ->
     ?line TestServer = self(),
     ?line ProcessPairs = 10,
     ?line Msgs = 100000,
@@ -1147,7 +1183,14 @@ contended_atom_cache_entry(Config) when is_list(Config) ->
 						  true),
 		    Master = self(),
 		    CIX = get_cix(),
-		    TestAtoms = get_conflicting_atoms(CIX, ProcessPairs),
+		    TestAtoms = case Type of
+				    latin1 ->
+					get_conflicting_atoms(CIX,
+							      ProcessPairs);
+				    unicode ->
+					get_conflicting_unicode_atoms(CIX,
+								      ProcessPairs)
+				end,
 		    io:format("Testing with the following atoms all using "
 			      "cache index ~p:~n ~p~n",
 			      [CIX, TestAtoms]),
@@ -1159,8 +1202,12 @@ contended_atom_cache_entry(Config) when is_list(Config) ->
 					 fun () ->
 						 Atom = receive
 							    {Ref, txt, ATxt} ->
-								list_to_atom(
-								  ATxt)
+								case Type of
+								    latin1 ->
+									list_to_atom(ATxt);
+								    unicode ->
+									string_to_atom(ATxt)
+								end
 							end,
 						 receive_ref_atom(Ref,
 								  Atom,
@@ -1252,6 +1299,20 @@ get_conflicting_atoms(CIX, N) ->
 	    get_conflicting_atoms(CIX, N)
     end.
 
+get_conflicting_unicode_atoms(_CIX, 0) ->
+    [];
+get_conflicting_unicode_atoms(CIX, N) ->
+    {A, B, C} = now(),
+    Atom = string_to_atom([16#1f608] ++ "atom" ++ integer_to_list(A*1000000000000
+								  + B*1000000
+								  + C)),
+    case erts_debug:get_internal_state({atom_out_cache_index, Atom}) of
+	CIX ->
+	    [Atom|get_conflicting_unicode_atoms(CIX, N-1)];
+	_ ->
+	    get_conflicting_unicode_atoms(CIX, N)
+    end.
+
 -define(COOKIE, '').
 -define(DOP_LINK,		1).
 -define(DOP_SEND,		2).
@@ -2131,3 +2192,86 @@ repeat(_Fun, 0) ->
 repeat(Fun, N) ->
     Fun(),
     repeat(Fun, N-1).
+
+string_to_atom(String) ->
+    Utf8List = string_to_utf8_list(String),
+    Len = length(Utf8List),
+    TagLen = case Len < 256 of
+		 true -> [119, Len];
+		 false -> [118, Len bsr 8, Len band 16#ff]
+	     end,
+    binary_to_term(list_to_binary([131, TagLen, Utf8List])).
+
+string_to_utf8_list([]) ->
+    [];
+string_to_utf8_list([CP|CPs]) when is_integer(CP),
+				   0 =< CP,
+				   CP =< 16#7F ->
+    [CP | string_to_utf8_list(CPs)];
+string_to_utf8_list([CP|CPs]) when is_integer(CP),
+				   16#80 =< CP,
+				   CP =< 16#7FF ->
+    [16#C0 bor (CP bsr 6),
+     16#80 bor (16#3F band CP)
+     | string_to_utf8_list(CPs)];
+string_to_utf8_list([CP|CPs]) when is_integer(CP),
+				   16#800 =< CP,
+				   CP =< 16#FFFF ->
+    [16#E0 bor (CP bsr 12),
+     16#80 bor (16#3F band (CP bsr 6)),
+     16#80 bor (16#3F band CP)
+     | string_to_utf8_list(CPs)];
+string_to_utf8_list([CP|CPs]) when is_integer(CP),
+				   16#10000 =< CP,
+				   CP =< 16#10FFFF ->
+    [16#F0 bor (CP bsr 18),
+     16#80 bor (16#3F band (CP bsr 12)),
+     16#80 bor (16#3F band (CP bsr 6)),
+     16#80 bor (16#3F band CP)
+     | string_to_utf8_list(CPs)].
+
+utf8_list_to_string([]) ->
+    [];
+utf8_list_to_string([B|Bs]) when is_integer(B),
+				 0 =< B,
+				 B =< 16#7F ->
+    [B | utf8_list_to_string(Bs)];
+utf8_list_to_string([B0, B1 | Bs]) when is_integer(B0),
+					16#C0 =< B0,
+					B0 =< 16#DF,
+					is_integer(B1),
+					16#80 =< B1,
+					B1 =< 16#BF ->
+    [(((B0 band 16#1F) bsl 6)
+	  bor (B1 band 16#3F))
+     | utf8_list_to_string(Bs)];
+utf8_list_to_string([B0, B1, B2 | Bs]) when is_integer(B0),
+					    16#E0 =< B0,
+					    B0 =< 16#EF,
+					    is_integer(B1),
+					    16#80 =< B1,
+					    B1 =< 16#BF,
+					    is_integer(B2),
+					    16#80 =< B2,
+					    B2 =< 16#BF ->
+    [(((B0 band 16#F) bsl 12)
+	  bor ((B1 band 16#3F) bsl 6)
+	  bor (B2 band 16#3F))
+     | utf8_list_to_string(Bs)];
+utf8_list_to_string([B0, B1, B2, B3 | Bs]) when is_integer(B0),
+						16#F0 =< B0,
+						B0 =< 16#F7,
+						is_integer(B1),
+						16#80 =< B1,
+						B1 =< 16#BF,
+						is_integer(B2),
+						16#80 =< B2,
+						B2 =< 16#BF,
+						is_integer(B3),
+						16#80 =< B3,
+						B3 =< 16#BF ->
+    [(((B0 band 16#7) bsl 18)
+	  bor ((B1 band 16#3F) bsl 12)
+	  bor ((B2 band 16#3F) bsl 6)
+	  bor (B3 band 16#3F))
+     | utf8_list_to_string(Bs)].
-- 
cgit v1.2.3


From b553664f54034e8c04ae6f9cc44f16b7f516518b Mon Sep 17 00:00:00 2001
From: Sverker Eriksson <sverker@erlang.org>
Date: Fri, 11 Jan 2013 17:27:29 +0100
Subject: erl_interface: utf8 atoms continued

---
 lib/erl_interface/doc/src/ei.xml                   |  59 +++-
 lib/erl_interface/doc/src/erl_eterm.xml            |  18 +-
 lib/erl_interface/include/ei.h                     |  35 +-
 lib/erl_interface/include/erl_interface.h          |  40 ++-
 lib/erl_interface/src/connect/ei_connect.c         |   5 +-
 lib/erl_interface/src/connect/ei_connect_int.h     |   2 +
 lib/erl_interface/src/connect/eirecv.c             |  12 +-
 lib/erl_interface/src/decode/decode_atom.c         | 185 +++++++---
 lib/erl_interface/src/decode/decode_boolean.c      |   8 +-
 lib/erl_interface/src/decode/decode_fun.c          |   6 +-
 lib/erl_interface/src/decode/decode_pid.c          |   3 +-
 lib/erl_interface/src/decode/decode_port.c         |   3 +-
 lib/erl_interface/src/decode/decode_ref.c          |   6 +-
 lib/erl_interface/src/encode/encode_atom.c         | 100 +++++-
 lib/erl_interface/src/encode/encode_fun.c          |   4 +-
 lib/erl_interface/src/encode/encode_pid.c          |  24 +-
 lib/erl_interface/src/encode/encode_port.c         |  23 +-
 lib/erl_interface/src/encode/encode_ref.c          |  24 +-
 lib/erl_interface/src/legacy/erl_connect.c         |  16 +-
 lib/erl_interface/src/legacy/erl_eterm.c           | 158 +++++++--
 lib/erl_interface/src/legacy/erl_eterm.h           |   4 +-
 lib/erl_interface/src/legacy/erl_format.c          |  20 +-
 lib/erl_interface/src/legacy/erl_malloc.c          |  20 +-
 lib/erl_interface/src/legacy/erl_marshal.c         | 259 ++++++++------
 lib/erl_interface/src/legacy/global_whereis.c      |  11 +-
 lib/erl_interface/src/misc/ei_decode_term.c        |  13 +-
 lib/erl_interface/src/misc/ei_printterm.c          |   5 +-
 lib/erl_interface/src/misc/ei_x_encode.c           |  21 +-
 lib/erl_interface/src/misc/get_type.c              |  77 +---
 lib/erl_interface/src/misc/putget.h                |   6 +-
 lib/erl_interface/src/misc/show_msg.c              |  12 +-
 lib/erl_interface/test/ei_decode_encode_SUITE.erl  |  66 +++-
 .../ei_decode_encode_test.c                        | 386 ++++++++++++---------
 lib/ic/examples/all-against-all/client.c           |   1 +
 lib/ic/examples/c-client/client.c                  |   1 +
 lib/ic/examples/c-server/client.c                  |   1 +
 36 files changed, 1062 insertions(+), 572 deletions(-)

diff --git a/lib/erl_interface/doc/src/ei.xml b/lib/erl_interface/doc/src/ei.xml
index 539e16d837..0b0b1eeb79 100644
--- a/lib/erl_interface/doc/src/ei.xml
+++ b/lib/erl_interface/doc/src/ei.xml
@@ -82,6 +82,22 @@
       function returns the size required (note that for strings an
       extra byte is needed for the 0 string terminator).</p>
   </description>
+  <section>
+    <title>DATA TYPES</title>
+
+    <taglist>
+      <tag><marker id="erlang_char_encoding"/>enum erlang_char_encoding</tag>
+       <item>
+	 <p/>
+	 <code type="none">
+enum erlang_char_encoding {
+    ERLANG_ASCII, ERLANG_LATIN1, ERLANG_UTF8, ERLANG_WHATEVER
+};
+</code>
+         <p>The character encoding used for atoms.</p>
+      </item>
+    </taglist>
+  </section>
   <funcs>
     <func>
       <name><ret>void</ret><nametext>ei_set_compat_rel(release_number)</nametext></name>
@@ -225,11 +241,31 @@
       <fsummary>Encode an atom</fsummary>
       <desc>
         <p>Encodes an atom in the binary format. The <c><![CDATA[p]]></c> parameter
-          is the name of the atom. Only upto <c><![CDATA[MAXATOMLEN]]></c> bytes
+          is the name of the atom in latin1 encoding. Only upto <c>MAXATOMLEN-1</c> bytes
           are encoded. The name should be zero-terminated, except for
           the <c><![CDATA[ei_x_encode_atom_len()]]></c> function.</p>
       </desc>
     </func>
+    <func>
+      <name><ret>int</ret><nametext>ei_encode_atom_as(char *buf, int *index, const char *p, enum erlang_char_encoding from_enc, enum erlang_char_encoding to_enc)</nametext></name>
+      <name><ret>int</ret><nametext>ei_encode_atom_len_as(char *buf, int *index, const char *p, int len, enum erlang_char_encoding from_enc, enum erlang_char_encoding to_enc)</nametext></name>
+      <name><ret>int</ret><nametext>ei_x_encode_atom_as(ei_x_buff* x, const char *p, enum erlang_char_encoding from_enc, enum erlang_char_encoding to_enc)</nametext></name>
+      <name><ret>int</ret><nametext>ei_x_encode_atom_len_as(ei_x_buff* x, const char *p, int len, enum erlang_char_encoding from_enc, enum erlang_char_encoding to_enc)</nametext></name>
+      <fsummary>Encode an atom</fsummary>
+      <desc>
+        <p>Encodes an atom in the binary format with character encoding
+	<c><seealso marker="#erlang_char_encoding">to_enc</seealso></c> (latin1 or utf8).
+	The <c>p</c> parameter is the name of the atom with character encoding
+	<c><seealso marker="#erlang_char_encoding">from_enc</seealso></c>.
+	The name must either be zero-terminated or a function variant with a <c>len</c>
+	parameter must be used.</p>
+	<p>The encoding will fail if the atom is too long or if it can not be represented
+	with character encoding <c>to_enc</c>.</p>
+	<p>These functions were introduced in R16 release of Erlang/OTP as part of a first step
+	to support UTF8 atoms. Atoms encoded with <c>ERLANG_UTF8</c>
+	can not be decoded by earlier releases than R16.</p>
+      </desc>
+    </func>
     <func>
       <name><ret>int</ret><nametext>ei_encode_binary(char *buf, int *index, const void *p, long len)</nametext></name>
       <name><ret>int</ret><nametext>ei_x_encode_binary(ei_x_buff* x, const void *p, long len)</nametext></name>
@@ -490,10 +526,29 @@ ei_x_encode_empty_list(&amp;x);
       <fsummary>Decode an atom</fsummary>
       <desc>
         <p>This function decodes an atom from the binary format.  The
-          name of the atom is placed at <c><![CDATA[p]]></c>. There can be at most
+	null terminated name of the atom is placed at <c><![CDATA[p]]></c>. There can be at most
           <c><![CDATA[MAXATOMLEN]]></c> bytes placed in the buffer.</p>
       </desc>
     </func>
+    <func>
+      <name><ret>int</ret><nametext>ei_decode_atom_as(const char *buf, int *index, char *p, int plen, enum erlang_char_encoding want, enum erlang_char_encoding* was, enum erlang_char_encoding* result)</nametext></name>
+      <fsummary>Decode an atom</fsummary>
+      <desc>
+        <p>This function decodes an atom from the binary format. The
+	null terminated name of the atom is placed in buffer at <c>p</c> of length
+	<c>plen</c> bytes.</p>
+	<p>The wanted string encoding is specified by <c><seealso marker="#erlang_char_encoding">
+	want</seealso></c>. The original encoding used in the
+	binary format (latin1 or utf8) can be obtained from <c>*was</c>. The actual encoding of the resulting string
+	(7-bit ascii, latin1 or utf8) can be obtained from <c>*result</c>. Both <c>was</c> and <c>result</c> can be <c>NULL</c>.
+	<c>*result</c> may differ from <c>want</c> if <c>want</c> is <c>ERLANG_WHATEVER</c> or if
+	<c>*result</c> turn out to be pure 7-bit ascii (compatible with both latin1 and utf8).</p>
+	<p>This function fails if the atom is too long for the buffer
+	or if it can not be represented with encoding <c>want</c>.</p>
+	<p>This functions was introduced in R16 release of Erlang/OTP as part of a first step
+	to support UTF8 atoms.</p>
+      </desc>
+    </func>
     <func>
       <name><ret>int</ret><nametext>ei_decode_binary(const char *buf, int *index, void *p, long *len)</nametext></name>
       <fsummary>Decode a binary</fsummary>
diff --git a/lib/erl_interface/doc/src/erl_eterm.xml b/lib/erl_interface/doc/src/erl_eterm.xml
index f403618c59..c7840d7813 100644
--- a/lib/erl_interface/doc/src/erl_eterm.xml
+++ b/lib/erl_interface/doc/src/erl_eterm.xml
@@ -77,10 +77,12 @@
       </p>
     <taglist>
       <tag><c><![CDATA[char *ERL_ATOM_PTR(t)]]></c></tag>
+      <tag><c><![CDATA[char *ERL_ATOM_PTR_UTF8(t)]]></c></tag>
       <item>A string representing atom <c><![CDATA[t]]></c>.
       </item>
       <tag><c><![CDATA[int ERL_ATOM_SIZE(t)]]></c></tag>
-      <item>The length (in characters) of atom t.</item>
+      <tag><c><![CDATA[int ERL_ATOM_SIZE_UTF8(t)]]></c></tag>
+      <item>The length (in bytes) of atom t.</item>
       <tag><c><![CDATA[void *ERL_BIN_PTR(t)]]></c></tag>
       <item>A pointer to the contents of <c><![CDATA[t]]></c></item>
       <tag><c><![CDATA[int ERL_BIN_SIZE(t)]]></c></tag>
@@ -92,6 +94,7 @@
       <tag><c><![CDATA[double ERL_FLOAT_VALUE(t)]]></c></tag>
       <item>The floating point value of <c><![CDATA[t]]></c>.</item>
       <tag><c><![CDATA[ETERM *ERL_PID_NODE(t)]]></c></tag>
+      <tag><c><![CDATA[ETERM *ERL_PID_NODE_UTF8(t)]]></c></tag>
       <item>The Node in pid <c><![CDATA[t]]></c>.</item>
       <tag><c><![CDATA[int ERL_PID_NUMBER(t)]]></c></tag>
       <item>The sequence number in pid <c><![CDATA[t]]></c>.</item>
@@ -104,6 +107,7 @@
       <tag><c><![CDATA[int ERL_PORT_CREATION(t)]]></c></tag>
       <item>The creation number in port <c><![CDATA[t]]></c>.</item>
       <tag><c><![CDATA[ETERM *ERL_PORT_NODE(t)]]></c></tag>
+      <tag><c><![CDATA[ETERM *ERL_PORT_NODE_UTF8(t)]]></c></tag>
       <item>The node in port <c><![CDATA[t]]></c>.</item>
       <tag><c><![CDATA[int ERL_REF_NUMBER(t)]]></c></tag>
       <item>The first part of the reference number in ref <c><![CDATA[t]]></c>. Use
@@ -296,7 +300,7 @@ iohead ::= Binary
       <name><ret>ETERM *</ret><nametext>erl_mk_atom(string)</nametext></name>
       <fsummary>Creates an atom</fsummary>
       <type>
-        <v>char *string;</v>
+        <v>const char *string;</v>
       </type>
       <desc>
         <p>Creates an atom.</p>
@@ -305,10 +309,12 @@ iohead ::= Binary
         <p>Returns an Erlang term containing an atom. Note that it is
           the callers responsibility to make sure that <c><![CDATA[string]]></c>
           contains a valid name for an atom.</p>
-        <p><c><![CDATA[ERL_ATOM_PTR(atom)]]></c> can be used to retrieve the 
-          atom name (as a string). Note that the string is not
-          0-terminated in the atom. <c><![CDATA[ERL_ATOM_SIZE(atom)]]></c>returns
-          the length of the atom name.</p>
+        <p><c><![CDATA[ERL_ATOM_PTR(atom)]]></c> and <c><![CDATA[ERL_ATOM_PTR_UTF8(atom)]]></c>
+	can be used to retrieve the atom name (as a null terminated string). <c><![CDATA[ERL_ATOM_SIZE(atom)]]></c>
+	and <c><![CDATA[ERL_ATOM_SIZE_UTF8(atom)]]></c> returns the length of the atom name.</p>
+	<note><p>Note that the UTF8 variants were introduced in Erlang/OTP releases R16
+	and the string returned by <c>ERL_ATOM_PTR(atom)</c> was not null terminated on older releases.</p>
+	</note>
       </desc>
     </func>
     <func>
diff --git a/lib/erl_interface/include/ei.h b/lib/erl_interface/include/ei.h
index 8f07b24852..20e575f64d 100644
--- a/lib/erl_interface/include/ei.h
+++ b/lib/erl_interface/include/ei.h
@@ -116,7 +116,8 @@
 #define NEW_FLOAT_EXT         'F'
 #define ERL_ATOM_EXT          'd'
 #define ERL_SMALL_ATOM_EXT    's'
-#define ERL_UNICODE_ATOM_EXT  'v'
+#define ERL_ATOM_UTF8_EXT     'v'
+#define ERL_SMALL_ATOM_UTF8_EXT 'w'
 #define ERL_REFERENCE_EXT     'e'
 #define ERL_NEW_REFERENCE_EXT 'r'
 #define ERL_PORT_EXT          'f'
@@ -185,12 +186,16 @@ extern volatile int __erl_errno;
 #define EI_MAXHOSTNAMELEN 64
 #define EI_MAXALIVELEN 63
 #define EI_MAX_COOKIE_SIZE 512
-#define MAXATOMLEN 255
+#define MAXATOMLEN (255 + 1)
+#define MAXATOMLEN_UTF8 (255*4 + 1)
 #define MAXNODELEN EI_MAXALIVELEN+1+EI_MAXHOSTNAMELEN
 
+enum erlang_char_encoding { ERLANG_ASCII, ERLANG_LATIN1, ERLANG_UTF8, ERLANG_WHATEVER };
+
 /* a pid */
 typedef struct {
-  char node[MAXATOMLEN+1];
+  char node[MAXATOMLEN_UTF8];
+  enum erlang_char_encoding node_org_enc;
   unsigned int num;
   unsigned int serial;
   unsigned int creation;
@@ -198,14 +203,16 @@ typedef struct {
 
 /* a port */
 typedef struct {
-  char node[MAXATOMLEN+1];
+  char node[MAXATOMLEN_UTF8];
+  enum erlang_char_encoding node_org_enc;
   unsigned int id;
   unsigned int creation;
 } erlang_port;
 
 /* a ref */
 typedef struct {
-  char node[MAXATOMLEN+1];
+  char node[MAXATOMLEN_UTF8];
+  enum erlang_char_encoding node_org_enc;  
   int len;
   unsigned int n[3];
   unsigned int creation;
@@ -225,15 +232,16 @@ typedef struct {
   long msgtype;
   erlang_pid from;
   erlang_pid to;
-  char toname[MAXATOMLEN+1];
-  char cookie[MAXATOMLEN+1];
+  char toname[MAXATOMLEN_UTF8];
+  char cookie[MAXATOMLEN_UTF8];
   erlang_trace token;
 } erlang_msg;
 
 /* a fun */
 typedef struct {
     long arity;
-    char module[MAXATOMLEN+1];
+    char module[MAXATOMLEN_UTF8];
+    enum erlang_char_encoding module_org_enc;
     char md5[16];
     long index;
     long old_index;
@@ -258,7 +266,7 @@ typedef struct {
     union {
 	long i_val;
 	double d_val;
-	char atom_name[MAXATOMLEN+1];
+	char atom_name[MAXATOMLEN_UTF8];
 	erlang_pid pid;
 	erlang_port port;
 	erlang_ref ref;
@@ -427,9 +435,17 @@ int ei_encode_string_len(char *buf, int *index, const char *p, int len);
 int ei_x_encode_string(ei_x_buff* x, const char* s);
 int ei_x_encode_string_len(ei_x_buff* x, const char* s, int len);
 int ei_encode_atom(char *buf, int *index, const char *p);
+int ei_encode_atom_as(char *buf, int *index, const char *p,
+		    enum erlang_char_encoding from, enum erlang_char_encoding to);
 int ei_encode_atom_len(char *buf, int *index, const char *p, int len);
+int ei_encode_atom_len_as(char *buf, int *index, const char *p, int len,
+			enum erlang_char_encoding from, enum erlang_char_encoding to);
 int ei_x_encode_atom(ei_x_buff* x, const char* s);
+int ei_x_encode_atom_as(ei_x_buff* x, const char* s,
+		      enum erlang_char_encoding from, enum erlang_char_encoding to);
 int ei_x_encode_atom_len(ei_x_buff* x, const char* s, int len);
+int ei_x_encode_atom_len_as(ei_x_buff* x, const char* s, int len,
+			  enum erlang_char_encoding from, enum erlang_char_encoding to);
 int ei_encode_binary(char *buf, int *index, const void *p, long len);
 int ei_x_encode_binary(ei_x_buff* x, const void* s, int len);
 int ei_encode_pid(char *buf, int *index, const erlang_pid *p);
@@ -479,6 +495,7 @@ int ei_decode_boolean(const char *buf, int *index, int *p);
 int ei_decode_char(const char *buf, int *index, char *p);
 int ei_decode_string(const char *buf, int *index, char *p);
 int ei_decode_atom(const char *buf, int *index, char *p);
+int ei_decode_atom_as(const char *buf, int *index, char *p, int destlen, enum erlang_char_encoding want, enum erlang_char_encoding* was, enum erlang_char_encoding* result);
 int ei_decode_binary(const char *buf, int *index, void *p, long *len);
 int ei_decode_fun(const char* buf, int* index, erlang_fun* p);
 void free_fun(erlang_fun* f);
diff --git a/lib/erl_interface/include/erl_interface.h b/lib/erl_interface/include/erl_interface.h
index 1c4a94700d..98acc0d71d 100644
--- a/lib/erl_interface/include/erl_interface.h
+++ b/lib/erl_interface/include/erl_interface.h
@@ -95,19 +95,24 @@
 
 #define ERL_FLOAT_VALUE(x) ((x)->uval.fval.f)
 
-#define ERL_ATOM_PTR(x) ((x)->uval.aval.a)
-#define ERL_ATOM_SIZE(x) ((x)->uval.aval.len)
+#define ERL_ATOM_PTR(x) erl_atom_ptr_latin1((Erl_Atom_data*) &(x)->uval.aval.d)
+#define ERL_ATOM_PTR_UTF8(x) erl_atom_ptr_utf8((Erl_Atom_data*) &(x)->uval.aval.d)
+#define ERL_ATOM_SIZE(x) erl_atom_size_latin1((Erl_Atom_data*) &(x)->uval.aval.d)
+#define ERL_ATOM_SIZE_UTF8(x) erl_atom_size_utf8((Erl_Atom_data*) &(x)->uval.aval.d)
 
-#define ERL_PID_NODE(x) ((x)->uval.pidval.node)
+#define ERL_PID_NODE(x) erl_atom_ptr_latin1((Erl_Atom_data*) &(x)->uval.pidval.node)
+#define ERL_PID_NODE_UTF8(x) erl_atom_ptr_utf8((Erl_Atom_data*) &(x)->uval.pidval.node)
 #define ERL_PID_NUMBER(x) ((x)->uval.pidval.number)
 #define ERL_PID_SERIAL(x) ((x)->uval.pidval.serial)
 #define ERL_PID_CREATION(x) ((x)->uval.pidval.creation)
 
-#define ERL_PORT_NODE(x) ((x)->uval.portval.node)
+#define ERL_PORT_NODE(x) erl_atom_ptr_latin1((Erl_Atom_data*) &(x)->uval.portval.node)
+#define ERL_PORT_NODE_UTF8(x) erl_atom_ptr_utf8((Erl_Atom_data*) &(x)->uval.portval.node)
 #define ERL_PORT_NUMBER(x) ((x)->uval.portval.number)
 #define ERL_PORT_CREATION(x) ((x)->uval.portval.creation)
 
-#define ERL_REF_NODE(x) ((x)->uval.refval.node)
+#define ERL_REF_NODE(x) erl_atom_ptr_latin1((Erl_Atom_data*) &(x)->uval.refval.node)
+#define ERL_REF_NODE_UTF8(x) erl_atom_ptr_utf8((Erl_Atom_data*) &(x)->uval.refval.node)
 #define ERL_REF_NUMBER(x) ((x)->uval.refval.n[0])
 #define ERL_REF_NUMBERS(x) ((x)->uval.refval.n)
 #define ERL_REF_LEN(x) ((x)->uval.refval.len)
@@ -182,15 +187,27 @@ typedef struct {
   double f;
 } Erl_Float;
 
+typedef struct {
+  char *utf8;
+  int lenU;
+  char *latin1;
+  int lenL;
+} Erl_Atom_data;
+
+char* erl_atom_ptr_latin1(Erl_Atom_data*);
+char* erl_atom_ptr_utf8(Erl_Atom_data*);
+int erl_atom_size_latin1(Erl_Atom_data*);
+int erl_atom_size_utf8(Erl_Atom_data*);
+char* erl_atom_init_latin1(Erl_Atom_data*, const char*);
+
 typedef struct {
   Erl_Header h;
-  int len;
-  char *a;
+  Erl_Atom_data d;
 } Erl_Atom;
 
 typedef struct {
   Erl_Header h;
-  char * node;
+  Erl_Atom_data node;
   unsigned int number;
   unsigned int serial;
   unsigned char creation;
@@ -198,14 +215,14 @@ typedef struct {
 
 typedef struct {    
   Erl_Header h;
-  char * node;
+  Erl_Atom_data node;
   unsigned int number;
   unsigned char creation;
 } Erl_Port;
 
 typedef struct {
   Erl_Header h;
-  char * node;
+  Erl_Atom_data node;
   int len;
   unsigned int n[3];
   unsigned char creation;
@@ -289,7 +306,7 @@ typedef struct _eterm {
 } ETERM;
 
 
-#define MAXREGLEN 255  /* max length of registered (atom) name */
+#define MAXREGLEN (255*4)  /* max length of registered (atom) name */
 
 typedef struct {
   int type;   /* one of the message type constants in eiext.h */
@@ -409,6 +426,7 @@ unsigned char erl_ext_type(unsigned char*); /* Note: returned 'char' before R9C
 unsigned char *erl_peek_ext(unsigned char*,int);
 int    erl_term_len(ETERM*);
 
+int cmp_latin1_vs_utf8(const char* sL, int lenL, const char* sU, int lenU);
 
 /* -------------------------------------------------------------------- */
 /*                      Wrappers around ei functions                    */
diff --git a/lib/erl_interface/src/connect/ei_connect.c b/lib/erl_interface/src/connect/ei_connect.c
index 34362b4b9f..a17257795e 100644
--- a/lib/erl_interface/src/connect/ei_connect.c
+++ b/lib/erl_interface/src/connect/ei_connect.c
@@ -459,6 +459,7 @@ int ei_connect_xinit(ei_cnode* ec, const char *thishostname,
 /*    memmove(&ec->this_ipaddr, thisipaddr, sizeof(ec->this_ipaddr)); */
     
     strcpy(ec->self.node,thisnodename);
+    ec->self.node_org_enc = ERLANG_LATIN1;
     ec->self.num = 0;
     ec->self.serial = 0;
     ec->self.creation = creation;
@@ -1332,7 +1333,9 @@ static int send_name_or_challenge(int fd, char *nodename,
 		| DFLAG_EXTENDED_PIDS_PORTS
 		| DFLAG_FUN_TAGS
 		| DFLAG_NEW_FUN_TAGS
-                | DFLAG_NEW_FLOATS));
+                | DFLAG_NEW_FLOATS
+		| DFLAG_SMALL_ATOM_TAGS
+		| DFLAG_UTF8_ATOMS));
     if (f_chall)
 	put32be(s, challenge);
     memcpy(s, nodename, strlen(nodename));
diff --git a/lib/erl_interface/src/connect/ei_connect_int.h b/lib/erl_interface/src/connect/ei_connect_int.h
index 3c42b49b82..81c384e38d 100644
--- a/lib/erl_interface/src/connect/ei_connect_int.h
+++ b/lib/erl_interface/src/connect/ei_connect_int.h
@@ -102,6 +102,8 @@ extern int h_errno;
 #define DFLAG_NEW_FUN_TAGS        0x80
 #define DFLAG_EXTENDED_PIDS_PORTS 0x100
 #define DFLAG_NEW_FLOATS          0x800
+#define DFLAG_SMALL_ATOM_TAGS     0x4000
+#define DFLAG_UTF8_ATOMS          0x10000
 
 ei_cnode   *ei_fd_to_cnode(int fd);
 int         ei_distversion(int fd);
diff --git a/lib/erl_interface/src/connect/eirecv.c b/lib/erl_interface/src/connect/eirecv.c
index 86852f947d..075f78e3d2 100644
--- a/lib/erl_interface/src/connect/eirecv.c
+++ b/lib/erl_interface/src/connect/eirecv.c
@@ -108,7 +108,7 @@ ei_recv_internal (int fd,
   switch (msg->msgtype) {
   case ERL_SEND:          /* { SEND, Cookie, ToPid } */
     if (ei_tracelevel >= 4) show_this_msg = 1;
-    if (ei_decode_atom(header,&index,msg->cookie) 
+    if (ei_decode_atom_as(header,&index,msg->cookie,sizeof(msg->cookie),ERLANG_UTF8,NULL,NULL) 
 	|| ei_decode_pid(header,&index,&msg->to))
     {
 	erl_errno = EIO;
@@ -120,8 +120,8 @@ ei_recv_internal (int fd,
   case ERL_REG_SEND:     /* { REG_SEND, From, Cookie, ToName } */
     if (ei_tracelevel >= 4) show_this_msg = 1;
     if (ei_decode_pid(header,&index,&msg->from) 
-	|| ei_decode_atom(header,&index,msg->cookie) 
-	|| ei_decode_atom(header,&index,msg->toname))
+	|| ei_decode_atom_as(header,&index,msg->cookie,sizeof(msg->cookie),ERLANG_UTF8,NULL,NULL) 
+	|| ei_decode_atom_as(header,&index,msg->toname,sizeof(msg->toname),ERLANG_UTF8,NULL,NULL))
     {
 	erl_errno = EIO;
 	return -1;
@@ -157,7 +157,7 @@ ei_recv_internal (int fd,
     
   case ERL_SEND_TT:      /* { SEND_TT, Cookie, ToPid, TraceToken } */
     if (ei_tracelevel >= 4) show_this_msg = 1;
-    if (ei_decode_atom(header,&index,msg->cookie) 
+    if (ei_decode_atom_as(header,&index,msg->cookie,sizeof(msg->cookie),ERLANG_UTF8,NULL,NULL) 
 	|| ei_decode_pid(header,&index,&msg->to)
 	|| ei_decode_trace(header,&index,&msg->token))
     {
@@ -171,8 +171,8 @@ ei_recv_internal (int fd,
   case ERL_REG_SEND_TT: /* { REG_SEND_TT, From, Cookie, ToName, TraceToken } */
     if (ei_tracelevel >= 4) show_this_msg = 1;
     if (ei_decode_pid(header,&index,&msg->from) 
-	|| ei_decode_atom(header,&index,msg->cookie) 
-	|| ei_decode_atom(header,&index,msg->toname)
+	|| ei_decode_atom_as(header,&index,msg->cookie,sizeof(msg->cookie),ERLANG_UTF8,NULL,NULL) 
+	|| ei_decode_atom_as(header,&index,msg->toname,sizeof(msg->toname),ERLANG_UTF8,NULL,NULL)
 	|| ei_decode_trace(header,&index,&msg->token))
     {
 	erl_errno = EIO;
diff --git a/lib/erl_interface/src/decode/decode_atom.c b/lib/erl_interface/src/decode/decode_atom.c
index 84edf1766a..2ada418243 100644
--- a/lib/erl_interface/src/decode/decode_atom.c
+++ b/lib/erl_interface/src/decode/decode_atom.c
@@ -21,76 +21,155 @@
 #include "eiext.h"
 #include "putget.h"
 
-static int utf8_to_latin1(char* dest, const char* source, unsigned len);
 
 int ei_decode_atom(const char *buf, int *index, char *p)
 {
-  const char *s = buf + *index;
-  const char *s0 = s;
-  int len;
-
-  switch (get8(s)) {
-  case ERL_ATOM_EXT:
-      len = get16be(s);
-      if (len > MAXATOMLEN) return -1;
-      if (p) {
-	  memmove(p,s,len); 
-	  p[len] = (char)0;
-      }
-      break;
-
-  case ERL_SMALL_ATOM_EXT:
-      len = get8(s);
-      if (p) {
-	  memmove(p,s,len); 
-	  p[len] = (char)0;
-      }
-      break;
-
-  case ERL_UNICODE_ATOM_EXT:
-      len = get16be(s);
-
-      if (len > 2*MAXATOMLEN) return -1;
-
-      if (p && utf8_to_latin1(p, s, len) < 0) return -1;
-      break;
-
-  default:
-      return -1;
-  }
-
-  s += len;
-  *index += s-s0;
-  return 0;
+    return ei_decode_atom_as(buf, index, p, MAXATOMLEN, ERLANG_LATIN1, NULL, NULL);
 }
 
-int ei_internal_get_atom(const char** bufp, char* p)
+int ei_decode_atom_as(const char *buf, int *index, char* p, int destlen,
+		      enum erlang_char_encoding want_enc,
+		      enum erlang_char_encoding* was_encp,
+		      enum erlang_char_encoding* res_encp)
 {
-    int ix = 0;
-    if (ei_decode_atom(*bufp, &ix, p) < 0) return -1;
-    *bufp += ix;
+    const char *s = buf + *index;
+    const char *s0 = s;
+    int len;
+    enum erlang_char_encoding got_enc;
+    
+    switch (get8(s)) {
+    case ERL_ATOM_EXT:
+	len = get16be(s);
+	got_enc = ERLANG_LATIN1;
+	break;
+    case ERL_SMALL_ATOM_EXT:
+	len = get8(s);
+	got_enc = ERLANG_LATIN1;
+	break;
+    case ERL_ATOM_UTF8_EXT:
+	len = get16be(s);
+	got_enc = ERLANG_UTF8;
+	break;
+    case ERL_SMALL_ATOM_UTF8_EXT:
+	len = get8(s);
+	got_enc = ERLANG_UTF8;
+	break;
+    default:
+	return -1;
+    }
+    
+    if (want_enc == got_enc || want_enc == ERLANG_WHATEVER || want_enc == ERLANG_ASCII) {
+	int i, found_non_ascii = 0;
+	if (len >= destlen)
+	    return -1;
+	for (i=0; i<len; i++) {
+	    if (s[i] & 0x80) found_non_ascii = 1;
+	    if (p) p[i] = s[i];
+	}
+	if (p) p[len] = 0;
+	if (want_enc == ERLANG_ASCII && found_non_ascii) {
+	    return -1;
+	}
+	if (res_encp) {
+	    *res_encp = found_non_ascii ? got_enc : ERLANG_ASCII;
+	}
+    }
+    else {
+	int plen = (got_enc == ERLANG_LATIN1) ?
+	    utf8_to_latin1(p, s, len, destlen-1, res_encp) :
+	    latin1_to_utf8(p, s, len, destlen-1, res_encp);
+	if (plen < 0) return -1;
+	if (p) p[plen] = 0;
+    }
+    if (was_encp) {
+	*was_encp = got_enc;
+    }
+    
+    s += len;
+    *index += s-s0;
     return 0;
-}
+}	
+
 
-static int utf8_to_latin1(char* dest, const char* source, unsigned slen)
+int utf8_to_latin1(char* dst, const char* src, int slen, int destlen,
+		   enum erlang_char_encoding* res_encp)
 {
-    const char* dest_end = dest + MAXATOMLEN - 1;
+    const char* const dst_start = dst;
+    const char* const dst_end = dst + destlen;
+    int found_non_ascii = 0;
 
-    while (slen > 0 && dest < dest_end) {
-	if ((source[0] & 0x80) == 0) {
-	    *dest++ = *source++;
+    while (slen > 0) {
+	if (dst >= dst_end) return -1;
+	if ((src[0] & 0x80) == 0) {
+	    if (dst_start) {
+		*dst = *src;
+	    }
+	    ++dst;
+	    ++src;
 	    --slen;
 	}
 	else if (slen > 1 &&
-		 (source[0] & 0xFE) == 0xC2 &&
-		 (source[1] & 0xC0) == 0x80) {
-	    *dest++ = (char) ((source[0] << 6) | (source[1] & 0x3F));
-	    source += 2;
+		 (src[0] & 0xFE) == 0xC2 &&
+		 (src[1] & 0xC0) == 0x80) {
+	    if (dst_start) {
+		*dst = (char) ((src[0] << 6) | (src[1] & 0x3F));
+	    }
+	    ++dst;
+	    src += 2;
 	    slen -= 2;
+	    found_non_ascii = 1;
 	}
 	else return -1;
     }
-    *dest = 0;
+    if (res_encp) {
+	*res_encp = found_non_ascii ? ERLANG_LATIN1 : ERLANG_ASCII;
+    }
+    return dst - dst_start;
+}
+
+int latin1_to_utf8(char* dst, const char* src, int slen, int destlen,
+		   enum erlang_char_encoding* res_encp)
+{
+    const char* const src_end = src + slen;
+    const char* const dst_start = dst;
+    const char* const dst_end = dst + destlen;
+    int found_non_ascii = 0;
+
+    while (src < src_end) {
+	if (dst >= dst_end) return -1;
+	if ((src[0] & 0x80) == 0) {
+	    if (dst_start) {
+		*dst = *src;
+	    }
+	    ++dst;
+	}
+	else {
+	    if (dst_start) {
+		unsigned char ch = *src;
+		dst[0] = 0xC0 | (ch >> 6);
+		dst[1] = 0x80 | (ch & 0x3F);
+	    }
+	    dst += 2;
+	    found_non_ascii = 1;
+	}
+	++src;
+    }
+    if (res_encp) {
+	*res_encp = found_non_ascii ? ERLANG_UTF8 : ERLANG_ASCII;
+    }
+    return dst - dst_start;
+}
+
+
+
+int ei_internal_get_atom(const char** bufp, char* p,
+			 enum erlang_char_encoding* was_encp)
+{
+    int ix = 0;
+    if (ei_decode_atom_as(*bufp, &ix, p, MAXATOMLEN_UTF8, ERLANG_UTF8, was_encp, NULL) < 0)
+	return -1;
+    *bufp += ix;
     return 0;
 }
 
+
diff --git a/lib/erl_interface/src/decode/decode_boolean.c b/lib/erl_interface/src/decode/decode_boolean.c
index 0a7a06f1d4..f20690249b 100644
--- a/lib/erl_interface/src/decode/decode_boolean.c
+++ b/lib/erl_interface/src/decode/decode_boolean.c
@@ -24,12 +24,11 @@
 /* c non-zero -> erlang "true" atom, otherwise "false" */
 int ei_decode_boolean(const char *buf, int *index, int *p)
 {
-  const char *s = buf + *index;
-  const char *s0 = s;
-  char tbuf[MAXATOMLEN+1];
+  char tbuf[6];
   int t;
 
-  if (get_atom(&s, tbuf) < 0) return -1;
+  if (ei_decode_atom_as(buf, index, tbuf, sizeof(tbuf), ERLANG_ASCII, NULL, NULL) < 0)
+      return -1;
 
   if (memcmp(tbuf, "true", 5) == 0)
       t = 1;
@@ -39,7 +38,6 @@ int ei_decode_boolean(const char *buf, int *index, int *p)
       return -1;
       
   if (p) *p = t;
-  *index += s-s0;
   return 0; 
 }
 
diff --git a/lib/erl_interface/src/decode/decode_fun.c b/lib/erl_interface/src/decode/decode_fun.c
index 64fb9e86d8..7bbef5db44 100644
--- a/lib/erl_interface/src/decode/decode_fun.c
+++ b/lib/erl_interface/src/decode/decode_fun.c
@@ -42,7 +42,8 @@ int ei_decode_fun(const char *buf, int *index, erlang_fun *p)
 	if (ei_decode_pid(s, &ix, (p == NULL ? (erlang_pid*)NULL : &p->pid)) < 0)
 	    return -1;
 	/* then the module (atom) */
-	if (ei_decode_atom(s, &ix, (p == NULL ? (char*)NULL : p->module)) < 0)
+	if (ei_decode_atom_as(s, &ix, (p == NULL ? (char*)NULL : p->module),
+			      MAXATOMLEN_UTF8, ERLANG_UTF8, &p->module_org_enc, NULL) < 0)
 	    return -1;
 	/* then the index */
 	if (ei_decode_long(s, &ix, (p == NULL ? (long*)NULL : &p->index)) < 0)
@@ -84,7 +85,8 @@ int ei_decode_fun(const char *buf, int *index, erlang_fun *p)
 	if (p != NULL) p->n_free_vars = i;
 	/* then the module (atom) */
 	ix = 0;
-	if (ei_decode_atom(s, &ix, (p == NULL ? (char*)NULL : p->module)) < 0)
+	if (ei_decode_atom_as(s, &ix, (p == NULL ? (char*)NULL : p->module),
+			      MAXATOMLEN_UTF8, ERLANG_UTF8, &p->module_org_enc, NULL) < 0)
 	    return -1;
 	/* then the old_index */
 	if (ei_decode_long(s, &ix, (p == NULL ? (long*)NULL : &p->old_index)) < 0)
diff --git a/lib/erl_interface/src/decode/decode_pid.c b/lib/erl_interface/src/decode/decode_pid.c
index a762ae499e..e79952195d 100644
--- a/lib/erl_interface/src/decode/decode_pid.c
+++ b/lib/erl_interface/src/decode/decode_pid.c
@@ -26,12 +26,11 @@ int ei_decode_pid(const char *buf, int *index, erlang_pid *p)
 {
   const char *s = buf + *index;
   const char *s0 = s;
-  int len;
   
   if (get8(s) != ERL_PID_EXT) return -1;
 
   /* first the nodename */
-  if (get_atom(&s, p->node) < 0) return -1;
+  if (get_atom(&s, p->node, &p->node_org_enc) < 0) return -1;
   
   /* now the numbers: num (4), serial (4), creation (1) */
   if (p) {
diff --git a/lib/erl_interface/src/decode/decode_port.c b/lib/erl_interface/src/decode/decode_port.c
index 6eb2bc9197..5fd96b51a4 100644
--- a/lib/erl_interface/src/decode/decode_port.c
+++ b/lib/erl_interface/src/decode/decode_port.c
@@ -25,12 +25,11 @@ int ei_decode_port(const char *buf, int *index, erlang_port *p)
 {
   const char *s = buf + *index;
   const char *s0 = s;
-  int len;
   
   if (get8(s) != ERL_PORT_EXT) return -1;
 
   /* first the nodename */
-  if (get_atom(&s, p->node) < 0) return -1;
+  if (get_atom(&s, p->node, &p->node_org_enc) < 0) return -1;
   
   /* now the numbers: num (4), creation (1) */
   if (p) {
diff --git a/lib/erl_interface/src/decode/decode_ref.c b/lib/erl_interface/src/decode/decode_ref.c
index df3c30777b..7294e5d239 100644
--- a/lib/erl_interface/src/decode/decode_ref.c
+++ b/lib/erl_interface/src/decode/decode_ref.c
@@ -26,13 +26,13 @@ int ei_decode_ref(const char *buf, int *index, erlang_ref *p)
 {
   const char *s = buf + *index;
   const char *s0 = s;
-  int count, len, i;
+  int count, i;
   
   switch (get8(s)) {
     case ERL_REFERENCE_EXT:
 
       /* nodename */
-      if (get_atom(&s, p->node) < 0) return -1;
+      if (get_atom(&s, p->node, &p->node_org_enc) < 0) return -1;
   
       /* now the numbers: num (4), creation (1) */
       if (p) {
@@ -53,7 +53,7 @@ int ei_decode_ref(const char *buf, int *index, erlang_ref *p)
       if (p) p->len = count;
 
       /* then the nodename */
-      if (get_atom(&s, p->node) < 0) return -1;
+      if (get_atom(&s, p->node, &p->node_org_enc) < 0) return -1;
 
       /* creation */
       if (p) {
diff --git a/lib/erl_interface/src/encode/encode_atom.c b/lib/erl_interface/src/encode/encode_atom.c
index 6f41f045e0..a3d7c4c759 100644
--- a/lib/erl_interface/src/encode/encode_atom.c
+++ b/lib/erl_interface/src/encode/encode_atom.c
@@ -26,25 +26,95 @@ int ei_encode_atom(char *buf, int *index, const char *p)
 {
     size_t len = strlen(p);
 
-    if (len >= INT_MAX) return -1;
-    return ei_encode_atom_len(buf, index, p, len);
+    if (len >= MAXATOMLEN)
+	len = MAXATOMLEN - 1;
+    return ei_encode_atom_len_as(buf, index, p, len, ERLANG_LATIN1, ERLANG_LATIN1);
 }
 
 int ei_encode_atom_len(char *buf, int *index, const char *p, int len)
+{
+    /* This function is documented to truncate at MAXATOMLEN (256) */ 
+    if (len >= MAXATOMLEN)
+	len = MAXATOMLEN - 1;
+    return ei_encode_atom_len_as(buf, index, p, len, ERLANG_LATIN1, ERLANG_LATIN1);
+}
+
+int ei_encode_atom_as(char *buf, int *index, const char *p,
+		      enum erlang_char_encoding from_enc,
+		      enum erlang_char_encoding to_enc)
+{	
+    return ei_encode_atom_len_as(buf, index, p, strlen(p), from_enc, to_enc);
+}
+
+int ei_encode_atom_len_as(char *buf, int *index, const char *p, int len,
+			  enum erlang_char_encoding from_enc,
+			  enum erlang_char_encoding to_enc)
 {
   char *s = buf + *index;
   char *s0 = s;
+  int offs;
 
-  /* This function is documented to truncate at MAXATOMLEN (256) */ 
-  if (len > MAXATOMLEN)
-    len = MAXATOMLEN;
+  if (from_enc == ERLANG_LATIN1 && len >= MAXATOMLEN) {
+      return -1;
+  }
 
-  if (!buf) s += 3;
-  else {
-    put8(s,ERL_ATOM_EXT);
-    put16be(s,len);
+  switch(to_enc) {
+  case ERLANG_LATIN1:
+      if (buf) {
+	  put8(s,ERL_ATOM_EXT);
+	  switch (from_enc) {
+	  case ERLANG_UTF8:
+	      len = utf8_to_latin1(s+2, p, len, MAXATOMLEN-1, NULL);
+	      if (len < 0) return -1;
+	      break;
+	  case ERLANG_ASCII:
+	  case ERLANG_LATIN1:
+	      memcpy(s+2, p, len);
+	      break;
+	  default:
+	      return -1;
+	  }
+	  put16be(s,len);
+      }
+      else {
+	  s += 3;
+	  if (from_enc == ERLANG_UTF8) {
+	      len = utf8_to_latin1(NULL, p, len, MAXATOMLEN-1, NULL);
+	      if (len < 0) return -1;
+	  }
+      }
+      break;
+      
+  case ERLANG_UTF8:
+      offs =  1 + 1;
+      switch (from_enc) {
+      case ERLANG_LATIN1:
+	  if (len >= 256/2) offs++;
+	  len = latin1_to_utf8((buf ? s+offs : NULL), p, len, MAXATOMLEN_UTF8-1, NULL);
+	  break;
+      case ERLANG_ASCII:
+      case ERLANG_UTF8:
+	  if (len >= 256) offs++;
+	  if (buf) memcpy(s+offs, p, len);
+	  break;
+      default:
+	  return -1;
+      }
+      if (buf) {
+	  if (offs == 2) {
+	      put8(s, ERL_SMALL_ATOM_UTF8_EXT);
+	      put8(s, len);
+	  }
+	  else {
+	      put8(s, ERL_ATOM_UTF8_EXT);
+	      put16be(s, len);
+	  }
+      }
+      else s+= offs;
+      break;
 
-    memmove(s,p,len); /* unterminated string */
+  default:
+      return -1;
   }
   s += len;
 
@@ -53,3 +123,13 @@ int ei_encode_atom_len(char *buf, int *index, const char *p, int len)
   return 0; 
 }
 
+int
+ei_internal_put_atom(char** bufp, const char* p, int slen,
+		     enum erlang_char_encoding to_enc)
+{
+    int ix = 0;
+    if (ei_encode_atom_len_as(*bufp, &ix, p, slen, ERLANG_UTF8, to_enc) < 0)
+	return -1;
+    *bufp += ix;
+    return 0;
+}
diff --git a/lib/erl_interface/src/encode/encode_fun.c b/lib/erl_interface/src/encode/encode_fun.c
index 54ee2083d6..4daee32648 100644
--- a/lib/erl_interface/src/encode/encode_fun.c
+++ b/lib/erl_interface/src/encode/encode_fun.c
@@ -35,7 +35,7 @@ int ei_encode_fun(char *buf, int *index, const erlang_fun *p)
 	ix += sizeof(char) + 4;
 	if (ei_encode_pid(buf, &ix, &p->pid) < 0)
 	    return -1;
-	if (ei_encode_atom(buf, &ix, p->module) < 0)
+	if (ei_encode_atom_as(buf, &ix, p->module, ERLANG_UTF8, p->module_org_enc) < 0)
 	    return -1;
 	if (ei_encode_long(buf, &ix, p->index) < 0)
 	    return -1;
@@ -60,7 +60,7 @@ int ei_encode_fun(char *buf, int *index, const erlang_fun *p)
 	} else
 	    size_p = NULL;
 	ix += 1 + 4 + 1 + sizeof(p->md5) + 4 + 4;
-	if (ei_encode_atom(buf, &ix, p->module) < 0)
+	if (ei_encode_atom_as(buf, &ix, p->module, ERLANG_UTF8, p->module_org_enc) < 0)
 	    return -1;
 	if (ei_encode_long(buf, &ix, p->old_index) < 0)
 	    return -1;
diff --git a/lib/erl_interface/src/encode/encode_pid.c b/lib/erl_interface/src/encode/encode_pid.c
index ee7f235c17..0cf3ef4efb 100644
--- a/lib/erl_interface/src/encode/encode_pid.c
+++ b/lib/erl_interface/src/encode/encode_pid.c
@@ -24,29 +24,23 @@
 int ei_encode_pid(char *buf, int *index, const erlang_pid *p)
 {
   char *s = buf + *index;
-  char *s0 = s;
-  int len = strlen(p->node);
-  
-  if (!buf) s += 13 + len;
-  else {
-    put8(s,ERL_PID_EXT);
 
-    /* first the nodename */
-    put8(s,ERL_ATOM_EXT);
+  ++(*index); /* skip ERL_PID_EXT */
+  if (ei_encode_atom_len_as(buf, index, p->node, strlen(p->node), ERLANG_UTF8, p->node_org_enc) < 0)
+      return -1;
+
+  if (buf) {
+    put8(s,ERL_PID_EXT);
 
-    put16be(s,len);
-  
-    memmove(s, p->node, len);
-    s += len;
+    s = buf + *index;
 
     /* now the integers */
     put32be(s,p->num & 0x7fff); /* 15 bits */
     put32be(s,p->serial & 0x1fff); /* 13 bits */
     put8(s,(p->creation & 0x03)); /* 2 bits */
   }
-  
-  *index += s-s0;
-  
+
+  *index += 4 + 4 + 1;  
   return 0;
 }
 
diff --git a/lib/erl_interface/src/encode/encode_port.c b/lib/erl_interface/src/encode/encode_port.c
index fbbb33182e..2bf9e26d78 100644
--- a/lib/erl_interface/src/encode/encode_port.c
+++ b/lib/erl_interface/src/encode/encode_port.c
@@ -24,28 +24,23 @@
 int ei_encode_port(char *buf, int *index, const erlang_port *p)
 {
   char *s = buf + *index;
-  char *s0 = s;
-  int len = strlen(p->node);
-  
-  if (!buf) s += 9 + len;
-  else {
-    put8(s,ERL_PORT_EXT);
 
-    /* first the nodename */
-    put8(s,ERL_ATOM_EXT);
+  ++(*index); /* skip ERL_PORT_EXT */
+  if (ei_encode_atom_len_as(buf, index, p->node, strlen(p->node), ERLANG_UTF8,
+			    p->node_org_enc) < 0) {
+      return -1;
+  }
+  if (buf) {
+    put8(s,ERL_PORT_EXT);
 
-    put16be(s,len);
-  
-    memmove(s, p->node, len);
-    s += len;
+    s = buf + *index;
 
     /* now the integers */
     put32be(s,p->id & 0x0fffffff /* 28 bits */);
     put8(s,(p->creation & 0x03));
   }
   
-  *index += s-s0;
-  
+  *index += 4 + 1;
   return 0;
 }
 
diff --git a/lib/erl_interface/src/encode/encode_ref.c b/lib/erl_interface/src/encode/encode_ref.c
index 292b452864..e8b3173315 100644
--- a/lib/erl_interface/src/encode/encode_ref.c
+++ b/lib/erl_interface/src/encode/encode_ref.c
@@ -24,36 +24,32 @@
 int ei_encode_ref(char *buf, int *index, const erlang_ref *p)
 {
   char *s = buf + *index;
-  char *s0 = s;
-  int len = strlen(p->node);
   int i;
 
+  (*index) += 1 + 2; /* skip to node atom */
+  if (ei_encode_atom_len_as(buf, index, p->node, strlen(p->node), ERLANG_UTF8,
+			    p->node_org_enc) < 0) {
+      return -1;
+  }
+
   /* Always encode as an extended reference; all participating parties
      are now expected to be able to decode extended references. */
-      if (!buf) s += 1 + 2 + (3+len) + p->len*4 + 1;
-      else {
+  if (buf) {
 	  put8(s,ERL_NEW_REFERENCE_EXT);
 
 	  /* first, number of integers */
 	  put16be(s, p->len);
 
 	  /* then the nodename */
-	  put8(s,ERL_ATOM_EXT);
-
-	  put16be(s,len);
-  
-	  memmove(s, p->node, len);
-	  s += len;
+	  s = buf + *index;
 
 	  /* now the integers */
 	  put8(s,(p->creation & 0x03));
 	  for (i = 0; i < p->len; i++)
 	      put32be(s,p->n[i]);
-
-      }
-  
-  *index += s-s0;
+  }
   
+  *index += p->len*4 + 1;  
   return 0;
 }
 
diff --git a/lib/erl_interface/src/legacy/erl_connect.c b/lib/erl_interface/src/legacy/erl_connect.c
index 41d4fa3138..be83fa8469 100644
--- a/lib/erl_interface/src/legacy/erl_connect.c
+++ b/lib/erl_interface/src/legacy/erl_connect.c
@@ -247,9 +247,15 @@ int erl_send(int fd, ETERM *to ,ETERM *msg)
 	erl_errno = EINVAL;
 	return -1;
     }
-    
-    strncpy(topid.node, (char *)ERL_PID_NODE(to), sizeof(topid.node));
-    topid.node[sizeof(topid.node)-1] = '\0';
+
+    if (to->uval.pidval.node.latin1) {
+	strcpy(topid.node, to->uval.pidval.node.latin1);
+	topid.node_org_enc = ERLANG_LATIN1;
+    }
+    else {
+	strcpy(topid.node, to->uval.pidval.node.utf8);
+	topid.node_org_enc = ERLANG_UTF8;
+    }    
     topid.num = ERL_PID_NUMBER(to);
     topid.serial = ERL_PID_SERIAL(to);
     topid.creation = ERL_PID_CREATION(to);
@@ -263,7 +269,7 @@ static int erl_do_receive_msg(int fd, ei_x_buff* x, ErlMessage* emsg)
     erlang_msg msg;
 
     int r;
-    msg.from.node[0] = msg.to.node[0] = '\0';
+    msg.from.node[0] = msg.to.node[0] = msg.toname[0] = '\0';
     r = ei_do_receive_msg(fd, 0, &msg, x, 0);
 
     if (r == ERL_MSG) {
@@ -299,7 +305,7 @@ static int erl_do_receive_msg(int fd, ei_x_buff* x, ErlMessage* emsg)
 	emsg->to = erl_mk_pid(msg.to.node, msg.to.num, msg.to.serial, msg.to.creation);
     else
 	emsg->to = NULL;
-    memcpy(emsg->to_name, msg.toname, MAXATOMLEN+1);
+    strcpy(emsg->to_name, msg.toname);
     return r;
 }
 
diff --git a/lib/erl_interface/src/legacy/erl_eterm.c b/lib/erl_interface/src/legacy/erl_eterm.c
index 8d559f0f55..aa0fd5ddcf 100644
--- a/lib/erl_interface/src/legacy/erl_eterm.c
+++ b/lib/erl_interface/src/legacy/erl_eterm.c
@@ -36,6 +36,7 @@
 #include "erl_error.h"
 #include "erl_internal.h"
 #include "ei_internal.h"
+#include "putget.h"
 
 #define ERL_IS_BYTE(x) (ERL_IS_INTEGER(x) && (ERL_INT_VALUE(x) & ~0xFF) == 0)
 
@@ -142,9 +143,7 @@ ETERM *erl_mk_atom (const char *s)
 
   ep = erl_alloc_eterm(ERL_ATOM);
   ERL_COUNT(ep) = 1;
-  ERL_ATOM_SIZE(ep) = strlen(s);
-  if ((ERL_ATOM_PTR(ep) = strsave(s)) == NULL)
-  {
+  if (erl_atom_init_latin1(&ep->uval.aval.d, s) == NULL) {
       erl_free_term(ep);
       erl_errno = ENOMEM;
       return NULL;
@@ -152,6 +151,65 @@ ETERM *erl_mk_atom (const char *s)
   return ep;
 } 
 
+char* erl_atom_ptr_latin1(Erl_Atom_data* a)
+{
+    if (a->latin1 == NULL) {
+	enum erlang_char_encoding enc;
+	a->lenL = utf8_to_latin1(NULL, a->utf8, a->lenU, a->lenU, &enc);
+	if (a->lenL < 0) {
+	    a->lenL = 0;
+	    return NULL;
+	}
+	if (enc == ERLANG_ASCII) {
+	    a->latin1 = a->utf8; 
+	}
+	else {
+	    a->latin1 = malloc(a->lenL+1);
+	    utf8_to_latin1(a->latin1, a->utf8, a->lenU, a->lenL, NULL);
+	    a->latin1[a->lenL] = '\0';
+	}
+    }
+    return a->latin1;
+}
+
+char* erl_atom_ptr_utf8(Erl_Atom_data* a)
+{
+    if (a->utf8 == NULL) {
+	int dlen = a->lenL * 2; /* over estimation */
+	a->utf8 = malloc(dlen + 1); 
+	a->lenU = latin1_to_utf8(a->utf8, a->latin1, a->lenL, dlen, NULL);
+	a->utf8[a->lenU] = '\0';
+    }
+    return a->utf8;
+
+}
+int erl_atom_size_latin1(Erl_Atom_data* a)
+{
+    if (a->latin1 == NULL) {
+	erl_atom_ptr_latin1(a);
+    }
+    return a->lenL;
+}
+int erl_atom_size_utf8(Erl_Atom_data* a)
+{
+    if (a->utf8 == NULL) {
+	erl_atom_ptr_utf8(a);
+    }
+    return a->lenU;
+}
+char* erl_atom_init_latin1(Erl_Atom_data* a, const char* s)
+{
+    a->lenL = strlen(s);
+    if ((a->latin1 = strsave(s)) == NULL)
+    {
+	return NULL;
+    }
+    a->utf8 = NULL;
+    a->lenU = 0;
+    return a->latin1;
+}
+
+
 /*
  * Given a string as input, creates a list.
  */
@@ -208,12 +266,19 @@ ETERM *erl_mk_pid(const char *node,
 
     ep = erl_alloc_eterm(ERL_PID);
     ERL_COUNT(ep) = 1;
-    if ((ERL_PID_NODE(ep) = strsave(node)) == NULL)
+    if (erl_atom_init_latin1(&ep->uval.pidval.node, node) == NULL)
     {	     
 	erl_free_term(ep);
 	erl_errno = ENOMEM;
 	return NULL;
     }
+    erl_mk_pid_helper(ep, number, serial, creation);
+    return ep;
+}
+
+void erl_mk_pid_helper(ETERM *ep, unsigned int number, 
+		       unsigned int serial, unsigned char creation)
+{
     ERL_PID_NUMBER(ep)   = number & 0x7fff; /* 15 bits */
     if (ei_internal_use_r9_pids_ports()) {
 	ERL_PID_SERIAL(ep)   = serial & 0x07;  /* 3 bits */
@@ -222,7 +287,6 @@ ETERM *erl_mk_pid(const char *node,
 	ERL_PID_SERIAL(ep)   = serial & 0x1fff;  /* 13 bits */
     }
     ERL_PID_CREATION(ep) = creation & 0x03; /* 2 bits */
-    return ep;
 }
 
 /*
@@ -239,12 +303,18 @@ ETERM *erl_mk_port(const char *node,
 
     ep = erl_alloc_eterm(ERL_PORT);
     ERL_COUNT(ep) = 1;
-    if ((ERL_PORT_NODE(ep) = strsave(node)) == NULL)
+    if (erl_atom_init_latin1(&ep->uval.portval.node, node) == NULL)
     {	     
 	erl_free_term(ep);
 	erl_errno = ENOMEM;
 	return NULL;
     }
+    erl_mk_port_helper(ep, number, creation);
+    return ep;
+}
+
+void erl_mk_port_helper(ETERM* ep, unsigned number, unsigned char creation)
+{
     if (ei_internal_use_r9_pids_ports()) {
 	ERL_PORT_NUMBER(ep)   = number & 0x3ffff; /* 18 bits */
     }
@@ -252,29 +322,29 @@ ETERM *erl_mk_port(const char *node,
 	ERL_PORT_NUMBER(ep)   = number & 0x0fffffff; /* 18 bits */
     }
     ERL_PORT_CREATION(ep) = creation & 0x03; /* 2 bits */
-    return ep;
 }
 
 /*
  * Create any kind of reference.
  */
-ETERM *__erl_mk_reference (const char *node,
+ETERM *__erl_mk_reference (ETERM* t,
+			   const char *node,
 			   size_t len,
 			   unsigned int n[],
 			   unsigned char creation)
 {
-    ETERM * t;
-
-    if (node == NULL) return NULL;
-
-    t = erl_alloc_eterm(ERL_REF);
-    ERL_COUNT(t) = 1;
-
-    if ((ERL_REF_NODE(t) = strsave(node)) == NULL)
-    {	     
-	erl_free_term(t);
-	erl_errno = ENOMEM;
-	return NULL;
+    if (t == NULL) {
+	if (node == NULL) return NULL;
+    
+	t = erl_alloc_eterm(ERL_REF);
+	ERL_COUNT(t) = 1;
+    
+	if (erl_atom_init_latin1(&t->uval.refval.node, node) == NULL)
+	{	     
+	    erl_free_term(t);
+	    erl_errno = ENOMEM;
+	    return NULL;
+	}
     }
     ERL_REF_LEN(t) = len;
     ERL_REF_NUMBERS(t)[0]   = n[0] & 0x3ffff; /* 18 bits */
@@ -294,7 +364,7 @@ ETERM *erl_mk_ref (const char *node,
 {
     unsigned int n[3] = {0, 0, 0};
     n[0] = number;
-    return __erl_mk_reference(node, 1, n, creation);
+    return __erl_mk_reference(NULL, node, 1, n, creation);
 }
 
 /*
@@ -307,7 +377,7 @@ erl_mk_long_ref (const char *node,
 {
     unsigned int n[3] = {0, 0, 0};
     n[0] = n3; n[1] = n2; n[2] = n1;
-    return __erl_mk_reference(node, 3, n, creation);
+    return __erl_mk_reference(NULL, node, 3, n, creation);
 }
 
 /*
@@ -758,6 +828,28 @@ int erl_iolist_length (const ETERM* term)
 	return -1;
 }
 
+static int erl_atom_copy(Erl_Atom_data* dst, const Erl_Atom_data* src)
+{
+    if (src->latin1 == src->utf8) {
+	dst->latin1 = dst->utf8 = strsave(src->latin1);
+	dst->lenL = dst->lenU = strlen(src->latin1);
+    }
+    else if (src->latin1) {
+	dst->latin1 = strsave(src->latin1);
+	dst->lenL = strlen(src->latin1);
+	dst->utf8 = NULL;
+	dst->lenU = 0;
+    }
+    else {
+	dst->utf8 = strsave(src->utf8);
+	dst->lenU = strlen(src->utf8);
+	dst->latin1 = NULL;
+	dst->lenL = 0;
+    }
+    return (dst->latin1 != NULL || dst->utf8 == NULL);
+}
+
+
 /*
  * Return a brand NEW COPY of an ETERM.
  */
@@ -796,9 +888,7 @@ ETERM *erl_copy_term(const ETERM *ep)
 	ERL_FLOAT_VALUE(cp) = ERL_FLOAT_VALUE(ep);
 	break;
     case ERL_ATOM:
-	ERL_ATOM_SIZE(cp) = ERL_ATOM_SIZE(ep);
-	ERL_ATOM_PTR(cp) = strsave(ERL_ATOM_PTR(ep));
-	if (ERL_ATOM_PTR(cp) == NULL)
+	if (!erl_atom_copy(&cp->uval.aval.d, &ep->uval.aval.d))
 	{
 	    erl_free_term(cp);
 	    erl_errno = ENOMEM;
@@ -810,17 +900,17 @@ ETERM *erl_copy_term(const ETERM *ep)
            name and plug in. Somewhat ugly (also done with port and
            ref below). */
 	memcpy(&cp->uval.pidval, &ep->uval.pidval, sizeof(Erl_Pid));
-	ERL_PID_NODE(cp) = strsave(ERL_PID_NODE(ep));
+	erl_atom_copy(&cp->uval.pidval.node, &ep->uval.pidval.node);
 	ERL_COUNT(cp) = 1;
 	break;
     case ERL_PORT:
 	memcpy(&cp->uval.portval, &ep->uval.portval, sizeof(Erl_Port));
-	ERL_PORT_NODE(cp) = strsave(ERL_PORT_NODE(ep));
+	erl_atom_copy(&cp->uval.portval.node, &ep->uval.portval.node);
 	ERL_COUNT(cp) = 1;
 	break;
     case ERL_REF:
 	memcpy(&cp->uval.refval, &ep->uval.refval, sizeof(Erl_Ref));
-	ERL_REF_NODE(cp) = strsave(ERL_REF_NODE(ep));
+	erl_atom_copy(&cp->uval.refval.node, &ep->uval.refval.node);
 	ERL_COUNT(cp) = 1;
 	break;
     case ERL_LIST:
@@ -883,29 +973,29 @@ int erl_print_term(FILE *fp, const ETERM *ep)
     j = i = doquote = 0;
     switch(ERL_TYPE(ep)) 
     {
-    case ERL_ATOM:
+    case ERL_ATOM: {
+	char* adata = ERL_ATOM_PTR(ep);
 	/* FIXME: what if some weird locale is in use? */
-	if (!islower((int)ERL_ATOM_PTR(ep)[0]))
+	if (!islower(adata[0]))
 	    doquote = 1;
 
 	for (i = 0; !doquote && i < ERL_ATOM_SIZE(ep); i++) 
 	{
-	    doquote = !(isalnum((int)ERL_ATOM_PTR(ep)[i]) 
-			|| (ERL_ATOM_PTR(ep)[i] == '_'));
+	    doquote = !(isalnum(adata[i]) || (adata[i] == '_'));
 	}
 
 	if (doquote) {
 	    putc('\'', fp);
 	    ch_written++; 
 	}
-	fputs(ERL_ATOM_PTR(ep), fp);
+	fputs(adata, fp);
 	ch_written += ERL_ATOM_SIZE(ep);	
 	if (doquote) {
 	    putc('\'', fp);
 	    ch_written++;
 	}
 	break;
-
+    }
     case ERL_VARIABLE:
 	if (!isupper((int)ERL_VAR_NAME(ep)[0])) {
 	    doquote = 1;
diff --git a/lib/erl_interface/src/legacy/erl_eterm.h b/lib/erl_interface/src/legacy/erl_eterm.h
index 41b008f04f..2e8129d9cd 100644
--- a/lib/erl_interface/src/legacy/erl_eterm.h
+++ b/lib/erl_interface/src/legacy/erl_eterm.h
@@ -55,7 +55,9 @@ typedef struct _heapmark {
 } Erl_HeapMark;
 
 
-ETERM * __erl_mk_reference(const char *, size_t, unsigned int n[], unsigned char);
+void erl_mk_port_helper(ETERM* ep, unsigned number, unsigned char creation);
+void erl_mk_pid_helper(ETERM*, unsigned,unsigned, unsigned char);
+ETERM * __erl_mk_reference(ETERM*, const char *, size_t, unsigned int n[], unsigned char);
 int erl_current_fix_desc(void);
 
 #endif /* _ERL_ETERM_H */
diff --git a/lib/erl_interface/src/legacy/erl_format.c b/lib/erl_interface/src/legacy/erl_format.c
index dc85806c36..533241e396 100644
--- a/lib/erl_interface/src/legacy/erl_format.c
+++ b/lib/erl_interface/src/legacy/erl_format.c
@@ -574,10 +574,22 @@ static int ematch(ETERM *p, ETERM *t)
 
   switch (type_p) {
 
-  case ERL_ATOM:
-    return p->uval.aval.len == t->uval.aval.len &&
-      memcmp(p->uval.aval.a, t->uval.aval.a, p->uval.aval.len) == 0;
-	
+  case ERL_ATOM: {
+      Erl_Atom_data* pa = &p->uval.aval.d;
+      Erl_Atom_data* ta = &t->uval.aval.d;
+      if (pa->utf8 && ta->utf8) {
+	  return pa->lenU == ta->lenU && memcmp(pa->utf8, ta->utf8, pa->lenU)==0; 
+      }
+      else if (pa->latin1 && ta->latin1) {
+	  return pa->lenL == ta->lenL && memcmp(pa->latin1, ta->latin1, pa->lenL)==0; 
+      }
+      else if (pa->latin1) {
+	  return cmp_latin1_vs_utf8(pa->latin1, pa->lenL, ta->utf8, ta->lenU)==0;
+      }
+      else {
+	  return cmp_latin1_vs_utf8(ta->latin1, ta->lenL, pa->utf8, pa->lenU)==0;
+      }
+  }
   case ERL_VARIABLE:
     if (strcmp(p->uval.vval.name, "_") == 0) /* anon. variable */
       return ERL_TRUE;
diff --git a/lib/erl_interface/src/legacy/erl_malloc.c b/lib/erl_interface/src/legacy/erl_malloc.c
index f51a6c69b3..d09239e02d 100644
--- a/lib/erl_interface/src/legacy/erl_malloc.c
+++ b/lib/erl_interface/src/legacy/erl_malloc.c
@@ -112,6 +112,18 @@ do {						\
     (ptr) = NULL;				\
 } while (0)
 
+static void erl_atom_free(Erl_Atom_data* p)
+{
+    erl_free(p->latin1);
+    if (p->utf8 != p->latin1) {
+	erl_free(p->utf8);
+    }
+    p->latin1 = NULL;
+    p->utf8 = NULL;
+    p->lenL = 0;
+    p->lenU = 0;
+}
+
 static void _erl_free_term (ETERM *ep, int external, int compound)
 {
 restart:
@@ -122,7 +134,7 @@ restart:
 	switch(ERL_TYPE(ep)) 
 	    {
 	    case ERL_ATOM:
-		FREE_AND_CLEAR(ERL_ATOM_PTR(ep));
+		erl_atom_free(&ep->uval.aval.d);
 		break;
 	    case ERL_VARIABLE:
 		FREE_AND_CLEAR(ERL_VAR_NAME(ep));
@@ -161,13 +173,13 @@ restart:
 		FREE_AND_CLEAR(ERL_BIN_PTR(ep));
 		break;
 	    case ERL_PID:
-		FREE_AND_CLEAR(ERL_PID_NODE(ep));
+		erl_atom_free(&ep->uval.pidval.node);
 		break;
 	    case ERL_PORT:
-		FREE_AND_CLEAR(ERL_PORT_NODE(ep));
+		erl_atom_free(&ep->uval.portval.node);
 		break;
 	    case ERL_REF:
-		FREE_AND_CLEAR(ERL_REF_NODE(ep));
+		erl_atom_free(&ep->uval.refval.node);
 		break;
 	    case ERL_EMPTY_LIST:
 	    case ERL_INTEGER:
diff --git a/lib/erl_interface/src/legacy/erl_marshal.c b/lib/erl_interface/src/legacy/erl_marshal.c
index 775d7e82ca..884e9d421b 100644
--- a/lib/erl_interface/src/legacy/erl_marshal.c
+++ b/lib/erl_interface/src/legacy/erl_marshal.c
@@ -44,6 +44,9 @@ int erl_fp_compare(unsigned *a, unsigned *b);
 static void erl_long_to_fp(long l, unsigned *d);
 #endif
 
+static int cmpbytes(unsigned char* s1,int l1,unsigned char* s2,int l2);
+static int cmpatoms(unsigned char* s1, int l1, unsigned char tag1, unsigned char* s2, int l2, unsigned char tag2);
+
 /* Used when comparing two encoded byte arrays */
 /* this global data is ok (from threading point of view) since it is
  * initialized once and never changed
@@ -111,8 +114,9 @@ void erl_init_marshal(void)
     cmp_array[ERL_SMALL_BIG_EXT]     = ERL_NUM_CMP;
     cmp_array[ERL_LARGE_BIG_EXT]     = ERL_NUM_CMP;
     cmp_array[ERL_ATOM_EXT]          = ERL_ATOM_CMP;
+    cmp_array[ERL_ATOM_UTF8_EXT]     = ERL_ATOM_CMP;
     cmp_array[ERL_SMALL_ATOM_EXT]    = ERL_ATOM_CMP;
-    cmp_array[ERL_UNICODE_ATOM_EXT]  = ERL_ATOM_CMP;
+    cmp_array[ERL_SMALL_ATOM_UTF8_EXT] = ERL_ATOM_CMP;
     cmp_array[ERL_REFERENCE_EXT]     = ERL_REF_CMP;
     cmp_array[ERL_NEW_REFERENCE_EXT] = ERL_REF_CMP;
     cmp_array[ERL_FUN_EXT]           = ERL_FUN_CMP;
@@ -162,6 +166,21 @@ static int erl_length_x(const ETERM *ep) {
  *==============================================================
  */
 
+static void encode_atom(Erl_Atom_data* a, unsigned char **ext)
+{
+    int ix = 0;
+    if (a->latin1) {
+	ei_encode_atom_len_as((char*)*ext, &ix, a->latin1, a->lenL,
+			      ERLANG_LATIN1, ERLANG_LATIN1);
+    }
+    else if (ei_encode_atom_len_as((char*)*ext, &ix, a->utf8, a->lenU,
+				   ERLANG_UTF8, ERLANG_LATIN1) < 0) {
+	ei_encode_atom_len_as((char*)*ext, &ix, a->utf8, a->lenU,
+			      ERLANG_UTF8, ERLANG_UTF8);
+    }
+    *ext += ix;
+}
+
 /* 
  * The actual ENCODE engine.
  * Returns 0 on success, otherwise 1.
@@ -176,12 +195,7 @@ int erl_encode_it(ETERM *ep, unsigned char **ext, int dist)
     switch(ERL_TYPE(ep)) 
     {
     case ERL_ATOM:
-	i =  ep->uval.aval.len;
-	*(*ext)++ = ERL_ATOM_EXT;
-	*(*ext)++ = (i >>8) &0xff;
-	*(*ext)++ = i &0xff;
-	memcpy((void *) *ext, (const void *) ep->uval.aval.a, i);
-	*ext += i;
+	encode_atom(&ep->uval.aval.d, ext);
 	return 0;
 
     case ERL_INTEGER:
@@ -292,12 +306,7 @@ int erl_encode_it(ETERM *ep, unsigned char **ext, int dist)
     case ERL_PID:
 	*(*ext)++ = ERL_PID_EXT;    
 	/* First poke in node as an atom */    
-	i = strlen((char *)ERL_PID_NODE(ep));
-	*(*ext)++ = ERL_ATOM_EXT;
-	*(*ext)++ = (i >>8) &0xff;
-	*(*ext)++ = i &0xff;
-	memcpy(*ext, ERL_PID_NODE(ep), i);
-	*ext += i;
+	encode_atom(&ep->uval.pidval.node, ext);
 	/* And then fill in the integer fields */
 	i = ERL_PID_NUMBER(ep);
 	*(*ext)++ = (i >> 24) &0xff;
@@ -325,11 +334,8 @@ int erl_encode_it(ETERM *ep, unsigned char **ext, int dist)
 	    *(*ext)++ = (len >> 8) &0xff;
 	    *(*ext)++ = len &0xff;
 
-	    *(*ext)++ = ERL_ATOM_EXT;
-	    *(*ext)++ = (i >> 8) &0xff;
-	    *(*ext)++ = i &0xff;
-	    memcpy(*ext, ERL_REF_NODE(ep), i);
-	    *ext += i;
+	    encode_atom(&ep->uval.refval.node, ext);
+
 	    *(*ext)++ = ERL_REF_CREATION(ep);
 	    /* Then the integer fields */
 	    for (j = 0; j < ERL_REF_LEN(ep); j++) {
@@ -344,12 +350,7 @@ int erl_encode_it(ETERM *ep, unsigned char **ext, int dist)
     case ERL_PORT:
 	*(*ext)++ = ERL_PORT_EXT;
 	/* First poke in node as an atom */
-	i = strlen((char *)ERL_PORT_NODE(ep));
-	*(*ext)++ = ERL_ATOM_EXT;
-	*(*ext)++ = (i >>8) &0xff;
-	*(*ext)++ = i &0xff;
-	memcpy(*ext, ERL_PORT_NODE(ep), i);
-	*ext += i;
+	encode_atom(&ep->uval.portval.node, ext);
 	/* Then the integer fields */
 	i = ERL_PORT_NUMBER(ep);
 	*(*ext)++ = (i >> 24) &0xff;
@@ -500,6 +501,16 @@ int erl_term_len(ETERM *ep)
   return 1+erl_term_len_helper(ep, 4);
 }
 
+static int atom_len_helper(Erl_Atom_data* a)
+{
+    if (erl_atom_ptr_latin1(a)) {
+	return 1 + 2 + a->lenL; /* ERL_ATOM_EXT */
+    }
+    else {
+	return 1 + 1 + (a->lenU > 255) + a->lenU;
+    }
+}
+
 static int erl_term_len_helper(ETERM *ep, int dist)
 {
   int len = 0;
@@ -511,8 +522,7 @@ static int erl_term_len_helper(ETERM *ep, int dist)
   if (ep) {
     switch (ERL_TYPE(ep)) {
     case ERL_ATOM:
-      i = ep->uval.aval.len;
-      len = i + 3;
+      len = atom_len_helper(&ep->uval.aval.d);
       break;
 
     case ERL_INTEGER:
@@ -544,20 +554,15 @@ static int erl_term_len_helper(ETERM *ep, int dist)
       break;
 
     case ERL_PID:
-      /* 1 + N + 4 + 4 + 1 where N = 3 + strlen */
-      i = strlen((char *)ERL_PID_NODE(ep));
-      len = 13 + i;
+      len = 1 + atom_len_helper(&ep->uval.pidval.node) + 4 + 4 + 1;
       break;
 
     case ERL_REF:
-      i = strlen((char *)ERL_REF_NODE(ep));
-      len = 1 + 2 + (i+3) + 1 + ERL_REF_LEN(ep) * 4;
+      len = 1 + 2 + atom_len_helper(&ep->uval.refval.node) + 1 + ERL_REF_LEN(ep) * 4;
       break;
 
     case ERL_PORT:
-      /* 1 + N + 4 + 1 where N = 3 + strlen */
-      i = strlen((char *)ERL_PORT_NODE(ep));
-      len = 9 + i;
+      len = 1 + atom_len_helper(&ep->uval.portval.node) + 4 + 1;
       break;
 
     case ERL_EMPTY_LIST:
@@ -651,11 +656,33 @@ int erl_encode_buf(ETERM *ep, unsigned char **ext)
 } /* erl_encode_buf */
 
 
-static int read_atom(unsigned char** ext, char* dst)
+static int read_atom(unsigned char** ext, Erl_Atom_data* a)
 {
+    char buf[MAXATOMLEN_UTF8];
     int offs = 0;
-    int ret = ei_decode_atom((char*)*ext, &offs, dst);
+    enum erlang_char_encoding enc;
+    int ret = ei_decode_atom_as((char*)*ext, &offs, buf, MAXATOMLEN_UTF8,
+				ERLANG_WHATEVER, NULL, &enc);
     *ext += offs;
+
+    if (ret == 0) {
+	int i = strlen(buf);
+	char* clone = erl_malloc(i+1);
+	memcpy(clone, buf, i+1);
+
+	a->latin1 = NULL; 	    
+	a->lenL = 0;
+	a->utf8 = NULL;
+	a->lenU = 0;
+	if (enc == ERLANG_LATIN1 || enc == ERLANG_ASCII) {
+	    a->latin1 = clone; 	    
+	    a->lenL = i;
+	}
+	if (enc == ERLANG_UTF8 || enc == ERLANG_ASCII) {
+	    a->utf8 = clone;
+	    a->lenU = i;
+	}
+    }
     return ret;
 }
 
@@ -665,7 +692,6 @@ static int read_atom(unsigned char** ext, char* dst)
  */
 static ETERM *erl_decode_it(unsigned char **ext)
 {
-    char atom_buf[MAXATOMLEN+1];
     char *cp;
     ETERM *ep,*tp,*np;
     unsigned int u,sign;
@@ -765,127 +791,89 @@ static ETERM *erl_decode_it(unsigned char **ext)
       
     case ERL_ATOM_EXT:
     case ERL_SMALL_ATOM_EXT:
-    case ERL_UNICODE_ATOM_EXT:
+    case ERL_ATOM_UTF8_EXT:
+    case ERL_SMALL_ATOM_UTF8_EXT:
+	
 	ERL_TYPE(ep) = ERL_ATOM;
 	--(*ext);
-	if (read_atom(ext, atom_buf) < 0) return NULL;
-
-	i = strlen(atom_buf);
-	ep->uval.aval.len = i;
-	ep->uval.aval.a = (char *) erl_malloc(i+1);
-	memcpy(ep->uval.aval.a, atom_buf, i+1);
+	if (read_atom(ext, &ep->uval.aval.d) < 0) return NULL;
 	return ep;
       
     case ERL_PID_EXT:
-	erl_free_term(ep);
-	{			/* Why not use the constructors? */
-	    char* node = atom_buf;
+	{
 	    unsigned int number, serial;
 	    unsigned char creation;
-	    ETERM *eterm_p;
 
-	    if (read_atom(ext, node) < 0) return NULL;
+	    ERL_TYPE(ep) = ERL_PID;
+	    if (read_atom(ext, &ep->uval.pidval.node) < 0) return NULL;
 
 	    /* get the integers */
-#if 0
-	    /* FIXME: Remove code or whatever....
-               Ints on the wire are big-endian (== network byte order)
-               so use ntoh[sl]. (But some are little-endian! Arrrgh!)
-               Also, the libc authors can be expected to optimize them
-               heavily. However, the marshalling makes no guarantees
-               about alignments -- so it won't work at all. */
-	    number = ntohl(*((unsigned int *)*ext)++);
-	    serial = ntohl(*((unsigned int *)*ext)++);
-#else
 	    number = ((*ext)[0] << 24) | ((*ext)[1]) << 16 | 
 		((*ext)[2]) << 8 | ((*ext)[3]);	
 	    *ext += 4;
 	    serial = ((*ext)[0] << 24) | ((*ext)[1]) << 16 | 
 		((*ext)[2]) << 8 | ((*ext)[3]);	
 	    *ext += 4;
-#endif
 	    creation =  *(*ext)++; 
-	    eterm_p = erl_mk_pid(node, number, serial, creation);
-	    return eterm_p;
+	    erl_mk_pid_helper(ep, number, serial, creation);
+	    return ep;
 	}
     case ERL_REFERENCE_EXT:
-	erl_free_term(ep);
 	{
-	    char* node = atom_buf;
-	    unsigned int number;
+	    unsigned int n[3] = {0, 0, 0};
 	    unsigned char creation;
-	    ETERM *eterm_p;
 
-	    if (read_atom(ext, node) < 0) return NULL;
+	    ERL_TYPE(ep) = ERL_REF;
+	    if (read_atom(ext, &ep->uval.refval.node) < 0) return NULL;
 
 	    /* get the integers */
-#if 0
-	    number = ntohl(*((unsigned int *)*ext)++);
-#else
-	    number = ((*ext)[0] << 24) | ((*ext)[1]) << 16 | 
+	    n[0] = ((*ext)[0] << 24) | ((*ext)[1]) << 16 | 
 		((*ext)[2]) << 8 | ((*ext)[3]);	
 	    *ext += 4;
-#endif
 	    creation =  *(*ext)++; 
-	    eterm_p = erl_mk_ref(node, number, creation);
-	    return eterm_p;
+	    __erl_mk_reference(ep, NULL, 1, n, creation);
+	    return ep;
 	}
 
     case ERL_NEW_REFERENCE_EXT: 
-	erl_free_term(ep);
 	{
-	    char* node = atom_buf;
 	    size_t cnt, i;
 	    unsigned int n[3];
 	    unsigned char creation;
-	    ETERM *eterm_p;
 
-#if 0
-	    cnt = ntohs(*((unsigned short *)*ext)++);
-#else
+	    ERL_TYPE(ep) = ERL_REF;
 	    cnt = ((*ext)[0] << 8) | (*ext)[1];
 	    *ext += 2;
-#endif
 
-	    if (read_atom(ext, node) < 0) return NULL;
+	    if (read_atom(ext, &ep->uval.refval.node) < 0) return NULL;
 
 	    /* get the integers */
 	    creation =  *(*ext)++; 
 	    for(i = 0; i < cnt; i++)
 	    {
-#if 0
-		n[i] = ntohl(*((unsigned int *)*ext)++);
-#else
 		n[i] = ((*ext)[0] << 24) | ((*ext)[1]) << 16 | 
 		    ((*ext)[2]) << 8 | ((*ext)[3]);	
 		*ext += 4;
-#endif
 	    }
-	    eterm_p = __erl_mk_reference(node, cnt, n, creation);
-	    return eterm_p;
+	    __erl_mk_reference(ep, NULL, cnt, n, creation);
+	    return ep;
 	}
 
     case ERL_PORT_EXT:
-	erl_free_term(ep);
 	{
-	    char* node = atom_buf;
 	    unsigned int number;
 	    unsigned char creation;
-	    ETERM *eterm_p;
 
-	    if (read_atom(ext, node) < 0) return NULL;
+	    ERL_TYPE(ep) = ERL_PORT;
+	    if (read_atom(ext, &ep->uval.portval.node) < 0) return NULL;
 
 	    /* get the integers */
-#if 0
-	    number = ntohl(*((unsigned int *)*ext)++);
-#else
 	    number = ((*ext)[0] << 24) | ((*ext)[1]) << 16 | 
 		((*ext)[2]) << 8 | ((*ext)[3]);	
 	    *ext += 4;
-#endif
 	    creation =  *(*ext)++; 
-	    eterm_p = erl_mk_port(node, number, creation);
-	    return eterm_p;
+	    erl_mk_port_helper(ep, number, creation);
+	    return ep;
 	}
 
     case ERL_NIL_EXT:
@@ -1120,8 +1108,9 @@ unsigned char erl_ext_type(unsigned char *ext)
     case ERL_INTEGER_EXT:
 	return ERL_INTEGER;
     case ERL_ATOM_EXT:
+    case ERL_ATOM_UTF8_EXT:
     case ERL_SMALL_ATOM_EXT:
-    case ERL_UNICODE_ATOM_EXT:
+    case ERL_SMALL_ATOM_UTF8_EXT:
 	return ERL_ATOM;
     case ERL_PID_EXT:
 	return ERL_PID;
@@ -1173,8 +1162,9 @@ int erl_ext_size(unsigned char *t)
     case ERL_SMALL_INTEGER_EXT:
     case ERL_INTEGER_EXT:
     case ERL_ATOM_EXT:
+    case ERL_ATOM_UTF8_EXT:
     case ERL_SMALL_ATOM_EXT:
-    case ERL_UNICODE_ATOM_EXT:
+    case ERL_SMALL_ATOM_UTF8_EXT:
     case ERL_PID_EXT:
     case ERL_PORT_EXT:
     case ERL_REFERENCE_EXT:
@@ -1221,12 +1211,13 @@ static int jump_atom(unsigned char** ext)
 
     switch (*e++) {
     case ERL_ATOM_EXT:
-    case ERL_UNICODE_ATOM_EXT:
+    case ERL_ATOM_UTF8_EXT:
 	len = (e[0] << 8) | e[1];
 	e += (len + 2);
 	break;
 
     case ERL_SMALL_ATOM_EXT:
+    case ERL_SMALL_ATOM_UTF8_EXT:
 	len = e[0];
 	e += (len + 1);
 	break;
@@ -1259,8 +1250,9 @@ static int jump(unsigned char **ext)
 	*ext += 1;
 	break;
     case ERL_ATOM_EXT:
+    case ERL_ATOM_UTF8_EXT:
     case ERL_SMALL_ATOM_EXT:
-    case ERL_UNICODE_ATOM_EXT:
+    case ERL_SMALL_ATOM_UTF8_EXT:
 	jump_atom(ext);
 	break;
     case ERL_PID_EXT:
@@ -1426,6 +1418,58 @@ static int cmpbytes(unsigned char* s1,int l1,unsigned char* s2,int l2)
 
 } /* cmpbytes */
 
+#define tag2enc(T) ((T)==ERL_ATOM_EXT || (T)==ERL_SMALL_ATOM_EXT ? ERLANG_LATIN1 : ERLANG_UTF8)
+
+static int cmpatoms(unsigned char* s1, int l1, unsigned char tag1,	 
+                    unsigned char* s2, int l2, unsigned char tag2)
+{
+    enum erlang_char_encoding enc1 = tag2enc(tag1);
+    enum erlang_char_encoding enc2 = tag2enc(tag2);
+
+    if (enc1 == enc2) {
+	return cmpbytes(s1, l1,s2,l2);
+    }
+    
+    if (enc1 == ERLANG_LATIN1) {
+	return cmp_latin1_vs_utf8((char*)s1, l1, (char*)s2, l2);
+    }
+    else {
+	return -cmp_latin1_vs_utf8((char*)s2, l2, (char*)s1, l1);
+    }
+}
+
+int cmp_latin1_vs_utf8(const char* strL, int lenL, const char* strU, int lenU)
+{
+    unsigned char* sL = (unsigned char*)strL;
+    unsigned char* sU = (unsigned char*)strU;
+    unsigned char* sL_end = sL + lenL;
+    unsigned char* sU_end = sU + lenU;
+
+    while(sL < sL_end && sU < sU_end) {
+	unsigned char UasL;
+	if (*sL >= 0x80) {
+	    if (*sU < 0xC4 && (sU+1) < sU_end) {
+		UasL = ((sU[0] & 0x3) << 6) | (sU[1] & 0x3F);
+	    }
+	    else return -1;
+	}
+	else {
+	    UasL = *sU;
+	}
+	if (*sL < UasL) return -1;
+	if (*sL > UasL) return 1;
+
+	sL++;
+	if (*sU < 0x80) sU++;
+	else if (*sU < 0xE0) sU += 2;
+	else if (*sU < 0xF0) sU += 3;
+	else /*if (*sU < 0xF8)*/ sU += 4;
+    }
+
+    return (sU >= sU_end) - (sL >= sL_end); /* -1, 0 or 1 */
+}
+
+
 #define CMP_EXT_ERROR_CODE 4711
 
 #define CMP_EXT_INT32_BE(AP, BP)				\
@@ -1561,6 +1605,7 @@ static int cmp_exe2(unsigned char **e1, unsigned char **e2)
   int min,  ret,i,j,k;
   double ff1, ff2;
   unsigned char *tmp1, *tmp2;
+  unsigned char tag1, tag2;
 
   if ( ((*e1)[0] == ERL_STRING_EXT) && ((*e2)[0] == ERL_LIST_EXT) ) {
     return cmp_string_list(e1, e2);
@@ -1568,9 +1613,10 @@ static int cmp_exe2(unsigned char **e1, unsigned char **e2)
     return -cmp_string_list(e2, e1);
   }
 
-  *e2 += 1;
+  tag1 = *(*e1)++;
+  tag2 = *(*e2)++;
   i = j = 0;
-  switch (*(*e1)++) 
+  switch (tag1) 
     {
     case ERL_SMALL_INTEGER_EXT:
       if (**e1 < **e2) ret = -1;
@@ -1590,14 +1636,15 @@ static int cmp_exe2(unsigned char **e1, unsigned char **e2)
       *e1 += 4; *e2 += 4;
       return ret;
     case ERL_ATOM_EXT:
-    case ERL_UNICODE_ATOM_EXT:
+    case ERL_ATOM_UTF8_EXT:
       i = (**e1) << 8; (*e1)++;
       j = (**e2) << 8; (*e2)++;
       /*fall through*/
     case ERL_SMALL_ATOM_EXT:
+    case ERL_SMALL_ATOM_UTF8_EXT:
       i |= (**e1); (*e1)++;
       j |= (**e2); (*e2)++;
-      ret = cmpbytes(*e1, i, *e2, j);
+      ret = cmpatoms(*e1, i, tag1, *e2, j, tag2);
       *e1 += i;
       *e2 += j;
       return ret;
diff --git a/lib/erl_interface/src/legacy/global_whereis.c b/lib/erl_interface/src/legacy/global_whereis.c
index 2afb193504..e6c556d907 100644
--- a/lib/erl_interface/src/legacy/global_whereis.c
+++ b/lib/erl_interface/src/legacy/global_whereis.c
@@ -85,7 +85,16 @@ ETERM *erl_global_whereis(int fd, const char *name, char *node)
   opid = erl_decode((unsigned char*)buf);
 
   /* extract the nodename for the caller */
-  if (node) strcpy(node,epid.node);
+  if (node) {
+      char* node_str = ERL_PID_NODE(opid);
+      if (node_str) {
+	  strcpy(node, node_str);
+      }
+      else {
+	  erl_free_term(opid);
+	  return NULL;
+      }
+  }
 
   return opid;
 }
diff --git a/lib/erl_interface/src/misc/ei_decode_term.c b/lib/erl_interface/src/misc/ei_decode_term.c
index 6773f90bfc..65afee89cc 100644
--- a/lib/erl_interface/src/misc/ei_decode_term.c
+++ b/lib/erl_interface/src/misc/ei_decode_term.c
@@ -32,7 +32,7 @@
 int ei_decode_ei_term(const char* buf, int* index, ei_term* term)
 {
     const char* s = buf + *index, * s0 = s;
-    int len, i, n, sign;
+    int i, n, sign;
     char c;
 
     if (term == NULL) return -1;
@@ -48,12 +48,13 @@ int ei_decode_ei_term(const char* buf, int* index, ei_term* term)
     case NEW_FLOAT_EXT:
         return ei_decode_double(buf, index, &term->value.d_val);
     case ERL_ATOM_EXT:
+    case ERL_ATOM_UTF8_EXT:
     case ERL_SMALL_ATOM_EXT:
-    case ERL_UNICODE_ATOM_EXT:
+    case ERL_SMALL_ATOM_UTF8_EXT:
 	return ei_decode_atom(buf, index, term->value.atom_name);
     case ERL_REFERENCE_EXT:
 	/* first the nodename */
-	if (get_atom(&s, term->value.ref.node) < 0) return -1;
+	if (get_atom(&s, term->value.ref.node, &term->value.ref.node_org_enc) < 0) return -1;
         /* now the numbers: num (4), creation (1) */
 	term->value.ref.n[0] = get32be(s);
 	term->value.ref.len = 1;
@@ -63,7 +64,7 @@ int ei_decode_ei_term(const char* buf, int* index, ei_term* term)
 	/* first the integer count */
 	term->value.ref.len = get16be(s);
 	/* then the nodename */
-	if (get_atom(&s, term->value.ref.node) < 0) return -1;
+	if (get_atom(&s, term->value.ref.node, &term->value.ref.node_org_enc) < 0) return -1;
 	/* creation */
 	term->value.ref.creation = get8(s) & 0x03;
 	/* finally the id integers */
@@ -75,12 +76,12 @@ int ei_decode_ei_term(const char* buf, int* index, ei_term* term)
 	}
 	break;
     case ERL_PORT_EXT:
-	if (get_atom(&s, term->value.port.node) < 0) return -1;
+	if (get_atom(&s, term->value.port.node, &term->value.port.node_org_enc) < 0) return -1;
 	term->value.port.id = get32be(s) & 0x0fffffff; /* 28 bits */;
 	term->value.port.creation = get8(s) & 0x03;
 	break;
     case ERL_PID_EXT:
-	if (get_atom(&s, term->value.pid.node) < 0) return -1;
+	if (get_atom(&s, term->value.pid.node, &term->value.port.node_org_enc) < 0) return -1;
 	/* now the numbers: num (4), serial (4), creation (1) */
 	term->value.pid.num = get32be(s) & 0x7fff; /* 15 bits */
 	term->value.pid.serial = get32be(s) & 0x1fff; /* 13 bits */
diff --git a/lib/erl_interface/src/misc/ei_printterm.c b/lib/erl_interface/src/misc/ei_printterm.c
index 620c6e72e2..91fe73e68c 100644
--- a/lib/erl_interface/src/misc/ei_printterm.c
+++ b/lib/erl_interface/src/misc/ei_printterm.c
@@ -132,9 +132,10 @@ static int print_term(FILE* fp, ei_x_buff* x,
     doquote = 0;
     ei_get_type_internal(buf, index, &ty, &n);
     switch (ty) {
-    case ERL_ATOM_EXT:
+    case ERL_ATOM_EXT:   
+    case ERL_ATOM_UTF8_EXT:
     case ERL_SMALL_ATOM_EXT:
-    case ERL_UNICODE_ATOM_EXT:
+    case ERL_SMALL_ATOM_UTF8_EXT:
 	if (ei_decode_atom(buf, index, a) < 0)
 	   goto err;
 	doquote = !islower((int)a[0]);
diff --git a/lib/erl_interface/src/misc/ei_x_encode.c b/lib/erl_interface/src/misc/ei_x_encode.c
index fa1e26ccbb..44dcff7664 100644
--- a/lib/erl_interface/src/misc/ei_x_encode.c
+++ b/lib/erl_interface/src/misc/ei_x_encode.c
@@ -197,18 +197,33 @@ int ei_x_encode_tuple_header(ei_x_buff* x, long n)
 
 int ei_x_encode_atom(ei_x_buff* x, const char* s)
 {
-    return ei_x_encode_atom_len(x, s, strlen(s));
+    return ei_x_encode_atom_len_as(x, s, strlen(s), ERLANG_LATIN1, ERLANG_LATIN1);
 }
 
 int ei_x_encode_atom_len(ei_x_buff* x, const char* s, int len)
+{
+    return ei_x_encode_atom_len_as(x, s, len, ERLANG_LATIN1, ERLANG_LATIN1);
+}
+
+int ei_x_encode_atom_as(ei_x_buff* x, const char* s,
+			enum erlang_char_encoding from_enc,
+			enum erlang_char_encoding to_enc)
+{
+    return ei_x_encode_atom_len_as(x, s, strlen(s), from_enc, to_enc);
+}
+
+int ei_x_encode_atom_len_as(ei_x_buff* x, const char* s, int len,
+			    enum erlang_char_encoding from_enc,
+			    enum erlang_char_encoding to_enc)
 {
     int i = x->index;
-    ei_encode_atom_len(NULL, &i, s, len);
+    ei_encode_atom_len_as(NULL, &i, s, len, from_enc, to_enc);
     if (!x_fix_buff(x, i))
 	return -1;
-    return ei_encode_atom_len(x->buff, &x->index, s, len);
+    return ei_encode_atom_len_as(x->buff, &x->index, s, len, from_enc, to_enc);
 }
 
+
 int ei_x_encode_pid(ei_x_buff* x, const erlang_pid* pid)
 {
     int i = x->index;
diff --git a/lib/erl_interface/src/misc/get_type.c b/lib/erl_interface/src/misc/get_type.c
index c9a040fbbd..54465196b0 100644
--- a/lib/erl_interface/src/misc/get_type.c
+++ b/lib/erl_interface/src/misc/get_type.c
@@ -33,80 +33,6 @@ int ei_get_type(const char *buf, const int *index, int *type, int *len)
     return ei_get_type_internal(buf, index, type, len);
 }
 
-#if 0
-int ei_get_type(const char *buf, const int *index, int *type, int *len)
-{
-    const char *s = buf + *index;
-    int itype = get8(s);	/* Internal type */
-
-    *len = 0;
-  
-    switch (*type) {
-
-    case ERL_SMALL_INTEGER_EXT:
-    case ERL_INTEGER_EXT:
-    case ERL_SMALL_BIG_EXT:
-    case ERL_LARGE_BIG_EXT:
-	*type = EI_TYPE_INTEGER;
-	break;
-
-    case ERL_FLOAT_EXT:
-	*type = EI_TYPE_FLOAT;
-	break;
-
-    case ERL_SMALL_TUPLE_EXT:
-    case ERL_SMALL_ATOM_EXT:
-	*len  = get8(s);
-	break;
-    
-    case ERL_ATOM_EXT:
-    case ERL_UNICODE_ATOM_EXT:
-    case ERL_STRING_EXT:
-	*len  = get16be(s);
-	break;
-    
-    case ERL_LARGE_TUPLE_EXT:
-    case ERL_LIST_EXT:
-    case ERL_BINARY_EXT:
-	*len  = get32be(s);
-	break;
-    
-    case ERL_SMALL_BIG_EXT:
-	*len  = (get8(s)+1)/2; /* big arity */
-	break;
-
-    case ERL_LARGE_BIG_EXT:
-	*len  = (get32be(s)+1)/2; /* big arity */
-	break;
-
-    case ERL_BINARY_EXT:
-	*type = EI_TYPE_BINARY;
-	break;
-
-    case ERL_PID_EXT:
-	*type = EI_TYPE_PID;
-	break;
-
-    case ERL_PORT_EXT:
-	*type = EI_TYPE_PORT;
-	break;
-
-    case ERL_REFERENCE_EXT:
-    case ERL_NEW_REFERENCE_EXT:
-	*type = EI_TYPE_REF;
-	break;
-
-    default:
-	break;
-    }
-
-    /* leave index unchanged */
-    return 0;
-}
-#endif
-
-
-/* Old definition of function above */
    
 int ei_get_type_internal(const char *buf, const int *index,
 			 int *type, int *len)
@@ -117,12 +43,13 @@ int ei_get_type_internal(const char *buf, const int *index,
   
   switch (*type) {
   case ERL_SMALL_ATOM_EXT:
+  case ERL_SMALL_ATOM_UTF8_EXT:
     *type = ERL_ATOM_EXT;
   case ERL_SMALL_TUPLE_EXT:
     *len = get8(s);
     break;
 
-  case ERL_UNICODE_ATOM_EXT:
+  case ERL_ATOM_UTF8_EXT:
     *type = ERL_ATOM_EXT;
   case ERL_ATOM_EXT:
   case ERL_STRING_EXT:
diff --git a/lib/erl_interface/src/misc/putget.h b/lib/erl_interface/src/misc/putget.h
index 8b0d4d3404..77ae168f8c 100644
--- a/lib/erl_interface/src/misc/putget.h
+++ b/lib/erl_interface/src/misc/putget.h
@@ -105,8 +105,12 @@
        ((EI_ULONGLONG)((unsigned char *)(s))[-2] << 8)  | \
         (EI_ULONGLONG)((unsigned char *)(s))[-1]))
 
-int ei_internal_get_atom(const char** bufp, char* p);
+int utf8_to_latin1(char* dst, const char* src, int slen, int destlen, enum erlang_char_encoding* res_encp);
+int latin1_to_utf8(char* dst, const char* src, int slen, int destlen, enum erlang_char_encoding* res_encp);
+int ei_internal_get_atom(const char** bufp, char* p, enum erlang_char_encoding*);
+int ei_internal_put_atom(char** bufp, const char* p, int slen, enum erlang_char_encoding);
 #define get_atom ei_internal_get_atom
+#define put_atom ei_internal_put_atom
 
 typedef union float_ext {
     double d;
diff --git a/lib/erl_interface/src/misc/show_msg.c b/lib/erl_interface/src/misc/show_msg.c
index 194296798b..ca46b15aff 100644
--- a/lib/erl_interface/src/misc/show_msg.c
+++ b/lib/erl_interface/src/misc/show_msg.c
@@ -132,13 +132,13 @@ int ei_show_sendmsg(FILE *stream, const char *header, const char *msgbuf)
 
     switch (msg.msgtype) {
     case ERL_SEND:
-	if (ei_decode_atom(header,&index,msg.cookie) 
+	if (ei_decode_atom_as(header,&index,msg.cookie,sizeof(msg.cookie),ERLANG_UTF8,NULL,NULL) 
 	    || ei_decode_pid(header,&index,&msg.to)) return -1;
 	mbuf = msgbuf;
 	break;
 
     case ERL_SEND_TT:
-	if (ei_decode_atom(header,&index,msg.cookie) 
+	if (ei_decode_atom_as(header,&index,msg.cookie,sizeof(msg.cookie),ERLANG_UTF8,NULL,NULL) 
 	    || ei_decode_pid(header,&index,&msg.to)
 	    || ei_decode_trace(header,&index,&msg.token)) return -1;
 	mbuf = msgbuf;
@@ -146,15 +146,15 @@ int ei_show_sendmsg(FILE *stream, const char *header, const char *msgbuf)
     
     case ERL_REG_SEND:
 	if (ei_decode_pid(header,&index,&msg.from) 
-	    || ei_decode_atom(header,&index,msg.cookie) 
-	    || ei_decode_atom(header,&index,msg.toname)) return -1;
+	    || ei_decode_atom_as(header,&index,msg.cookie,sizeof(msg.cookie),ERLANG_UTF8,NULL,NULL) 
+	    || ei_decode_atom_as(header,&index,msg.toname,sizeof(msg.toname),ERLANG_UTF8,NULL,NULL)) return -1;
 	mbuf = msgbuf;
 	break;
     
     case ERL_REG_SEND_TT:
 	if (ei_decode_pid(header,&index,&msg.from) 
-	    || ei_decode_atom(header,&index,msg.cookie) 
-	    || ei_decode_atom(header,&index,msg.toname)
+	    || ei_decode_atom_as(header,&index,msg.cookie,sizeof(msg.cookie),ERLANG_UTF8,NULL,NULL) 
+	    || ei_decode_atom_as(header,&index,msg.toname,sizeof(msg.toname),ERLANG_UTF8,NULL,NULL)
 	    || ei_decode_trace(header,&index,&msg.token)) return -1;
 	mbuf = msgbuf;
 	break;
diff --git a/lib/erl_interface/test/ei_decode_encode_SUITE.erl b/lib/erl_interface/test/ei_decode_encode_SUITE.erl
index 85cb62239b..e8ae7a6f81 100644
--- a/lib/erl_interface/test/ei_decode_encode_SUITE.erl
+++ b/lib/erl_interface/test/ei_decode_encode_SUITE.erl
@@ -118,6 +118,9 @@ test_ei_decode_encode(Config) when is_list(Config) ->
     ?line send_rec(P, OXPort),
     ?line send_rec(P, OXRef),
 
+    %% Unicode atoms
+    [send_rec(P, Atom) || Atom <- unicode_atom_data()],
+
     ?line runner:recv_eot(P),
     ok.
 
@@ -127,7 +130,7 @@ test_ei_decode_encode(Config) when is_list(Config) ->
 % We read two packets for each test, the ei_decode_encode and ei_x_decode_encode  version....
 
 send_rec(P, Term) when is_port(P) ->
-    ?t:format("Testing: ~p~n", [Term]),
+    %%?t:format("Testing: ~p~n", [Term]),
     P ! {self(), {command, term_to_binary(Term)}},
     {_B,Term} = get_buf_and_term(P).
     
@@ -146,7 +149,7 @@ get_buf_and_term(P) ->
 	_ ->
 	    B1 = list_to_binary([131,B]),	% No magic, add
 	    T = binary_to_term(B1),
-	    io:format("~w\n~w\n(got no magic)\n",[B,T]),
+	    %io:format("~w\n~w\n(got no magic)\n",[B,T]),
 	    {B,T}
     end.
     
@@ -160,7 +163,7 @@ get_binary(P) ->
     case runner:get_term(P) of
 	{bytes,L} ->
 	    B = list_to_binary(L),
-	    io:format("~w\n",[L]),
+	    %%io:format("~w\n",[L]),
 % For strange reasons <<131>> show up as <>....
 %	    io:format("~w\n",[B]),
 	    B;
@@ -305,3 +308,60 @@ mk_ref({NodeName, Creation}, Numbers) when is_list(NodeName),
 	    exit({unexpected_binary_to_term_result, Other})
     end.
 
+
+
+unicode_atom_data() ->
+    [uc_atup(lists:seq(16#1f600, 16#1f600+254)),
+     uc_atup(lists:seq(16#1f600, 16#1f600+63)),
+     uc_atup(lists:seq(1, 255)),
+     uc_atup(lists:seq(100, 163)),
+     uc_atup(lists:seq(200, 354)),
+     uc_atup(lists:seq(200, 263)),
+     uc_atup(lists:seq(2000, 2254)),
+     uc_atup(lists:seq(2000, 2063)),
+     uc_atup(lists:seq(65500, 65754)),
+     uc_atup(lists:seq(65500, 65563))
+     | lists:map(fun (N) ->
+			 uc_atup(lists:seq(64000+N, 64254+N))
+		 end,
+		 lists:seq(1, 2000))].
+
+uc_atup(ATxt) ->
+    string_to_atom(ATxt).
+
+string_to_atom(String) ->
+    Utf8List = string_to_utf8_list(String),
+    Len = length(Utf8List),
+    TagLen = case Len < 256 of
+		 true -> [119, Len];
+		 false -> [118, Len bsr 8, Len band 16#ff]
+	     end,
+    binary_to_term(list_to_binary([131, TagLen, Utf8List])).
+
+string_to_utf8_list([]) ->
+    [];
+string_to_utf8_list([CP|CPs]) when is_integer(CP),
+				   0 =< CP,
+				   CP =< 16#7F ->
+    [CP | string_to_utf8_list(CPs)];
+string_to_utf8_list([CP|CPs]) when is_integer(CP),
+				   16#80 =< CP,
+				   CP =< 16#7FF ->
+    [16#C0 bor (CP bsr 6),
+     16#80 bor (16#3F band CP)
+     | string_to_utf8_list(CPs)];
+string_to_utf8_list([CP|CPs]) when is_integer(CP),
+				   16#800 =< CP,
+				   CP =< 16#FFFF ->
+    [16#E0 bor (CP bsr 12),
+     16#80 bor (16#3F band (CP bsr 6)),
+     16#80 bor (16#3F band CP)
+     | string_to_utf8_list(CPs)];
+string_to_utf8_list([CP|CPs]) when is_integer(CP),
+				   16#10000 =< CP,
+				   CP =< 16#10FFFF ->
+    [16#F0 bor (CP bsr 18),
+     16#80 bor (16#3F band (CP bsr 12)),
+     16#80 bor (16#3F band (CP bsr 6)),
+     16#80 bor (16#3F band CP)
+     | string_to_utf8_list(CPs)].
diff --git a/lib/erl_interface/test/ei_decode_encode_SUITE_data/ei_decode_encode_test.c b/lib/erl_interface/test/ei_decode_encode_SUITE_data/ei_decode_encode_test.c
index 406f02ecfb..194ce9057b 100644
--- a/lib/erl_interface/test/ei_decode_encode_SUITE_data/ei_decode_encode_test.c
+++ b/lib/erl_interface/test/ei_decode_encode_SUITE_data/ei_decode_encode_test.c
@@ -29,171 +29,222 @@
  * Author:  kent@erix.ericsson.se
  */
 
-#define EI_DECODE_ENCODE(FUNC,TYPE) \
-  { \
-    char *buf; \
-    char buf2[1024]; \
-    TYPE p; \
-    int size1 = 0; \
-    int size2 = 0; \
-    int size3 = 0; \
-    int err; \
-    ei_x_buff arg; \
-\
-    message("ei_decode_" #FUNC ", arg is type " #TYPE); \
-    buf = read_packet(NULL); \
-    err = ei_decode_ ## FUNC(buf+1, &size1, &p); \
-    if (err != 0) { \
-      if (err != -1) { \
-	fail("decode returned non zero but not -1"); \
-      } else { \
-	fail("decode returned non zero"); \
-      } \
-      return; \
-    } \
-    if (size1 < 1) { \
-      fail("size is < 1"); \
-      return; \
-    } \
-\
-    message("ei_encode_" #FUNC " buf is NULL, arg is type " #TYPE); \
-    err = ei_encode_ ## FUNC(NULL, &size2, &p); \
-    if (err != 0) { \
-      if (err != -1) { \
-	fail("size calculation returned non zero but not -1"); \
-	return; \
-      } else { \
-	fail("size calculation returned non zero"); \
-	return; \
-      } \
-    } \
-    if (size1 != size2) { \
-      message("size1 = %d, size2 = %d\n",size1,size2); \
-      fail("decode and encode size differs when buf is NULL"); \
-      return; \
-    } \
-    message("ei_encode_" #FUNC ", arg is type " #TYPE); \
-    err = ei_encode_ ## FUNC(buf2, &size3, &p); \
-    if (err != 0) { \
-      if (err != -1) { \
-	fail("returned non zero but not -1"); \
-      } else { \
-	fail("returned non zero"); \
-      } \
-      return; \
-    } \
-    if (size1 != size3) { \
-      message("size1 = %d, size2 = %d\n",size1,size3); \
-      fail("decode and encode size differs"); \
-      return; \
-    } \
-    send_buffer(buf2, size1); \
-\
-    message("ei_x_encode_" #FUNC ", arg is type " #TYPE); \
-    ei_x_new(&arg); \
-    err = ei_x_encode_ ## FUNC(&arg, &p); \
-    if (err != 0) { \
-      if (err != -1) { \
-	fail("returned non zero but not -1"); \
-      } else { \
-	fail("returned non zero"); \
-      } \
-      ei_x_free(&arg); \
-      return; \
-    } \
-    if (arg.index < 1) { \
-      fail("size is < 1"); \
-      ei_x_free(&arg); \
-      return; \
-    } \
-    send_buffer(arg.buff, arg.index); \
-    ei_x_free(&arg); \
-  }
-
-#define EI_DECODE_ENCODE_BIG(FUNC,TYPE) \
-  { \
-    char *buf; \
-    char buf2[2048]; \
-    TYPE *p; \
-    int size1 = 0; \
-    int size2 = 0; \
-    int size3 = 0; \
-    int err, index = 0, len, type; \
-    ei_x_buff arg; \
-\
-    message("ei_decode_" #FUNC ", arg is type " #TYPE); \
-    buf = read_packet(NULL); \
-    ei_get_type(buf+1, &index, &type, &len); \
-    p = ei_alloc_big(len); \
-    err = ei_decode_ ## FUNC(buf+1, &size1, p); \
-    if (err != 0) { \
-      if (err != -1) { \
-	fail("decode returned non zero but not -1"); \
-      } else { \
-	fail("decode returned non zero"); \
-      } \
-      return; \
-    } \
-    if (size1 < 1) { \
-      fail("size is < 1"); \
-      return; \
-    } \
-\
-    message("ei_encode_" #FUNC " buf is NULL, arg is type " #TYPE); \
-    err = ei_encode_ ## FUNC(NULL, &size2, p); \
-    if (err != 0) { \
-      if (err != -1) { \
-	fail("size calculation returned non zero but not -1"); \
-	return; \
-      } else { \
-	fail("size calculation returned non zero"); \
-	return; \
-      } \
-    } \
-    if (size1 != size2) { \
-      message("size1 = %d, size2 = %d\n",size1,size2); \
-      fail("decode and encode size differs when buf is NULL"); \
-      return; \
-    } \
-    message("ei_encode_" #FUNC ", arg is type " #TYPE); \
-    err = ei_encode_ ## FUNC(buf2, &size3, p); \
-    if (err != 0) { \
-      if (err != -1) { \
-	fail("returned non zero but not -1"); \
-      } else { \
-	fail("returned non zero"); \
-      } \
-      return; \
-    } \
-    if (size1 != size3) { \
-      message("size1 = %d, size2 = %d\n",size1,size3); \
-      fail("decode and encode size differs"); \
-      return; \
-    } \
-    send_buffer(buf2, size1); \
-\
-    message("ei_x_encode_" #FUNC ", arg is type " #TYPE); \
-    ei_x_new(&arg); \
-    err = ei_x_encode_ ## FUNC(&arg, p); \
-    if (err != 0) { \
-      if (err != -1) { \
-	fail("returned non zero but not -1"); \
-      } else { \
-	fail("returned non zero"); \
-      } \
-      ei_x_free(&arg); \
-      return; \
-    } \
-    if (arg.index < 1) { \
-      fail("size is < 1"); \
-      ei_x_free(&arg); \
-      return; \
-    } \
-    send_buffer(arg.buff, arg.index); \
-    ei_x_free(&arg); \
-    ei_free_big(p); \
-  }
+/*#define MESSAGE(FMT,A1,A2) message(FMT,A1,A2)*/
+#define MESSAGE(FMT,A1,A2)
 
+typedef int decodeFT(const char *buf, int *index, void*);
+typedef int encodeFT(char *buf, int *index, void*);
+typedef int x_encodeFT(ei_x_buff*, void*);
+
+struct Type {
+    char* name;
+    char* type;
+    decodeFT* ei_decode_fp;
+    encodeFT* ei_encode_fp;
+    x_encodeFT* ei_x_encode_fp;
+};
+
+typedef struct
+{
+    char name[MAXATOMLEN_UTF8];
+    enum erlang_char_encoding enc;
+}my_atom;
+
+int ei_decode_my_atom(const char *buf, int *index, my_atom* a)
+{
+    return ei_decode_atom_as(buf, index, a->name, sizeof(a->name), ERLANG_UTF8, &a->enc, NULL);
+}
+int ei_encode_my_atom(char *buf, int *index, my_atom* a)
+{
+    return ei_encode_atom_as(buf, index, a->name, ERLANG_UTF8, a->enc);
+}
+int ei_x_encode_my_atom(ei_x_buff* x, my_atom* a)
+{
+    return ei_x_encode_atom_as(x, a->name, ERLANG_UTF8, a->enc);
+}
+
+void decode_encode(struct Type* t, void* obj)
+{
+    char *buf;
+    char buf2[1024];
+    int size1 = 0;
+    int size2 = 0;
+    int size3 = 0;
+    int err;
+    ei_x_buff arg;
+    
+    MESSAGE("ei_decode_%s, arg is type %s", t->name, t->type);
+    buf = read_packet(NULL);
+    err = t->ei_decode_fp(buf+1, &size1, obj);
+    if (err != 0) {
+	if (err != -1) {
+	    fail("decode returned non zero but not -1");
+	} else {
+	    fail("decode returned non zero");
+	}
+	return;
+    }
+    if (size1 < 1) {
+	fail("size is < 1");
+	return;
+    }
+
+    MESSAGE("ei_encode_%s buf is NULL, arg is type %s", t->name, t->type);
+    err = t->ei_encode_fp(NULL, &size2, obj);
+    if (err != 0) {
+	if (err != -1) {
+	    fail("size calculation returned non zero but not -1");
+	    return;
+	} else {
+	    fail("size calculation returned non zero");
+	    return;
+	}
+    }
+    if (size1 != size2) {
+	MESSAGE("size1 = %d, size2 = %d\n",size1,size2);
+	fail("decode and encode size differs when buf is NULL");
+	return;
+    }
+    MESSAGE("ei_encode_%s, arg is type %s", t->name, t->type);
+    err = t->ei_encode_fp(buf2, &size3, obj);
+    if (err != 0) {
+	if (err != -1) {
+	    fail("returned non zero but not -1");
+	} else {
+	    fail("returned non zero");
+	}
+	return;
+    }
+    if (size1 != size3) {
+	MESSAGE("size1 = %d, size2 = %d\n",size1,size3);
+	fail("decode and encode size differs");
+	return;
+    }
+    send_buffer(buf2, size1);
+
+    MESSAGE("ei_x_encode_%s, arg is type %s", t->name, t->type);
+    ei_x_new(&arg);
+    err = t->ei_x_encode_fp(&arg, obj);
+    if (err != 0) {
+	if (err != -1) {
+	    fail("returned non zero but not -1");
+	} else {
+	    fail("returned non zero");
+	}
+	ei_x_free(&arg);
+	return;
+    }
+    if (arg.index < 1) {
+	fail("size is < 1");
+	ei_x_free(&arg);
+	return;
+    }
+    send_buffer(arg.buff, arg.index);
+    ei_x_free(&arg);
+}
+
+
+#define EI_DECODE_ENCODE(TYPE, ERLANG_TYPE) {			\
+	struct Type type_struct = {#TYPE, #ERLANG_TYPE,		\
+				   (decodeFT*)ei_decode_##TYPE,		\
+				   (encodeFT*)ei_encode_##TYPE,		\
+				   (x_encodeFT*)ei_x_encode_##TYPE };	\
+	ERLANG_TYPE type_obj;						\
+	decode_encode(&type_struct, &type_obj);				\
+    }
+
+
+void decode_encode_big(struct Type* t)
+{
+    char *buf;
+    char buf2[2048];
+    void *p; /* (TYPE*) */
+    int size1 = 0;
+    int size2 = 0;
+    int size3 = 0;
+    int err, index = 0, len, type;
+    ei_x_buff arg;
+
+    MESSAGE("ei_decode_%s, arg is type %s", t->name, t->type);
+    buf = read_packet(NULL);
+    ei_get_type(buf+1, &index, &type, &len);
+    p = ei_alloc_big(len);
+    err = t->ei_decode_fp(buf+1, &size1, p);
+    if (err != 0) {
+	if (err != -1) {
+	    fail("decode returned non zero but not -1");
+	} else {
+	    fail("decode returned non zero");
+	}
+	return;
+    }
+    if (size1 < 1) {
+	fail("size is < 1");
+	return;
+    }
+
+    MESSAGE("ei_encode_%s buf is NULL, arg is type %s", t->name, t->type);
+    err = t->ei_encode_fp(NULL, &size2, p);
+    if (err != 0) {
+	if (err != -1) {
+	    fail("size calculation returned non zero but not -1");
+	    return;
+	} else {
+	    fail("size calculation returned non zero");
+	    return;
+	}
+    }
+    if (size1 != size2) {
+	MESSAGE("size1 = %d, size2 = %d\n",size1,size2);
+	fail("decode and encode size differs when buf is NULL");
+	return;
+    }
+    MESSAGE("ei_encode_%s, arg is type %s", t->name, t->type);
+    err = t->ei_encode_fp(buf2, &size3, p);
+    if (err != 0) {
+	if (err != -1) {
+	    fail("returned non zero but not -1");
+	} else {
+	    fail("returned non zero");
+	}
+	return;
+    }
+    if (size1 != size3) {
+	MESSAGE("size1 = %d, size2 = %d\n",size1,size3);
+	fail("decode and encode size differs");
+	return;
+    }
+    send_buffer(buf2, size1);
+
+    MESSAGE("ei_x_encode_%s, arg is type %s", t->name, t->type);
+    ei_x_new(&arg);
+    err = t->ei_x_encode_fp(&arg, p);
+    if (err != 0) {
+	if (err != -1) {
+	    fail("returned non zero but not -1");
+	} else {
+	    fail("returned non zero");
+	}
+	ei_x_free(&arg);
+	return;
+    }
+    if (arg.index < 1) {
+	fail("size is < 1");
+	ei_x_free(&arg);
+	return;
+    }
+    send_buffer(arg.buff, arg.index);
+    ei_x_free(&arg);
+    ei_free_big(p);
+}
+
+#define EI_DECODE_ENCODE_BIG(TYPE, ERLANG_TYPE) {					\
+	struct Type type_struct = {#TYPE, #ERLANG_TYPE,		\
+				   (decodeFT*)ei_decode_##TYPE,	\
+				   (encodeFT*)ei_encode_##TYPE,		\
+				   (x_encodeFT*)ei_x_encode_##TYPE };	\
+	decode_encode_big(&type_struct);				\
+    }
 
 
@@ -201,6 +252,8 @@
 
 TESTCASE(test_ei_decode_encode)
 {
+    int i;
+
     EI_DECODE_ENCODE(fun  , erlang_fun);
     EI_DECODE_ENCODE(pid  , erlang_pid);
     EI_DECODE_ENCODE(port , erlang_port);
@@ -223,6 +276,11 @@ TESTCASE(test_ei_decode_encode)
     EI_DECODE_ENCODE(port , erlang_port);
     EI_DECODE_ENCODE(ref  , erlang_ref);
 
+    /* Unicode atoms */
+    for (i=0; i<2010; i++) {
+	EI_DECODE_ENCODE(my_atom , my_atom);
+    }
+
     report(1);
 }
 
diff --git a/lib/ic/examples/all-against-all/client.c b/lib/ic/examples/all-against-all/client.c
index e0a52b142d..5dece9cfa6 100644
--- a/lib/ic/examples/all-against-all/client.c
+++ b/lib/ic/examples/all-against-all/client.c
@@ -88,6 +88,7 @@ int main(){
 
   /* Initiating pid*/
   strcpy(pid.node,client_node);
+  pid.node_org_enc = ERLANG_LATIN1;
   pid.num = 99;
   pid.serial = 0;
   pid.creation = 0;
diff --git a/lib/ic/examples/c-client/client.c b/lib/ic/examples/c-client/client.c
index 816477cf15..5b11510ce3 100644
--- a/lib/ic/examples/c-client/client.c
+++ b/lib/ic/examples/c-client/client.c
@@ -64,6 +64,7 @@ int main()
 
     /* Initiating pid*/
     strcpy(pid.node,CLNODE);
+    pid.node_org_enc = ERLANG_LATIN1;
     pid.num = 99;
     pid.serial = 0;
     pid.creation = 0;
diff --git a/lib/ic/examples/c-server/client.c b/lib/ic/examples/c-server/client.c
index fa570089b5..605e41ddb1 100644
--- a/lib/ic/examples/c-server/client.c
+++ b/lib/ic/examples/c-server/client.c
@@ -58,6 +58,7 @@ int main()
 
     /* Initiating pid*/
     strcpy(pid.node, CLNODE);
+    pid.node_org_enc = ERLANG_LATIN1;
     pid.num = 99;
     pid.serial = 0;
     pid.creation = 0;
-- 
cgit v1.2.3


From a912b3c6f4759a6a8e60fc4ea559c19edb02448c Mon Sep 17 00:00:00 2001
From: Rickard Green <rickard@erlang.org>
Date: Thu, 17 Jan 2013 21:09:46 +0100
Subject: test case

---
 erts/emulator/test/distribution_SUITE.erl | 197 ++++++++++++++++++++++++++----
 1 file changed, 175 insertions(+), 22 deletions(-)

diff --git a/erts/emulator/test/distribution_SUITE.erl b/erts/emulator/test/distribution_SUITE.erl
index aaf6420ccd..e7bd8d19c3 100644
--- a/erts/emulator/test/distribution_SUITE.erl
+++ b/erts/emulator/test/distribution_SUITE.erl
@@ -18,7 +18,17 @@
 %%
 
 -module(distribution_SUITE).
--compile(r13).
+-compile(r15).
+
+-define(VERSION_MAGIC,       131).
+
+-define(ATOM_EXT,            100).
+-define(REFERENCE_EXT,       101).
+-define(PORT_EXT,            102).
+-define(PID_EXT,             103).
+-define(NEW_REFERENCE_EXT,   114).
+-define(ATOM_UTF8_EXT,       118).
+-define(SMALL_ATOM_UTF8_EXT, 119).
 
 %% Tests distribution and the tcp driver.
 
@@ -1139,27 +1149,66 @@ atom_data() ->
 	      lists:seq(1, 2000)).
 
 verify_atom_data(AtomData) ->
-    lists:foreach(fun ({Atom, AtomTxt}) ->
-			  AtomTxt = atom_to_list(Atom)
+    lists:foreach(fun ({Atom, AtomTxt}) when is_atom(Atom) ->
+			  AtomTxt = atom_to_list(Atom);
+		      ({PPR, AtomTxt}) ->
+			  % Pid, Port, or Ref
+			  AtomTxt = atom_to_list(node(PPR))
 		  end,
 		  AtomData).
 
-uc_atup(ATxt) ->
-    {string_to_atom(ATxt), ATxt}.
+uc_atom_tup(ATxt) ->
+    Atom = string_to_atom(ATxt),
+    ATxt = atom_to_list(Atom),
+    {Atom, ATxt}.
+
+uc_pid_tup(ATxt) ->
+    ATxtExt = string_to_atom_ext(ATxt),
+    Pid = mk_pid({ATxtExt, 1}, 4711,17),
+    true = is_pid(Pid),
+    Atom = node(Pid),
+    true = is_atom(Atom),
+    ATxt = atom_to_list(Atom),
+    {Pid, ATxt}.
+
+uc_port_tup(ATxt) ->
+    ATxtExt = string_to_atom_ext(ATxt),
+    Port = mk_port({ATxtExt, 2}, 4711),
+    true = is_port(Port),
+    Atom = node(Port),
+    true = is_atom(Atom),
+    ATxt = atom_to_list(Atom),
+    {Port, ATxt}.
+
+uc_ref_tup(ATxt) ->
+    ATxtExt = string_to_atom_ext(ATxt),
+    Ref = mk_ref({ATxtExt, 3}, [4711,17, 4711]),
+    true = is_reference(Ref),
+    Atom = node(Ref),
+    true = is_atom(Atom),
+    ATxt = atom_to_list(Atom),
+    {Ref, ATxt}.
+
 
 unicode_atom_data() ->
-    [uc_atup(lists:seq(16#1f600, 16#1f600+254)),
-     uc_atup(lists:seq(16#1f600, 16#1f600+63)),
-     uc_atup(lists:seq(0, 254)),
-     uc_atup(lists:seq(100, 163)),
-     uc_atup(lists:seq(200, 354)),
-     uc_atup(lists:seq(200, 263)),
-     uc_atup(lists:seq(2000, 2254)),
-     uc_atup(lists:seq(2000, 2063)),
-     uc_atup(lists:seq(65500, 65754)),
-     uc_atup(lists:seq(65500, 65563))
+    [uc_pid_tup(lists:seq(16#1f600, 16#1f600+249) ++ "@host"),
+     uc_pid_tup(lists:seq(16#1f600, 16#1f600+30) ++ "@host"),
+     uc_port_tup(lists:seq(16#1f600, 16#1f600+249) ++ "@host"),
+     uc_port_tup(lists:seq(16#1f600, 16#1f600+30) ++ "@host"),
+     uc_ref_tup(lists:seq(16#1f600, 16#1f600+249) ++ "@host"),
+     uc_ref_tup(lists:seq(16#1f600, 16#1f600+30) ++ "@host"),
+     uc_atom_tup(lists:seq(16#1f600, 16#1f600+254)),
+     uc_atom_tup(lists:seq(16#1f600, 16#1f600+63)),
+     uc_atom_tup(lists:seq(0, 254)),
+     uc_atom_tup(lists:seq(100, 163)),
+     uc_atom_tup(lists:seq(200, 354)),
+     uc_atom_tup(lists:seq(200, 263)),
+     uc_atom_tup(lists:seq(2000, 2254)),
+     uc_atom_tup(lists:seq(2000, 2063)),
+     uc_atom_tup(lists:seq(65500, 65754)),
+     uc_atom_tup(lists:seq(65500, 65563))
      | lists:map(fun (N) ->
-			 uc_atup(lists:seq(64000+N, 64254+N))
+			 uc_atom_tup(lists:seq(64000+N, 64254+N))
 		 end,
 		 lists:seq(1, 2000))].
 
@@ -2193,14 +2242,19 @@ repeat(Fun, N) ->
     Fun(),
     repeat(Fun, N-1).
 
-string_to_atom(String) ->
+string_to_atom_ext(String) ->
     Utf8List = string_to_utf8_list(String),
     Len = length(Utf8List),
-    TagLen = case Len < 256 of
-		 true -> [119, Len];
-		 false -> [118, Len bsr 8, Len band 16#ff]
-	     end,
-    binary_to_term(list_to_binary([131, TagLen, Utf8List])).
+    case Len < 256 of
+	true ->
+	    [?SMALL_ATOM_UTF8_EXT, Len | Utf8List];
+	false ->
+	    [?ATOM_UTF8_EXT, Len bsr 8, Len band 16#ff | Utf8List]
+    end.
+
+string_to_atom(String) ->
+    binary_to_term(list_to_binary([?VERSION_MAGIC
+				   | string_to_atom_ext(String)])).
 
 string_to_utf8_list([]) ->
     [];
@@ -2275,3 +2329,102 @@ utf8_list_to_string([B0, B1, B2, B3 | Bs]) when is_integer(B0),
 	  bor ((B2 band 16#3F) bsl 6)
 	  bor (B3 band 16#3F))
      | utf8_list_to_string(Bs)].
+
+mk_pid({NodeName, Creation}, Number, Serial) when is_atom(NodeName) ->
+    <<?VERSION_MAGIC, NodeNameExt/binary>> = term_to_binary(NodeName),
+    mk_pid({NodeNameExt, Creation}, Number, Serial);
+mk_pid({NodeNameExt, Creation}, Number, Serial) ->
+    case catch binary_to_term(list_to_binary([?VERSION_MAGIC,
+					      ?PID_EXT,
+					      NodeNameExt,
+					      uint32_be(Number),
+					      uint32_be(Serial),
+					      uint8(Creation)])) of
+	Pid when is_pid(Pid) ->
+	    Pid;
+	{'EXIT', {badarg, _}} ->
+	    exit({badarg, mk_pid, [{NodeNameExt, Creation}, Number, Serial]});
+	Other ->
+	    exit({unexpected_binary_to_term_result, Other})
+    end.
+
+mk_port({NodeName, Creation}, Number) when is_atom(NodeName) ->
+    <<?VERSION_MAGIC, NodeNameExt/binary>> = term_to_binary(NodeName),
+    mk_port({NodeNameExt, Creation}, Number);
+mk_port({NodeNameExt, Creation}, Number) ->
+    case catch binary_to_term(list_to_binary([?VERSION_MAGIC,
+					      ?PORT_EXT,
+					      NodeNameExt,
+					      uint32_be(Number),
+					      uint8(Creation)])) of
+	Port when is_port(Port) ->
+	    Port;
+	{'EXIT', {badarg, _}} ->
+	    exit({badarg, mk_port, [{NodeNameExt, Creation}, Number]});
+	Other ->
+	    exit({unexpected_binary_to_term_result, Other})
+    end.
+
+mk_ref({NodeName, Creation}, [Number] = NL) when is_atom(NodeName),
+						 is_integer(Creation),
+						 is_integer(Number) ->
+    <<?VERSION_MAGIC, NodeNameExt/binary>> = term_to_binary(NodeName),
+    mk_ref({NodeNameExt, Creation}, NL);
+mk_ref({NodeNameExt, Creation}, [Number]) when is_integer(Creation),
+					       is_integer(Number) ->
+    case catch binary_to_term(list_to_binary([?VERSION_MAGIC,
+					      ?REFERENCE_EXT,
+					      NodeNameExt,
+					      uint32_be(Number),
+					      uint8(Creation)])) of
+	Ref when is_reference(Ref) ->
+	    Ref;
+	{'EXIT', {badarg, _}} ->
+	    exit({badarg, mk_ref, [{NodeNameExt, Creation}, [Number]]});
+	Other ->
+	    exit({unexpected_binary_to_term_result, Other})
+    end;
+mk_ref({NodeName, Creation}, Numbers) when is_atom(NodeName),
+					   is_integer(Creation),
+					   is_list(Numbers) ->
+    <<?VERSION_MAGIC, NodeNameExt/binary>> = term_to_binary(NodeName),
+    mk_ref({NodeNameExt, Creation}, Numbers);
+mk_ref({NodeNameExt, Creation}, Numbers) when is_integer(Creation),
+					      is_list(Numbers) ->
+    case catch binary_to_term(list_to_binary([?VERSION_MAGIC,
+					      ?NEW_REFERENCE_EXT,
+					      uint16_be(length(Numbers)),
+					      NodeNameExt,
+					      uint8(Creation),
+					      lists:map(fun (N) ->
+								uint32_be(N)
+							end,
+							Numbers)])) of
+	Ref when is_reference(Ref) ->
+	    Ref;
+	{'EXIT', {badarg, _}} ->
+	    exit({badarg, mk_ref, [{NodeNameExt, Creation}, Numbers]});
+	Other ->
+	    exit({unexpected_binary_to_term_result, Other})
+    end.
+
+
+uint32_be(Uint) when is_integer(Uint), 0 =< Uint, Uint < 1 bsl 32 ->
+    [(Uint bsr 24) band 16#ff,
+     (Uint bsr 16) band 16#ff,
+     (Uint bsr 8) band 16#ff,
+     Uint band 16#ff];
+uint32_be(Uint) ->
+    exit({badarg, uint32_be, [Uint]}).
+
+
+uint16_be(Uint) when is_integer(Uint), 0 =< Uint, Uint < 1 bsl 16 ->
+    [(Uint bsr 8) band 16#ff,
+     Uint band 16#ff];
+uint16_be(Uint) ->
+    exit({badarg, uint16_be, [Uint]}).
+
+uint8(Uint) when is_integer(Uint), 0 =< Uint, Uint < 1 bsl 8 ->
+    Uint band 16#ff;
+uint8(Uint) ->
+    exit({badarg, uint8, [Uint]}).
-- 
cgit v1.2.3


From b15688d40d5147c1122aaad3b82495fbbc4dede8 Mon Sep 17 00:00:00 2001
From: Rickard Green <rickard@erlang.org>
Date: Sat, 19 Jan 2013 00:45:16 +0100
Subject: UTF-8 atom documentation

---
 erts/doc/src/erl_dist_protocol.xml                 | 288 ++++++++++++++++++++-
 erts/doc/src/erl_ext_dist.xml                      | 121 ++++++++-
 erts/doc/src/erlang.xml                            |  10 +-
 lib/kernel/internal_doc/distribution_handshake.txt | 216 +---------------
 4 files changed, 398 insertions(+), 237 deletions(-)

diff --git a/erts/doc/src/erl_dist_protocol.xml b/erts/doc/src/erl_dist_protocol.xml
index 6c725fc82d..0252187be5 100644
--- a/erts/doc/src/erl_dist_protocol.xml
+++ b/erts/doc/src/erl_dist_protocol.xml
@@ -547,13 +547,289 @@ If Result > 0, the packet only consists of [119, Result].
 -->
 
     </section>
-
+    <marker id="distribution_handshake"/>
       <section>
-	<title>Handshake</title>
-	<p>
-	  The handshake is discussed in detail in the internal documentation for 
-	  the kernel (Erlang) application.
-	</p>
+	<title>Distribution Handshake</title>
+	  <p>
+	    This section describes the distribution handshake protocol
+	    introduced in the OTP-R6 release of Erlang/OTP. This
+	    description was previously located in
+	    <c>$ERL_TOP/lib/kernel/internal_doc/distribution_handshake.txt</c>,
+	    and has more or less been copied and "formatted" here. It has been
+	    more or less unchanged since the year 1999, but the handshake
+	    should not have changed much since then either.
+	  </p>
+	<section>
+	  <title>General</title>
+	  <p>
+	    The TCP/IP distribution uses a handshake which expects a
+	    connection based protocol, i.e. the protocol does not include
+	    any authentication after the handshake procedure.
+	  </p>
+	  <p>
+	    This is not entirely safe, as it is vulnerable against takeover
+	    attacks, but it is a tradeoff between fair safety and performance.
+	  </p>
+	  <p>
+	    The cookies are never sent in cleartext and the handshake procedure
+	    expects the client (called A) to be the first one to prove that it can 
+	    generate a sufficient digest. The digest is generated with the 
+	    MD5 message digest algorithm and the challenges are expected to be very
+	    random numbers.
+	  </p>
+	</section>
+	<section>
+	  <title>Definitions</title>
+	  <p>
+	    A challenge is a 32 bit integer number in big endian order. Below the function
+	    <c>gen_challenge()</c> returns a random 32 bit integer used as a challenge.
+	  </p>
+	  <p>
+	    A digest is a (16 bytes) MD5 hash of the Challenge (as text) concatenated
+	    with the cookie (as text). Below, the function <c>gen_digest(Challenge, Cookie)</c>
+	    generates a digest as described above.
+	  </p>
+	  <p>
+	    An out_cookie is the cookie used in outgoing communication to a certain node,
+	    so that A's out_cookie for B should correspond with B's in_cookie for A and 
+	    the other way around. A's out_cookie for B and A's in_cookie for B need <em>NOT</em>
+	    be the same. Below the function <c>out_cookie(Node)</c> returns the current
+	    node's out_cookie for <c>Node</c>.
+	  </p>
+	  <p>
+	    An in_cookie is the cookie expected to be used by another node when 
+	    communicating with us, so that A's in_cookie for B corresponds with B's 
+	    out_cookie for A. Below the function <c>in_cookie(Node)</c> returns the current
+	    node's <c>in_cookie</c> for <c>Node</c>.
+	  </p>
+	  <p>
+	    The cookies are text strings that can be viewed as passwords.
+	  </p>
+	  <p>
+	    Every message in the handshake starts with a 16 bit big endian integer 
+	    which contains the length of the message (not counting the two initial bytes).
+	    In erlang this corresponds to the <c>gen_tcp</c> option <c>{packet, 2}</c>. Note that after 
+	    the handshake, the distribution switches to 4 byte packet headers.
+	  </p>
+
+	</section>
+	<section>
+	  <title>The Handshake in Detail</title>
+	  <p>
+	    Imagine two nodes, node A, which initiates the handshake and node B, which
+	    accepts the connection.
+	  </p>
+	  <taglist>
+	    <tag>1) connect/accept</tag>
+	    <item><p>A connects to B via TCP/IP and B accepts the connection.</p></item>
+	    <tag>2) send_name/receive_name</tag>
+	    <item><p>A sends an initial identification to B. B receives the message.
+	    The message looks like this (every "square" being one byte and the packet
+	    header removed):
+	  </p>
+<pre>
++---+--------+--------+-----+-----+-----+-----+-----+-----+-...-+-----+
+|'n'|Version0|Version1|Flag0|Flag1|Flag2|Flag3|Name0|Name1| ... |NameN|
++---+--------+--------+-----+-----+-----+-----+-----+-----+-... +-----+
+</pre>
+	  <p>
+	    The 'n' is just a message tag.
+	    Version0 and Version1 is the distribution version selected by node A,
+            based on information from EPMD. (16 bit big endian)
+	    Flag0 ... Flag3 are capability flags, the capabilities defined in
+	    <c>$ERL_TOP/lib/kernel/include/dist.hrl</c>.
+            (32 bit big endian)
+	    Name0 ... NameN is the full nodename of A, as a string of bytes (the
+            packet length denotes how long it is).
+	  </p></item>
+	  <tag>3) recv_status/send_status</tag>
+	  <item><p>B sends a status message to A, which indicates
+	    if the connection is allowed. The following status codes are defined:</p>
+	    <taglist>
+	      <tag><c>ok</c></tag>
+	      <item>The handshake will continue.</item>
+	      <tag><c>ok_simultaneous</c></tag>
+	      <item>The handshake will continue, but A is informed that B
+              has another ongoing connection attempt that will be
+	      shut down (simultaneous connect where A's name is 
+	      greater than B's name, compared literally).</item>
+	      <tag><c>nok</c></tag>
+	      <item>The handshake will not continue, as B already has an ongoing handshake
+	      which it itself has initiated. (simultaneous connect where B's name is 
+	      greater than A's).</item>
+	      <tag><c>not_allowed</c></tag>
+	      <item>The connection is disallowed for some (unspecified) security 
+              reason.</item>
+	      <tag><c>alive</c></tag>
+	      <item>A connection to the node is already active, which either means
+	      that node A is confused or that the TCP connection breakdown
+	      of a previous node with this name has not yet reached node B.
+	      See 3B below.</item>
+	    </taglist>
+	    <p>This is the format of the status message:</p>
+<pre>
++---+-------+-------+-...-+-------+
+|'s'|Status0|Status1| ... |StatusN|
++---+-------+-------+-...-+-------+
+</pre>
+	    <p>
+	      's' is the message tag Status0 ... StatusN is the status as a string (not terminated)
+	    </p>
+	  </item>
+	  <tag>3B) send_status/recv_status</tag>
+	  <item><p>If status was 'alive', node A will answer with
+	  another status message containing either 'true' which means that the
+	  connection should continue (The old connection from this node is broken), or
+	  <c>'false'</c>, which simply means that the connection should be closed, the 
+	  connection attempt was a mistake.</p></item>
+	  <tag>4) recv_challenge/send_challenge</tag>
+	  <item><p>If the status was <c>ok</c> or <c>ok_simultaneous</c>,
+	  The handshake continues with B sending A another message, the challenge.
+	  The challenge contains the same type of information as the "name" message 
+	  initially sent from A to B, with the addition of a 32 bit challenge:</p>
+<pre>
++---+--------+--------+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-...-+-----+
+|'n'|Version0|Version1|Flag0|Flag1|Flag2|Flag3|Chal0|Chal1|Chal2|Chal3|Name0|Name1| ... |NameN|
++---+--------+--------+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-... +-----+
+</pre>
+          <p>
+	    Where Chal0 ... Chal3 is the challenge as a 32 bit big endian integer
+	    and the other fields are B's version, flags and full nodename.
+	  </p></item>
+	  <tag>5) send_challenge_reply/recv_challenge_reply</tag>
+	  <item><p>Now A has generated a digest and its own challenge. Those are
+	  sent together in a package to B:</p>
+<pre>
++---+-----+-----+-----+-----+-----+-----+-----+-----+-...-+------+
+|'r'|Chal0|Chal1|Chal2|Chal3|Dige0|Dige1|Dige2|Dige3| ... |Dige15|
++---+-----+-----+-----+-----+-----+-----+-----+-----+-...-+------+
+</pre>
+          <p>
+	    Where 'r' is the tag, Chal0 ... Chal3 is A's challenge for B to handle and
+	    Dige0 ... Dige15 is the digest that A constructed from the challenge B sent
+	    in the previous step.
+	  </p></item>
+	  <tag>6) recv_challenge_ack/send_challenge_ack</tag>
+	  <item><p>B checks that the digest received from A is correct and generates a
+	  digest from the challenge received from  A. The digest is then sent to A. The
+	  message looks like this:</p>
+<pre>
++---+-----+-----+-----+-----+-...-+------+
+|'a'|Dige0|Dige1|Dige2|Dige3| ... |Dige15|
++---+-----+-----+-----+-----+-...-+------+
+</pre>
+          <p>
+	    Where 'a' is the tag and Dige0 ... Dige15 is the digest calculated by B
+	    for A's challenge.</p></item>
+	  <tag>7)</tag>
+	  <item><p>A checks the digest from B and the connection is up.</p></item>
+	</taglist>
+	</section>
+	<section>
+	  <title>Semigraphic View</title>
+<pre>
+A (initiator)						B (acceptor)
+
+TCP connect -----------------------------------------&gt;	
+							TCP accept
+
+send_name   -----------------------------------------&gt;
+							recv_name
+
+	    &lt;----------------------------------------	send_status
+recv_status
+(if status was 'alive'
+ send_status - - - - - - - - - - - - - - - - - - - -&gt;
+							recv_status)
+							ChB = gen_challenge()
+		          (ChB)
+	    &lt;----------------------------------------	send_challenge
+recv_challenge
+
+ChA = gen_challenge(),
+OCA = out_cookie(B),
+DiA = gen_digest(ChB,OCA)
+			  (ChA, DiA)
+send_challenge_reply --------------------------------&gt;
+							recv_challenge_reply
+							ICB = in_cookie(A),
+							check:
+							DiA == gen_digest
+								(ChB, ICB) ?
+							- if OK:
+	    						 OCB = out_cookie(A),
+							 DiB = gen_digest
+			(DiB)					(ChA, OCB)
+	    &lt;-----------------------------------------	 send_challenge_ack
+recv_challenge_ack					 DONE
+ICA = in_cookie(B),                                     - else
+check:                                                   CLOSE
+DiB == gen_digest(ChA,ICA) ?
+- if OK
+ DONE
+- else
+ CLOSE
+</pre>
+	</section>
+	<marker id="dflags"/>
+	<section>
+	  <title>The Currently Defined Distribution Flags</title>
+	  <p>
+	    Currently (OTP-R16) the following capability flags are defined:
+	  </p>
+<pre>
+%% The node should be published and part of the global namespace
+-define(DFLAG_PUBLISHED,1).
+
+%% The node implements an atom cache (obsolete)
+-define(DFLAG_ATOM_CACHE,2).
+
+%% The node implements extended (3 * 32 bits) references. This is
+%% required today. If not present connection will be refused.
+-define(DFLAG_EXTENDED_REFERENCES,4).
+
+%% The node implements distributed process monitoring.
+-define(DFLAG_DIST_MONITOR,8).
+
+%% The node uses separate tag for fun's (lambdas) in the distribution protocol.
+-define(DFLAG_FUN_TAGS,16#10).
+
+%% The node implements distributed named process monitoring.
+-define(DFLAG_DIST_MONITOR_NAME,16#20).
+
+%% The (hidden) node implements atom cache (obsolete)
+-define(DFLAG_HIDDEN_ATOM_CACHE,16#40).
+
+%% The node understand new fun-tags
+-define(DFLAG_NEW_FUN_TAGS,16#80).
+
+%% The node is capable of handling extended pids and ports. This is
+%% required today. If not present connection will be refused.
+-define(DFLAG_EXTENDED_PIDS_PORTS,16#100).
+
+%%
+-define(DFLAG_EXPORT_PTR_TAG,16#200).
+
+%%
+-define(DFLAG_BIT_BINARIES,16#400).
+
+%% The node understands new float format
+-define(DFLAG_NEW_FLOATS,16#800).
+
+%%
+-define(DFLAG_UNICODE_IO,16#1000).
+
+%% The node implements atom cache in distribution header.
+-define(DFLAG_DIST_HDR_ATOM_CACHE,16#2000).
+
+%% The node understand the SMALL_ATOM_EXT tag
+-define(DFLAG_SMALL_ATOM_TAGS, 16#4000).
+
+%% The node understand UTF-8 encoded atoms
+-define(DFLAG_UTF8_ATOMS, 16#10000).
+
+</pre>
+	</section>
       </section>
 
       <section>
diff --git a/erts/doc/src/erl_ext_dist.xml b/erts/doc/src/erl_ext_dist.xml
index fd2da2cfe3..28afea8b29 100644
--- a/erts/doc/src/erl_ext_dist.xml
+++ b/erts/doc/src/erl_ext_dist.xml
@@ -119,10 +119,39 @@
 	<cell align="center">Data</cell>
       </row>
     <tcaption></tcaption></table>
+    <marker id="utf8_atoms"/>
+    <note>
+    <p>As of ERTS version 5.10 (OTP-R16) support
+    for UTF-8 encoded atoms has been introduced in the external format.
+    However, only characters that can be encoded using Latin1 (ISO-8859-1)
+    are currently supported in atoms. The support for UTF-8 encoded atoms
+    in the external format has been implemented in order to be able to support
+    all Unicode characters in atoms in <em>some future release</em>. Full
+    support for Unicode atoms will not happen before OTP-R18, and might
+    be introduced even later than that. Until full Unicode support for
+    atoms has been introduced, it is an <em>error</em> to pass atoms containing
+    characters that cannot be encoded in Latin1, and <em>the behavior is
+    undefined</em>.</p>
+    <p>When the
+    <seealso marker="erl_dist_protocol#dflags"><c>DFLAG_UTF8_ATOMS</c></seealso>
+    distribution flag has been exchanged between both nodes in the
+    <seealso marker="erl_dist_protocol#distribution_handshake">distribution handshake</seealso>,
+    all atoms in the distribution header will be encoded in UTF-8; otherwise,
+    all atoms in the distribution header will be encoded in Latin1. The two
+    new tags <seealso marker="#ATOM_UTF8_EXT">ATOM_UTF8_EXT</seealso>, and
+    <seealso marker="#SMALL_ATOM_UTF8_EXT">SMALL_ATOM_UTF8_EXT</seealso>
+    will only be used if the <c>DFLAG_UTF8_ATOMS</c> distribution flag has
+    been exchanged between nodes, or if an atom containing characters
+    that cannot be encoded in Latin1 is encountered.
+    </p>
+    <p>The maximum number of allowed characters in an atom is 255. In the
+    UTF-8 case each character may need 4 bytes to be encoded.
+    </p>
+    </note>
   </section>
 
-  <section>
-    <marker id="distribution_header"/>
+  <marker id="distribution_header"/>
+  <section>  
     <title>Distribution header</title>
     <p>
       As of erts version 5.7.2 the old atom cache protocol was
@@ -219,8 +248,7 @@
     <p>
       The least significant bit in that half byte is the <c>LongAtoms</c>
       flag. If it is set, 2 bytes are used for atom lengths instead of
-      1 byte in the distribution header. However, the current emulator
-      cannot handle long atoms, so it will currently always be 0.
+      1 byte in the distribution header.
     </p>
     <p>
       After the <c>Flags</c> field follow the <c>AtomCacheRefs</c>. The
@@ -247,15 +275,25 @@
     <p>
       <c>InternalSegmentIndex</c> together with the <c>SegmentIndex</c>
       completely identify the location of an atom cache entry in the
-      atom cache. <c>Length</c> is number of one byte characters that
-      the atom text consists of. Length is a two byte big endian integer
+      atom cache. <c>Length</c> is number of bytes that <c>AtomText</c>
+      consists of. Length is a two byte big endian integer
       if the <c>LongAtoms</c> flag has been set, otherwise a one byte
-      integer. Subsequent <c>CachedAtomRef</c>s with the same
+      integer. When the
+      <seealso marker="erl_dist_protocol#dflags"><c>DFLAG_UTF8_ATOMS</c></seealso>
+      distribution flag has been exchanged between both nodes in the
+      <seealso marker="erl_dist_protocol#distribution_handshake">distribution handshake</seealso>,
+      characters in <c>AtomText</c> is encoded in UTF-8; otherwise,
+      encoded in Latin1. Subsequent <c>CachedAtomRef</c>s with the same
       <c>SegmentIndex</c> and <c>InternalSegmentIndex</c> as this
       <c>NewAtomCacheRef</c> will refer to this atom until a new
       <c>NewAtomCacheRef</c> with the same <c>SegmentIndex</c>
       and <c>InternalSegmentIndex</c> appear.
     </p>
+    <p>
+      For more information on encoding of atoms, see
+      <seealso marker="#utf8_atoms">note on UTF-8 encoded atoms</seealso>
+      in the beginning of this document.
+    </p>
     <p>
       If the <c>NewCacheEntryFlag</c> for the next <c>AtomCacheRef</c>
       has not been set, a <c>CachedAtomRef</c> on the following format
@@ -383,9 +421,9 @@
 	<tcaption></tcaption></table>
       <p>
 	An atom is stored with a 2 byte unsigned length in big-endian order, 
-	followed by <c>Len</c> numbers of 8 bit characters that forms the 
-	<c>AtomName</c>.
-	Note: The maximum allowed value for <c>Len</c> is 255.
+	followed by <c>Len</c> numbers of 8 bit Latin1 characters that forms
+	the <c>AtomName</c>.
+	<em>Note</em>: The maximum allowed value for <c>Len</c> is 255.
       </p>
     </section>
 
@@ -754,12 +792,14 @@
 	<tcaption></tcaption></table>
       <p>
 	An atom is stored with a 1 byte unsigned length, 
-	followed by <c>Len</c> numbers of 8 bit characters that
+	followed by <c>Len</c> numbers of 8 bit Latin1 characters that
 	forms the <c>AtomName</c>. Longer atoms can be represented
 	by <seealso marker="#ATOM_EXT">ATOM_EXT</seealso>. <em>Note</em>
 	the <c>SMALL_ATOM_EXT</c> was introduced in erts version 5.7.2 and
-	require a small atom distribution flag exchanged in the distribution
-	handshake.
+	require an exchange of the
+	<seealso marker="erl_dist_protocol#dflags"><c>DFLAG_SMALL_ATOM_TAGS</c></seealso>
+	distribution flag in the
+	<seealso marker="erl_dist_protocol#distribution_handshake">distribution handshake</seealso>.
       </p>
     </section>
 
@@ -1007,7 +1047,62 @@
 	  This term is used in minor version 1 of the external format.
 	</p>
     </section>
+    <section>
+      <marker id="ATOM_UTF8_EXT"/>
+      <title>ATOM_UTF8_EXT</title>
+
+	<table align="left">
+	  <row>
+	    <cell align="center">1</cell>
+	    <cell align="center">2</cell>
+	    <cell align="center">Len</cell>
+	  </row>
+	  <row>
+	    <cell align="center"><c>118</c></cell>
+	    <cell align="center"><c>Len</c></cell>
+	    <cell align="center"><c>AtomName</c></cell>
+	  </row>
+	<tcaption></tcaption></table>
+      <p>
+	An atom is stored with a 2 byte unsigned length in big-endian order,
+	followed by <c>Len</c> bytes containing the <c>AtomName</c> encoded
+	in UTF-8.
+      </p>
+      <p>
+	For more information on encoding of atoms, see
+	<seealso marker="#utf8_atoms">note on UTF-8 encoded atoms</seealso>
+	in the beginning of this document.
+      </p>
+    </section>
 
+    <section>
+      <marker id="SMALL_ATOM_UTF8_EXT"/>
+      <title>SMALL_ATOM_UTF8_EXT</title>
+
+	<table align="left">
+	  <row>
+	    <cell align="center">1</cell>
+	    <cell align="center">1</cell>
+	    <cell align="center">Len</cell>
+	  </row>
+	  <row>
+	    <cell align="center"><c>119</c></cell>
+	    <cell align="center"><c>Len</c></cell>
+	    <cell align="center"><c>AtomName</c></cell>
+	  </row>
+	<tcaption></tcaption></table>
+      <p>
+	An atom is stored with a 1 byte unsigned length, 
+	followed by <c>Len</c> bytes containing the <c>AtomName</c> encoded
+	in UTF-8. Longer atoms encoded in UTF-8 can be represented using
+	<seealso marker="#ATOM_UTF8_EXT">ATOM_UTF8_EXT</seealso>.
+      </p>
+      <p>
+	For more information on encoding of atoms, see
+	<seealso marker="#utf8_atoms">note on UTF-8 encoded atoms</seealso>
+	in the beginning of this document.
+      </p>
+    </section>
 
   </chapter>
   
diff --git a/erts/doc/src/erlang.xml b/erts/doc/src/erlang.xml
index 5002c48ca1..0077c0096c 100644
--- a/erts/doc/src/erlang.xml
+++ b/erts/doc/src/erlang.xml
@@ -277,7 +277,9 @@
 	the binary contains Unicode characters greater than 16#FF.
 	In a future release, such Unicode characters might be allowed
 	and <c>binary_to_atom(<anno>Binary</anno>, utf8)</c>
-	will not fail in that case.</p></note>
+	will not fail in that case. For more information on Unicode support in atoms
+	see <seealso marker="erl_ext_dist#utf8_atoms">note on UTF-8 encoded atoms</seealso>
+	in the chapter about the external term format in the ERTS User's Guide.</p></note>
 
         <pre>
 > <input>binary_to_atom(&lt;&lt;"Erlang"&gt;&gt;, latin1).</input>
@@ -1647,9 +1649,11 @@ os_prompt% </pre>
       <desc>
         <p>Returns the atom whose text representation is <c><anno>String</anno></c>.</p>
 	<p><c><anno>String</anno></c> may only contain ISO-latin-1
-	characterns (i.e. numbers below 256) as the current
+	characters (i.e. numbers below 256) as the current
 	implementation does not allow unicode characters >= 256 in
-	atoms.</p>
+	atoms. For more information on Unicode support in atoms
+	see <seealso marker="erl_ext_dist#utf8_atoms">note on UTF-8 encoded atoms</seealso>
+	in the chapter about the external term format in the ERTS User's Guide.</p>
         <pre>
 > <input>list_to_atom("Erlang").</input>
 'Erlang'</pre>
diff --git a/lib/kernel/internal_doc/distribution_handshake.txt b/lib/kernel/internal_doc/distribution_handshake.txt
index 6a3ee22ed3..d00c4ceb02 100644
--- a/lib/kernel/internal_doc/distribution_handshake.txt
+++ b/lib/kernel/internal_doc/distribution_handshake.txt
@@ -1,215 +1 @@
-HOW THE DISTRIBUTION HANDSHAKE WORKS
-------------------------------------
-
-This document describes the distribution handshake introduced in 
-the R6 release of Erlang/OTP.
-
-GENERAL
--------
-
-The TCP/IP distribution uses a handshake which expects a
-connection based protocol, i.e. the protocol does not include
-any authentication after the handshake procedure.
-
-This is not entirely safe, as it is vulnerable against takeover
-attacks, but it is a tradeoff between fair safety and performance.
-
-The cookies are never sent in cleartext and the handshake procedure
-expects the client (called A) to be the first one to prove that it can 
-generate a sufficient digest. The digest is generated with the 
-MD5 message digest algorithm and the challenges are expected to be very
-random numbers.
-
-DEFINITIONS
------------
-
-A challenge is a 32 bit integer number in big endian order. Below the function
-gen_challenge() returns a random 32 bit integer used as a challenge.
-
-A digest is a (16 bytes) MD5 hash of [the Challenge (as text) concatenated
-with the cookie (as text)]. Below, the function gen_digest(Challenge, Cookie)
-generates a digest as described above.
-
-An out_cookie is the cookie used in outgoing communication to a certain node,
-so that A's out_cookie for B should correspond with B's in_cookie for A and 
-the other way around. A's out_cookie for B and A's in_cookie for B need *NOT*
-be the same. Below the function out_cookie(Node) returns the current
-node's out_cookie for Node.
-
-An in_cookie is the cookie expected to be used by another node when 
-communicating with us, so that A's in_cookie for B corresponds with B's 
-out_cookie for A. Below the function in_cookie(Node) returns the current
-node's in_cookie for Node.
-
-The cookies are text strings that can be viewed as passwords.
-
-Every message in the handshake starts with a 16 bit big endian integer 
-which contains the length of the message (not counting the two initial bytes).
-In erlang this corresponds to the gen_tcp option {packet, 2}. Note that after 
-the handshake, the distribution switches to 4 byte packet headers.
-
-THE HANDSHAKE IN DETAIL
------------------------
- 
-Imagine two nodes, node A, which initiates the handshake and node B, which
-accepts the connection.
-
-1) connect/accept: A connects to B via TCP/IP and B accepts the connection.
-
-2) send_name/receive_name: A sends an initial identification to B. 
-B receives the message. The message looks
-like this (every "square" being one byte and the packet header removed):
-
-+---+--------+--------+-----+-----+-----+-----+-----+-----+-...-+-----+
-|'n'|Version0|Version1|Flag0|Flag1|Flag2|Flag3|Name0|Name1| ... |NameN|
-+---+--------+--------+-----+-----+-----+-----+-----+-----+-... +-----+
-
-The 'n' is just a message tag,
-Version0 & Version1 is the distribution version selected by node A,
-                    based on information from EPMD. (16 bit big endian)
-Flag0 ... Flag3 are capability flags, the capabilities defined in dist.hrl.
-                (32 bit big endian)
-Name0 ... NameN is the full nodename of A, as a string of bytes (the
-                packet length denotes how long it is).
-
-3) recv_status/send_status: B sends a status message to A, which indicates
-if the connection is allowed. Four different status codes are defined:
-ok: The handshake will continue.
-ok_simultaneous: The handshake will continue, but A is informed that B
-                 has another ongoing connection attempt that will be
-		 shut down (simultaneous connect where A's name is 
-		 greater than B's name, compared literally),
-nok: The handshake will not continue, as B already has an ongoing handshake
-     which it itself has initiated. (simultaneous connect where B's name is 
-     greater than A's)
-not_allowed: The connection is disallowed for some (unspecified) security 
-             reason.
-alive: A connection to the node is already active, which either means
-       that node A is confused or that the TCP connection breakdown
-       of a previous node with this name has not yet reached node B.
-       See 3B below.
-
-This is the format of the status message:
-
-+---+-------+-------+-...-+-------+
-|'s'|Status0|Status1| ... |StatusN|
-+---+-------+-------+-...-+-------+
-
-'s' is the message tag
-Status0 ... StatusN is the status as a string (not terminated)
-
-3B) send_status/recv_status: If status was 'alive', node A will answer with
-another status message containing either 'true' which means that the
-connection should continue (The old connection from this node is broken), or
-'false', which simply means that the connection should be closed, the 
-connection attempt was a mistake.
-
-4) recv_challenge/send_challenge: If the status was 'ok' or 'ok_simultaneous',
-The handshake continues with B sending A another message, the challenge.
-The challenge contains the same type of information as the "name" message 
-initially sent from A to B, with the addition of a 32 bit challenge:
-
-+---+--------+--------+-----+-----+-----+-----+-----+-----+-----+-----+---
-|'n'|Version0|Version1|Flag0|Flag1|Flag2|Flag3|Chal0|Chal1|Chal2|Chal3|
-+---+--------+--------+-----+-----+-----+-----+-----+-----+-----+-----+---
-   ------+-----+-...-+-----+
-    Name0|Name1| ... |NameN|
-   ------+-----+-... +-----+
-
-Where Chal0 ... Chal3 is the challenge as a 32 bit big endian integer
-and the other fields are B's version, flags and full nodename.
-
-5) send_challenge_reply/recv_challenge_reply: Now A has generated
-a digest and its own challenge. Those are sent together in a package
-to B:
-
-+---+-----+-----+-----+-----+-----+-----+-----+-----+-...-+------+
-|'r'|Chal0|Chal1|Chal2|Chal3|Dige0|Dige1|Dige2|Dige3| ... |Dige15|
-+---+-----+-----+-----+-----+-----+-----+-----+-----+-...-+------+
-
-Where 'r' is the tag, Chal0 ... Chal3 is A's challenge for B to handle and
-Dige0 ... Dige15 is the digest that A constructed from the challenge B sent
-in the previous step.
-
-6) recv_challenge_ack/send_challenge_ack: B checks that the digest received
-from A is correct and generates a digest from the challenge received from 
-A. The digest is then sent to A. The message looks like this:
-
-+---+-----+-----+-----+-----+-...-+------+
-|'a'|Dige0|Dige1|Dige2|Dige3| ... |Dige15|
-+---+-----+-----+-----+-----+-...-+------+
-
-Where 'a' is the tag and Dige0 ... Dige15 is the digest calculated by B
-for A's challenge.
-
-7) A checks the digest from B and the connection is up.
-
-SEMIGRAPHIC VIEW
-----------------
-
-A (initiator)						B (acceptor)
-
-TCP connect ----------------------------------------->	
-							TCP accept
-
-send_name   ----------------------------------------->	
-							recv_name
-
-	    <----------------------------------------	send_status
-recv_status
-(if status was 'alive'
- send_status - - - - - - - - - - - - - - - - - - - ->   
-							recv_status)
-							ChB = gen_challenge()
-		          (ChB)
-	    <----------------------------------------	send_challenge
-recv_challenge
-
-ChA = gen_challenge(),
-OCA = out_cookie(B),
-DiA = gen_digest(ChB,OCA)
-			  (ChA, DiA)
-send_challenge_reply -------------------------------->
-							recv_challenge_reply
-							ICB = in_cookie(A),
-							check:
-							DiA == gen_digest
-								(ChB, ICB) ?
-							- if OK:
-	    						 OCB = out_cookie(A),
-							 DiB = gen_digest
-			(DiB)					(ChA, OCB)
-	    <-----------------------------------------	 send_challenge_ack
-recv_challenge_ack					 DONE
-ICA = in_cookie(B),                                     - else
-check:                                                   CLOSE
-DiB == gen_digest(ChA,ICA) ?
-- if OK
- DONE
-- else
- CLOSE
-
-
-THE CURRENTLY DEFINED FLAGS
----------------------------
-Currently the following capability flags are defined:
-
-%% The node should be published and part of the global namespace
--define(DFLAG_PUBLISHED,1).
-
-%% The node implements an atom cache
--define(DFLAG_ATOM_CACHE,2).
-
-%% The node implements extended (3 * 32 bits) references
--define(DFLAG_EXTENDED_REFERENCES,4).
-
-%% The node implements distributed process monitoring.
--define(DFLAG_DIST_MONITOR,8).
- 
-%% The node uses separate tag for fun's (lambdas) in the distribution protocol.
--define(DFLAG_FUN_TAGS,16).
- 
-An R6 erlang node implements all of the above, while a C or Java node only
-implements DFLAG_EXTENDED_REFERENCES.
-
-Last modified 1999-11-08 -- Patrik Nyblom, OTP
+This information has been moved to the "Distribution Protocol" chapter of "ERTS User's Guide".
-- 
cgit v1.2.3


From 6347bc07dac9f958b4ac3cb751dabae08d350b8b Mon Sep 17 00:00:00 2001
From: Rickard Green <rickard@erlang.org>
Date: Sat, 19 Jan 2013 00:46:21 +0100
Subject: Fix merge conflict with hasse

---
 lib/stdlib/src/io_lib.erl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lib/stdlib/src/io_lib.erl b/lib/stdlib/src/io_lib.erl
index 0f1f417d01..513d904c39 100644
--- a/lib/stdlib/src/io_lib.erl
+++ b/lib/stdlib/src/io_lib.erl
@@ -276,7 +276,7 @@ write_atom(Atom) ->
     Chars = atom_to_list(Atom),
     case quote_atom(Atom, Chars) of
 	true ->
-	    write_unicode_string(Chars, $');   %'
+	    write_string(Chars, $');   %'
 	false ->
 	    Chars
     end.
-- 
cgit v1.2.3


From 5ee4c7136ce8f311e3d3384ae0feb29bcbff6e85 Mon Sep 17 00:00:00 2001
From: Rickard Green <rickard@erlang.org>
Date: Mon, 21 Jan 2013 15:44:33 +0100
Subject: workaround...

---
 erts/emulator/test/distribution_SUITE.erl | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/erts/emulator/test/distribution_SUITE.erl b/erts/emulator/test/distribution_SUITE.erl
index e7bd8d19c3..7edd92d56f 100644
--- a/erts/emulator/test/distribution_SUITE.erl
+++ b/erts/emulator/test/distribution_SUITE.erl
@@ -1240,9 +1240,9 @@ contended_atom_cache_entry_test(Config, Type) ->
 					get_conflicting_unicode_atoms(CIX,
 								      ProcessPairs)
 				end,
-		    io:format("Testing with the following atoms all using "
-			      "cache index ~p:~n ~p~n",
-			      [CIX, TestAtoms]),
+%		    io:format("Testing with the following atoms all using "
+%			      "cache index ~p:~n ~p~n",
+%			      [CIX, TestAtoms]),
 		    Ps = lists:map(
 			   fun (A) ->
 				   Ref = make_ref(),
-- 
cgit v1.2.3


From 3aa60cc472bc330dbe9360eb27a1f340b7e23dc6 Mon Sep 17 00:00:00 2001
From: Rickard Green <rickard@erlang.org>
Date: Tue, 22 Jan 2013 18:35:35 +0100
Subject: Add UTF-8 node name support for epmd

---
 erts/epmd/src/epmd_cli.c      |  13 ++-
 erts/epmd/src/epmd_int.h      |  20 +++-
 erts/epmd/src/epmd_srv.c      | 215 ++++++++++++++++++++++++++++++++++++------
 erts/epmd/test/epmd_SUITE.erl |  47 ++++++++-
 4 files changed, 254 insertions(+), 41 deletions(-)

diff --git a/erts/epmd/src/epmd_cli.c b/erts/epmd/src/epmd_cli.c
index 74408e3ebe..1d4de64b63 100644
--- a/erts/epmd/src/epmd_cli.c
+++ b/erts/epmd/src/epmd_cli.c
@@ -22,6 +22,7 @@
 #endif
 #include "epmd.h"     /* Renamed from 'epmd_r4.h' */
 #include "epmd_int.h"
+#include "erl_printf.h" /* erts_snprintf */
 
 /* forward declarations */
 
@@ -114,16 +115,18 @@ void epmd_call(EpmdVars *g,int what)
 	epmd_cleanup_exit(g,1);
     }
     j = ntohl(i);
-    if (!g->silent)
-	printf("epmd: up and running on port %d with data:\n", j);
+    if (!g->silent) {
+	rval = erts_snprintf(buf, OUTBUF_SIZE,
+			     "epmd: up and running on port %d with data:\n", j);
+	write(1, buf, rval);
+    }
     while(1) {
-	if ((rval = read(fd,buf,1)) <= 0)  {
+	if ((rval = read(fd,buf,OUTBUF_SIZE)) <= 0)  {
 	    close(fd);
 	    epmd_cleanup_exit(g,0);
 	}
-	buf[rval] = '\0';
 	if (!g->silent)
-	    printf("%s",buf);
+	    write(1, buf, rval); /* Potentially UTF-8 encoded */
     }
 }
 
diff --git a/erts/epmd/src/epmd_int.h b/erts/epmd/src/epmd_int.h
index 14d05c3f19..b25412c905 100644
--- a/erts/epmd/src/epmd_int.h
+++ b/erts/epmd/src/epmd_int.h
@@ -226,13 +226,25 @@
 #define MAX_UNREG_COUNT 1000
 #define DEBUG_MAX_UNREG_COUNT 5
 
-/* Maximum length of a node name == atom name */
-#define MAXSYMLEN 255
+/*
+ * Maximum length of a node name == atom name
+ *   255 characters; UTF-8 encoded -> max 255*4
+ */
+#define MAXSYMLEN (255*4)
 
 #define MAX_LISTEN_SOCKETS 16
 
-#define INBUF_SIZE 1024
-#define OUTBUF_SIZE 1024
+/*
+ * Largest request: ALIVE2_REQ
+ *  2 + 13 + 2*MAXSYMLEN
+ * Largest response: PORT2_RESP
+ *  2 + 14 + 2*MAXSYMLEN
+ *
+ * That is, 3*MAXSYMLEN should be large enough
+ */
+
+#define INBUF_SIZE (3*MAXSYMLEN)
+#define OUTBUF_SIZE (3*MAXSYMLEN)
 
 #define get_int16(s) ((((unsigned char*)  (s))[0] << 8) | \
                       (((unsigned char*)  (s))[1]))
diff --git a/erts/epmd/src/epmd_srv.c b/erts/epmd/src/epmd_srv.c
index 36565b7438..2a74c4955e 100644
--- a/erts/epmd/src/epmd_srv.c
+++ b/erts/epmd/src/epmd_srv.c
@@ -73,7 +73,7 @@ static int conn_open(EpmdVars*,int);
 static int conn_close_fd(EpmdVars*,int);
 
 static void node_init(EpmdVars*);
-static Node *node_reg2(EpmdVars*,char*, int, int, unsigned char, unsigned char, int, int, int, char*);
+static Node *node_reg2(EpmdVars*, int, char*, int, int, unsigned char, unsigned char, int, int, int, char*);
 static int node_unreg(EpmdVars*,char*);
 static int node_unreg_sock(EpmdVars*,int);
 
@@ -81,6 +81,113 @@ static int reply(EpmdVars*,int,char *,int);
 static void dbg_print_buf(EpmdVars*,char *,int);
 static void print_names(EpmdVars*);
 
+static int is_same_str(char *x, char *y)
+{
+    int i = 0;
+    /*
+     * Using strcmp() == 0 is probably ok, but just to be sure,
+     * since we got UTF-8 strings, we do it ourselves.
+     *
+     * We assume null-terminated correctly encoded UTF-8.
+     */
+    while (x[i] == y[i]) {
+	if (x[i] == '\0')
+	    return 1;
+	i++;
+    }
+    return 0;
+}
+
+static int copy_str(char *x, char *y)
+{
+    int i = 0;
+    /*
+     * Using strcpy() is probably ok, but just to be sure,
+     * since we got UTF-8 strings, we do it ourselves.
+     *
+     * We assume null-terminated correctly encoded UTF-8.
+     */
+    while (1) {
+	x[i] = y[i];
+	if (y[i] == '\0')
+	    return i;
+	i++;
+    }
+}
+
+static int length_str(char *x)
+{
+    int i = 0;
+    /*
+     * Using strlen is probably ok, but just to be sure,
+     * since we got UTF-8 strings, we do it ourselves.
+     *
+     * We assume null-terminated correctly encoded UTF-8.
+     */
+    while (x[i])
+	i++;
+    return i;
+}
+
+static int verify_utf8(const char *src, int sz, int null_term)
+{
+    unsigned char *source = (unsigned char *) src;
+    int size = sz;
+    int num_chars = 0;
+    while (size) {
+	if (null_term && (*source) == 0)
+	    return num_chars;
+	if (((*source) & ((unsigned char) 0x80)) == 0) {
+	    source++;
+	    --size;
+	} else if (((*source) & ((unsigned char) 0xE0)) == 0xC0) {
+	    if (size < 2)
+		return -1;
+	    if (((source[1] & ((unsigned char) 0xC0)) != 0x80) ||
+		((*source) < 0xC2) /* overlong */) {
+		return -1;
+	    }
+	    source += 2;
+	    size -= 2;
+	} else if (((*source) & ((unsigned char) 0xF0)) == 0xE0) {
+	    if (size < 3)
+		return -1;
+	    if (((source[1] & ((unsigned char) 0xC0)) != 0x80) ||
+		((source[2] & ((unsigned char) 0xC0)) != 0x80) ||
+		(((*source) == 0xE0) && (source[1] < 0xA0)) /* overlong */ ) {
+		return -1;
+	    }
+	    if ((((*source) & ((unsigned char) 0xF)) == 0xD) && 
+		((source[1] & 0x20) != 0)) {
+		return -1;
+	    }
+	    source += 3;
+	    size -= 3;
+	} else if (((*source) & ((unsigned char) 0xF8)) == 0xF0) {
+	    if (size < 4)
+		return -1;
+	    if (((source[1] & ((unsigned char) 0xC0)) != 0x80) ||
+		((source[2] & ((unsigned char) 0xC0)) != 0x80) ||
+		((source[3] & ((unsigned char) 0xC0)) != 0x80) ||
+		(((*source) == 0xF0) && (source[1] < 0x90)) /* overlong */) {
+		return -1;
+	    }
+	    if ((((*source) & ((unsigned char)0x7)) > 0x4U) ||
+		((((*source) & ((unsigned char)0x7)) == 0x4U) && 
+		 ((source[1] & ((unsigned char)0x3F)) > 0xFU))) {
+		return -1;
+	    }
+	    source += 4;
+	    size -= 4; 
+	} else {
+	    return -1;
+	}
+	++num_chars;
+    }
+    return num_chars;
+}
+
+
 static EPMD_INLINE void select_fd_set(EpmdVars* g, int fd)
 {
     FD_SET(fd, &g->orig_read_mask);
@@ -525,10 +632,11 @@ static void do_request(g, fd, s, buf, bsize)
 	    }
 	name = &buf[11];
 	name[namelen]='\000';
+
 	extra = &buf[11+namelen+2];
 	extra[extralen]='\000';
 	wbuf[0] = EPMD_ALIVE2_RESP;
-	if ((node = node_reg2(g, name, fd, eport, nodetype, protocol,
+	if ((node = node_reg2(g, namelen, name, fd, eport, nodetype, protocol,
 			      highvsn, lowvsn, extralen, extra)) == NULL) {
 	    wbuf[1] = 1; /* error */
 	    put_int16(99, wbuf+2);
@@ -573,22 +681,28 @@ static void do_request(g, fd, s, buf, bsize)
 
       {
 	char *name = &buf[1]; /* Points to node name */
+	int nsz;
 	Node *node;
-	
+
+	nsz = verify_utf8(name, bsize, 0);
+	if (nsz < 1 || 255 < nsz) {
+	    dbg_printf(g,0,"invalid node name in PORT2_REQ");
+	    return;
+	}
+
 	wbuf[0] = EPMD_PORT2_RESP;
 	for (node = g->nodes.reg; node; node = node->next) {
 	    int offset;
-	    if (strcmp(node->symname, name) == 0) {
+	    if (is_same_str(node->symname, name)) {
 		wbuf[1] = 0; /* ok */
 		put_int16(node->port,wbuf+2);
 		wbuf[4] = node->nodetype;
 		wbuf[5] = node->protocol;
 		put_int16(node->highvsn,wbuf+6);
 		put_int16(node->lowvsn,wbuf+8);
-		put_int16(strlen(node->symname),wbuf+10);
+		put_int16(length_str(node->symname),wbuf+10);
 		offset = 12;
-		strcpy(wbuf + offset,node->symname);
-		offset += strlen(node->symname);
+		offset += copy_str(wbuf + offset,node->symname);
 		put_int16(node->extralen,wbuf + offset);
 		offset += 2;
 		memcpy(wbuf + offset,node->extra,node->extralen);
@@ -629,15 +743,22 @@ static void do_request(g, fd, s, buf, bsize)
 
 	for (node = g->nodes.reg; node; node = node->next)
 	  {
-	    int len;
+	    int len = 0;
+	    int r;
 
 	    /* CAREFUL!!! These are parsed by "erl_epmd.erl" so a slight
 	       change in syntax will break < OTP R3A */
 
-	    erts_snprintf(wbuf, sizeof(wbuf), "name %s at port %d\n",node->symname, node->port);
-	    len = strlen(wbuf);
+	    len += copy_str(&wbuf[len], "name ");
+	    len += copy_str(&wbuf[len], node->symname);
+	    r = erts_snprintf(&wbuf[len], sizeof(wbuf)-len,
+			      " at port %d\n", node->port);
+	    if (r < 0)
+		goto failed_names_resp;
+	    len += r;
 	    if (reply(g, fd, wbuf, len) != len)
 	      {
+	      failed_names_resp:
 		dbg_tty_printf(g,1,"failed to send NAMES_RESP");
 		return;
 	      }
@@ -665,16 +786,22 @@ static void do_request(g, fd, s, buf, bsize)
 
 	for (node = g->nodes.reg; node; node = node->next)
 	  {
-	    int len;
+	      int len = 0, r;
 
 	    /* CAREFUL!!! These are parsed by "erl_epmd.erl" so a slight
 	       change in syntax will break < OTP R3A */
 
-	    erts_snprintf(wbuf, sizeof(wbuf), "active name     <%s> at port %d, fd = %d\n",
-		    node->symname, node->port, node->fd);
-	    len = strlen(wbuf) + 1;
-	    if (reply(g, fd,wbuf,len) != len)
+	      len += copy_str(&wbuf[len], "active name     <");
+	      len += copy_str(&wbuf[len], node->symname);
+	      r = erts_snprintf(&wbuf[len], sizeof(wbuf)-len,
+				"> at port %d, fd = %d\n",
+				node->port, node->fd);
+	      if (r < 0)
+		  goto failed_dump_resp;
+	      len += r + 1;
+	      if (reply(g, fd,wbuf,len) != len)
 	      {
+	      failed_dump_resp:
 		dbg_tty_printf(g,1,"failed to send DUMP_RESP");
 		return;
 	      }
@@ -682,16 +809,22 @@ static void do_request(g, fd, s, buf, bsize)
 
 	for (node = g->nodes.unreg; node; node = node->next)
 	  {
-	    int len;
+	      int len = 0, r;
 
 	    /* CAREFUL!!! These are parsed by "erl_epmd.erl" so a slight
 	       change in syntax will break < OTP R3A */
 
-	    erts_snprintf(wbuf, sizeof(wbuf), "old/unused name <%s>, port = %d, fd = %d \n",
-		    node->symname,node->port, node->fd);
-	    len = strlen(wbuf) + 1;
-	    if (reply(g, fd,wbuf,len) != len)
+	      len += copy_str(&wbuf[len], "old/unused name <");
+	      len += copy_str(&wbuf[len], node->symname);
+	      r = erts_snprintf(&wbuf[len], sizeof(wbuf)-len,
+				">, port = %d, fd = %d \n",
+				node->port, node->fd);
+	      if (r < 0)
+		  goto failed_dump_resp2;
+	      len += r + 1;
+	      if (reply(g, fd,wbuf,len) != len)
 	      {
+	      failed_dump_resp2:
 		dbg_tty_printf(g,1,"failed to send DUMP_RESP");
 		return;
 	      }
@@ -933,7 +1066,7 @@ static int node_unreg(EpmdVars *g,char *name)
   Node *node  = g->nodes.reg;	/* Point to first node */
 
   for (; node; prev = &node->next, node = node->next)
-    if (strcmp(node->symname, name) == 0)
+    if (is_same_str(node->symname, name))
       {
 	dbg_tty_printf(g,1,"unregistering '%s:%d', port %d",
 		       node->symname, node->creation, node->port);
@@ -1013,6 +1146,7 @@ static int node_unreg_sock(EpmdVars *g,int fd)
  */
 
 static Node *node_reg2(EpmdVars *g,
+		       int namelen,
 		       char* name,
 		       int fd,
 		       int port,
@@ -1025,6 +1159,7 @@ static Node *node_reg2(EpmdVars *g,
 {
   Node *prev;			/* Point to previous node or NULL */
   Node *node;			/* Point to first node */
+  int sz;
 
   /* Can be NULL; means old style */
   if (extra == NULL)
@@ -1032,21 +1167,47 @@ static Node *node_reg2(EpmdVars *g,
 
   /* Fail if node name is too long */
 
-  if (strlen(name) > MAXSYMLEN)
+
+  if (namelen > MAXSYMLEN)
     {
-      dbg_printf(g,0,"node name is too long (%d) %s", strlen(name), name);
+    too_long_name:
+      dbg_printf(g,0,"node name is too long (%d) %s", namelen, name);
       return NULL;
     }
+
+  sz = verify_utf8(name, namelen, 0);
+  if (sz > 255)
+      goto too_long_name;
+
+  if (sz < 0) {
+      dbg_printf(g,0,"invalid node name encoding");
+      return NULL;
+  }
+
   if (extralen > MAXSYMLEN)
     {
-      dbg_printf(g,0,"extra data is too long (%d) %s", strlen(name), name);
+#if 0
+    too_long_extra:
+#endif
+      dbg_printf(g,0,"extra data is too long (%d) %s", extralen, extra);
       return NULL;
     }
 
+#if 0 /* Should we require valid utf8 here? */
+  sz = verify_utf8(extra, extralen, 0);
+  if (sz > 255)
+      goto too_long_extra;
+
+  if (sz < 0) {
+      dbg_printf(g,0,"invalid extra data encoding");
+      return NULL;
+  }
+#endif
+
   /* Fail if it is already registered */
 
   for (node = g->nodes.reg; node; node = node->next)
-    if (strcmp(node->symname, name) == 0)
+    if (is_same_str(node->symname, name))
       {
 	dbg_printf(g,0,"node name already occupied %s", name);
 	return NULL;
@@ -1058,7 +1219,7 @@ static Node *node_reg2(EpmdVars *g,
   prev = NULL;
 
   for (node = g->nodes.unreg; node; prev = node, node = node->next)
-    if (strcmp(node->symname, name) == 0)
+    if (is_same_str(node->symname, name))
       {
 	dbg_tty_printf(g,1,"reusing slot with same name '%s'", node->symname);
 
@@ -1126,7 +1287,7 @@ static Node *node_reg2(EpmdVars *g,
   node->lowvsn   = lowvsn;
   node->extralen = extralen;
   memcpy(node->extra,extra,extralen);
-  strcpy(node->symname,name);
+  copy_str(node->symname,name);
   select_fd_set(g, fd);
 
   if (highvsn == 0) {
diff --git a/erts/epmd/test/epmd_SUITE.erl b/erts/epmd/test/epmd_SUITE.erl
index fd9969ae2b..fc0abef400 100644
--- a/erts/epmd/test/epmd_SUITE.erl
+++ b/erts/epmd/test/epmd_SUITE.erl
@@ -45,6 +45,8 @@
     register_names_1/1,
     register_names_2/1,
     register_duplicate_name/1,
+    unicode_name/1,
+    long_unicode_name/1,
     get_port_nr/1,
     slow_get_port_nr/1,
     unregister_others_name_1/1,
@@ -107,7 +109,8 @@ suite() -> [{ct_hooks,[ts_install_cth]}].
 
 all() -> 
     [register_name, register_names_1, register_names_2,
-     register_duplicate_name, get_port_nr, slow_get_port_nr,
+     register_duplicate_name, unicode_name, long_unicode_name,
+     get_port_nr, slow_get_port_nr,
      unregister_others_name_1, unregister_others_name_2,
      register_overflow, name_with_null_inside,
      name_null_terminated, stupid_names_req, no_data,
@@ -197,6 +200,37 @@ register_duplicate_name(Config) when is_list(Config) ->
     ?line ok = close(Sock),			% Unregister
     ok.
 
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+
+unicode_name(doc) ->
+    ["Check that we can register and lookup a unicode name"];
+unicode_name(suite) ->
+    [];
+unicode_name(Config) when is_list(Config) ->
+    ok = epmdrun(),
+    NodeName = [16#1f608],
+    {ok,Sock} = register_node_v2(4711, 72, 0, 5, 5, NodeName, []),
+    {ok,NodeInfo} = port_please_v2(NodeName),
+    NodeName = NodeInfo#node_info.node_name,
+    ok = close(Sock),
+    ok.
+
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+
+long_unicode_name(doc) ->
+    ["Check that we can register and lookup a long unicode name"];
+long_unicode_name(suite) ->
+    [];
+long_unicode_name(Config) when is_list(Config) ->
+    ok = epmdrun(),
+    BaseChar = 16#1f600,
+    NodeName = lists:seq(BaseChar, BaseChar+200), % will be 800 bytes long
+    {ok,Sock} = register_node_v2(4711, 72, 0, 5, 5, NodeName, []),
+    {ok,NodeInfo} = port_please_v2(NodeName),
+    NodeName = NodeInfo#node_info.node_name,
+    ok = close(Sock),
+    ok.
+
 % Internal function to register a node name, no close, i.e. unregister
 
 register_node(Name) ->
@@ -205,9 +239,10 @@ register_node(Name,Port) ->
     register_node_v2(Port,$M,0,5,5,Name,"").
 
 register_node_v2(Port, NodeType, Prot, HVsn, LVsn, Name, Extra) ->
+    Utf8Name = unicode:characters_to_binary(Name),
     Req = [?EPMD_ALIVE2_REQ, put16(Port), NodeType, Prot,
 	   put16(HVsn), put16(LVsn),
-	   size16(Name), Name,
+	   put16(size(Utf8Name)), binary_to_list(Utf8Name),
 	   size16(Extra), Extra],
     case send_req(Req) of
 	{ok,Sock} ->
@@ -226,7 +261,8 @@ register_node_v2(Port, NodeType, Prot, HVsn, LVsn, Name, Extra) ->
 % Internal function to fetch information about a node
 
 port_please_v2(Name) ->
-    case send_req([?EPMD_PORT_PLEASE2_REQ, Name]) of
+    case send_req([?EPMD_PORT_PLEASE2_REQ,
+		   binary_to_list(unicode:characters_to_binary(Name))]) of
 	{ok,Sock} ->
 	    case recv_until_sock_closes(Sock) of
 		{ok, Resp} ->
@@ -247,7 +283,7 @@ parse_port2_resp(Resp) ->
 	  ELen:16,Extra:ELen/binary>> when Res =:= 0 ->
 	    {ok, #node_info{port=Port,node_type=NodeType,prot=Prot,
 			    hvsn=HVsn,lvsn=LVsn,
-			    node_name=binary_to_list(NodeName),
+			    node_name=unicode:characters_to_list(NodeName),
 			    extra=binary_to_list(Extra)}};
 	_Other ->
 	    test_server:format("invalid port2 resp: ~p~n",
@@ -737,7 +773,7 @@ buffer_overrun_2(doc) ->
     ["Test security vulnerability in fake extra lengths in alive2_req"];
 buffer_overrun_2(Config) when is_list(Config) ->
     ?line ok = epmdrun(),
-    ?line [false | Rest] = [hostile2(N) || N <- lists:seq(255,10000)],
+    ?line [false | Rest] = [hostile2(N) || N <- lists:seq(255*4,10000)],
     ?line true = alltrue(Rest),
     ok.
 hostile(N) ->
@@ -880,6 +916,7 @@ no_live_killing(Config) when is_list(Config) ->
     ?line close(Sock3),
     ok.
 
+
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 % Terminate all tests with killing epmd.
 
-- 
cgit v1.2.3


From 8eb544073fe243a8935a54f83f9c9f1f7478e3c5 Mon Sep 17 00:00:00 2001
From: Sverker Eriksson <sverker@erlang.org>
Date: Tue, 22 Jan 2013 17:20:17 +0100
Subject: erts: Fix bug in analyze_utf8 causing faulty latin1 detection

---
 erts/emulator/beam/erl_unicode.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/erts/emulator/beam/erl_unicode.c b/erts/emulator/beam/erl_unicode.c
index c00293de89..883405d066 100644
--- a/erts/emulator/beam/erl_unicode.c
+++ b/erts/emulator/beam/erl_unicode.c
@@ -1180,13 +1180,13 @@ analyze_utf8(byte *source, Uint size, byte **err_pos, Uint *num_chars, int *left
 		((*source) < 0xC2) /* overlong */) {
 		return ERTS_UTF8_ERROR;
 	    }
-	    source += 2;
-	    size -= 2;
 	    if (num_latin1_chars) {
 		latin1_count++;
 		if ((source[0] & ((byte) 0xFC)) != ((byte) 0xC0))
 		    is_latin1 = 0;
 	    }
+	    source += 2;
+	    size -= 2;
 	} else if (((*source) & ((byte) 0xF0)) == 0xE0) {
 	    if (size < 3) {
 		return ERTS_UTF8_INCOMPLETE;
-- 
cgit v1.2.3


From 1f4765cca4874fa92fcfad888fbe6d5f2fbf74d1 Mon Sep 17 00:00:00 2001
From: Sverker Eriksson <sverker@erlang.org>
Date: Tue, 22 Jan 2013 19:25:36 +0100
Subject: erl_interface: even more utf8 atom stuff

---
 lib/erl_interface/doc/src/ei.xml                   | 12 +++--
 lib/erl_interface/src/connect/ei_connect.c         |  2 +-
 lib/erl_interface/src/encode/encode_atom.c         | 59 +++++++++++++++++++-
 lib/erl_interface/src/legacy/erl_connect.c         |  2 +-
 lib/erl_interface/src/misc/ei_format.c             |  4 +-
 lib/erl_interface/src/misc/ei_printterm.c          |  2 +-
 lib/erl_interface/src/misc/show_msg.c              |  2 +-
 lib/erl_interface/src/prog/ei_fake_prog.c          |  6 +++
 lib/erl_interface/test/ei_decode_encode_SUITE.erl  | 63 +++++++++++-----------
 .../ei_decode_encode_test.c                        | 16 ++++--
 10 files changed, 122 insertions(+), 46 deletions(-)

diff --git a/lib/erl_interface/doc/src/ei.xml b/lib/erl_interface/doc/src/ei.xml
index 0b0b1eeb79..e9c7c644b5 100644
--- a/lib/erl_interface/doc/src/ei.xml
+++ b/lib/erl_interface/doc/src/ei.xml
@@ -94,7 +94,11 @@ enum erlang_char_encoding {
     ERLANG_ASCII, ERLANG_LATIN1, ERLANG_UTF8, ERLANG_WHATEVER
 };
 </code>
-         <p>The character encoding used for atoms.</p>
+         <p>The character encoding used for atoms. <c>ERLANG_ASCII</c> represents 7-bit ASCII.
+	 Latin1 and UTF8 are different extensions of 7-bit ASCII. All 7-bit ASCII characters
+	 are valid Latin1 and UTF8 characters. ASCII and Latin1 both represent each character
+	 by one byte. A UTF8 character can consist of one to four bytes. <c>ERLANG_WHATEVER</c>
+	 is not an encoding but rather used as a wildcard.</p>
       </item>
     </taglist>
   </section>
@@ -256,11 +260,11 @@ enum erlang_char_encoding {
         <p>Encodes an atom in the binary format with character encoding
 	<c><seealso marker="#erlang_char_encoding">to_enc</seealso></c> (latin1 or utf8).
 	The <c>p</c> parameter is the name of the atom with character encoding
-	<c><seealso marker="#erlang_char_encoding">from_enc</seealso></c>.
+	<c><seealso marker="#erlang_char_encoding">from_enc</seealso></c>  (ascii, latin1 or utf8).
 	The name must either be zero-terminated or a function variant with a <c>len</c>
 	parameter must be used.</p>
-	<p>The encoding will fail if the atom is too long or if it can not be represented
-	with character encoding <c>to_enc</c>.</p>
+	<p>The encoding will fail if <c>p</c> is not a valid string in encoding <c>from_enc</c>,
+	if the string is too long or if it can not be represented with character encoding <c>to_enc</c>.</p>
 	<p>These functions were introduced in R16 release of Erlang/OTP as part of a first step
 	to support UTF8 atoms. Atoms encoded with <c>ERLANG_UTF8</c>
 	can not be decoded by earlier releases than R16.</p>
diff --git a/lib/erl_interface/src/connect/ei_connect.c b/lib/erl_interface/src/connect/ei_connect.c
index a17257795e..4421bbb7fe 100644
--- a/lib/erl_interface/src/connect/ei_connect.c
+++ b/lib/erl_interface/src/connect/ei_connect.c
@@ -1071,7 +1071,7 @@ int ei_rpc(ei_cnode* ec, int fd, char *mod, char *fun,
     int i, index;
     ei_term t;
     erlang_msg msg;
-    char rex[MAXATOMLEN+1];
+    char rex[MAXATOMLEN];
 
     if (ei_rpc_to(ec, fd, mod, fun, inbuf, inbuflen) < 0) {
 	return -1;
diff --git a/lib/erl_interface/src/encode/encode_atom.c b/lib/erl_interface/src/encode/encode_atom.c
index a3d7c4c759..8bbe962396 100644
--- a/lib/erl_interface/src/encode/encode_atom.c
+++ b/lib/erl_interface/src/encode/encode_atom.c
@@ -22,6 +22,11 @@
 #include "eiext.h"
 #include "putget.h"
 
+
+static int copy_ascii_atom(char* dst, const char* src, int slen);
+static int copy_utf8_atom(char* dst, const char* src, int slen);
+
+
 int ei_encode_atom(char *buf, int *index, const char *p)
 {
     size_t len = strlen(p);
@@ -54,7 +59,8 @@ int ei_encode_atom_len_as(char *buf, int *index, const char *p, int len,
   char *s0 = s;
   int offs;
 
-  if (from_enc == ERLANG_LATIN1 && len >= MAXATOMLEN) {
+  if (len >= MAXATOMLEN && (from_enc == ERLANG_LATIN1 ||
+			    from_enc == ERLANG_ASCII)) {
       return -1;
   }
 
@@ -68,6 +74,8 @@ int ei_encode_atom_len_as(char *buf, int *index, const char *p, int len,
 	      if (len < 0) return -1;
 	      break;
 	  case ERLANG_ASCII:
+	      if (copy_ascii_atom(s+2, p, len) < 0) return -1;
+	      break;
 	  case ERLANG_LATIN1:
 	      memcpy(s+2, p, len);
 	      break;
@@ -93,9 +101,11 @@ int ei_encode_atom_len_as(char *buf, int *index, const char *p, int len,
 	  len = latin1_to_utf8((buf ? s+offs : NULL), p, len, MAXATOMLEN_UTF8-1, NULL);
 	  break;
       case ERLANG_ASCII:
+	  if (buf && copy_ascii_atom(s+offs, p, len) < 0) return -1;
+	  break;
       case ERLANG_UTF8:
 	  if (len >= 256) offs++;
-	  if (buf) memcpy(s+offs, p, len);
+	  if (buf && copy_utf8_atom(s+offs, p, len) < 0) return -1;
 	  break;
       default:
 	  return -1;
@@ -133,3 +143,48 @@ ei_internal_put_atom(char** bufp, const char* p, int slen,
     *bufp += ix;
     return 0;
 }
+
+
+int copy_ascii_atom(char* dst, const char* src, int slen)
+{
+    while (slen > 0) {
+	if ((src[0] & 0x80) != 0) return -1;
+	*dst++ = *src++;
+	slen--;
+    }
+    return 0;
+}
+
+int copy_utf8_atom(char* dst, const char* src, int slen)
+{
+    int num_chars = 0;
+
+    while (slen > 0) {
+	if (++num_chars >= MAXATOMLEN) return -1;
+	if ((src[0] & 0x80) != 0) {
+	    if ((src[0] & 0xE0) == 0xC0) {
+		if (slen < 2 || (src[1] & 0xC0) != 0x80) return -1;
+		*dst++ = *src++;
+		slen--;
+	    }
+	    else if ((src[0] & 0xF0) == 0xE0) {
+		if (slen < 3 || (src[1] & 0xC0) != 0x80 || (src[2] & 0xC0) != 0x80) return -1;
+		*dst++ = *src++;
+		*dst++ = *src++;
+		slen -= 2;
+	    }
+	    else if ((src[0] & 0xF8) == 0xF0) {
+		if (slen < 4 || (src[1] & 0xC0) != 0x80 || (src[2] & 0xC0) != 0x80 || (src[3] & 0xC0) != 0x80) return -1;
+		*dst++ = *src++;
+		*dst++ = *src++;
+		*dst++ = *src++;
+		slen -= 3;
+	    }
+	    else return -1;
+	}
+	*dst++ = *src++;
+	slen--;
+    }
+    return 0;
+}
+
diff --git a/lib/erl_interface/src/legacy/erl_connect.c b/lib/erl_interface/src/legacy/erl_connect.c
index be83fa8469..f82704ea8b 100644
--- a/lib/erl_interface/src/legacy/erl_connect.c
+++ b/lib/erl_interface/src/legacy/erl_connect.c
@@ -125,7 +125,7 @@ static ei_cnode erl_if_ec;
 
 int erl_connect_init(int this_node_number, char *cookie, short creation)
 {
-    char nn[MAXATOMLEN+1];
+    char nn[MAXATOMLEN];
 
     sprintf(nn, "c%d", this_node_number);
 
diff --git a/lib/erl_interface/src/misc/ei_format.c b/lib/erl_interface/src/misc/ei_format.c
index 281a192535..b5f11e618e 100644
--- a/lib/erl_interface/src/misc/ei_format.c
+++ b/lib/erl_interface/src/misc/ei_format.c
@@ -139,8 +139,8 @@ static int patom(const char** fmt, ei_x_buff* x)
     --(*fmt);
     len = *fmt - start;
     /* FIXME why truncate atom name and not fail?! */
-    if (len > MAXATOMLEN)
-	len = MAXATOMLEN;
+    if (len >= MAXATOMLEN)
+	len = MAXATOMLEN-1;
     return ei_x_encode_atom_len(x, start, len);
 }
 
diff --git a/lib/erl_interface/src/misc/ei_printterm.c b/lib/erl_interface/src/misc/ei_printterm.c
index 91fe73e68c..f3003a6172 100644
--- a/lib/erl_interface/src/misc/ei_printterm.c
+++ b/lib/erl_interface/src/misc/ei_printterm.c
@@ -115,7 +115,7 @@ static int print_term(FILE* fp, ei_x_buff* x,
 			       const char* buf, int* index)
 {
     int i, doquote, n, m, ty, r;
-    char a[MAXATOMLEN+1], *p;
+    char a[MAXATOMLEN], *p;
     int ch_written = 0;		/* counter of written chars */
     erlang_pid pid;
     erlang_port port;
diff --git a/lib/erl_interface/src/misc/show_msg.c b/lib/erl_interface/src/misc/show_msg.c
index ca46b15aff..33b09643ca 100644
--- a/lib/erl_interface/src/misc/show_msg.c
+++ b/lib/erl_interface/src/misc/show_msg.c
@@ -457,7 +457,7 @@ static void show_term(const char *termbuf, int *index, FILE *stream)
 	break;
     
     case ERL_FUN_EXT: {
-	char atom[MAXATOMLEN+1];
+	char atom[MAXATOMLEN];
 	long idx;
 	long uniq;
 	const char* s = termbuf + *index, * s0 = s;
diff --git a/lib/erl_interface/src/prog/ei_fake_prog.c b/lib/erl_interface/src/prog/ei_fake_prog.c
index 68eb537211..34101a2851 100644
--- a/lib/erl_interface/src/prog/ei_fake_prog.c
+++ b/lib/erl_interface/src/prog/ei_fake_prog.c
@@ -96,6 +96,7 @@ int main(void)
   EI_ULONGLONG *ulonglongp = (EI_ULONGLONG*)NULL;
   EI_ULONGLONG ulonglongx = 0;
 #endif
+  enum erlang_char_encoding enc;
 
   intx = erl_errno;
 
@@ -148,9 +149,13 @@ int main(void)
   ei_x_encode_string(&eix, charp);
   ei_x_encode_string_len(&eix, charp, intx);
   ei_encode_atom(charp, intp, charp);
+  ei_encode_atom_as(charp, intp, charp, ERLANG_LATIN1, ERLANG_UTF8);
   ei_encode_atom_len(charp, intp, charp, intx);
+  ei_encode_atom_len_as(charp, intp, charp, intx, ERLANG_ASCII, ERLANG_LATIN1);
   ei_x_encode_atom(&eix, charp);
+  ei_x_encode_atom_as(&eix, charp, ERLANG_LATIN1, ERLANG_UTF8);
   ei_x_encode_atom_len(&eix, charp, intx);
+  ei_x_encode_atom_len_as(&eix, charp, intx, ERLANG_LATIN1, ERLANG_UTF8);
   ei_encode_binary(charp, intp, (void *)0, longx);
   ei_x_encode_binary(&eix, (void*)0, intx);
   ei_encode_pid(charp, intp, &epid);
@@ -181,6 +186,7 @@ int main(void)
   ei_decode_char(charp, intp, charp);
   ei_decode_string(charp, intp, charp);
   ei_decode_atom(charp, intp, charp);
+  ei_decode_atom_as(charp, intp, charp, MAXATOMLEN_UTF8, ERLANG_WHATEVER, &enc, &enc);
   ei_decode_binary(charp, intp, (void *)0, longp);
   ei_decode_fun(charp, intp, &efun);
   free_fun(&efun);
diff --git a/lib/erl_interface/test/ei_decode_encode_SUITE.erl b/lib/erl_interface/test/ei_decode_encode_SUITE.erl
index e8ae7a6f81..0c98b494ec 100644
--- a/lib/erl_interface/test/ei_decode_encode_SUITE.erl
+++ b/lib/erl_interface/test/ei_decode_encode_SUITE.erl
@@ -119,8 +119,12 @@ test_ei_decode_encode(Config) when is_list(Config) ->
     ?line send_rec(P, OXRef),
 
     %% Unicode atoms
-    [send_rec(P, Atom) || Atom <- unicode_atom_data()],
-
+    [begin send_rec(P, Atom),
+	   send_rec(P, mk_pid({Atom,1}, 23434, 3434)),
+	   send_rec(P, mk_port({Atom,1}, 2343434)),
+	   send_rec(P, mk_ref({Atom,1}, [262143, 8723648, 24097245])),
+	   void
+     end || Atom <- unicode_atom_data()],
     ?line runner:recv_eot(P),
     ok.
 
@@ -229,38 +233,36 @@ uint8(Uint) ->
 
 
 mk_pid({NodeName, Creation}, Number, Serial) when is_atom(NodeName) ->
-    mk_pid({atom_to_list(NodeName), Creation}, Number, Serial);
-mk_pid({NodeName, Creation}, Number, Serial) ->
+    <<?VERSION_MAGIC, NodeNameExt/binary>> = term_to_binary(NodeName),
+    mk_pid({NodeNameExt, Creation}, Number, Serial);
+mk_pid({NodeNameExt, Creation}, Number, Serial) ->
     case catch binary_to_term(list_to_binary([?VERSION_MAGIC,
 					?PID_EXT,
-					?ATOM_EXT,
-					uint16_be(length(NodeName)),
-					NodeName,
+					NodeNameExt,
 					uint32_be(Number),
 					uint32_be(Serial),
 					uint8(Creation)])) of
 	Pid when is_pid(Pid) ->
 	    Pid;
 	{'EXIT', {badarg, _}} ->
-	    exit({badarg, mk_pid, [{NodeName, Creation}, Number, Serial]});
+	    exit({badarg, mk_pid, [{NodeNameExt, Creation}, Number, Serial]});
 	Other ->
 	    exit({unexpected_binary_to_term_result, Other})
     end.
 
 mk_port({NodeName, Creation}, Number) when is_atom(NodeName) ->
-    mk_port({atom_to_list(NodeName), Creation}, Number);
-mk_port({NodeName, Creation}, Number) ->
+    <<?VERSION_MAGIC, NodeNameExt/binary>> = term_to_binary(NodeName),
+    mk_port({NodeNameExt, Creation}, Number);
+mk_port({NodeNameExt, Creation}, Number) ->
     case catch binary_to_term(list_to_binary([?VERSION_MAGIC,
 					      ?PORT_EXT,
-					      ?ATOM_EXT,
-					      uint16_be(length(NodeName)),
-					      NodeName,
+					      NodeNameExt,
 					      uint32_be(Number),
 					      uint8(Creation)])) of
 	Port when is_port(Port) ->
 	    Port;
 	{'EXIT', {badarg, _}} ->
-	    exit({badarg, mk_port, [{NodeName, Creation}, Number]});
+	    exit({badarg, mk_port, [{NodeNameExt, Creation}, Number]});
 	Other ->
 	    exit({unexpected_binary_to_term_result, Other})
     end.
@@ -268,33 +270,30 @@ mk_port({NodeName, Creation}, Number) ->
 mk_ref({NodeName, Creation}, Numbers) when is_atom(NodeName),
 					   is_integer(Creation),
 					   is_list(Numbers) ->
-    mk_ref({atom_to_list(NodeName), Creation}, Numbers);
-mk_ref({NodeName, Creation}, [Number]) when is_list(NodeName),
-					    is_integer(Creation),
-					    is_integer(Number) ->
+    <<?VERSION_MAGIC, NodeNameExt/binary>> = term_to_binary(NodeName),
+    mk_ref({NodeNameExt, Creation}, Numbers);
+mk_ref({NodeNameExt, Creation}, [Number]) when is_binary(NodeNameExt),
+					       is_integer(Creation),
+					       is_integer(Number) ->
     case catch binary_to_term(list_to_binary([?VERSION_MAGIC,
 					      ?REFERENCE_EXT,
-					      ?ATOM_EXT,
-					      uint16_be(length(NodeName)),
-					      NodeName,
+					      NodeNameExt,
 					      uint32_be(Number),
 					      uint8(Creation)])) of
 	Ref when is_reference(Ref) ->
 	    Ref;
 	{'EXIT', {badarg, _}} ->
-	    exit({badarg, mk_ref, [{NodeName, Creation}, [Number]]});
+	    exit({badarg, mk_ref, [{NodeNameExt, Creation}, [Number]]});
 	Other ->
 	    exit({unexpected_binary_to_term_result, Other})
     end;
-mk_ref({NodeName, Creation}, Numbers) when is_list(NodeName),
-					   is_integer(Creation),
-					   is_list(Numbers) ->
+mk_ref({NodeNameExt, Creation}, Numbers) when is_binary(NodeNameExt),
+					      is_integer(Creation),
+					      is_list(Numbers) ->
     case catch binary_to_term(list_to_binary([?VERSION_MAGIC,
 					      ?NEW_REFERENCE_EXT,
 					      uint16_be(length(Numbers)),
-					      ?ATOM_EXT,
-					      uint16_be(length(NodeName)),
-					      NodeName,
+					      NodeNameExt,
 					      uint8(Creation),
 					      lists:map(fun (N) ->
 								uint32_be(N)
@@ -303,7 +302,7 @@ mk_ref({NodeName, Creation}, Numbers) when is_list(NodeName),
 	Ref when is_reference(Ref) ->
 	    Ref;
 	{'EXIT', {badarg, _}} ->
-	    exit({badarg, mk_ref, [{NodeName, Creation}, Numbers]});
+	    exit({badarg, mk_ref, [{NodeNameExt, Creation}, Numbers]});
 	Other ->
 	    exit({unexpected_binary_to_term_result, Other})
     end.
@@ -322,9 +321,11 @@ unicode_atom_data() ->
      uc_atup(lists:seq(65500, 65754)),
      uc_atup(lists:seq(65500, 65563))
      | lists:map(fun (N) ->
-			 uc_atup(lists:seq(64000+N, 64254+N))
+			 Pow2 = (1 bsl N),
+			 uc_atup(lists:seq(Pow2 - 127, Pow2 + 127))
 		 end,
-		 lists:seq(1, 2000))].
+		 lists:seq(7, 20))
+    ].
 
 uc_atup(ATxt) ->
     string_to_atom(ATxt).
diff --git a/lib/erl_interface/test/ei_decode_encode_SUITE_data/ei_decode_encode_test.c b/lib/erl_interface/test/ei_decode_encode_SUITE_data/ei_decode_encode_test.c
index 194ce9057b..e57663f984 100644
--- a/lib/erl_interface/test/ei_decode_encode_SUITE_data/ei_decode_encode_test.c
+++ b/lib/erl_interface/test/ei_decode_encode_SUITE_data/ei_decode_encode_test.c
@@ -63,10 +63,12 @@ int ei_x_encode_my_atom(ei_x_buff* x, my_atom* a)
     return ei_x_encode_atom_as(x, a->name, ERLANG_UTF8, a->enc);
 }
 
+#define BUFSZ 2000
+
 void decode_encode(struct Type* t, void* obj)
 {
     char *buf;
-    char buf2[1024];
+    char buf2[BUFSZ];
     int size1 = 0;
     int size2 = 0;
     int size3 = 0;
@@ -89,6 +91,11 @@ void decode_encode(struct Type* t, void* obj)
 	return;
     }
 
+    if (size1 > BUFSZ) {
+	fail("size is > BUFSZ");
+	return;
+    }
+
     MESSAGE("ei_encode_%s buf is NULL, arg is type %s", t->name, t->type);
     err = t->ei_encode_fp(NULL, &size2, obj);
     if (err != 0) {
@@ -277,8 +284,11 @@ TESTCASE(test_ei_decode_encode)
     EI_DECODE_ENCODE(ref  , erlang_ref);
 
     /* Unicode atoms */
-    for (i=0; i<2010; i++) {
-	EI_DECODE_ENCODE(my_atom , my_atom);
+    for (i=0; i<24; i++) {
+	EI_DECODE_ENCODE(my_atom, my_atom);
+	EI_DECODE_ENCODE(pid, erlang_pid);
+	EI_DECODE_ENCODE(port, erlang_port);
+	EI_DECODE_ENCODE(ref, erlang_ref);
     }
 
     report(1);
-- 
cgit v1.2.3


From 5d2aaef89f2e1dce2147ac56a105f129390f31a3 Mon Sep 17 00:00:00 2001
From: Sverker Eriksson <sverker@erlang.org>
Date: Wed, 23 Jan 2013 12:06:15 +0100
Subject: erts: Testcase doing unicode atom printout with ~w

---
 erts/emulator/test/distribution_SUITE.erl | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/erts/emulator/test/distribution_SUITE.erl b/erts/emulator/test/distribution_SUITE.erl
index 7edd92d56f..101007c288 100644
--- a/erts/emulator/test/distribution_SUITE.erl
+++ b/erts/emulator/test/distribution_SUITE.erl
@@ -1240,9 +1240,9 @@ contended_atom_cache_entry_test(Config, Type) ->
 					get_conflicting_unicode_atoms(CIX,
 								      ProcessPairs)
 				end,
-%		    io:format("Testing with the following atoms all using "
-%			      "cache index ~p:~n ~p~n",
-%			      [CIX, TestAtoms]),
+		    io:format("Testing with the following atoms all using "
+			      "cache index ~p:~n ~w~n",
+			      [CIX, TestAtoms]),
 		    Ps = lists:map(
 			   fun (A) ->
 				   Ref = make_ref(),
-- 
cgit v1.2.3


From c596e17cf3d69cf5e10d28ee2a8ee35162786da1 Mon Sep 17 00:00:00 2001
From: Sverker Eriksson <sverker@erlang.org>
Date: Wed, 23 Jan 2013 16:04:38 +0100
Subject: erl_interface: Changed erlang_char_encoding interface

to allow bitwise-or'd combinations.
---
 lib/erl_interface/doc/src/ei.xml           | 13 +++++++------
 lib/erl_interface/include/ei.h             |  7 ++++++-
 lib/erl_interface/src/decode/decode_atom.c |  2 +-
 lib/erl_interface/src/encode/encode_atom.c |  3 +--
 lib/erl_interface/src/legacy/erl_marshal.c |  6 +++---
 5 files changed, 18 insertions(+), 13 deletions(-)

diff --git a/lib/erl_interface/doc/src/ei.xml b/lib/erl_interface/doc/src/ei.xml
index e9c7c644b5..117c787da6 100644
--- a/lib/erl_interface/doc/src/ei.xml
+++ b/lib/erl_interface/doc/src/ei.xml
@@ -91,14 +91,13 @@
 	 <p/>
 	 <code type="none">
 enum erlang_char_encoding {
-    ERLANG_ASCII, ERLANG_LATIN1, ERLANG_UTF8, ERLANG_WHATEVER
+    ERLANG_ASCII, ERLANG_LATIN1, ERLANG_UTF8
 };
 </code>
          <p>The character encoding used for atoms. <c>ERLANG_ASCII</c> represents 7-bit ASCII.
 	 Latin1 and UTF8 are different extensions of 7-bit ASCII. All 7-bit ASCII characters
 	 are valid Latin1 and UTF8 characters. ASCII and Latin1 both represent each character
-	 by one byte. A UTF8 character can consist of one to four bytes. <c>ERLANG_WHATEVER</c>
-	 is not an encoding but rather used as a wildcard.</p>
+	 by one byte. A UTF8 character can consist of one to four bytes.</p>
       </item>
     </taglist>
   </section>
@@ -545,11 +544,13 @@ ei_x_encode_empty_list(&amp;x);
 	want</seealso></c>. The original encoding used in the
 	binary format (latin1 or utf8) can be obtained from <c>*was</c>. The actual encoding of the resulting string
 	(7-bit ascii, latin1 or utf8) can be obtained from <c>*result</c>. Both <c>was</c> and <c>result</c> can be <c>NULL</c>.
-	<c>*result</c> may differ from <c>want</c> if <c>want</c> is <c>ERLANG_WHATEVER</c> or if
-	<c>*result</c> turn out to be pure 7-bit ascii (compatible with both latin1 and utf8).</p>
+	
+	<c>*result</c> may differ from <c>want</c> if <c>want</c> is a bitwise-or'd combination like
+	<c>ERLANG_LATIN1|ERLANG_UTF8</c> or if <c>*result</c> turn out to be pure 7-bit ascii
+	(compatible with both latin1 and utf8).</p>
 	<p>This function fails if the atom is too long for the buffer
 	or if it can not be represented with encoding <c>want</c>.</p>
-	<p>This functions was introduced in R16 release of Erlang/OTP as part of a first step
+	<p>This function was introduced in R16 release of Erlang/OTP as part of a first step
 	to support UTF8 atoms.</p>
       </desc>
     </func>
diff --git a/lib/erl_interface/include/ei.h b/lib/erl_interface/include/ei.h
index 20e575f64d..2278a28adb 100644
--- a/lib/erl_interface/include/ei.h
+++ b/lib/erl_interface/include/ei.h
@@ -190,7 +190,12 @@ extern volatile int __erl_errno;
 #define MAXATOMLEN_UTF8 (255*4 + 1)
 #define MAXNODELEN EI_MAXALIVELEN+1+EI_MAXHOSTNAMELEN
 
-enum erlang_char_encoding { ERLANG_ASCII, ERLANG_LATIN1, ERLANG_UTF8, ERLANG_WHATEVER };
+enum erlang_char_encoding { 
+    ERLANG_ASCII = 1,
+    ERLANG_LATIN1 = 2,
+    ERLANG_UTF8 = 4,
+    ERLANG_ANY = ERLANG_ASCII|ERLANG_LATIN1|ERLANG_UTF8
+};
 
 /* a pid */
 typedef struct {
diff --git a/lib/erl_interface/src/decode/decode_atom.c b/lib/erl_interface/src/decode/decode_atom.c
index 2ada418243..556c400cb3 100644
--- a/lib/erl_interface/src/decode/decode_atom.c
+++ b/lib/erl_interface/src/decode/decode_atom.c
@@ -58,7 +58,7 @@ int ei_decode_atom_as(const char *buf, int *index, char* p, int destlen,
 	return -1;
     }
     
-    if (want_enc == got_enc || want_enc == ERLANG_WHATEVER || want_enc == ERLANG_ASCII) {
+    if ((want_enc & got_enc) || want_enc == ERLANG_ASCII) {
 	int i, found_non_ascii = 0;
 	if (len >= destlen)
 	    return -1;
diff --git a/lib/erl_interface/src/encode/encode_atom.c b/lib/erl_interface/src/encode/encode_atom.c
index 8bbe962396..044f17cb60 100644
--- a/lib/erl_interface/src/encode/encode_atom.c
+++ b/lib/erl_interface/src/encode/encode_atom.c
@@ -59,8 +59,7 @@ int ei_encode_atom_len_as(char *buf, int *index, const char *p, int len,
   char *s0 = s;
   int offs;
 
-  if (len >= MAXATOMLEN && (from_enc == ERLANG_LATIN1 ||
-			    from_enc == ERLANG_ASCII)) {
+  if (len >= MAXATOMLEN && (from_enc & (ERLANG_LATIN1|ERLANG_ASCII))) {
       return -1;
   }
 
diff --git a/lib/erl_interface/src/legacy/erl_marshal.c b/lib/erl_interface/src/legacy/erl_marshal.c
index 884e9d421b..4c45cebb02 100644
--- a/lib/erl_interface/src/legacy/erl_marshal.c
+++ b/lib/erl_interface/src/legacy/erl_marshal.c
@@ -662,7 +662,7 @@ static int read_atom(unsigned char** ext, Erl_Atom_data* a)
     int offs = 0;
     enum erlang_char_encoding enc;
     int ret = ei_decode_atom_as((char*)*ext, &offs, buf, MAXATOMLEN_UTF8,
-				ERLANG_WHATEVER, NULL, &enc);
+				ERLANG_LATIN1|ERLANG_UTF8, NULL, &enc);
     *ext += offs;
 
     if (ret == 0) {
@@ -674,11 +674,11 @@ static int read_atom(unsigned char** ext, Erl_Atom_data* a)
 	a->lenL = 0;
 	a->utf8 = NULL;
 	a->lenU = 0;
-	if (enc == ERLANG_LATIN1 || enc == ERLANG_ASCII) {
+	if (enc & (ERLANG_LATIN1 | ERLANG_ASCII)) {
 	    a->latin1 = clone; 	    
 	    a->lenL = i;
 	}
-	if (enc == ERLANG_UTF8 || enc == ERLANG_ASCII) {
+	if (enc & (ERLANG_UTF8 | ERLANG_ASCII)) {
 	    a->utf8 = clone;
 	    a->lenU = i;
 	}
-- 
cgit v1.2.3


From d6e3e256b850050b7a86323b2948009d5fcc30a9 Mon Sep 17 00:00:00 2001
From: Lukas Larsson <lukas@erlang-solutions.com>
Date: Wed, 23 Jan 2013 10:17:29 +0100
Subject: erl_interface: Fix bug when transcoding atoms from and to UTF8

---
 lib/erl_interface/src/decode/decode_atom.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/lib/erl_interface/src/decode/decode_atom.c b/lib/erl_interface/src/decode/decode_atom.c
index 556c400cb3..9779ad3f35 100644
--- a/lib/erl_interface/src/decode/decode_atom.c
+++ b/lib/erl_interface/src/decode/decode_atom.c
@@ -76,8 +76,8 @@ int ei_decode_atom_as(const char *buf, int *index, char* p, int destlen,
     }
     else {
 	int plen = (got_enc == ERLANG_LATIN1) ?
-	    utf8_to_latin1(p, s, len, destlen-1, res_encp) :
-	    latin1_to_utf8(p, s, len, destlen-1, res_encp);
+	  latin1_to_utf8(p, s, len, destlen-1, res_encp) :
+	  utf8_to_latin1(p, s, len, destlen-1, res_encp);
 	if (plen < 0) return -1;
 	if (p) p[plen] = 0;
     }
-- 
cgit v1.2.3