erts: Change internal representation of atoms to utf8

author: Sverker Eriksson <sverker@erlang.org> 2012-12-14 19:52:59 +0100
committer: Sverker Eriksson <sverker@erlang.org> 2013-01-08 11:15:01 +0100
commit: a9a57385f0a5648bbfeccc5e8ef0bfe2cdac80c7 (patch)
tree: b7fd2766b020c37a68b29b0aac47ace685841349 /erts
parent: 81a5f0f91ce50416d44f48752cb2b7f4ae02d953 (diff)
download: otp-a9a57385f0a5648bbfeccc5e8ef0bfe2cdac80c7.tar.gz
otp-a9a57385f0a5648bbfeccc5e8ef0bfe2cdac80c7.tar.bz2
otp-a9a57385f0a5648bbfeccc5e8ef0bfe2cdac80c7.zip
13 files changed, 312 insertions, 169 deletions
diff --git a/erts/emulator/beam/atom.c b/erts/emulator/beam/atom.c
index d7c7f117cf..b41a98f2a2 100644
--- a/erts/emulator/beam/atom.c
+++ b/erts/emulator/beam/atom.c
@@ -111,7 +111,7 @@ atom_text_alloc(int bytes)
 {
     byte *res;
 
-    ASSERT(bytes <= MAX_ATOM_LENGTH);
+    ASSERT(bytes <= MAX_ATOM_SZ_LIMIT);
     if (atom_text_pos + bytes >= atom_text_end) {
 	more_atom_space();
     }
@@ -198,7 +198,11 @@ am_atom_put(const char* name, int len)
     Atom a;
     Eterm ret;
     int aix;
-
+#ifdef DEBUG
+    byte* err_pos;
+    Uint num_chars;
+    ASSERT(erts_analyze_utf8(name, len, &err_pos, &num_chars, NULL) == ERTS_UTF8_OK);
+#endif
     /*
      * Silently truncate the atom if it is too long. Overlong atoms
      * could occur in situations where we have no good way to return
@@ -209,8 +213,8 @@ am_atom_put(const char* name, int len)
      * list_to_atom/1), the caller should check the length before
      * calling this function.
      */
-    if (len > MAX_ATOM_LENGTH) {
-	len = MAX_ATOM_LENGTH;
+    if (len > MAX_ATOM_SZ_LIMIT) {
+	len = MAX_ATOM_SZ_LIMIT;  /*SVERK Urk... */
     }
 #ifdef ERTS_ATOM_PUT_OPS_STAT
     erts_smp_atomic_inc_nob(&atom_put_ops);
@@ -230,6 +234,49 @@ am_atom_put(const char* name, int len)
     return ret;
 }
 
+static void latin1_to_utf8(byte* conv_buf, const byte** srcp, int* lenp)
+{
+    byte* dst;
+    const byte* src = *srcp;
+    int i, len = *lenp;
+
+    for (i=0 ; i < len; ++i) {
+	if (src[i] & 0x80) {
+	    goto need_convertion;
+	}
+    }
+    return;
+
+need_convertion:
+    sys_memcpy(conv_buf, src, i);
+    dst = conv_buf + i;
+    for ( ; i < len; ++i) {
+	unsigned char chr = src[i];
+	if (!(chr & 0x80)) {
+	    *dst++ = chr;
+	}
+	else {
+	    *dst++ = 0xC0 | (chr >> 6);
+	    *dst++ = 0x80 | (chr & 0x3F);
+	}
+    }
+    *srcp = conv_buf;	
+    *lenp = dst - conv_buf;
+}
+
+
+Eterm
+am_atom_put2(const byte* name, int len, int is_latin1)
+{
+    byte utf8_copy[MAX_ATOM_SZ_FROM_LATIN1];
+
+    if (is_latin1) {
+	latin1_to_utf8(utf8_copy, &name, &len);
+    }
+    return am_atom_put((const char*)name, len);
+}
+
+
 
 int atom_table_size(void)
 {
@@ -264,14 +311,18 @@ int atom_table_sz(void)
 }
 
 int
-erts_atom_get(const char *name, int len, Eterm* ap)
+erts_atom_get(const char *name, int len, Eterm* ap, int is_latin1)
 {
+    byte utf8_copy[MAX_ATOM_SZ_FROM_LATIN1];
     Atom a;
     int i;
     int res;
 
     a.len = len;
     a.name = (byte *)name;
+    if (is_latin1) {
+	latin1_to_utf8(utf8_copy, (const byte**)&a.name, &a.len);
+    }
     atom_read_lock();
     i = index_get(&erts_atom_table, (void*) &a);
     res = i < 0 ? 0 : (*ap = make_atom(i), 1);
diff --git a/erts/emulator/beam/atom.h b/erts/emulator/beam/atom.h
index fd9c04d3d0..84dd6d8901 100644
--- a/erts/emulator/beam/atom.h
+++ b/erts/emulator/beam/atom.h
@@ -26,7 +26,9 @@
 
 #include "erl_atom_table.h"
 
-#define MAX_ATOM_LENGTH 255
+#define MAX_ATOM_CHARACTERS 255
+#define MAX_ATOM_SZ_FROM_LATIN1 (2*MAX_ATOM_CHARACTERS)
+#define MAX_ATOM_SZ_LIMIT (4*MAX_ATOM_CHARACTERS) /* theoretical byte limit */
 #define ATOM_LIMIT (1024*1024)
 #define MIN_ATOM_TABLE_SIZE 8192
 
@@ -53,8 +55,8 @@ typedef struct atom {
 extern IndexTable erts_atom_table;
 
 ERTS_GLB_INLINE Atom* atom_tab(Uint i);
-ERTS_GLB_INLINE int erts_is_atom_bytes(byte *text, size_t len, Eterm term);
-ERTS_GLB_INLINE int erts_is_atom_str(char *str, Eterm term);
+ERTS_GLB_INLINE int erts_is_atom_utf8_bytes(byte *text, size_t len, Eterm term);
+ERTS_GLB_INLINE int erts_is_atom_str(const char *str, Eterm term, int is_latin1);
 
 #if ERTS_GLB_INLINE_INCL_FUNC_DEF
 ERTS_GLB_INLINE Atom*
@@ -63,7 +65,7 @@ atom_tab(Uint i)
     return (Atom *) erts_index_lookup(&erts_atom_table, i);
 }
 
-ERTS_GLB_INLINE int erts_is_atom_bytes(byte *text, size_t len, Eterm term)
+ERTS_GLB_INLINE int erts_is_atom_utf8_bytes(byte *text, size_t len, Eterm term)
 {
     Atom *a;
     if (!is_atom(term))
@@ -73,30 +75,50 @@ ERTS_GLB_INLINE int erts_is_atom_bytes(byte *text, size_t len, Eterm term)
 	    && sys_memcmp((void *) a->name, (void *) text, len) == 0);
 }
 
-ERTS_GLB_INLINE int erts_is_atom_str(char *str, Eterm term)
+ERTS_GLB_INLINE int erts_is_atom_str(const char *str, Eterm term, int is_latin1)
 {
     Atom *a;
     int i, len;
-    char *aname;
+    const byte* aname;
+    const byte* s = (const byte*) str;
+
     if (!is_atom(term))
 	return 0;
     a = atom_tab(atom_val(term));
     len = a->len;
-    aname = (char *) a->name;
-    for (i = 0; i < len; i++)
-	if (aname[i] != str[i] || str[i] == '\0')
-	    return 0;
-    return str[len] == '\0';
+    aname = a->name;
+    if (is_latin1) {
+	for (i = 0; i < len; s++) {
+	    if (aname[i] < 0x80) {
+		if (aname[i] != *s || *s == '\0')
+		    return 0;
+		i++;
+	    }
+	    else {
+		if (aname[i]   != (0xC0 | (*s >> 6)) || 
+		    aname[i+1] != (0x80 | (*s & 0x3F))) {
+		    return 0;
+		}
+		i += 2;
+	    }
+	}
+    }
+    else {
+	for (i = 0; i < len; i++, s++)
+	    if (aname[i] != *s || *s == '\0')
+		return 0;
+    }
+    return *s == '\0';
 }
 
 #endif
 
 /*
  * Note, ERTS_IS_ATOM_STR() expects the first argument to be a
- * string literal.
+ * 7-bit ASCII string literal.
  */
 #define ERTS_IS_ATOM_STR(LSTR, TERM) \
-  (erts_is_atom_bytes((byte *) LSTR, sizeof(LSTR) - 1, (TERM)))
+  (erts_is_atom_utf8_bytes((byte *) LSTR, sizeof(LSTR) - 1, (TERM)))
 #define ERTS_DECL_AM(S) Eterm AM_ ## S = am_atom_put(#S, sizeof(#S) - 1)
 #define ERTS_INIT_AM(S) AM_ ## S = am_atom_put(#S, sizeof(#S) - 1)
 
@@ -104,12 +126,13 @@ int atom_table_size(void);	/* number of elements */
 int atom_table_sz(void);	/* table size in bytes, excluding stored objects */
 
 Eterm am_atom_put(const char*, int); /* most callers pass plain char*'s */
+Eterm am_atom_put2(const byte*, int, int is_latin1);
 int atom_erase(byte*, int);
 int atom_static_put(byte*, int);
 void init_atom_table(void);
 void atom_info(int, void *);
 void dump_atoms(int, void *);
-int erts_atom_get(const char* name, int len, Eterm* ap);
+int erts_atom_get(const char* name, int len, Eterm* ap, int is_latin1);
 void erts_atom_get_text_space_sizes(Uint *reserved, Uint *used);
 #endif
 
diff --git a/erts/emulator/beam/beam_load.c b/erts/emulator/beam/beam_load.c
index b51f076a5d..7bb964d5aa 100644
--- a/erts/emulator/beam/beam_load.c
+++ b/erts/emulator/beam/beam_load.c
@@ -1230,7 +1230,7 @@ load_atom_table(LoaderState* stp)
 
 	GetByte(stp, n);
 	GetString(stp, atom, n);
-	stp->atom[i] = am_atom_put((char*)atom, n);
+	stp->atom[i] = am_atom_put2(atom, n, 1);
     }
 
     /*
@@ -1240,7 +1240,7 @@ load_atom_table(LoaderState* stp)
     if (is_nil(stp->module)) {
 	stp->module = stp->atom[1];
     } else if (stp->atom[1] != stp->module) {
-	char sbuf[256];
+	char sbuf[MAX_ATOM_SZ_FROM_LATIN1];
 	Atom* ap;
 
 	ap = atom_tab(atom_val(stp->atom[1]));
@@ -1620,7 +1620,7 @@ read_line_table(LoaderState* stp)
 
 	    GetInt(stp, 2, n);
 	    GetString(stp, fname, n);
-	    stp->fname[i] = am_atom_put((char*)fname, n);
+	    stp->fname[i] = am_atom_put((char*)fname, n); /*SVERK ? */
 	}
     }
 
diff --git a/erts/emulator/beam/bif.c b/erts/emulator/beam/bif.c
index 1cdce49eef..89157068c0 100644
--- a/erts/emulator/beam/bif.c
+++ b/erts/emulator/beam/bif.c
@@ -2536,9 +2536,11 @@ BIF_RETTYPE append_element_2(BIF_ALIST_2)
 
 BIF_RETTYPE atom_to_list_1(BIF_ALIST_1)
 {
-    Uint need;
-    Eterm* hp;
+    Eterm do_utf8_to_list(Process*, Uint num, byte *bytes, Uint sz, Uint left,
+			  Uint *num_built, Uint *num_eaten, Eterm tail); /*SVERK */
     Atom* ap;
+    Uint num_chars, num_built, num_eaten;
+    Eterm res;
 
     if (is_not_atom(BIF_ARG_1))
 	BIF_ERROR(BIF_P, BADARG);
@@ -2547,9 +2549,19 @@ BIF_RETTYPE atom_to_list_1(BIF_ALIST_1)
     ap = atom_tab(atom_val(BIF_ARG_1));
     if (ap->len == 0)
 	BIF_RET(NIL);	/* the empty atom */
-    need = ap->len*2;
-    hp = HAlloc(BIF_P, need);
-    BIF_RET(buf_to_intlist(&hp,(char*)ap->name,ap->len, NIL));
+    {
+	byte* err_pos;
+	if (erts_analyze_utf8(ap->name, ap->len, &err_pos, &num_chars, NULL)
+	    != ERTS_UTF8_OK) {
+	    BIF_ERROR(BIF_P, BADARG);
+	}
+    }
+    
+    res = do_utf8_to_list(BIF_P, num_chars, ap->name, ap->len, ap->len,
+			  &num_built, &num_eaten, NIL);
+    ASSERT(num_built == num_chars);
+    ASSERT(num_eaten == ap->len);
+    BIF_RET(res);
 }
 
 /**********************************************************************/
@@ -2559,18 +2571,18 @@ BIF_RETTYPE atom_to_list_1(BIF_ALIST_1)
 BIF_RETTYPE list_to_atom_1(BIF_ALIST_1)
 {
     Eterm res;
-    char *buf = (char *) erts_alloc(ERTS_ALC_T_TMP, MAX_ATOM_LENGTH);
-    int i = intlist_to_buf(BIF_ARG_1, buf, MAX_ATOM_LENGTH);
+    char *buf = (char *) erts_alloc(ERTS_ALC_T_TMP, MAX_ATOM_CHARACTERS);
+    int i = intlist_to_buf(BIF_ARG_1, buf, MAX_ATOM_CHARACTERS);
 
     if (i < 0) {
 	erts_free(ERTS_ALC_T_TMP, (void *) buf);
 	i = list_length(BIF_ARG_1);
-	if (i > MAX_ATOM_LENGTH) {
+	if (i > MAX_ATOM_CHARACTERS) {
 	    BIF_ERROR(BIF_P, SYSTEM_LIMIT);
 	}
 	BIF_ERROR(BIF_P, BADARG);
     }
-    res = am_atom_put(buf, i);
+    res = am_atom_put2((byte*)buf, i, 1);
     erts_free(ERTS_ALC_T_TMP, (void *) buf);
     BIF_RET(res);
 }
@@ -2580,16 +2592,16 @@ BIF_RETTYPE list_to_atom_1(BIF_ALIST_1)
 BIF_RETTYPE list_to_existing_atom_1(BIF_ALIST_1)
 {
     int i;
-    char *buf = (char *) erts_alloc(ERTS_ALC_T_TMP, MAX_ATOM_LENGTH);
+    char *buf = (char *) erts_alloc(ERTS_ALC_T_TMP, MAX_ATOM_CHARACTERS);
 
-    if ((i = intlist_to_buf(BIF_ARG_1, buf, MAX_ATOM_LENGTH)) < 0) {
+    if ((i = intlist_to_buf(BIF_ARG_1, buf, MAX_ATOM_CHARACTERS)) < 0) {
     error:
 	erts_free(ERTS_ALC_T_TMP, (void *) buf);
 	BIF_ERROR(BIF_P, BADARG);
     } else {
 	Eterm a;
 	
-	if (erts_atom_get(buf, i, &a)) {
+	if (erts_atom_get(buf, i, &a, 1)) {
 	    erts_free(ERTS_ALC_T_TMP, (void *) buf);
 	    BIF_RET(a);
 	} else {
diff --git a/erts/emulator/beam/dist.h b/erts/emulator/beam/dist.h
index 0db7d56cb2..129f637dba 100644
--- a/erts/emulator/beam/dist.h
+++ b/erts/emulator/beam/dist.h
@@ -39,6 +39,7 @@
 #define DFLAG_DIST_HDR_ATOM_CACHE 0x2000
 #define DFLAG_SMALL_ATOM_TAGS     0x4000
 #define DFLAG_INTERNAL_TAGS       0x8000
+#define DFLAG_UTF8_ATOMS          0x10000
 
 /* All flags that should be enabled when term_to_binary/1 is used. */
 #define TERM_TO_BINARY_DFLAGS (DFLAG_EXTENDED_REFERENCES	\
diff --git a/erts/emulator/beam/erl_alloc.c b/erts/emulator/beam/erl_alloc.c
index 3eee53eba3..061f229f59 100644
--- a/erts/emulator/beam/erl_alloc.c
+++ b/erts/emulator/beam/erl_alloc.c
@@ -3045,13 +3045,13 @@ erts_request_alloc_info(struct process *c_p,
 	Eterm alloc = CAR(consp);
 
 	for (ai = ERTS_ALC_A_MIN; ai <= ERTS_ALC_A_MAX; ai++)
-	    if (erts_is_atom_str((char *) erts_alc_a2ad[ai], alloc))
+	    if (erts_is_atom_str(erts_alc_a2ad[ai], alloc, 0))
 		goto save_alloc;
-	if (erts_is_atom_str("mseg_alloc", alloc)) {
+	if (erts_is_atom_str("mseg_alloc", alloc, 0)) {
 	    ai = ERTS_ALC_INFO_A_MSEG_ALLOC;
 	    goto save_alloc;
 	}
-	if (erts_is_atom_str("alloc_util", alloc)) {
+	if (erts_is_atom_str("alloc_util", alloc, 0)) {
 	    ai = ERTS_ALC_INFO_A_ALLOC_UTIL;
 	save_alloc:
 	    if (req_ai[ai])
diff --git a/erts/emulator/beam/erl_alloc_util.c b/erts/emulator/beam/erl_alloc_util.c
index 97ba306a79..f8a8c00715 100644
--- a/erts/emulator/beam/erl_alloc_util.c
+++ b/erts/emulator/beam/erl_alloc_util.c
@@ -2815,10 +2815,10 @@ make_name_atoms(Allctr_t *allctr)
     char alloc[] = "alloc";
     char realloc[] = "realloc";
     char free[] = "free";
-    char buf[MAX_ATOM_LENGTH];
+    char buf[MAX_ATOM_CHARACTERS];
     size_t prefix_len = strlen(allctr->name_prefix);
 
-    if (prefix_len > MAX_ATOM_LENGTH + sizeof(realloc) - 1)
+    if (prefix_len > MAX_ATOM_CHARACTERS + sizeof(realloc) - 1)
 	erl_exit(1,"Too long allocator name: %salloc\n",allctr->name_prefix);
 
     memcpy((void *) buf, (void *) allctr->name_prefix, prefix_len);
diff --git a/erts/emulator/beam/erl_nif.c b/erts/emulator/beam/erl_nif.c
index 632d756481..185ac75d73 100644
--- a/erts/emulator/beam/erl_nif.c
+++ b/erts/emulator/beam/erl_nif.c
@@ -974,7 +974,7 @@ int enif_make_existing_atom_len(ErlNifEnv* env, const char* name, size_t len,
 				ERL_NIF_TERM* atom, ErlNifCharEncoding encoding)
 {
     ASSERT(encoding == ERL_NIF_LATIN1);
-    return erts_atom_get(name, len, atom);
+    return erts_atom_get(name, len, atom, 1);
 }
 
 ERL_NIF_TERM enif_make_tuple(ErlNifEnv* env, unsigned cnt, ...)
@@ -1633,7 +1633,7 @@ BIF_RETTYPE load_nif_2(BIF_ALIST_2)
 			     "this vm variant (%s).",
 			     entry->vm_variant, ERL_NIF_VM_VARIANT);
     }
-    else if (!erts_is_atom_str((char*)entry->name, mod_atom)) {
+    else if (!erts_is_atom_str((char*)entry->name, mod_atom, 1)) {
 	ret = load_nif_error(BIF_P, bad_lib, "Library module name '%s' does not"
 			     " match calling module '%T'", entry->name, mod_atom);
     }
@@ -1643,7 +1643,7 @@ BIF_RETTYPE load_nif_2(BIF_ALIST_2)
 	for (i=0; i < entry->num_of_funcs && ret==am_ok; i++) {
 	    BeamInstr** code_pp;
 	    ErlNifFunc* f = &entry->funcs[i];
-	    if (!erts_atom_get(f->name, sys_strlen(f->name), &f_atom)
+	    if (!erts_atom_get(f->name, sys_strlen(f->name), &f_atom, 1)
 		|| (code_pp = get_func_pp(mod->curr.code, f_atom, f->arity))==NULL) {
 		ret = load_nif_error(BIF_P,bad_lib,"Function not found %T:%s/%u",
 				     mod_atom, f->name, f->arity);
@@ -1746,7 +1746,7 @@ BIF_RETTYPE load_nif_2(BIF_ALIST_2)
 	for (i=0; i < entry->num_of_funcs; i++)
 	{
 	    BeamInstr* code_ptr;
-	    erts_atom_get(entry->funcs[i].name, sys_strlen(entry->funcs[i].name), &f_atom); 
+	    erts_atom_get(entry->funcs[i].name, sys_strlen(entry->funcs[i].name), &f_atom, 1); 
 	    code_ptr = *get_func_pp(mod->curr.code, f_atom, entry->funcs[i].arity);
 	    
 	    if (code_ptr[1] == 0) {
diff --git a/erts/emulator/beam/erl_unicode.c b/erts/emulator/beam/erl_unicode.c
index 51559aea1c..e24b6f1458 100644
--- a/erts/emulator/beam/erl_unicode.c
+++ b/erts/emulator/beam/erl_unicode.c
@@ -1155,7 +1155,7 @@ BIF_RETTYPE unicode_characters_to_list_2(BIF_ALIST_2)
  * a faster analyze and size count with this function.
  */
 int erts_analyze_utf8(byte *source, Uint size, 
-			byte **err_pos, Uint *num_chars, int *left)
+		      byte **err_pos, Uint *num_chars, int *left)
 {
     *err_pos = source;
     *num_chars = 0;
@@ -1210,7 +1210,7 @@ int erts_analyze_utf8(byte *source, Uint size,
 	}
 	++(*num_chars);
 	*err_pos = source;
-	if (left && --(*left) <= 0) {
+	if (left && --(*left) <= 0 && size) {
 	    return ERTS_UTF8_ANALYZE_MORE;
 	}
     }
@@ -1220,7 +1220,7 @@ int erts_analyze_utf8(byte *source, Uint size,
 /*
  * No errors should be able to occur - no overlongs, no malformed, no nothing
  */    
-static Eterm do_utf8_to_list(Process *p, Uint num, byte *bytes, Uint sz, 
+Eterm do_utf8_to_list(Process *p, Uint num, byte *bytes, Uint sz, 
 			     Uint left,
 			     Uint *num_built, Uint *num_eaten, Eterm tail)
 {
@@ -1812,31 +1812,36 @@ BIF_RETTYPE atom_to_binary_2(BIF_ALIST_2)
     ap = atom_tab(atom_val(BIF_ARG_1));
 
     if (BIF_ARG_2 == am_latin1) {
-	BIF_RET(new_binary(BIF_P, ap->name, ap->len));
-    } else if (BIF_ARG_2 == am_utf8 || BIF_ARG_2 == am_unicode) {
-	int bin_size = 0;
 	int i;
 	Eterm bin_term;
-	byte* bin_p;
-
-	for (i = 0; i < ap->len; i++) {
-	    bin_size += (ap->name[i] >= 0x80) ? 2 : 1;
+	int bin_size = ap->len;
+
+	for (i = 0; i < ap->len; ) {  
+	    if (ap->name[i] < 0x80) i++;
+	    else {
+		ASSERT(ap->name[i] >= 0xC0);
+		if (ap->name[i] < 0xE0) {
+		    ASSERT(i+1 < ap->len && (ap->name[i+1] & 0xC0) == 0x80);
+		    i += 2;
+		    bin_size -= 1;
+		}
+		else goto error;
+	    }
 	}
 	if (bin_size == ap->len) {
-	    BIF_RET(new_binary(BIF_P, ap->name, ap->len));
+	    bin_term = new_binary(BIF_P, ap->name, ap->len);
 	}
-	bin_term = new_binary(BIF_P, 0, bin_size);
-	bin_p = binary_bytes(bin_term);
-	for (i = 0; i < ap->len; i++) {
-	    byte b = ap->name[i];
-	    if (b < 0x80) {
-		*bin_p++ = b;
-	    } else {
-		*bin_p++ = 0xC0 | (b >> 6);
-		*bin_p++ = 0x80 | (b & 0x3F);
-	    }
+	else {
+	    byte* bin_p;
+	    int dbg_sz;
+	    bin_term = new_binary(BIF_P, 0, bin_size);
+	    bin_p = binary_bytes(bin_term);
+	    dbg_sz = erts_utf8_to_latin1(bin_p, ap->name, ap->len);
+	    ASSERT(dbg_sz == bin_size); (void)dbg_sz; 
 	}
 	BIF_RET(bin_term);
+    } else if (BIF_ARG_2 == am_utf8 || BIF_ARG_2 == am_unicode) {
+	BIF_RET(new_binary(BIF_P, ap->name, ap->len));
     } else {
     error:
 	BIF_ERROR(BIF_P, BADARG);
@@ -1856,102 +1861,52 @@ binary_to_atom(Process* p, Eterm bin, Eterm enc, int must_exist)
     bin_size = binary_size(bin);
     if (enc == am_latin1) {
 	Eterm a;
-	if (bin_size > MAX_ATOM_LENGTH) {
+	if (bin_size > MAX_ATOM_CHARACTERS) {
 	system_limit:
 	    erts_free_aligned_binary_bytes(temp_alloc);
 	    BIF_ERROR(p, SYSTEM_LIMIT);
 	}
 	if (!must_exist) {
-	    a = am_atom_put((char *)bytes, bin_size);
+	    a = am_atom_put2(bytes, bin_size, 1);
 	    erts_free_aligned_binary_bytes(temp_alloc);
 	    BIF_RET(a);
-	} else if (erts_atom_get((char *)bytes, bin_size, &a)) {
+	} else if (erts_atom_get((char *)bytes, bin_size, &a, 1)) {
 	    erts_free_aligned_binary_bytes(temp_alloc);
 	    BIF_RET(a);
 	} else {
 	    goto badarg;
 	}
     } else if (enc == am_utf8 || enc == am_unicode) {
-	char *buf;
-	char *dst;
-	int i;
-	int num_chars;
 	Eterm res;
+	Uint num_chars = 0;
+	const byte* p = bytes;
+	Uint left = bin_size;
 
-	if (bin_size > 2*MAX_ATOM_LENGTH) {
-	    byte* err_pos;
-	    Uint n;
-	    int reds_left = bin_size+1; /* Number of reductions left. */
-
-	    if (erts_analyze_utf8(bytes, bin_size, &err_pos,
-			     &n, &reds_left) == ERTS_UTF8_OK) {
-		/* 
-		 * Correct UTF-8 encoding, but too many characters to
-		 * fit in an atom.
-		 */
+	while (left) {
+	    if (++num_chars > MAX_ATOM_CHARACTERS) {
 		goto system_limit;
-	    } else {
-		/*
-		 * Something wrong in the UTF-8 encoding or Unicode code
-		 * points > 255.
-		 */
-		goto badarg;
 	    }
-	}
-
-	/*
-	 * Allocate a temporary buffer the same size as the binary,
-	 * so that we don't need an extra overflow test.
-	 */
-	buf = (char *) erts_alloc(ERTS_ALC_T_TMP, bin_size);
-	dst = buf;
-	for (i = 0; i < bin_size; i++) {
-	    int c = bytes[i];
-	    if (c < 0x80) {
-		*dst++ = c;
-	    } else if (i < bin_size-1) {
-		int c2;
-		if ((c & 0xE0) != 0xC0) {
-		    goto free_badarg;
-		}
-		i++;
-		c = (c & 0x3F) << 6;
-		c2 = bytes[i];
-		if ((c2 & 0xC0) != 0x80) {
-		    goto free_badarg;
-		}
-		c = c | (c2 & 0x3F);
-		if (0x80 <= c && c < 256) {
-		    *dst++ = c;
-		} else {
-		    goto free_badarg;
-		}
-	    } else {
-	    free_badarg:
-		erts_free(ERTS_ALC_T_TMP, (void *) buf);
-		goto badarg;
+	    if ((p[0] & 0x80) == 0) {
+		++p;
+		--left;
 	    }
+	    else if (left >= 2
+		     && (p[0] & 0xFE) == 0xC2 /* only allow latin1 subset */
+		     && (p[1] & 0xC0) == 0x80) {
+		p += 2;
+		left -= 2;
+	    }
+	    else goto badarg;
 	}
-	num_chars = dst - buf;
-	if (num_chars > MAX_ATOM_LENGTH) {
-	    erts_free(ERTS_ALC_T_TMP, (void *) buf);
-	    goto system_limit;
-	}
+
 	if (!must_exist) {
-	    res = am_atom_put(buf, num_chars);
-	    erts_free(ERTS_ALC_T_TMP, (void *) buf);
-	    erts_free_aligned_binary_bytes(temp_alloc);
-	    BIF_RET(res);
-	} else {
-	    int exists = erts_atom_get(buf, num_chars, &res);
-	    erts_free(ERTS_ALC_T_TMP, (void *) buf);
-	    if (exists) {
-		erts_free_aligned_binary_bytes(temp_alloc);
-		BIF_RET(res);
-	    } else {
-		goto badarg;
-	    }
+	    res = am_atom_put((char*)bytes, bin_size);
+	}
+	else if (!erts_atom_get((char*)bytes, bin_size, &res, 0)) {
+	    goto badarg;
 	}
+	erts_free_aligned_binary_bytes(temp_alloc);
+	BIF_RET(res);
     } else {
     badarg:
 	erts_free_aligned_binary_bytes(temp_alloc);
@@ -2670,3 +2625,30 @@ BIF_RETTYPE file_native_name_encoding_0(BIF_ALIST_0)
     }
 }
 
+/* Assumes 'dest' has enough room.
+ */
+int erts_utf8_to_latin1(byte* dest, const byte* source, unsigned slen)
+{
+    byte* dp = dest;
+    while (slen > 0) {
+	if ((source[0] & 0x80) == 0) {
+	    *dp++ = *source++;
+	    --slen;
+	}
+	else if (slen > 1 &&
+		 (source[0] & 0xFE) == 0xC2 &&
+		 (source[1] & 0xC0) == 0x80) {
+	    *dp++ = (char) ((source[0] << 6) | (source[1] & 0x3F));
+	    source += 2;
+	    slen -= 2;
+	}
+	else {
+	    /* Just let unconvertable octets through. This should not happen
+	       in a correctly upgraded system */
+	    *dp++ = *source++;
+	    --slen;
+	}
+    }
+    return dp - dest;
+}
+
diff --git a/erts/emulator/beam/external.c b/erts/emulator/beam/external.c
index 263ffc4eb3..68edcd0fa6 100644
--- a/erts/emulator/beam/external.c
+++ b/erts/emulator/beam/external.c
@@ -707,7 +707,7 @@ erts_prepare_dist_ext(ErtsDistExternal *edep,
 		    if (cix >= ERTS_ATOM_CACHE_SIZE)
 			ERTS_EXT_HDR_FAIL;
 		    ep++;
-#if MAX_ATOM_LENGTH > 255
+#if MAX_ATOM_CHARACTERS > 255
 		    if (long_atoms) {
 			CHKSIZE(2);
 			len = get_int16(ep);
@@ -720,7 +720,7 @@ erts_prepare_dist_ext(ErtsDistExternal *edep,
 			len = get_int8(ep);
 			ep++;
 		    }
-		    if (len > MAX_ATOM_LENGTH)
+		    if (len > MAX_ATOM_CHARACTERS)
 			ERTS_EXT_HDR_FAIL; /* Too long atom */
 		    CHKSIZE(len);
 		    atom = am_atom_put((char *) ep, len);
@@ -1431,18 +1431,33 @@ enc_atom(ErtsAtomCacheMap *acmp, Eterm atom, byte *ep, Uint32 dflags)
     if (iix < 0) { 
 	i = atom_val(atom);
 	j = atom_tab(i)->len;
-	if ((MAX_ATOM_LENGTH <= 255 || j <= 255)
-	    && (dflags & DFLAG_SMALL_ATOM_TAGS)) {
-	    *ep++ = SMALL_ATOM_EXT;
-	    put_int8(j, ep);
-	    ep++;
+	if (dflags & DFLAG_UTF8_ATOMS) {
+	    if (j <= 255) {
+		*ep++ = ATOM_UTF8_EXT;
+		put_int16(j, ep);
+		ep += 2;
+	    }
+	    else {
+		*ep++ = SMALL_ATOM_UTF8_EXT;
+		put_int8(j, ep);
+		ep += 2;
+	    }
+	    sys_memcpy((char *) ep, (char*)atom_tab(i)->name, (int) j);
 	}
 	else {
-	    *ep++ = ATOM_EXT;
-	    put_int16(j, ep);
-	    ep += 2;
+	    if (j <= 255 && (dflags & DFLAG_SMALL_ATOM_TAGS)) {
+		*ep++ = SMALL_ATOM_EXT;
+		j = erts_utf8_to_latin1(ep+1, atom_tab(i)->name, j);
+		put_int8(j, ep);
+		ep++;
+	    }
+	    else {
+		*ep++ = ATOM_EXT;
+		j = erts_utf8_to_latin1(ep+2, atom_tab(i)->name, j);
+		put_int16(j, ep);
+		ep += 2;
+	    }	    
 	}
-	sys_memcpy((char *) ep, (char*)atom_tab(i)->name, (int) j);
 	ep += j;
 	return ep;
     }
@@ -1482,7 +1497,7 @@ static byte*
 dec_atom(ErtsDistExternal *edep, byte* ep, Eterm* objp)
 {
     Uint len;
-    int n;
+    int n, is_latin1;
 
     switch (*ep++) {
     case ATOM_CACHE_REF:
@@ -1498,17 +1513,29 @@ dec_atom(ErtsDistExternal *edep, byte* ep, Eterm* objp)
     case ATOM_EXT:
 	len = get_int16(ep),
 	ep += 2;
+	is_latin1 = 1;
         goto dec_atom_common;
     case SMALL_ATOM_EXT:
 	len = get_int8(ep);
 	ep++;
+	is_latin1 = 1;
+	goto dec_atom_common;
+    case ATOM_UTF8_EXT:
+	len = get_int16(ep),
+	ep += 2;
+	is_latin1 = 0;
+	goto dec_atom_common;
+    case SMALL_ATOM_UTF8_EXT:
+	len = get_int8(ep),
+	ep++;
+	is_latin1 = 0;
     dec_atom_common:
         if (edep && (edep->flags & ERTS_DIST_EXT_BTT_SAFE)) {
-	    if (!erts_atom_get((char*)ep, len, objp)) {
+	    if (!erts_atom_get((char*)ep, len, objp, is_latin1)) {
                 goto error;
 	    }
         } else {
-            *objp = am_atom_put((char*)ep, len);
+            *objp = am_atom_put2(ep, len, is_latin1);
         }
 	ep += len;
 	break;
@@ -2113,7 +2140,7 @@ static byte*
 dec_term(ErtsDistExternal *edep, Eterm** hpp, byte* ep, ErlOffHeap* off_heap, Eterm* objp)
 {
     Eterm* hp_saved = *hpp;
-    int n;
+    int n, is_latin1;
     register Eterm* hp = *hpp;	/* Please don't take the address of hp */
     Eterm* next = objp;
 
@@ -2199,17 +2226,29 @@ dec_term(ErtsDistExternal *edep, Eterm** hpp, byte* ep, ErlOffHeap* off_heap, Et
 	case ATOM_EXT:
 	    n = get_int16(ep);
 	    ep += 2;
-            goto dec_term_atom_common;
+	    is_latin1 = 1;
+	    goto dec_term_atom_common;
 	case SMALL_ATOM_EXT:
 	    n = get_int8(ep);
 	    ep++;
+	    is_latin1 = 1;
+	    goto dec_term_atom_common;
+	case ATOM_UTF8_EXT:
+	    n = get_int16(ep);
+	    ep += 2;
+	    is_latin1 = 0;
+	    goto dec_term_atom_common;
+	case SMALL_ATOM_UTF8_EXT:
+	    n = get_int8(ep);
+	    ep++;
+	    is_latin1 = 0;
 dec_term_atom_common:
 	    if (edep && (edep->flags & ERTS_DIST_EXT_BTT_SAFE)) {
-		if (!erts_atom_get((char*)ep, n, objp)) {
+		if (!erts_atom_get((char*)ep, n, objp, is_latin1)) {
 		    goto error;
 		}
 	    } else {
-	        *objp = am_atom_put((char*)ep, n);
+	        *objp = am_atom_put2(ep, n, is_latin1);
 	    }
 	    ep += n;
 	    break;
@@ -2879,14 +2918,15 @@ encode_size_struct2(ErtsAtomCacheMap *acmp, Eterm obj, unsigned dflags)
 	    }
 	    else {
 		int alen = atom_tab(atom_val(obj))->len;
-		if ((MAX_ATOM_LENGTH <= 255 || alen <= 255)
-		    && (dflags & DFLAG_SMALL_ATOM_TAGS)) {
-		    /* Make sure a SMALL_ATOM_EXT fits: SMALL_ATOM_EXT l t1 t2... */
-			result += 1 + 1 + alen;
-		}
-		else {
-		    /* Make sure an ATOM_EXT fits: ATOM_EXT l1 l0 t1 t2... */
-			result += 1 + 2 + alen;
+		result += 1 + 1 + alen;
+		if (dflags & DFLAG_UTF8_ATOMS) {
+		    if (alen > 255) {
+			result++; /* ATOM_UTF8_EXT (not small) */
+		    }
+		    /*SVERK we use utf8 length which is an over estimation */
+		}		    
+		else if (alen > 255 || !(dflags & DFLAG_SMALL_ATOM_TAGS)) {
+		    result++; /* ATOM_EXT (not small) */
 		}
 		insert_acache_map(acmp, obj);
 	    }
@@ -3058,6 +3098,17 @@ encode_size_struct2(ErtsAtomCacheMap *acmp, Eterm obj, unsigned dflags)
     return result;
 }
 
+static int is_valid_utf8_atom(byte* bytes, Uint nbytes)
+{
+    byte* err_pos;
+    Uint num_chars;
+
+    /*SVERK Do we really need to validate correct utf8? */
+    return nbytes <= MAX_ATOM_SZ_LIMIT
+	&& erts_analyze_utf8(bytes, nbytes, &err_pos, &num_chars, NULL) == ERTS_UTF8_OK
+	&& num_chars <= MAX_ATOM_CHARACTERS; 
+}
+
 static Sint
 decoded_size(byte *ep, byte* endp, int internal_tags)
 {
@@ -3125,21 +3176,41 @@ decoded_size(byte *ep, byte* endp, int internal_tags)
 	case ATOM_EXT:
 	    CHKSIZE(2);
 	    n = get_int16(ep);
-	    if (n > MAX_ATOM_LENGTH) {
+	    if (n > MAX_ATOM_CHARACTERS) {
 		return -1;
 	    }
 	    SKIP(n+2+atom_extra_skip);
 	    atom_extra_skip = 0;
 	    break;
+	case ATOM_UTF8_EXT:
+	    CHKSIZE(2);
+	    n = get_int16(ep);
+	    ep += 2;
+	    if (!is_valid_utf8_atom(ep, n)) {
+		return -1;
+	    }
+	    SKIP(n+atom_extra_skip);
+	    atom_extra_skip = 0;
+	    break;
 	case SMALL_ATOM_EXT:
 	    CHKSIZE(1);
 	    n = get_int8(ep);
-	    if (n > MAX_ATOM_LENGTH) {
+	    if (n > MAX_ATOM_CHARACTERS) {
 		return -1;
 	    }
 	    SKIP(n+1+atom_extra_skip);
 	    atom_extra_skip = 0;
 	    break;
+	case SMALL_ATOM_UTF8_EXT:
+	    CHKSIZE(1);
+	    n = get_int8(ep);
+	    ep++;
+	    if (!is_valid_utf8_atom(ep, n)) {
+		return -1;
+	    }
+	    SKIP(n+atom_extra_skip);
+	    atom_extra_skip = 0;
+	    break;
 	case ATOM_CACHE_REF:
 	    SKIP(1+atom_extra_skip);
 	    atom_extra_skip = 0;
diff --git a/erts/emulator/beam/external.h b/erts/emulator/beam/external.h
index eddd4571dd..50eea62225 100644
--- a/erts/emulator/beam/external.h
+++ b/erts/emulator/beam/external.h
@@ -51,6 +51,8 @@
 #define NEW_FUN_EXT       'p'
 #define EXPORT_EXT        'q'
 #define FUN_EXT           'u'
+#define ATOM_UTF8_EXT     'v'
+#define SMALL_ATOM_UTF8_EXT 'w'
 
 #define DIST_HEADER       'D'
 #define ATOM_CACHE_REF    'R'
diff --git a/erts/emulator/beam/global.h b/erts/emulator/beam/global.h
index 4c0d3421c8..1500424d3e 100755
--- a/erts/emulator/beam/global.h
+++ b/erts/emulator/beam/global.h
@@ -1525,6 +1525,7 @@ char *erts_convert_filename_to_native(Eterm name, char *statbuf,
 				      int allow_empty, int allow_atom,
 				      Sint *used /* out */);
 Eterm erts_convert_native_to_filename(Process *p, byte *bytes);
+int erts_utf8_to_latin1(byte* dest, const byte* source, unsigned slen);
 #define ERTS_UTF8_OK 0
 #define ERTS_UTF8_INCOMPLETE 1
 #define ERTS_UTF8_ERROR 2
diff --git a/erts/emulator/test/bif_SUITE.erl b/erts/emulator/test/bif_SUITE.erl
index e2442861c7..02c6de8cb1 100644
--- a/erts/emulator/test/bif_SUITE.erl
+++ b/erts/emulator/test/bif_SUITE.erl
@@ -481,8 +481,6 @@ binary_to_atom(Config) when is_list(Config) ->
     %% Bad UTF8 sequences.
     ?line ?BADARG(binary_to_atom(id(<<255>>), utf8)),
     ?line ?BADARG(binary_to_atom(id(<<255,0>>), utf8)),
-    ?line ?BADARG(binary_to_atom(id(<<0:512/unit:8,255>>), utf8)),
-    ?line ?BADARG(binary_to_atom(id(<<0:512/unit:8,255,0>>), utf8)),
     ?line ?BADARG(binary_to_atom(id(<<16#C0,16#80>>), utf8)), %Overlong 0.
     ?line [?BADARG(binary_to_atom(<<C/utf8>>, utf8)) ||
 	      C <- lists:seq(256, 16#D7FF)],
@@ -494,6 +492,8 @@ binary_to_atom(Config) when is_list(Config) ->
 	      C <- lists:seq(16#90000, 16#10FFFF)],
 
     %% system_limit failures.
+    ?line ?SYS_LIMIT(binary_to_atom(id(<<0:512/unit:8,255>>), utf8)),
+    ?line ?SYS_LIMIT(binary_to_atom(id(<<0:512/unit:8,255,0>>), utf8)),
     ?line ?SYS_LIMIT(binary_to_atom(<<0:256/unit:8>>, latin1)),
     ?line ?SYS_LIMIT(binary_to_atom(<<0:257/unit:8>>, latin1)),
     ?line ?SYS_LIMIT(binary_to_atom(<<0:512/unit:8>>, latin1)),
author	Sverker Eriksson <sverker@erlang.org>	2012-12-14 19:52:59 +0100
committer	Sverker Eriksson <sverker@erlang.org>	2013-01-08 11:15:01 +0100
commit	a9a57385f0a5648bbfeccc5e8ef0bfe2cdac80c7 (patch)
tree	b7fd2766b020c37a68b29b0aac47ace685841349 /erts
parent	81a5f0f91ce50416d44f48752cb2b7f4ae02d953 (diff)
download	otp-a9a57385f0a5648bbfeccc5e8ef0bfe2cdac80c7.tar.gz otp-a9a57385f0a5648bbfeccc5e8ef0bfe2cdac80c7.tar.bz2 otp-a9a57385f0a5648bbfeccc5e8ef0bfe2cdac80c7.zip