From 0dd3b88cdf90283d9c276ee415f985cb764e522f Mon Sep 17 00:00:00 2001
From: Rickard Green <rickard@erlang.org>
Date: Thu, 10 Jan 2013 12:47:46 +0100
Subject: UTF-8 support for distribution

---
 erts/emulator/beam/atom.c | 164 ++++++++++++++++++++++++++++++++--------------
 1 file changed, 114 insertions(+), 50 deletions(-)

(limited to 'erts/emulator/beam/atom.c')

diff --git a/erts/emulator/beam/atom.c b/erts/emulator/beam/atom.c
index b41a98f2a2..82dd320ea9 100644
--- a/erts/emulator/beam/atom.c
+++ b/erts/emulator/beam/atom.c
@@ -162,6 +162,7 @@ atom_alloc(Atom* tmpl)
     obj->name = atom_text_alloc(tmpl->len);
     sys_memcpy(obj->name, tmpl->name, tmpl->len);
     obj->len = tmpl->len;
+    obj->latin1_chars = tmpl->latin1_chars;
     obj->slot.index = -1;
 
     /*
@@ -192,48 +193,6 @@ atom_free(Atom* obj)
     erts_free(ERTS_ALC_T_ATOM, (void*) obj);
 }
 
-Eterm
-am_atom_put(const char* name, int len)
-{
-    Atom a;
-    Eterm ret;
-    int aix;
-#ifdef DEBUG
-    byte* err_pos;
-    Uint num_chars;
-    ASSERT(erts_analyze_utf8(name, len, &err_pos, &num_chars, NULL) == ERTS_UTF8_OK);
-#endif
-    /*
-     * Silently truncate the atom if it is too long. Overlong atoms
-     * could occur in situations where we have no good way to return
-     * an error, such as in the I/O system. (Unfortunately, many
-     * drivers don't check for errors.)
-     *
-     * If an error should be produced for overlong atoms (such in
-     * list_to_atom/1), the caller should check the length before
-     * calling this function.
-     */
-    if (len > MAX_ATOM_SZ_LIMIT) {
-	len = MAX_ATOM_SZ_LIMIT;  /*SVERK Urk... */
-    }
-#ifdef ERTS_ATOM_PUT_OPS_STAT
-    erts_smp_atomic_inc_nob(&atom_put_ops);
-#endif
-    a.len = len;
-    a.name = (byte*)name;
-    atom_read_lock();
-    aix = index_get(&erts_atom_table, (void*) &a);
-    atom_read_unlock();
-    if (aix >= 0)
-	ret = make_atom(aix);
-    else {
-	atom_write_lock();
-	ret = make_atom(index_put(&erts_atom_table, (void*) &a));
-	atom_write_unlock();
-    }
-    return ret;
-}
-
 static void latin1_to_utf8(byte* conv_buf, const byte** srcp, int* lenp)
 {
     byte* dst;
@@ -264,19 +223,116 @@ need_convertion:
     *lenp = dst - conv_buf;
 }
 
-
+/*
+ * erts_atom_put() may fail. If it fails THE_NON_VALUE is returned!
+ */
 Eterm
-am_atom_put2(const byte* name, int len, int is_latin1)
+erts_atom_put(const byte *name, int len, ErtsAtomEncoding enc, int trunc)
 {
     byte utf8_copy[MAX_ATOM_SZ_FROM_LATIN1];
+    const byte *text = name;
+    int tlen = len;
+    Sint no_latin1_chars;
+    Atom a;
+    int aix;
 
-    if (is_latin1) {
-	latin1_to_utf8(utf8_copy, &name, &len);
+#ifdef ERTS_ATOM_PUT_OPS_STAT
+    erts_smp_atomic_inc_nob(&atom_put_ops);
+#endif
+
+    if (tlen < 0) {
+	if (trunc)
+	    tlen = 0;
+	else
+	    return THE_NON_VALUE;
     }
-    return am_atom_put((const char*)name, len);
-}
 
+    switch (enc) {
+    case ERTS_ATOM_ENC_7BIT_ASCII:
+	if (tlen > MAX_ATOM_CHARACTERS) {
+	    if (trunc)
+		tlen = MAX_ATOM_CHARACTERS;
+	    else
+		return THE_NON_VALUE;
+	}
+#ifdef DEBUG
+	for (aix = 0; aix < len; aix++) {
+	    ASSERT((name[aix] & 0x80) == 0);
+	}
+#endif
+	no_latin1_chars = tlen;
+	break;
+    case ERTS_ATOM_ENC_LATIN1:
+	if (tlen > MAX_ATOM_CHARACTERS) {
+	    if (trunc)
+		tlen = MAX_ATOM_CHARACTERS;
+	    else
+		return THE_NON_VALUE;
+	}
+	no_latin1_chars = tlen;
+	latin1_to_utf8(utf8_copy, &text, &tlen);
+	break;
+    case ERTS_ATOM_ENC_UTF8:
+	/* First sanity check; need to verify later */
+	if (tlen > MAX_ATOM_SZ_LIMIT && !trunc)
+	    return THE_NON_VALUE;
+	break;
+    }
 
+    a.len = tlen;
+    a.name = (byte *) text;
+    atom_read_lock();
+    aix = index_get(&erts_atom_table, (void*) &a);
+    atom_read_unlock();
+    if (aix >= 0) {
+	/* Already in table no need to verify it */
+	return make_atom(aix);
+    }
+
+    if (enc == ERTS_ATOM_ENC_UTF8) {
+	/* Need to verify encoding and length */
+	byte *err_pos;
+	Uint no_chars;
+	switch (erts_analyze_utf8_x((byte *) text,
+				    (Uint) tlen,
+				    &err_pos,
+				    &no_chars, NULL,
+				    &no_latin1_chars,
+				    MAX_ATOM_CHARACTERS)) {
+	case ERTS_UTF8_OK:
+	    ASSERT(no_chars <= MAX_ATOM_CHARACTERS);
+	    break;
+	case ERTS_UTF8_OK_MAX_CHARS:
+	    /* Truncated... */
+	    if (!trunc)
+		return THE_NON_VALUE;
+	    ASSERT(no_chars == MAX_ATOM_CHARACTERS);
+	    tlen = err_pos - text;
+	    break;
+	default:
+	    /* Bad utf8... */
+	    return THE_NON_VALUE;
+	}
+    }
+
+    ASSERT(tlen <= MAX_ATOM_SZ_LIMIT);
+    ASSERT(-1 <= no_latin1_chars && no_latin1_chars <= MAX_ATOM_CHARACTERS);
+
+    a.len = tlen;
+    a.latin1_chars = (Sint16) no_latin1_chars;
+    a.name = (byte *) text;
+    atom_write_lock();
+    aix = index_put(&erts_atom_table, (void*) &a);
+    atom_write_unlock();
+    return make_atom(aix);
+}
+
+Eterm
+am_atom_put(const char* name, int len)
+{
+    /* Assumes 7-bit ascii; use erts_atom_put() for other encodings... */
+    return erts_atom_put((byte *) name, len, ERTS_ATOM_ENC_7BIT_ASCII, 1);
+}
 
 int atom_table_size(void)
 {
@@ -318,10 +374,11 @@ erts_atom_get(const char *name, int len, Eterm* ap, int is_latin1)
     int i;
     int res;
 
-    a.len = len;
+    a.len = (Sint16) len;
     a.name = (byte *)name;
     if (is_latin1) {
-	latin1_to_utf8(utf8_copy, (const byte**)&a.name, &a.len);
+	latin1_to_utf8(utf8_copy, (const byte**)&a.name, &len);
+	a.len = (Sint16) len;
     }
     atom_read_lock();
     i = index_get(&erts_atom_table, (void*) &a);
@@ -384,8 +441,15 @@ init_atom_table(void)
     for (i = 0; erl_atom_names[i] != 0; i++) {
 	int ix;
 	a.len = strlen(erl_atom_names[i]);
+	a.latin1_chars = a.len;
 	a.name = (byte*)erl_atom_names[i];
 	a.slot.index = i;
+#ifdef DEBUG
+	/* Verify 7-bit ascii */
+	for (ix = 0; ix < a.len; ix++) {
+	    ASSERT((a.name[ix] & 0x80) == 0);
+	}
+#endif
 	ix = index_put(&erts_atom_table, (void*) &a);
 	atom_text_pos -= a.len;
 	atom_space -= a.len;
-- 
cgit v1.2.3