aboutsummaryrefslogtreecommitdiffstats
path: root/erts
diff options
context:
space:
mode:
authorRickard Green <rickard@erlang.org>2013-01-10 12:47:46 +0100
committerRickard Green <rickard@erlang.org>2013-01-16 17:16:52 +0100
commit0dd3b88cdf90283d9c276ee415f985cb764e522f (patch)
tree1584d76d9960339a03c04412ef7919473e7b2efc /erts
parent5d79f55ca441727578d34b78ee0d6d8aa80976ee (diff)
downloadotp-0dd3b88cdf90283d9c276ee415f985cb764e522f.tar.gz
otp-0dd3b88cdf90283d9c276ee415f985cb764e522f.tar.bz2
otp-0dd3b88cdf90283d9c276ee415f985cb764e522f.zip
UTF-8 support for distribution
Diffstat (limited to 'erts')
-rw-r--r--erts/emulator/beam/atom.c164
-rw-r--r--erts/emulator/beam/atom.h13
-rw-r--r--erts/emulator/beam/atom.names2
-rw-r--r--erts/emulator/beam/beam_load.c4
-rw-r--r--erts/emulator/beam/bif.c26
-rw-r--r--erts/emulator/beam/dist.c8
-rw-r--r--erts/emulator/beam/erl_alloc.types2
-rw-r--r--erts/emulator/beam/erl_bif_ddll.c5
-rwxr-xr-xerts/emulator/beam/erl_bif_info.c22
-rw-r--r--erts/emulator/beam/erl_bif_port.c2
-rw-r--r--erts/emulator/beam/erl_db.c2
-rw-r--r--erts/emulator/beam/erl_db_util.c3
-rw-r--r--erts/emulator/beam/erl_init.c4
-rw-r--r--erts/emulator/beam/erl_unicode.c139
-rw-r--r--erts/emulator/beam/external.c193
-rw-r--r--erts/emulator/beam/external.h5
-rwxr-xr-xerts/emulator/beam/global.h10
-rw-r--r--erts/emulator/beam/io.c28
-rw-r--r--erts/emulator/beam/utils.c2
19 files changed, 440 insertions, 194 deletions
diff --git a/erts/emulator/beam/atom.c b/erts/emulator/beam/atom.c
index b41a98f2a2..82dd320ea9 100644
--- a/erts/emulator/beam/atom.c
+++ b/erts/emulator/beam/atom.c
@@ -162,6 +162,7 @@ atom_alloc(Atom* tmpl)
obj->name = atom_text_alloc(tmpl->len);
sys_memcpy(obj->name, tmpl->name, tmpl->len);
obj->len = tmpl->len;
+ obj->latin1_chars = tmpl->latin1_chars;
obj->slot.index = -1;
/*
@@ -192,48 +193,6 @@ atom_free(Atom* obj)
erts_free(ERTS_ALC_T_ATOM, (void*) obj);
}
-Eterm
-am_atom_put(const char* name, int len)
-{
- Atom a;
- Eterm ret;
- int aix;
-#ifdef DEBUG
- byte* err_pos;
- Uint num_chars;
- ASSERT(erts_analyze_utf8(name, len, &err_pos, &num_chars, NULL) == ERTS_UTF8_OK);
-#endif
- /*
- * Silently truncate the atom if it is too long. Overlong atoms
- * could occur in situations where we have no good way to return
- * an error, such as in the I/O system. (Unfortunately, many
- * drivers don't check for errors.)
- *
- * If an error should be produced for overlong atoms (such in
- * list_to_atom/1), the caller should check the length before
- * calling this function.
- */
- if (len > MAX_ATOM_SZ_LIMIT) {
- len = MAX_ATOM_SZ_LIMIT; /*SVERK Urk... */
- }
-#ifdef ERTS_ATOM_PUT_OPS_STAT
- erts_smp_atomic_inc_nob(&atom_put_ops);
-#endif
- a.len = len;
- a.name = (byte*)name;
- atom_read_lock();
- aix = index_get(&erts_atom_table, (void*) &a);
- atom_read_unlock();
- if (aix >= 0)
- ret = make_atom(aix);
- else {
- atom_write_lock();
- ret = make_atom(index_put(&erts_atom_table, (void*) &a));
- atom_write_unlock();
- }
- return ret;
-}
-
static void latin1_to_utf8(byte* conv_buf, const byte** srcp, int* lenp)
{
byte* dst;
@@ -264,19 +223,116 @@ need_convertion:
*lenp = dst - conv_buf;
}
-
+/*
+ * erts_atom_put() may fail. If it fails THE_NON_VALUE is returned!
+ */
Eterm
-am_atom_put2(const byte* name, int len, int is_latin1)
+erts_atom_put(const byte *name, int len, ErtsAtomEncoding enc, int trunc)
{
byte utf8_copy[MAX_ATOM_SZ_FROM_LATIN1];
+ const byte *text = name;
+ int tlen = len;
+ Sint no_latin1_chars;
+ Atom a;
+ int aix;
- if (is_latin1) {
- latin1_to_utf8(utf8_copy, &name, &len);
+#ifdef ERTS_ATOM_PUT_OPS_STAT
+ erts_smp_atomic_inc_nob(&atom_put_ops);
+#endif
+
+ if (tlen < 0) {
+ if (trunc)
+ tlen = 0;
+ else
+ return THE_NON_VALUE;
}
- return am_atom_put((const char*)name, len);
-}
+ switch (enc) {
+ case ERTS_ATOM_ENC_7BIT_ASCII:
+ if (tlen > MAX_ATOM_CHARACTERS) {
+ if (trunc)
+ tlen = MAX_ATOM_CHARACTERS;
+ else
+ return THE_NON_VALUE;
+ }
+#ifdef DEBUG
+ for (aix = 0; aix < len; aix++) {
+ ASSERT((name[aix] & 0x80) == 0);
+ }
+#endif
+ no_latin1_chars = tlen;
+ break;
+ case ERTS_ATOM_ENC_LATIN1:
+ if (tlen > MAX_ATOM_CHARACTERS) {
+ if (trunc)
+ tlen = MAX_ATOM_CHARACTERS;
+ else
+ return THE_NON_VALUE;
+ }
+ no_latin1_chars = tlen;
+ latin1_to_utf8(utf8_copy, &text, &tlen);
+ break;
+ case ERTS_ATOM_ENC_UTF8:
+ /* First sanity check; need to verify later */
+ if (tlen > MAX_ATOM_SZ_LIMIT && !trunc)
+ return THE_NON_VALUE;
+ break;
+ }
+ a.len = tlen;
+ a.name = (byte *) text;
+ atom_read_lock();
+ aix = index_get(&erts_atom_table, (void*) &a);
+ atom_read_unlock();
+ if (aix >= 0) {
+ /* Already in table no need to verify it */
+ return make_atom(aix);
+ }
+
+ if (enc == ERTS_ATOM_ENC_UTF8) {
+ /* Need to verify encoding and length */
+ byte *err_pos;
+ Uint no_chars;
+ switch (erts_analyze_utf8_x((byte *) text,
+ (Uint) tlen,
+ &err_pos,
+ &no_chars, NULL,
+ &no_latin1_chars,
+ MAX_ATOM_CHARACTERS)) {
+ case ERTS_UTF8_OK:
+ ASSERT(no_chars <= MAX_ATOM_CHARACTERS);
+ break;
+ case ERTS_UTF8_OK_MAX_CHARS:
+ /* Truncated... */
+ if (!trunc)
+ return THE_NON_VALUE;
+ ASSERT(no_chars == MAX_ATOM_CHARACTERS);
+ tlen = err_pos - text;
+ break;
+ default:
+ /* Bad utf8... */
+ return THE_NON_VALUE;
+ }
+ }
+
+ ASSERT(tlen <= MAX_ATOM_SZ_LIMIT);
+ ASSERT(-1 <= no_latin1_chars && no_latin1_chars <= MAX_ATOM_CHARACTERS);
+
+ a.len = tlen;
+ a.latin1_chars = (Sint16) no_latin1_chars;
+ a.name = (byte *) text;
+ atom_write_lock();
+ aix = index_put(&erts_atom_table, (void*) &a);
+ atom_write_unlock();
+ return make_atom(aix);
+}
+
+Eterm
+am_atom_put(const char* name, int len)
+{
+ /* Assumes 7-bit ascii; use erts_atom_put() for other encodings... */
+ return erts_atom_put((byte *) name, len, ERTS_ATOM_ENC_7BIT_ASCII, 1);
+}
int atom_table_size(void)
{
@@ -318,10 +374,11 @@ erts_atom_get(const char *name, int len, Eterm* ap, int is_latin1)
int i;
int res;
- a.len = len;
+ a.len = (Sint16) len;
a.name = (byte *)name;
if (is_latin1) {
- latin1_to_utf8(utf8_copy, (const byte**)&a.name, &a.len);
+ latin1_to_utf8(utf8_copy, (const byte**)&a.name, &len);
+ a.len = (Sint16) len;
}
atom_read_lock();
i = index_get(&erts_atom_table, (void*) &a);
@@ -384,8 +441,15 @@ init_atom_table(void)
for (i = 0; erl_atom_names[i] != 0; i++) {
int ix;
a.len = strlen(erl_atom_names[i]);
+ a.latin1_chars = a.len;
a.name = (byte*)erl_atom_names[i];
a.slot.index = i;
+#ifdef DEBUG
+ /* Verify 7-bit ascii */
+ for (ix = 0; ix < a.len; ix++) {
+ ASSERT((a.name[ix] & 0x80) == 0);
+ }
+#endif
ix = index_put(&erts_atom_table, (void*) &a);
atom_text_pos -= a.len;
atom_space -= a.len;
diff --git a/erts/emulator/beam/atom.h b/erts/emulator/beam/atom.h
index 84dd6d8901..f721999a4c 100644
--- a/erts/emulator/beam/atom.h
+++ b/erts/emulator/beam/atom.h
@@ -47,7 +47,8 @@
*/
typedef struct atom {
IndexSlot slot; /* MUST BE LOCATED AT TOP OF STRUCT!!! */
- int len; /* length of atom name */
+ Sint16 len; /* length of atom name (UTF-8 encoded) */
+ Sint16 latin1_chars; /* 0-255 if atom can be encoded in latin1; otherwise, -1 */
int ord0; /* ordinal value of first 3 bytes + 7 bits */
byte* name; /* name of atom */
} Atom;
@@ -113,6 +114,12 @@ ERTS_GLB_INLINE int erts_is_atom_str(const char *str, Eterm term, int is_latin1)
#endif
+typedef enum {
+ ERTS_ATOM_ENC_7BIT_ASCII,
+ ERTS_ATOM_ENC_LATIN1,
+ ERTS_ATOM_ENC_UTF8
+} ErtsAtomEncoding;
+
/*
* Note, ERTS_IS_ATOM_STR() expects the first argument to be a
* 7-bit ASCII string literal.
@@ -125,8 +132,8 @@ ERTS_GLB_INLINE int erts_is_atom_str(const char *str, Eterm term, int is_latin1)
int atom_table_size(void); /* number of elements */
int atom_table_sz(void); /* table size in bytes, excluding stored objects */
-Eterm am_atom_put(const char*, int); /* most callers pass plain char*'s */
-Eterm am_atom_put2(const byte*, int, int is_latin1);
+Eterm am_atom_put(const char*, int); /* ONLY 7-bit ascii! */
+Eterm erts_atom_put(const byte *name, int len, ErtsAtomEncoding enc, int trunc);
int atom_erase(byte*, int);
int atom_static_put(byte*, int);
void init_atom_table(void);
diff --git a/erts/emulator/beam/atom.names b/erts/emulator/beam/atom.names
index afcbd732df..59c9f39e7b 100644
--- a/erts/emulator/beam/atom.names
+++ b/erts/emulator/beam/atom.names
@@ -18,6 +18,8 @@
#
#
+# IMPORTANT! All atoms defined here *need* to be in 7-bit ascii!
+#
# File format:
#
# Lines starting with '#' are ignored.
diff --git a/erts/emulator/beam/beam_load.c b/erts/emulator/beam/beam_load.c
index 7bb964d5aa..8b4135e21d 100644
--- a/erts/emulator/beam/beam_load.c
+++ b/erts/emulator/beam/beam_load.c
@@ -1230,7 +1230,7 @@ load_atom_table(LoaderState* stp)
GetByte(stp, n);
GetString(stp, atom, n);
- stp->atom[i] = am_atom_put2(atom, n, 1);
+ stp->atom[i] = erts_atom_put(atom, n, ERTS_ATOM_ENC_LATIN1, 1);
}
/*
@@ -1620,7 +1620,7 @@ read_line_table(LoaderState* stp)
GetInt(stp, 2, n);
GetString(stp, fname, n);
- stp->fname[i] = am_atom_put((char*)fname, n); /*SVERK ? */
+ stp->fname[i] = erts_atom_put(fname, n, ERTS_ATOM_ENC_LATIN1, 1);
}
}
diff --git a/erts/emulator/beam/bif.c b/erts/emulator/beam/bif.c
index 89157068c0..a0b4a8c049 100644
--- a/erts/emulator/beam/bif.c
+++ b/erts/emulator/beam/bif.c
@@ -2536,11 +2536,13 @@ BIF_RETTYPE append_element_2(BIF_ALIST_2)
BIF_RETTYPE atom_to_list_1(BIF_ALIST_1)
{
- Eterm do_utf8_to_list(Process*, Uint num, byte *bytes, Uint sz, Uint left,
- Uint *num_built, Uint *num_eaten, Eterm tail); /*SVERK */
Atom* ap;
Uint num_chars, num_built, num_eaten;
+ byte* err_pos;
Eterm res;
+#ifdef DEBUG
+ int ares;
+#endif
if (is_not_atom(BIF_ARG_1))
BIF_ERROR(BIF_P, BADARG);
@@ -2549,16 +2551,15 @@ BIF_RETTYPE atom_to_list_1(BIF_ALIST_1)
ap = atom_tab(atom_val(BIF_ARG_1));
if (ap->len == 0)
BIF_RET(NIL); /* the empty atom */
- {
- byte* err_pos;
- if (erts_analyze_utf8(ap->name, ap->len, &err_pos, &num_chars, NULL)
- != ERTS_UTF8_OK) {
- BIF_ERROR(BIF_P, BADARG);
- }
- }
+
+#ifdef DEBUG
+ ares =
+#endif
+ erts_analyze_utf8(ap->name, ap->len, &err_pos, &num_chars, NULL);
+ ASSERT(ares == ERTS_UTF8_OK);
- res = do_utf8_to_list(BIF_P, num_chars, ap->name, ap->len, ap->len,
- &num_built, &num_eaten, NIL);
+ res = erts_utf8_to_list(BIF_P, num_chars, ap->name, ap->len, ap->len,
+ &num_built, &num_eaten, NIL);
ASSERT(num_built == num_chars);
ASSERT(num_eaten == ap->len);
BIF_RET(res);
@@ -2582,7 +2583,8 @@ BIF_RETTYPE list_to_atom_1(BIF_ALIST_1)
}
BIF_ERROR(BIF_P, BADARG);
}
- res = am_atom_put2((byte*)buf, i, 1);
+ res = erts_atom_put((byte *) buf, i, ERTS_ATOM_ENC_LATIN1, 1);
+ ASSERT(is_atom(res));
erts_free(ERTS_ALC_T_TMP, (void *) buf);
BIF_RET(res);
}
diff --git a/erts/emulator/beam/dist.c b/erts/emulator/beam/dist.c
index 28c4621ff2..8c3bcd1de4 100644
--- a/erts/emulator/beam/dist.c
+++ b/erts/emulator/beam/dist.c
@@ -1646,7 +1646,7 @@ dsig_send(ErtsDSigData *dsdp, Eterm ctl, Eterm msg, int force_busy)
data_size += erts_encode_dist_ext_size(ctl, flags, acmp);
if (is_value(msg))
data_size += erts_encode_dist_ext_size(msg, flags, acmp);
- erts_finalize_atom_cache_map(acmp);
+ erts_finalize_atom_cache_map(acmp, flags);
dhdr_ext_size = erts_encode_ext_dist_header_size(acmp);
data_size += dhdr_ext_size;
@@ -1996,7 +1996,8 @@ erts_dist_command(Port *prt, int reds_limit)
ASSERT(ob);
do {
ob->extp = erts_encode_ext_dist_header_finalize(ob->extp,
- dep->cache);
+ dep->cache,
+ flags);
if (!(flags & DFLAG_DIST_HDR_ATOM_CACHE))
*--ob->extp = PASS_THROUGH; /* Old node; 'pass through'
needed */
@@ -2040,7 +2041,8 @@ erts_dist_command(Port *prt, int reds_limit)
Uint size;
oq.first->extp
= erts_encode_ext_dist_header_finalize(oq.first->extp,
- dep->cache);
+ dep->cache,
+ flags);
reds += ERTS_PORT_REDS_DIST_CMD_FINALIZE;
if (!(flags & DFLAG_DIST_HDR_ATOM_CACHE))
*--oq.first->extp = PASS_THROUGH; /* Old node; 'pass through'
diff --git a/erts/emulator/beam/erl_alloc.types b/erts/emulator/beam/erl_alloc.types
index 0a4407f009..2b649b589b 100644
--- a/erts/emulator/beam/erl_alloc.types
+++ b/erts/emulator/beam/erl_alloc.types
@@ -49,6 +49,8 @@
# true after a "+enable X" statement or if it has been passed as a
# command line argument to make_alloc_types. The variable X is false
# after a "+disable X" statement or if it has never been mentioned.
+#
+# IMPORTANT! Only use 7-bit ascii text in this file!
+if smp
+disable threads_no_smp
diff --git a/erts/emulator/beam/erl_bif_ddll.c b/erts/emulator/beam/erl_bif_ddll.c
index 7f7c975e78..59a53870f3 100644
--- a/erts/emulator/beam/erl_bif_ddll.c
+++ b/erts/emulator/beam/erl_bif_ddll.c
@@ -1869,7 +1869,10 @@ static Eterm build_load_error_hp(Eterm *hp, int code)
static Eterm mkatom(char *str)
{
- return am_atom_put(str, sys_strlen(str));
+ return erts_atom_put((byte *) str,
+ sys_strlen(str),
+ ERTS_ATOM_ENC_LATIN1,
+ 1);
}
static char *pick_list_or_atom(Eterm name_term)
diff --git a/erts/emulator/beam/erl_bif_info.c b/erts/emulator/beam/erl_bif_info.c
index a3811ccdb0..c910bd0cb6 100755
--- a/erts/emulator/beam/erl_bif_info.c
+++ b/erts/emulator/beam/erl_bif_info.c
@@ -2296,8 +2296,10 @@ BIF_RETTYPE system_info_1(BIF_ALIST_1)
for (i = num_instructions-1; i >= 0; i--) {
res = erts_bld_cons(hpp, hszp,
erts_bld_tuple(hpp, hszp, 2,
- am_atom_put(opc[i].name,
- strlen(opc[i].name)),
+ erts_atom_put(opc[i].name,
+ strlen(opc[i].name),
+ ERTS_ATOM_ENC_LATIN1,
+ 1),
erts_bld_uint(hpp, hszp,
opc[i].count)),
res);
@@ -3901,7 +3903,7 @@ static Eterm lcnt_build_lock_stats_term(Eterm **hpp, Uint *szp, erts_lcnt_lock_s
timer_ns = stats->timer.ns;
timer_n = stats->timer_n;
- af = am_atom_put(stats->file, strlen(stats->file));
+ af = erts_atom_put(stats->file, strlen(stats->file), ERTS_ATOM_ENC_LATIN1, 1);
uil = erts_bld_uint( hpp, szp, line);
tloc = erts_bld_tuple(hpp, szp, 2, af, uil);
@@ -3938,13 +3940,13 @@ static Eterm lcnt_build_lock_term(Eterm **hpp, Uint *szp, erts_lcnt_lock_t *lock
ASSERT(ltype);
- type = am_atom_put(ltype, strlen(ltype));
- name = am_atom_put(lock->name, strlen(lock->name));
+ type = erts_atom_put(ltype, strlen(ltype), ERTS_ATOM_ENC_LATIN1, 1);
+ name = erts_atom_put(lock->name, strlen(lock->name), ERTS_ATOM_ENC_LATIN1, 1);
if (lock->flag & ERTS_LCNT_LT_ALLOC) {
/* use allocator types names as id's for allocator locks */
ltype = (char *) ERTS_ALC_A2AD(signed_val(lock->id));
- id = am_atom_put(ltype, strlen(ltype));
+ id = erts_atom_put(ltype, strlen(ltype), ERTS_ATOM_ENC_LATIN1, 1);
} else if (lock->flag & ERTS_LCNT_LT_PROCLOCK) {
/* use registered names as id's for process locks if available */
proc = erts_proc_lookup(lock->id);
@@ -3984,12 +3986,12 @@ static Eterm lcnt_build_result_term(Eterm **hpp, Uint *szp, erts_lcnt_data_t *da
dtns = erts_bld_uint( hpp, szp, data->duration.ns);
tdt = erts_bld_tuple(hpp, szp, 2, dts, dtns);
- adur = am_atom_put(str_duration, strlen(str_duration));
+ adur = erts_atom_put(str_duration, strlen(str_duration), ERTS_ATOM_ENC_LATIN1, 1);
tdur = erts_bld_tuple(hpp, szp, 2, adur, tdt);
/* lock tuple */
- aloc = am_atom_put(str_locks, strlen(str_locks));
+ aloc = erts_atom_put(str_locks, strlen(str_locks), ERTS_ATOM_ENC_LATIN1, 1);
for (lock = data->current_locks->head; lock != NULL ; lock = lock->next ) {
lloc = lcnt_build_lock_term(hpp, szp, lock, lloc);
@@ -4125,14 +4127,14 @@ BIF_RETTYPE erts_debug_lock_counters_1(BIF_ALIST_1)
static void os_info_init(void)
{
- Eterm type = am_atom_put(os_type, strlen(os_type));
+ Eterm type = erts_atom_put((byte *) os_type, strlen(os_type), ERTS_ATOM_ENC_LATIN1, 1);
Eterm flav;
int major, minor, build;
char* buf = erts_alloc(ERTS_ALC_T_TMP, 1024); /* More than enough */
Eterm* hp;
os_flavor(buf, 1024);
- flav = am_atom_put(buf, strlen(buf));
+ flav = erts_atom_put((byte *) buf, strlen(buf), ERTS_ATOM_ENC_LATIN1, 1);
erts_free(ERTS_ALC_T_TMP, (void *) buf);
hp = erts_alloc(ERTS_ALC_T_LL_TEMP_TERM, (3+4)*sizeof(Eterm));
os_type_tuple = TUPLE2(hp, type, flav);
diff --git a/erts/emulator/beam/erl_bif_port.c b/erts/emulator/beam/erl_bif_port.c
index f9009166c0..4b270414cb 100644
--- a/erts/emulator/beam/erl_bif_port.c
+++ b/erts/emulator/beam/erl_bif_port.c
@@ -68,7 +68,7 @@ BIF_RETTYPE open_port_2(BIF_ALIST_2)
} else {
str = "einval";
}
- BIF_P->fvalue = am_atom_put(str, strlen(str));
+ BIF_P->fvalue = erts_atom_put((byte *) str, strlen(str), ERTS_ATOM_ENC_LATIN1, 1);
BIF_ERROR(BIF_P, EXC_ERROR);
}
diff --git a/erts/emulator/beam/erl_db.c b/erts/emulator/beam/erl_db.c
index 1ba1048afa..f8a4882ec0 100644
--- a/erts/emulator/beam/erl_db.c
+++ b/erts/emulator/beam/erl_db.c
@@ -3815,7 +3815,7 @@ erts_ets_colliding_names(Process* p, Eterm name, Uint cnt)
while (index >= atom_table_size()) {
char tmp[20];
erts_snprintf(tmp, sizeof(tmp), "am%x", atom_table_size());
- am_atom_put(tmp,strlen(tmp));
+ erts_atom_put((byte *) tmp, strlen(tmp), ERTS_ATOM_ENC_LATIN1, 1);
}
list = CONS(hp, make_atom(index), list);
hp += 2;
diff --git a/erts/emulator/beam/erl_db_util.c b/erts/emulator/beam/erl_db_util.c
index 0c9ca83ce4..407a55a3d7 100644
--- a/erts/emulator/beam/erl_db_util.c
+++ b/erts/emulator/beam/erl_db_util.c
@@ -4768,7 +4768,8 @@ static int match_compact(ErlHeapFragment *expr, DMCErrInfo *err_info)
ASSERT(j < x);
erts_snprintf(buff+1, sizeof(buff) - 1, "%u", (unsigned) j);
/* Yes, writing directly into terms, they ARE off heap */
- *p = am_atom_put(buff, strlen(buff));
+ *p = erts_atom_put((byte *) buff, strlen(buff),
+ ERTS_ATOM_ENC_LATIN1, 1);
}
++p;
}
diff --git a/erts/emulator/beam/erl_init.c b/erts/emulator/beam/erl_init.c
index 4b90e5394a..369eab5980 100644
--- a/erts/emulator/beam/erl_init.c
+++ b/erts/emulator/beam/erl_init.c
@@ -344,7 +344,7 @@ erl_first_process_otp(char* modname, void* code, unsigned size, int argc, char**
ErlSpawnOpts so;
Eterm env;
- start_mod = am_atom_put(modname, sys_strlen(modname));
+ start_mod = erts_atom_put((byte *) modname, sys_strlen(modname), ERTS_ATOM_ENC_LATIN1, 1);
if (erts_find_function(start_mod, am_start, 2,
erts_active_code_ix()) == NULL) {
erl_exit(5, "No function %s:start/2\n", modname);
@@ -441,7 +441,7 @@ load_preloaded(void)
i = 0;
while ((name = preload_p[i].name) != NULL) {
length = preload_p[i].size;
- module_name = am_atom_put(name, sys_strlen(name));
+ module_name = erts_atom_put((byte *) name, sys_strlen(name), ERTS_ATOM_ENC_LATIN1, 1);
if ((code = sys_preload_begin(&preload_p[i])) == 0)
erl_exit(1, "Failed to find preloaded code for module %s\n",
name);
diff --git a/erts/emulator/beam/erl_unicode.c b/erts/emulator/beam/erl_unicode.c
index e24b6f1458..6600ce4a4a 100644
--- a/erts/emulator/beam/erl_unicode.c
+++ b/erts/emulator/beam/erl_unicode.c
@@ -1154,15 +1154,24 @@ BIF_RETTYPE unicode_characters_to_list_2(BIF_ALIST_2)
* When input to characters_to_list is a plain binary and the format is 'unicode', we do
* a faster analyze and size count with this function.
*/
-int erts_analyze_utf8(byte *source, Uint size,
- byte **err_pos, Uint *num_chars, int *left)
+static ERTS_INLINE int
+analyze_utf8(byte *source, Uint size, byte **err_pos, Uint *num_chars, int *left,
+ Sint *num_latin1_chars, Uint max_chars)
{
+ Uint latin1_count;
+ int is_latin1;
*err_pos = source;
+ if (num_latin1_chars) {
+ is_latin1 = 1;
+ latin1_count = 0;
+ }
*num_chars = 0;
while (size) {
if (((*source) & ((byte) 0x80)) == 0) {
source++;
- --size;
+ --size;
+ if (num_latin1_chars)
+ latin1_count++;
} else if (((*source) & ((byte) 0xE0)) == 0xC0) {
if (size < 2) {
return ERTS_UTF8_INCOMPLETE;
@@ -1173,6 +1182,11 @@ int erts_analyze_utf8(byte *source, Uint size,
}
source += 2;
size -= 2;
+ if (num_latin1_chars) {
+ latin1_count++;
+ if ((source[0] & ((byte) 0xFC)) != ((byte) 0xC0))
+ is_latin1 = 0;
+ }
} else if (((*source) & ((byte) 0xF0)) == 0xE0) {
if (size < 3) {
return ERTS_UTF8_INCOMPLETE;
@@ -1188,6 +1202,8 @@ int erts_analyze_utf8(byte *source, Uint size,
}
source += 3;
size -= 3;
+ if (num_latin1_chars)
+ is_latin1 = 0;
} else if (((*source) & ((byte) 0xF8)) == 0xF0) {
if (size < 4) {
return ERTS_UTF8_INCOMPLETE;
@@ -1205,22 +1221,41 @@ int erts_analyze_utf8(byte *source, Uint size,
}
source += 4;
size -= 4;
+ if (num_latin1_chars)
+ is_latin1 = 0;
} else {
return ERTS_UTF8_ERROR;
}
++(*num_chars);
*err_pos = source;
+ if (max_chars && size > 0 && *num_chars == max_chars)
+ return ERTS_UTF8_OK_MAX_CHARS;
if (left && --(*left) <= 0 && size) {
return ERTS_UTF8_ANALYZE_MORE;
}
}
+ if (num_latin1_chars)
+ *num_latin1_chars = is_latin1 ? latin1_count : -1;
return ERTS_UTF8_OK;
}
+int erts_analyze_utf8(byte *source, Uint size,
+ byte **err_pos, Uint *num_chars, int *left)
+{
+ return analyze_utf8(source, size, err_pos, num_chars, left, NULL, 0);
+}
+
+int erts_analyze_utf8_x(byte *source, Uint size,
+ byte **err_pos, Uint *num_chars, int *left,
+ Sint *num_latin1_chars, Uint max_chars)
+{
+ return analyze_utf8(source, size, err_pos, num_chars, left, num_latin1_chars, max_chars);
+}
+
/*
* No errors should be able to occur - no overlongs, no malformed, no nothing
- */
-Eterm do_utf8_to_list(Process *p, Uint num, byte *bytes, Uint sz,
+ */
+static Eterm do_utf8_to_list(Process *p, Uint num, byte *bytes, Uint sz,
Uint left,
Uint *num_built, Uint *num_eaten, Eterm tail)
{
@@ -1275,6 +1310,12 @@ Eterm do_utf8_to_list(Process *p, Uint num, byte *bytes, Uint sz,
return ret;
}
+Eterm erts_utf8_to_list(Process *p, Uint num, byte *bytes, Uint sz, Uint left,
+ Uint *num_built, Uint *num_eaten, Eterm tail)
+{
+ return do_utf8_to_list(p, num, bytes, sz, left, num_built, num_eaten, tail);
+}
+
static int is_candidate(Uint cp)
{
int index,pos;
@@ -1849,14 +1890,14 @@ BIF_RETTYPE atom_to_binary_2(BIF_ALIST_2)
}
static BIF_RETTYPE
-binary_to_atom(Process* p, Eterm bin, Eterm enc, int must_exist)
+binary_to_atom(Process* proc, Eterm bin, Eterm enc, int must_exist)
{
byte* bytes;
byte *temp_alloc = NULL;
Uint bin_size;
if ((bytes = erts_get_aligned_binary_bytes(bin, &temp_alloc)) == 0) {
- BIF_ERROR(p, BADARG);
+ BIF_ERROR(proc, BADARG);
}
bin_size = binary_size(bin);
if (enc == am_latin1) {
@@ -1864,11 +1905,16 @@ binary_to_atom(Process* p, Eterm bin, Eterm enc, int must_exist)
if (bin_size > MAX_ATOM_CHARACTERS) {
system_limit:
erts_free_aligned_binary_bytes(temp_alloc);
- BIF_ERROR(p, SYSTEM_LIMIT);
+ BIF_ERROR(proc, SYSTEM_LIMIT);
}
if (!must_exist) {
- a = am_atom_put2(bytes, bin_size, 1);
- erts_free_aligned_binary_bytes(temp_alloc);
+ a = erts_atom_put((byte *) bytes,
+ bin_size,
+ ERTS_ATOM_ENC_LATIN1,
+ 0);
+ erts_free_aligned_binary_bytes(temp_alloc);
+ if (is_non_value(a))
+ goto badarg;
BIF_RET(a);
} else if (erts_atom_get((char *)bytes, bin_size, &a, 1)) {
erts_free_aligned_binary_bytes(temp_alloc);
@@ -1900,17 +1946,22 @@ binary_to_atom(Process* p, Eterm bin, Eterm enc, int must_exist)
}
if (!must_exist) {
- res = am_atom_put((char*)bytes, bin_size);
+ res = erts_atom_put((byte *) bytes,
+ bin_size,
+ ERTS_ATOM_ENC_UTF8,
+ 0);
}
else if (!erts_atom_get((char*)bytes, bin_size, &res, 0)) {
goto badarg;
}
erts_free_aligned_binary_bytes(temp_alloc);
+ if (is_non_value(res))
+ goto badarg;
BIF_RET(res);
} else {
badarg:
erts_free_aligned_binary_bytes(temp_alloc);
- BIF_ERROR(p, BADARG);
+ BIF_ERROR(proc, BADARG);
}
}
@@ -2625,30 +2676,70 @@ BIF_RETTYPE file_native_name_encoding_0(BIF_ALIST_0)
}
}
-/* Assumes 'dest' has enough room.
- */
-int erts_utf8_to_latin1(byte* dest, const byte* source, unsigned slen)
+int erts_utf8_is_latin1_string(const byte *string, int len)
+{
+ /* Assumes string is encoded in valid UTF-8 */
+ int i;
+ while (i < len) {
+ if ((string[i] & 0x80) == 0)
+ i++;
+ else if (i+1 < len
+ && (string[i] & 0xFE) == 0xC2
+ && (string[i+1] & 0xC0) == 0x80)
+ i +=2;
+ else
+ return 0;
+ }
+ return 1;
+}
+
+int erts_utf8_to_latin1(byte* dest, const byte* source, int slen)
{
+ /*
+ * Assumes source contains valid utf8 that can be encoded as latin1,
+ * and that dest has enough room.
+ */
byte* dp = dest;
while (slen > 0) {
if ((source[0] & 0x80) == 0) {
*dp++ = *source++;
--slen;
}
- else if (slen > 1 &&
- (source[0] & 0xFE) == 0xC2 &&
- (source[1] & 0xC0) == 0x80) {
+ else {
+ ASSERT(slen > 1);
+ ASSERT((source[0] & 0xFE) == 0xC2);
+ ASSERT((source[1] & 0xC0) == 0x80);
*dp++ = (char) ((source[0] << 6) | (source[1] & 0x3F));
source += 2;
slen -= 2;
}
- else {
- /* Just let unconvertable octets through. This should not happen
- in a correctly upgraded system */
- *dp++ = *source++;
- --slen;
- }
}
return dp - dest;
}
+int erts_utf8_to_latin1_backwards(byte *dest, const byte *source, int slen)
+{
+ /*
+ * Assumes source contains valid utf8 that can be encoded as latin1,
+ * and that dest has enough room.
+ */
+ int dix = 0;
+ int six = slen;
+ while (six > 0) {
+ six--;
+ dix--;
+ if ((source[six] & 0x80) == 0)
+ dest[dix] = source[six];
+ else {
+ byte c;
+ ASSERT(six > 0);
+ ASSERT((source[six] & 0xC0) == 0x80);
+ ASSERT((source[six-1] & 0xFE) == 0xC2);
+ c = source[six] & 0x3F;
+ six--;
+ c |= source[six] << 6;
+ dest[dix] = c;
+ }
+ }
+ return -dix;
+}
diff --git a/erts/emulator/beam/external.c b/erts/emulator/beam/external.c
index 68edcd0fa6..8c4d9108d4 100644
--- a/erts/emulator/beam/external.c
+++ b/erts/emulator/beam/external.c
@@ -142,6 +142,7 @@ erts_init_atom_cache_map(ErtsAtomCacheMap *acmp)
{
if (acmp) {
int ix;
+ acmp->long_atoms = 0;
for (ix = 0; ix < ERTS_ATOM_CACHE_SIZE; ix++)
acmp->cache[ix].iix = -1;
acmp->sz = 0;
@@ -154,6 +155,7 @@ erts_reset_atom_cache_map(ErtsAtomCacheMap *acmp)
{
if (acmp) {
int i;
+ acmp->long_atoms = 0;
for (i = 0; i < acmp->sz; i++) {
ASSERT(0 <= acmp->cix[i] && acmp->cix[i] < ERTS_ATOM_CACHE_SIZE);
acmp->cache[acmp->cix[i]].iix = -1;
@@ -175,9 +177,23 @@ erts_destroy_atom_cache_map(ErtsAtomCacheMap *acmp)
}
static ERTS_INLINE void
-insert_acache_map(ErtsAtomCacheMap *acmp, Eterm atom)
+insert_acache_map(ErtsAtomCacheMap *acmp, Eterm atom, Uint32 dflags)
{
- if (acmp && acmp->sz < ERTS_MAX_INTERNAL_ATOM_CACHE_ENTRIES) {
+ /*
+ * If the receiver do not understand utf8 atoms
+ * and this atom cannot be represented in latin1,
+ * we are not allowed to cache it.
+ *
+ * In this case all atoms are assumed to have
+ * latin1 encoding in the cache. By refusing it
+ * in the cache we will instead encode it using
+ * ATOM_UTF8_EXT/SMALL_ATOM_UTF8_EXT which the
+ * receiver do not recognize and tear down the
+ * connection.
+ */
+ if (acmp && acmp->sz < ERTS_MAX_INTERNAL_ATOM_CACHE_ENTRIES
+ && ((dflags & DFLAG_UTF8_ATOMS)
+ || atom_tab(atom_val(atom))->latin1_chars >= 0)) {
int ix;
ASSERT(acmp->hdr_sz < 0);
ix = atom2cix(atom);
@@ -190,7 +206,7 @@ insert_acache_map(ErtsAtomCacheMap *acmp, Eterm atom)
}
static ERTS_INLINE int
-get_iix_acache_map(ErtsAtomCacheMap *acmp, Eterm atom)
+get_iix_acache_map(ErtsAtomCacheMap *acmp, Eterm atom, Uint32 dflags)
{
if (!acmp)
return -1;
@@ -199,7 +215,9 @@ get_iix_acache_map(ErtsAtomCacheMap *acmp, Eterm atom)
ASSERT(is_atom(atom));
ix = atom2cix(atom);
if (acmp->cache[ix].iix < 0) {
- ASSERT(acmp->sz == ERTS_MAX_INTERNAL_ATOM_CACHE_ENTRIES);
+ ASSERT(acmp->sz == ERTS_MAX_INTERNAL_ATOM_CACHE_ENTRIES
+ || (!(dflags & DFLAG_UTF8_ATOMS)
+ && atom_tab(atom_val(atom))->latin1_chars < 0));
return -1;
}
else {
@@ -210,18 +228,17 @@ get_iix_acache_map(ErtsAtomCacheMap *acmp, Eterm atom)
}
void
-erts_finalize_atom_cache_map(ErtsAtomCacheMap *acmp)
+erts_finalize_atom_cache_map(ErtsAtomCacheMap *acmp, Uint32 dflags)
{
if (acmp) {
-#if MAX_ATOM_LENGTH > 255
-#error "This code is not complete; long_atoms info need to be passed to the following stages."
- int long_atoms = 0; /* !0 if one or more atoms are long than 255. */
-#endif
+ int utf8_atoms = (int) (dflags & DFLAG_UTF8_ATOMS);
+ int long_atoms = 0; /* !0 if one or more atoms are longer than 255. */
int i;
int sz;
int fix_sz
= 1 /* VERSION_MAGIC */
+ 1 /* DIST_HEADER */
+ + 1 /* dist header flags */
+ 1 /* number of internal cache entries */
;
int min_sz;
@@ -230,22 +247,23 @@ erts_finalize_atom_cache_map(ErtsAtomCacheMap *acmp)
min_sz = fix_sz+(2+4)*acmp->sz;
sz = fix_sz;
for (i = 0; i < acmp->sz; i++) {
+ Atom *a;
Eterm atom;
int len;
atom = acmp->cache[acmp->cix[i]].atom;
ASSERT(is_atom(atom));
- len = atom_tab(atom_val(atom))->len;
-#if MAX_ATOM_LENGTH > 255
+ a = atom_tab(atom_val(atom));
+ len = (int) (utf8_atoms ? a->len : a->latin1_chars);
+ ASSERT(len >= 0);
if (!long_atoms && len > 255)
long_atoms = 1;
-#endif
/* Enough for a new atom cache value */
sz += 1 /* cix */ + 1 /* length */ + len /* text */;
}
-#if MAX_ATOM_LENGTH > 255
- if (long_atoms)
+ if (long_atoms) {
+ acmp->long_atoms = 1;
sz += acmp->sz; /* we need 2 bytes per atom for length */
-#endif
+ }
/* Dynamically sized flag field */
sz += ERTS_DIST_HDR_ATOM_CACHE_FLAG_BYTES(acmp->sz);
if (sz < min_sz)
@@ -274,6 +292,7 @@ byte *erts_encode_ext_dist_header_setup(byte *ctl_ext, ErtsAtomCacheMap *acmp)
else {
int i;
byte *ep = ctl_ext;
+ byte dist_hdr_flags = acmp->long_atoms ? ERTS_DIST_HDR_LONG_ATOMS_FLG : 0;
ASSERT(acmp->hdr_sz >= 0);
/*
* Write cache update instructions. Note that this is a purely
@@ -296,28 +315,36 @@ byte *erts_encode_ext_dist_header_setup(byte *ctl_ext, ErtsAtomCacheMap *acmp)
}
--ep;
put_int8(acmp->sz, ep);
+ --ep;
+ put_int8(dist_hdr_flags, ep);
*--ep = DIST_HEADER;
*--ep = VERSION_MAGIC;
return ep;
}
}
-byte *erts_encode_ext_dist_header_finalize(byte *ext, ErtsAtomCache *cache)
+byte *erts_encode_ext_dist_header_finalize(byte *ext, ErtsAtomCache *cache, Uint32 dflags)
{
byte *ip;
byte instr_buf[(2+4)*ERTS_ATOM_CACHE_SIZE];
int ci, sz;
+ byte dist_hdr_flags;
+ int long_atoms;
+ int utf8_atoms = (int) (dflags & DFLAG_UTF8_ATOMS);
register byte *ep = ext;
ASSERT(ep[0] == VERSION_MAGIC);
if (ep[1] != DIST_HEADER)
return ext;
+ dist_hdr_flags = ep[2];
+ long_atoms = ERTS_DIST_HDR_LONG_ATOMS_FLG & ((int) dist_hdr_flags);
+
/*
* Update output atom cache and write the external version of
* the dist header. We write the header backwards just
* before the actual term(s).
*/
- ep += 2;
+ ep += 3;
ci = (int) get_int8(ep);
ASSERT(0 <= ci && ci < ERTS_ATOM_CACHE_SIZE);
ep += 1;
@@ -342,12 +369,7 @@ byte *erts_encode_ext_dist_header_finalize(byte *ext, ErtsAtomCache *cache)
flgs_bytes = ERTS_DIST_HDR_ATOM_CACHE_FLAG_BYTES(ci);
ASSERT(flgs_bytes <= sizeof(flgs_buf));
-#if MAX_ATOM_LENGTH > 255
- /* long_atoms info needs to be passed from previous stages */
- if (long_atoms)
- flgs |= ERTS_DIST_HDR_LONG_ATOMS_FLG;
-#endif
- flgs = 0;
+ flgs = (Uint32) dist_hdr_flags;
flgs_buf_ix = 0;
if ((ci & 1) == 0)
used_half_bytes = 2;
@@ -382,17 +404,22 @@ byte *erts_encode_ext_dist_header_finalize(byte *ext, ErtsAtomCache *cache)
Atom *a;
cache->out_arr[cix] = atom;
a = atom_tab(atom_val(atom));
- sz = a->len;
- ep -= sz;
- sys_memcpy((void *) ep, (void *) a->name, sz);
-#if MAX_ATOM_LENGTH > 255
+ if (utf8_atoms) {
+ sz = a->len;
+ ep -= sz;
+ sys_memcpy((void *) ep, (void *) a->name, sz);
+ }
+ else {
+ ASSERT(0 <= a->latin1_chars && a->latin1_chars <= MAX_ATOM_CHARACTERS);
+ ep -= a->latin1_chars;
+ sz = erts_utf8_to_latin1(ep, a->name, a->len);
+ ASSERT(a->latin1_chars == sz);
+ }
if (long_atoms) {
ep -= 2;
put_int16(sz, ep);
}
- else
-#endif
- {
+ else {
ASSERT(0 <= sz && sz <= 255);
--ep;
put_int8(sz, ep);
@@ -553,6 +580,7 @@ erts_prepare_dist_ext(ErtsDistExternal *edep,
#endif
register byte *ep = ext;
+ int utf8_atoms = (int) (dep->flags & DFLAG_UTF8_ATOMS);
edep->heap_size = -1;
edep->ext_endp = ext+size;
@@ -611,9 +639,7 @@ erts_prepare_dist_ext(ErtsDistExternal *edep,
ERTS_EXT_HDR_FAIL;
ep++;
if (no_atoms) {
-#if MAX_ATOM_LENGTH > 255
int long_atoms = 0;
-#endif
#ifdef DEBUG
byte *flgs_buf = ep;
#endif
@@ -632,14 +658,8 @@ erts_prepare_dist_ext(ErtsDistExternal *edep,
*/
byte_ix = ERTS_DIST_HDR_ATOM_CACHE_FLAG_BYTE_IX(no_atoms);
bit_ix = ERTS_DIST_HDR_ATOM_CACHE_FLAG_BIT_IX(no_atoms);
- if (flgsp[byte_ix] & (((byte) ERTS_DIST_HDR_LONG_ATOMS_FLG)
- << bit_ix)) {
-#if MAX_ATOM_LENGTH > 255
+ if (flgsp[byte_ix] & (((byte) ERTS_DIST_HDR_LONG_ATOMS_FLG) << bit_ix))
long_atoms = 1;
-#else
- ERTS_EXT_HDR_FAIL; /* Long atoms not supported yet */
-#endif
- }
#ifdef DEBUG
byte_ix = 0;
@@ -707,23 +727,25 @@ erts_prepare_dist_ext(ErtsDistExternal *edep,
if (cix >= ERTS_ATOM_CACHE_SIZE)
ERTS_EXT_HDR_FAIL;
ep++;
-#if MAX_ATOM_CHARACTERS > 255
if (long_atoms) {
CHKSIZE(2);
len = get_int16(ep);
ep += 2;
}
- else
-#endif
- {
+ else {
CHKSIZE(1);
len = get_int8(ep);
ep++;
}
- if (len > MAX_ATOM_CHARACTERS)
- ERTS_EXT_HDR_FAIL; /* Too long atom */
CHKSIZE(len);
- atom = am_atom_put((char *) ep, len);
+ atom = erts_atom_put((byte *) ep,
+ len,
+ (utf8_atoms
+ ? ERTS_ATOM_ENC_UTF8
+ : ERTS_ATOM_ENC_LATIN1),
+ 0);
+ if (is_non_value(atom))
+ ERTS_EXT_HDR_FAIL;
ep += len;
cache->in_arr[cix] = atom;
edep->attab.atom[tix] = atom;
@@ -1404,7 +1426,8 @@ static byte*
enc_atom(ErtsAtomCacheMap *acmp, Eterm atom, byte *ep, Uint32 dflags)
{
int iix;
- int i, j;
+ int len;
+ int utf8_atoms = (int) (dflags & DFLAG_UTF8_ATOMS);
ASSERT(is_atom(atom));
@@ -1423,42 +1446,46 @@ enc_atom(ErtsAtomCacheMap *acmp, Eterm atom, byte *ep, Uint32 dflags)
}
return ep;
}
+
/*
* term_to_binary/1,2 and the initial distribution message
* don't use the cache.
*/
- iix = get_iix_acache_map(acmp, atom);
- if (iix < 0) {
- i = atom_val(atom);
- j = atom_tab(i)->len;
- if (dflags & DFLAG_UTF8_ATOMS) {
- if (j <= 255) {
+
+ iix = get_iix_acache_map(acmp, atom, dflags);
+ if (iix < 0) {
+ Atom *a = atom_tab(atom_val(atom));
+ if (utf8_atoms || a->latin1_chars < 0) {
+ len = a->len;
+ if (len > 255) {
*ep++ = ATOM_UTF8_EXT;
- put_int16(j, ep);
+ put_int16(len, ep);
ep += 2;
}
else {
*ep++ = SMALL_ATOM_UTF8_EXT;
- put_int8(j, ep);
- ep += 2;
+ put_int8(len, ep);
+ ep += 1;
}
- sys_memcpy((char *) ep, (char*)atom_tab(i)->name, (int) j);
+ sys_memcpy((char *) ep, (char *) a->name, len);
}
else {
- if (j <= 255 && (dflags & DFLAG_SMALL_ATOM_TAGS)) {
+ if (a->latin1_chars <= 255 && (dflags & DFLAG_SMALL_ATOM_TAGS)) {
*ep++ = SMALL_ATOM_EXT;
- j = erts_utf8_to_latin1(ep+1, atom_tab(i)->name, j);
- put_int8(j, ep);
+ len = erts_utf8_to_latin1(ep+1, a->name, a->len);
+ ASSERT(len == a->latin1_chars);
+ put_int8(len, ep);
ep++;
}
else {
*ep++ = ATOM_EXT;
- j = erts_utf8_to_latin1(ep+2, atom_tab(i)->name, j);
- put_int16(j, ep);
+ len = erts_utf8_to_latin1(ep+2, a->name, a->len);
+ ASSERT(len == a->latin1_chars);
+ put_int16(len, ep);
ep += 2;
}
}
- ep += j;
+ ep += len;
return ep;
}
@@ -1535,7 +1562,15 @@ dec_atom(ErtsDistExternal *edep, byte* ep, Eterm* objp)
goto error;
}
} else {
- *objp = am_atom_put2(ep, len, is_latin1);
+ Eterm atom = erts_atom_put(ep,
+ len,
+ (is_latin1
+ ? ERTS_ATOM_ENC_LATIN1
+ : ERTS_ATOM_ENC_UTF8),
+ 0);
+ if (is_non_value(atom))
+ goto error;
+ *objp = atom;
}
ep += len;
break;
@@ -2248,7 +2283,15 @@ dec_term_atom_common:
goto error;
}
} else {
- *objp = am_atom_put2(ep, n, is_latin1);
+ Eterm atom = erts_atom_put(ep,
+ n,
+ (is_latin1
+ ? ERTS_ATOM_ENC_LATIN1
+ : ERTS_ATOM_ENC_UTF8),
+ 0);
+ if (is_non_value(atom))
+ goto error;
+ *objp = atom;
}
ep += n;
break;
@@ -2917,18 +2960,22 @@ encode_size_struct2(ErtsAtomCacheMap *acmp, Eterm obj, unsigned dflags)
}
}
else {
- int alen = atom_tab(atom_val(obj))->len;
- result += 1 + 1 + alen;
- if (dflags & DFLAG_UTF8_ATOMS) {
+ Atom *a = atom_tab(atom_val(obj));
+ int alen;
+ if ((dflags & DFLAG_UTF8_ATOMS) || a->latin1_chars < 0) {
+ alen = a->len;
+ result += 1 + 1 + alen;
if (alen > 255) {
result++; /* ATOM_UTF8_EXT (not small) */
}
- /*SVERK we use utf8 length which is an over estimation */
- }
- else if (alen > 255 || !(dflags & DFLAG_SMALL_ATOM_TAGS)) {
- result++; /* ATOM_EXT (not small) */
}
- insert_acache_map(acmp, obj);
+ else {
+ alen = a->latin1_chars;
+ result += 1 + 1 + alen;
+ if (alen > 255 || !(dflags & DFLAG_SMALL_ATOM_TAGS))
+ result++; /* ATOM_EXT (not small) */
+ }
+ insert_acache_map(acmp, obj, dflags);
}
break;
case SMALL_DEF:
diff --git a/erts/emulator/beam/external.h b/erts/emulator/beam/external.h
index 50eea62225..ad430117c8 100644
--- a/erts/emulator/beam/external.h
+++ b/erts/emulator/beam/external.h
@@ -92,6 +92,7 @@ typedef struct cache {
typedef struct {
int hdr_sz;
int sz;
+ int long_atoms;
int cix[ERTS_ATOM_CACHE_SIZE];
struct {
Eterm atom;
@@ -152,12 +153,12 @@ typedef struct {
void erts_init_atom_cache_map(ErtsAtomCacheMap *);
void erts_reset_atom_cache_map(ErtsAtomCacheMap *);
void erts_destroy_atom_cache_map(ErtsAtomCacheMap *);
-void erts_finalize_atom_cache_map(ErtsAtomCacheMap *);
+void erts_finalize_atom_cache_map(ErtsAtomCacheMap *, Uint32);
Uint erts_encode_ext_dist_header_size(ErtsAtomCacheMap *);
Uint erts_encode_ext_dist_header_size(ErtsAtomCacheMap *);
byte *erts_encode_ext_dist_header_setup(byte *, ErtsAtomCacheMap *);
-byte *erts_encode_ext_dist_header_finalize(byte *, ErtsAtomCache *);
+byte *erts_encode_ext_dist_header_finalize(byte *, ErtsAtomCache *, Uint32);
Uint erts_encode_dist_ext_size(Eterm, Uint32, ErtsAtomCacheMap *);
void erts_encode_dist_ext(Eterm, byte **, Uint32, ErtsAtomCacheMap *);
diff --git a/erts/emulator/beam/global.h b/erts/emulator/beam/global.h
index 1500424d3e..eccdf10c75 100755
--- a/erts/emulator/beam/global.h
+++ b/erts/emulator/beam/global.h
@@ -1519,17 +1519,25 @@ Sint erts_native_filename_need(Eterm ioterm, int encoding);
void erts_copy_utf8_to_utf16_little(byte *target, byte *bytes, int num_chars);
int erts_analyze_utf8(byte *source, Uint size,
byte **err_pos, Uint *num_chars, int *left);
+int erts_analyze_utf8_x(byte *source, Uint size,
+ byte **err_pos, Uint *num_chars, int *left,
+ Sint *num_latin1_chars, Uint max_chars);
char *erts_convert_filename_to_native(Eterm name, char *statbuf,
size_t statbuf_size,
ErtsAlcType_t alloc_type,
int allow_empty, int allow_atom,
Sint *used /* out */);
Eterm erts_convert_native_to_filename(Process *p, byte *bytes);
-int erts_utf8_to_latin1(byte* dest, const byte* source, unsigned slen);
+Eterm erts_utf8_to_list(Process *p, Uint num, byte *bytes, Uint sz, Uint left,
+ Uint *num_built, Uint *num_eaten, Eterm tail);
+int erts_utf8_is_latin1_string(const byte *string, int len);
+int erts_utf8_to_latin1(byte* dest, const byte* source, int slen);
+int erts_utf8_to_latin1_backwards(byte* dest, const byte* source, int slen);
#define ERTS_UTF8_OK 0
#define ERTS_UTF8_INCOMPLETE 1
#define ERTS_UTF8_ERROR 2
#define ERTS_UTF8_ANALYZE_MORE 3
+#define ERTS_UTF8_OK_MAX_CHARS 4
/* erl_trace.c */
void erts_init_trace(void);
diff --git a/erts/emulator/beam/io.c b/erts/emulator/beam/io.c
index 60b9238d38..b1eb75bede 100644
--- a/erts/emulator/beam/io.c
+++ b/erts/emulator/beam/io.c
@@ -646,8 +646,11 @@ erts_open_driver(erts_driver_t* driver, /* Pointer to driver. */
if (IS_TRACED_FL(port, F_TRACE_PORTS)) {
trace_port_open(port,
- pid,
- am_atom_put(port->name, strlen(port->name)));
+ pid,
+ erts_atom_put((byte *) port->name,
+ strlen(port->name),
+ ERTS_ATOM_ENC_LATIN1,
+ 1));
}
if (driver->start) {
@@ -4765,7 +4768,8 @@ int driver_exit(ErlDrvPort ix, int err)
return driver_failure_term(ix, am_normal, 0);
else {
char* err_str = erl_errno_id(err);
- Eterm am_err = am_atom_put(err_str, sys_strlen(err_str));
+ Eterm am_err = erts_atom_put((byte *) err_str, sys_strlen(err_str),
+ ERTS_ATOM_ENC_LATIN1, 1);
return driver_failure_term(ix, am_err, 0);
}
}
@@ -4778,8 +4782,12 @@ int driver_failure(ErlDrvPort ix, int code)
int driver_failure_atom(ErlDrvPort ix, char* string)
{
- Eterm am = am_atom_put(string, strlen(string));
- return driver_failure_term(ix, am, 0);
+ return driver_failure_term(ix,
+ erts_atom_put((byte *) string,
+ strlen(string),
+ ERTS_ATOM_ENC_LATIN1,
+ 1),
+ 0);
}
int driver_failure_posix(ErlDrvPort ix, int err)
@@ -4796,7 +4804,10 @@ int driver_failure_eof(ErlDrvPort ix)
ErlDrvTermData driver_mk_atom(char* string)
{
- Eterm am = am_atom_put(string, sys_strlen(string));
+ Eterm am = erts_atom_put((byte *) string,
+ sys_strlen(string),
+ ERTS_ATOM_ENC_LATIN1,
+ 1);
ERTS_SMP_CHK_NO_PROC_LOCKS;
return (ErlDrvTermData) am;
}
@@ -5091,7 +5102,10 @@ init_driver(erts_driver_t *drv, ErlDrvEntry *de, DE_Handle *handle)
erts_smp_mtx_init_x(drv->lock,
"driver_lock",
#if defined(ERTS_ENABLE_LOCK_CHECK) || defined(ERTS_ENABLE_LOCK_COUNT)
- am_atom_put(drv->name, sys_strlen(drv->name))
+ erts_atom_put((byte *) drv->name,
+ sys_strlen(drv->name),
+ ERTS_ATOM_ENC_LATIN1,
+ 1)
#else
NIL
#endif
diff --git a/erts/emulator/beam/utils.c b/erts/emulator/beam/utils.c
index 1969fc762c..97b6d01207 100644
--- a/erts/emulator/beam/utils.c
+++ b/erts/emulator/beam/utils.c
@@ -370,7 +370,7 @@ Eterm
erts_bld_atom(Uint **hpp, Uint *szp, char *str)
{
if (hpp)
- return am_atom_put(str, sys_strlen(str));
+ return erts_atom_put((byte *) str, sys_strlen(str), ERTS_ATOM_ENC_LATIN1, 1);
else
return THE_NON_VALUE;
}