aboutsummaryrefslogtreecommitdiffstats
path: root/erts/emulator/beam/external.c
diff options
context:
space:
mode:
authorRickard Green <[email protected]>2013-01-10 12:47:46 +0100
committerRickard Green <[email protected]>2013-01-16 17:16:52 +0100
commit0dd3b88cdf90283d9c276ee415f985cb764e522f (patch)
tree1584d76d9960339a03c04412ef7919473e7b2efc /erts/emulator/beam/external.c
parent5d79f55ca441727578d34b78ee0d6d8aa80976ee (diff)
downloadotp-0dd3b88cdf90283d9c276ee415f985cb764e522f.tar.gz
otp-0dd3b88cdf90283d9c276ee415f985cb764e522f.tar.bz2
otp-0dd3b88cdf90283d9c276ee415f985cb764e522f.zip
UTF-8 support for distribution
Diffstat (limited to 'erts/emulator/beam/external.c')
-rw-r--r--erts/emulator/beam/external.c193
1 files changed, 120 insertions, 73 deletions
diff --git a/erts/emulator/beam/external.c b/erts/emulator/beam/external.c
index 68edcd0fa6..8c4d9108d4 100644
--- a/erts/emulator/beam/external.c
+++ b/erts/emulator/beam/external.c
@@ -142,6 +142,7 @@ erts_init_atom_cache_map(ErtsAtomCacheMap *acmp)
{
if (acmp) {
int ix;
+ acmp->long_atoms = 0;
for (ix = 0; ix < ERTS_ATOM_CACHE_SIZE; ix++)
acmp->cache[ix].iix = -1;
acmp->sz = 0;
@@ -154,6 +155,7 @@ erts_reset_atom_cache_map(ErtsAtomCacheMap *acmp)
{
if (acmp) {
int i;
+ acmp->long_atoms = 0;
for (i = 0; i < acmp->sz; i++) {
ASSERT(0 <= acmp->cix[i] && acmp->cix[i] < ERTS_ATOM_CACHE_SIZE);
acmp->cache[acmp->cix[i]].iix = -1;
@@ -175,9 +177,23 @@ erts_destroy_atom_cache_map(ErtsAtomCacheMap *acmp)
}
static ERTS_INLINE void
-insert_acache_map(ErtsAtomCacheMap *acmp, Eterm atom)
+insert_acache_map(ErtsAtomCacheMap *acmp, Eterm atom, Uint32 dflags)
{
- if (acmp && acmp->sz < ERTS_MAX_INTERNAL_ATOM_CACHE_ENTRIES) {
+ /*
+ * If the receiver do not understand utf8 atoms
+ * and this atom cannot be represented in latin1,
+ * we are not allowed to cache it.
+ *
+ * In this case all atoms are assumed to have
+ * latin1 encoding in the cache. By refusing it
+ * in the cache we will instead encode it using
+ * ATOM_UTF8_EXT/SMALL_ATOM_UTF8_EXT which the
+ * receiver do not recognize and tear down the
+ * connection.
+ */
+ if (acmp && acmp->sz < ERTS_MAX_INTERNAL_ATOM_CACHE_ENTRIES
+ && ((dflags & DFLAG_UTF8_ATOMS)
+ || atom_tab(atom_val(atom))->latin1_chars >= 0)) {
int ix;
ASSERT(acmp->hdr_sz < 0);
ix = atom2cix(atom);
@@ -190,7 +206,7 @@ insert_acache_map(ErtsAtomCacheMap *acmp, Eterm atom)
}
static ERTS_INLINE int
-get_iix_acache_map(ErtsAtomCacheMap *acmp, Eterm atom)
+get_iix_acache_map(ErtsAtomCacheMap *acmp, Eterm atom, Uint32 dflags)
{
if (!acmp)
return -1;
@@ -199,7 +215,9 @@ get_iix_acache_map(ErtsAtomCacheMap *acmp, Eterm atom)
ASSERT(is_atom(atom));
ix = atom2cix(atom);
if (acmp->cache[ix].iix < 0) {
- ASSERT(acmp->sz == ERTS_MAX_INTERNAL_ATOM_CACHE_ENTRIES);
+ ASSERT(acmp->sz == ERTS_MAX_INTERNAL_ATOM_CACHE_ENTRIES
+ || (!(dflags & DFLAG_UTF8_ATOMS)
+ && atom_tab(atom_val(atom))->latin1_chars < 0));
return -1;
}
else {
@@ -210,18 +228,17 @@ get_iix_acache_map(ErtsAtomCacheMap *acmp, Eterm atom)
}
void
-erts_finalize_atom_cache_map(ErtsAtomCacheMap *acmp)
+erts_finalize_atom_cache_map(ErtsAtomCacheMap *acmp, Uint32 dflags)
{
if (acmp) {
-#if MAX_ATOM_LENGTH > 255
-#error "This code is not complete; long_atoms info need to be passed to the following stages."
- int long_atoms = 0; /* !0 if one or more atoms are long than 255. */
-#endif
+ int utf8_atoms = (int) (dflags & DFLAG_UTF8_ATOMS);
+ int long_atoms = 0; /* !0 if one or more atoms are longer than 255. */
int i;
int sz;
int fix_sz
= 1 /* VERSION_MAGIC */
+ 1 /* DIST_HEADER */
+ + 1 /* dist header flags */
+ 1 /* number of internal cache entries */
;
int min_sz;
@@ -230,22 +247,23 @@ erts_finalize_atom_cache_map(ErtsAtomCacheMap *acmp)
min_sz = fix_sz+(2+4)*acmp->sz;
sz = fix_sz;
for (i = 0; i < acmp->sz; i++) {
+ Atom *a;
Eterm atom;
int len;
atom = acmp->cache[acmp->cix[i]].atom;
ASSERT(is_atom(atom));
- len = atom_tab(atom_val(atom))->len;
-#if MAX_ATOM_LENGTH > 255
+ a = atom_tab(atom_val(atom));
+ len = (int) (utf8_atoms ? a->len : a->latin1_chars);
+ ASSERT(len >= 0);
if (!long_atoms && len > 255)
long_atoms = 1;
-#endif
/* Enough for a new atom cache value */
sz += 1 /* cix */ + 1 /* length */ + len /* text */;
}
-#if MAX_ATOM_LENGTH > 255
- if (long_atoms)
+ if (long_atoms) {
+ acmp->long_atoms = 1;
sz += acmp->sz; /* we need 2 bytes per atom for length */
-#endif
+ }
/* Dynamically sized flag field */
sz += ERTS_DIST_HDR_ATOM_CACHE_FLAG_BYTES(acmp->sz);
if (sz < min_sz)
@@ -274,6 +292,7 @@ byte *erts_encode_ext_dist_header_setup(byte *ctl_ext, ErtsAtomCacheMap *acmp)
else {
int i;
byte *ep = ctl_ext;
+ byte dist_hdr_flags = acmp->long_atoms ? ERTS_DIST_HDR_LONG_ATOMS_FLG : 0;
ASSERT(acmp->hdr_sz >= 0);
/*
* Write cache update instructions. Note that this is a purely
@@ -296,28 +315,36 @@ byte *erts_encode_ext_dist_header_setup(byte *ctl_ext, ErtsAtomCacheMap *acmp)
}
--ep;
put_int8(acmp->sz, ep);
+ --ep;
+ put_int8(dist_hdr_flags, ep);
*--ep = DIST_HEADER;
*--ep = VERSION_MAGIC;
return ep;
}
}
-byte *erts_encode_ext_dist_header_finalize(byte *ext, ErtsAtomCache *cache)
+byte *erts_encode_ext_dist_header_finalize(byte *ext, ErtsAtomCache *cache, Uint32 dflags)
{
byte *ip;
byte instr_buf[(2+4)*ERTS_ATOM_CACHE_SIZE];
int ci, sz;
+ byte dist_hdr_flags;
+ int long_atoms;
+ int utf8_atoms = (int) (dflags & DFLAG_UTF8_ATOMS);
register byte *ep = ext;
ASSERT(ep[0] == VERSION_MAGIC);
if (ep[1] != DIST_HEADER)
return ext;
+ dist_hdr_flags = ep[2];
+ long_atoms = ERTS_DIST_HDR_LONG_ATOMS_FLG & ((int) dist_hdr_flags);
+
/*
* Update output atom cache and write the external version of
* the dist header. We write the header backwards just
* before the actual term(s).
*/
- ep += 2;
+ ep += 3;
ci = (int) get_int8(ep);
ASSERT(0 <= ci && ci < ERTS_ATOM_CACHE_SIZE);
ep += 1;
@@ -342,12 +369,7 @@ byte *erts_encode_ext_dist_header_finalize(byte *ext, ErtsAtomCache *cache)
flgs_bytes = ERTS_DIST_HDR_ATOM_CACHE_FLAG_BYTES(ci);
ASSERT(flgs_bytes <= sizeof(flgs_buf));
-#if MAX_ATOM_LENGTH > 255
- /* long_atoms info needs to be passed from previous stages */
- if (long_atoms)
- flgs |= ERTS_DIST_HDR_LONG_ATOMS_FLG;
-#endif
- flgs = 0;
+ flgs = (Uint32) dist_hdr_flags;
flgs_buf_ix = 0;
if ((ci & 1) == 0)
used_half_bytes = 2;
@@ -382,17 +404,22 @@ byte *erts_encode_ext_dist_header_finalize(byte *ext, ErtsAtomCache *cache)
Atom *a;
cache->out_arr[cix] = atom;
a = atom_tab(atom_val(atom));
- sz = a->len;
- ep -= sz;
- sys_memcpy((void *) ep, (void *) a->name, sz);
-#if MAX_ATOM_LENGTH > 255
+ if (utf8_atoms) {
+ sz = a->len;
+ ep -= sz;
+ sys_memcpy((void *) ep, (void *) a->name, sz);
+ }
+ else {
+ ASSERT(0 <= a->latin1_chars && a->latin1_chars <= MAX_ATOM_CHARACTERS);
+ ep -= a->latin1_chars;
+ sz = erts_utf8_to_latin1(ep, a->name, a->len);
+ ASSERT(a->latin1_chars == sz);
+ }
if (long_atoms) {
ep -= 2;
put_int16(sz, ep);
}
- else
-#endif
- {
+ else {
ASSERT(0 <= sz && sz <= 255);
--ep;
put_int8(sz, ep);
@@ -553,6 +580,7 @@ erts_prepare_dist_ext(ErtsDistExternal *edep,
#endif
register byte *ep = ext;
+ int utf8_atoms = (int) (dep->flags & DFLAG_UTF8_ATOMS);
edep->heap_size = -1;
edep->ext_endp = ext+size;
@@ -611,9 +639,7 @@ erts_prepare_dist_ext(ErtsDistExternal *edep,
ERTS_EXT_HDR_FAIL;
ep++;
if (no_atoms) {
-#if MAX_ATOM_LENGTH > 255
int long_atoms = 0;
-#endif
#ifdef DEBUG
byte *flgs_buf = ep;
#endif
@@ -632,14 +658,8 @@ erts_prepare_dist_ext(ErtsDistExternal *edep,
*/
byte_ix = ERTS_DIST_HDR_ATOM_CACHE_FLAG_BYTE_IX(no_atoms);
bit_ix = ERTS_DIST_HDR_ATOM_CACHE_FLAG_BIT_IX(no_atoms);
- if (flgsp[byte_ix] & (((byte) ERTS_DIST_HDR_LONG_ATOMS_FLG)
- << bit_ix)) {
-#if MAX_ATOM_LENGTH > 255
+ if (flgsp[byte_ix] & (((byte) ERTS_DIST_HDR_LONG_ATOMS_FLG) << bit_ix))
long_atoms = 1;
-#else
- ERTS_EXT_HDR_FAIL; /* Long atoms not supported yet */
-#endif
- }
#ifdef DEBUG
byte_ix = 0;
@@ -707,23 +727,25 @@ erts_prepare_dist_ext(ErtsDistExternal *edep,
if (cix >= ERTS_ATOM_CACHE_SIZE)
ERTS_EXT_HDR_FAIL;
ep++;
-#if MAX_ATOM_CHARACTERS > 255
if (long_atoms) {
CHKSIZE(2);
len = get_int16(ep);
ep += 2;
}
- else
-#endif
- {
+ else {
CHKSIZE(1);
len = get_int8(ep);
ep++;
}
- if (len > MAX_ATOM_CHARACTERS)
- ERTS_EXT_HDR_FAIL; /* Too long atom */
CHKSIZE(len);
- atom = am_atom_put((char *) ep, len);
+ atom = erts_atom_put((byte *) ep,
+ len,
+ (utf8_atoms
+ ? ERTS_ATOM_ENC_UTF8
+ : ERTS_ATOM_ENC_LATIN1),
+ 0);
+ if (is_non_value(atom))
+ ERTS_EXT_HDR_FAIL;
ep += len;
cache->in_arr[cix] = atom;
edep->attab.atom[tix] = atom;
@@ -1404,7 +1426,8 @@ static byte*
enc_atom(ErtsAtomCacheMap *acmp, Eterm atom, byte *ep, Uint32 dflags)
{
int iix;
- int i, j;
+ int len;
+ int utf8_atoms = (int) (dflags & DFLAG_UTF8_ATOMS);
ASSERT(is_atom(atom));
@@ -1423,42 +1446,46 @@ enc_atom(ErtsAtomCacheMap *acmp, Eterm atom, byte *ep, Uint32 dflags)
}
return ep;
}
+
/*
* term_to_binary/1,2 and the initial distribution message
* don't use the cache.
*/
- iix = get_iix_acache_map(acmp, atom);
- if (iix < 0) {
- i = atom_val(atom);
- j = atom_tab(i)->len;
- if (dflags & DFLAG_UTF8_ATOMS) {
- if (j <= 255) {
+
+ iix = get_iix_acache_map(acmp, atom, dflags);
+ if (iix < 0) {
+ Atom *a = atom_tab(atom_val(atom));
+ if (utf8_atoms || a->latin1_chars < 0) {
+ len = a->len;
+ if (len > 255) {
*ep++ = ATOM_UTF8_EXT;
- put_int16(j, ep);
+ put_int16(len, ep);
ep += 2;
}
else {
*ep++ = SMALL_ATOM_UTF8_EXT;
- put_int8(j, ep);
- ep += 2;
+ put_int8(len, ep);
+ ep += 1;
}
- sys_memcpy((char *) ep, (char*)atom_tab(i)->name, (int) j);
+ sys_memcpy((char *) ep, (char *) a->name, len);
}
else {
- if (j <= 255 && (dflags & DFLAG_SMALL_ATOM_TAGS)) {
+ if (a->latin1_chars <= 255 && (dflags & DFLAG_SMALL_ATOM_TAGS)) {
*ep++ = SMALL_ATOM_EXT;
- j = erts_utf8_to_latin1(ep+1, atom_tab(i)->name, j);
- put_int8(j, ep);
+ len = erts_utf8_to_latin1(ep+1, a->name, a->len);
+ ASSERT(len == a->latin1_chars);
+ put_int8(len, ep);
ep++;
}
else {
*ep++ = ATOM_EXT;
- j = erts_utf8_to_latin1(ep+2, atom_tab(i)->name, j);
- put_int16(j, ep);
+ len = erts_utf8_to_latin1(ep+2, a->name, a->len);
+ ASSERT(len == a->latin1_chars);
+ put_int16(len, ep);
ep += 2;
}
}
- ep += j;
+ ep += len;
return ep;
}
@@ -1535,7 +1562,15 @@ dec_atom(ErtsDistExternal *edep, byte* ep, Eterm* objp)
goto error;
}
} else {
- *objp = am_atom_put2(ep, len, is_latin1);
+ Eterm atom = erts_atom_put(ep,
+ len,
+ (is_latin1
+ ? ERTS_ATOM_ENC_LATIN1
+ : ERTS_ATOM_ENC_UTF8),
+ 0);
+ if (is_non_value(atom))
+ goto error;
+ *objp = atom;
}
ep += len;
break;
@@ -2248,7 +2283,15 @@ dec_term_atom_common:
goto error;
}
} else {
- *objp = am_atom_put2(ep, n, is_latin1);
+ Eterm atom = erts_atom_put(ep,
+ n,
+ (is_latin1
+ ? ERTS_ATOM_ENC_LATIN1
+ : ERTS_ATOM_ENC_UTF8),
+ 0);
+ if (is_non_value(atom))
+ goto error;
+ *objp = atom;
}
ep += n;
break;
@@ -2917,18 +2960,22 @@ encode_size_struct2(ErtsAtomCacheMap *acmp, Eterm obj, unsigned dflags)
}
}
else {
- int alen = atom_tab(atom_val(obj))->len;
- result += 1 + 1 + alen;
- if (dflags & DFLAG_UTF8_ATOMS) {
+ Atom *a = atom_tab(atom_val(obj));
+ int alen;
+ if ((dflags & DFLAG_UTF8_ATOMS) || a->latin1_chars < 0) {
+ alen = a->len;
+ result += 1 + 1 + alen;
if (alen > 255) {
result++; /* ATOM_UTF8_EXT (not small) */
}
- /*SVERK we use utf8 length which is an over estimation */
- }
- else if (alen > 255 || !(dflags & DFLAG_SMALL_ATOM_TAGS)) {
- result++; /* ATOM_EXT (not small) */
}
- insert_acache_map(acmp, obj);
+ else {
+ alen = a->latin1_chars;
+ result += 1 + 1 + alen;
+ if (alen > 255 || !(dflags & DFLAG_SMALL_ATOM_TAGS))
+ result++; /* ATOM_EXT (not small) */
+ }
+ insert_acache_map(acmp, obj, dflags);
}
break;
case SMALL_DEF: