diff options
Diffstat (limited to 'erts/emulator/beam/external.c')
-rw-r--r-- | erts/emulator/beam/external.c | 292 |
1 files changed, 205 insertions, 87 deletions
diff --git a/erts/emulator/beam/external.c b/erts/emulator/beam/external.c index ab1065aaa1..8c4d9108d4 100644 --- a/erts/emulator/beam/external.c +++ b/erts/emulator/beam/external.c @@ -142,6 +142,7 @@ erts_init_atom_cache_map(ErtsAtomCacheMap *acmp) { if (acmp) { int ix; + acmp->long_atoms = 0; for (ix = 0; ix < ERTS_ATOM_CACHE_SIZE; ix++) acmp->cache[ix].iix = -1; acmp->sz = 0; @@ -154,6 +155,7 @@ erts_reset_atom_cache_map(ErtsAtomCacheMap *acmp) { if (acmp) { int i; + acmp->long_atoms = 0; for (i = 0; i < acmp->sz; i++) { ASSERT(0 <= acmp->cix[i] && acmp->cix[i] < ERTS_ATOM_CACHE_SIZE); acmp->cache[acmp->cix[i]].iix = -1; @@ -175,9 +177,23 @@ erts_destroy_atom_cache_map(ErtsAtomCacheMap *acmp) } static ERTS_INLINE void -insert_acache_map(ErtsAtomCacheMap *acmp, Eterm atom) +insert_acache_map(ErtsAtomCacheMap *acmp, Eterm atom, Uint32 dflags) { - if (acmp && acmp->sz < ERTS_MAX_INTERNAL_ATOM_CACHE_ENTRIES) { + /* + * If the receiver do not understand utf8 atoms + * and this atom cannot be represented in latin1, + * we are not allowed to cache it. + * + * In this case all atoms are assumed to have + * latin1 encoding in the cache. By refusing it + * in the cache we will instead encode it using + * ATOM_UTF8_EXT/SMALL_ATOM_UTF8_EXT which the + * receiver do not recognize and tear down the + * connection. + */ + if (acmp && acmp->sz < ERTS_MAX_INTERNAL_ATOM_CACHE_ENTRIES + && ((dflags & DFLAG_UTF8_ATOMS) + || atom_tab(atom_val(atom))->latin1_chars >= 0)) { int ix; ASSERT(acmp->hdr_sz < 0); ix = atom2cix(atom); @@ -190,7 +206,7 @@ insert_acache_map(ErtsAtomCacheMap *acmp, Eterm atom) } static ERTS_INLINE int -get_iix_acache_map(ErtsAtomCacheMap *acmp, Eterm atom) +get_iix_acache_map(ErtsAtomCacheMap *acmp, Eterm atom, Uint32 dflags) { if (!acmp) return -1; @@ -199,7 +215,9 @@ get_iix_acache_map(ErtsAtomCacheMap *acmp, Eterm atom) ASSERT(is_atom(atom)); ix = atom2cix(atom); if (acmp->cache[ix].iix < 0) { - ASSERT(acmp->sz == ERTS_MAX_INTERNAL_ATOM_CACHE_ENTRIES); + ASSERT(acmp->sz == ERTS_MAX_INTERNAL_ATOM_CACHE_ENTRIES + || (!(dflags & DFLAG_UTF8_ATOMS) + && atom_tab(atom_val(atom))->latin1_chars < 0)); return -1; } else { @@ -210,18 +228,17 @@ get_iix_acache_map(ErtsAtomCacheMap *acmp, Eterm atom) } void -erts_finalize_atom_cache_map(ErtsAtomCacheMap *acmp) +erts_finalize_atom_cache_map(ErtsAtomCacheMap *acmp, Uint32 dflags) { if (acmp) { -#if MAX_ATOM_LENGTH > 255 -#error "This code is not complete; long_atoms info need to be passed to the following stages." - int long_atoms = 0; /* !0 if one or more atoms are long than 255. */ -#endif + int utf8_atoms = (int) (dflags & DFLAG_UTF8_ATOMS); + int long_atoms = 0; /* !0 if one or more atoms are longer than 255. */ int i; int sz; int fix_sz = 1 /* VERSION_MAGIC */ + 1 /* DIST_HEADER */ + + 1 /* dist header flags */ + 1 /* number of internal cache entries */ ; int min_sz; @@ -230,22 +247,23 @@ erts_finalize_atom_cache_map(ErtsAtomCacheMap *acmp) min_sz = fix_sz+(2+4)*acmp->sz; sz = fix_sz; for (i = 0; i < acmp->sz; i++) { + Atom *a; Eterm atom; int len; atom = acmp->cache[acmp->cix[i]].atom; ASSERT(is_atom(atom)); - len = atom_tab(atom_val(atom))->len; -#if MAX_ATOM_LENGTH > 255 + a = atom_tab(atom_val(atom)); + len = (int) (utf8_atoms ? a->len : a->latin1_chars); + ASSERT(len >= 0); if (!long_atoms && len > 255) long_atoms = 1; -#endif /* Enough for a new atom cache value */ sz += 1 /* cix */ + 1 /* length */ + len /* text */; } -#if MAX_ATOM_LENGTH > 255 - if (long_atoms) + if (long_atoms) { + acmp->long_atoms = 1; sz += acmp->sz; /* we need 2 bytes per atom for length */ -#endif + } /* Dynamically sized flag field */ sz += ERTS_DIST_HDR_ATOM_CACHE_FLAG_BYTES(acmp->sz); if (sz < min_sz) @@ -274,6 +292,7 @@ byte *erts_encode_ext_dist_header_setup(byte *ctl_ext, ErtsAtomCacheMap *acmp) else { int i; byte *ep = ctl_ext; + byte dist_hdr_flags = acmp->long_atoms ? ERTS_DIST_HDR_LONG_ATOMS_FLG : 0; ASSERT(acmp->hdr_sz >= 0); /* * Write cache update instructions. Note that this is a purely @@ -296,28 +315,36 @@ byte *erts_encode_ext_dist_header_setup(byte *ctl_ext, ErtsAtomCacheMap *acmp) } --ep; put_int8(acmp->sz, ep); + --ep; + put_int8(dist_hdr_flags, ep); *--ep = DIST_HEADER; *--ep = VERSION_MAGIC; return ep; } } -byte *erts_encode_ext_dist_header_finalize(byte *ext, ErtsAtomCache *cache) +byte *erts_encode_ext_dist_header_finalize(byte *ext, ErtsAtomCache *cache, Uint32 dflags) { byte *ip; byte instr_buf[(2+4)*ERTS_ATOM_CACHE_SIZE]; int ci, sz; + byte dist_hdr_flags; + int long_atoms; + int utf8_atoms = (int) (dflags & DFLAG_UTF8_ATOMS); register byte *ep = ext; ASSERT(ep[0] == VERSION_MAGIC); if (ep[1] != DIST_HEADER) return ext; + dist_hdr_flags = ep[2]; + long_atoms = ERTS_DIST_HDR_LONG_ATOMS_FLG & ((int) dist_hdr_flags); + /* * Update output atom cache and write the external version of * the dist header. We write the header backwards just * before the actual term(s). */ - ep += 2; + ep += 3; ci = (int) get_int8(ep); ASSERT(0 <= ci && ci < ERTS_ATOM_CACHE_SIZE); ep += 1; @@ -342,12 +369,7 @@ byte *erts_encode_ext_dist_header_finalize(byte *ext, ErtsAtomCache *cache) flgs_bytes = ERTS_DIST_HDR_ATOM_CACHE_FLAG_BYTES(ci); ASSERT(flgs_bytes <= sizeof(flgs_buf)); -#if MAX_ATOM_LENGTH > 255 - /* long_atoms info needs to be passed from previous stages */ - if (long_atoms) - flgs |= ERTS_DIST_HDR_LONG_ATOMS_FLG; -#endif - flgs = 0; + flgs = (Uint32) dist_hdr_flags; flgs_buf_ix = 0; if ((ci & 1) == 0) used_half_bytes = 2; @@ -382,17 +404,22 @@ byte *erts_encode_ext_dist_header_finalize(byte *ext, ErtsAtomCache *cache) Atom *a; cache->out_arr[cix] = atom; a = atom_tab(atom_val(atom)); - sz = a->len; - ep -= sz; - sys_memcpy((void *) ep, (void *) a->name, sz); -#if MAX_ATOM_LENGTH > 255 + if (utf8_atoms) { + sz = a->len; + ep -= sz; + sys_memcpy((void *) ep, (void *) a->name, sz); + } + else { + ASSERT(0 <= a->latin1_chars && a->latin1_chars <= MAX_ATOM_CHARACTERS); + ep -= a->latin1_chars; + sz = erts_utf8_to_latin1(ep, a->name, a->len); + ASSERT(a->latin1_chars == sz); + } if (long_atoms) { ep -= 2; put_int16(sz, ep); } - else -#endif - { + else { ASSERT(0 <= sz && sz <= 255); --ep; put_int8(sz, ep); @@ -467,7 +494,7 @@ Uint erts_encode_ext_size_2(Eterm term, unsigned dflags) Uint erts_encode_ext_size_ets(Eterm term) { - return encode_size_struct2(NULL, term, TERM_TO_BINARY_DFLAGS|DFLAGS_INTERNAL_TAGS); + return encode_size_struct2(NULL, term, TERM_TO_BINARY_DFLAGS|DFLAG_INTERNAL_TAGS); } @@ -500,7 +527,7 @@ void erts_encode_ext(Eterm term, byte **ext) byte* erts_encode_ext_ets(Eterm term, byte *ep, struct erl_off_heap_header** off_heap) { - return enc_term(NULL, term, ep, TERM_TO_BINARY_DFLAGS|DFLAGS_INTERNAL_TAGS, + return enc_term(NULL, term, ep, TERM_TO_BINARY_DFLAGS|DFLAG_INTERNAL_TAGS, off_heap); } @@ -553,6 +580,7 @@ erts_prepare_dist_ext(ErtsDistExternal *edep, #endif register byte *ep = ext; + int utf8_atoms = (int) (dep->flags & DFLAG_UTF8_ATOMS); edep->heap_size = -1; edep->ext_endp = ext+size; @@ -611,9 +639,7 @@ erts_prepare_dist_ext(ErtsDistExternal *edep, ERTS_EXT_HDR_FAIL; ep++; if (no_atoms) { -#if MAX_ATOM_LENGTH > 255 int long_atoms = 0; -#endif #ifdef DEBUG byte *flgs_buf = ep; #endif @@ -632,14 +658,8 @@ erts_prepare_dist_ext(ErtsDistExternal *edep, */ byte_ix = ERTS_DIST_HDR_ATOM_CACHE_FLAG_BYTE_IX(no_atoms); bit_ix = ERTS_DIST_HDR_ATOM_CACHE_FLAG_BIT_IX(no_atoms); - if (flgsp[byte_ix] & (((byte) ERTS_DIST_HDR_LONG_ATOMS_FLG) - << bit_ix)) { -#if MAX_ATOM_LENGTH > 255 + if (flgsp[byte_ix] & (((byte) ERTS_DIST_HDR_LONG_ATOMS_FLG) << bit_ix)) long_atoms = 1; -#else - ERTS_EXT_HDR_FAIL; /* Long atoms not supported yet */ -#endif - } #ifdef DEBUG byte_ix = 0; @@ -707,23 +727,25 @@ erts_prepare_dist_ext(ErtsDistExternal *edep, if (cix >= ERTS_ATOM_CACHE_SIZE) ERTS_EXT_HDR_FAIL; ep++; -#if MAX_ATOM_LENGTH > 255 if (long_atoms) { CHKSIZE(2); len = get_int16(ep); ep += 2; } - else -#endif - { + else { CHKSIZE(1); len = get_int8(ep); ep++; } - if (len > MAX_ATOM_LENGTH) - ERTS_EXT_HDR_FAIL; /* Too long atom */ CHKSIZE(len); - atom = am_atom_put((char *) ep, len); + atom = erts_atom_put((byte *) ep, + len, + (utf8_atoms + ? ERTS_ATOM_ENC_UTF8 + : ERTS_ATOM_ENC_LATIN1), + 0); + if (is_non_value(atom)) + ERTS_EXT_HDR_FAIL; ep += len; cache->in_arr[cix] = atom; edep->attab.atom[tix] = atom; @@ -1404,11 +1426,12 @@ static byte* enc_atom(ErtsAtomCacheMap *acmp, Eterm atom, byte *ep, Uint32 dflags) { int iix; - int i, j; + int len; + int utf8_atoms = (int) (dflags & DFLAG_UTF8_ATOMS); ASSERT(is_atom(atom)); - if (dflags & DFLAGS_INTERNAL_TAGS) { + if (dflags & DFLAG_INTERNAL_TAGS) { Uint aval = atom_val(atom); ASSERT(aval < (1<<24)); if (aval >= (1 << 16)) { @@ -1423,27 +1446,46 @@ enc_atom(ErtsAtomCacheMap *acmp, Eterm atom, byte *ep, Uint32 dflags) } return ep; } + /* * term_to_binary/1,2 and the initial distribution message * don't use the cache. */ - iix = get_iix_acache_map(acmp, atom); - if (iix < 0) { - i = atom_val(atom); - j = atom_tab(i)->len; - if ((MAX_ATOM_LENGTH <= 255 || j <= 255) - && (dflags & DFLAG_SMALL_ATOM_TAGS)) { - *ep++ = SMALL_ATOM_EXT; - put_int8(j, ep); - ep++; + + iix = get_iix_acache_map(acmp, atom, dflags); + if (iix < 0) { + Atom *a = atom_tab(atom_val(atom)); + if (utf8_atoms || a->latin1_chars < 0) { + len = a->len; + if (len > 255) { + *ep++ = ATOM_UTF8_EXT; + put_int16(len, ep); + ep += 2; + } + else { + *ep++ = SMALL_ATOM_UTF8_EXT; + put_int8(len, ep); + ep += 1; + } + sys_memcpy((char *) ep, (char *) a->name, len); } else { - *ep++ = ATOM_EXT; - put_int16(j, ep); - ep += 2; + if (a->latin1_chars <= 255 && (dflags & DFLAG_SMALL_ATOM_TAGS)) { + *ep++ = SMALL_ATOM_EXT; + len = erts_utf8_to_latin1(ep+1, a->name, a->len); + ASSERT(len == a->latin1_chars); + put_int8(len, ep); + ep++; + } + else { + *ep++ = ATOM_EXT; + len = erts_utf8_to_latin1(ep+2, a->name, a->len); + ASSERT(len == a->latin1_chars); + put_int16(len, ep); + ep += 2; + } } - sys_memcpy((char *) ep, (char*)atom_tab(i)->name, (int) j); - ep += j; + ep += len; return ep; } @@ -1472,7 +1514,7 @@ enc_pid(ErtsAtomCacheMap *acmp, Eterm pid, byte* ep, Uint32 dflags) ep += 4; put_int32(os, ep); ep += 4; - *ep++ = (is_internal_pid(pid) && (dflags & DFLAGS_INTERNAL_TAGS)) ? + *ep++ = (is_internal_pid(pid) && (dflags & DFLAG_INTERNAL_TAGS)) ? INTERNAL_CREATION : pid_creation(pid); return ep; } @@ -1482,7 +1524,7 @@ static byte* dec_atom(ErtsDistExternal *edep, byte* ep, Eterm* objp) { Uint len; - int n; + int n, is_latin1; switch (*ep++) { case ATOM_CACHE_REF: @@ -1498,17 +1540,37 @@ dec_atom(ErtsDistExternal *edep, byte* ep, Eterm* objp) case ATOM_EXT: len = get_int16(ep), ep += 2; + is_latin1 = 1; goto dec_atom_common; case SMALL_ATOM_EXT: len = get_int8(ep); ep++; + is_latin1 = 1; + goto dec_atom_common; + case ATOM_UTF8_EXT: + len = get_int16(ep), + ep += 2; + is_latin1 = 0; + goto dec_atom_common; + case SMALL_ATOM_UTF8_EXT: + len = get_int8(ep), + ep++; + is_latin1 = 0; dec_atom_common: if (edep && (edep->flags & ERTS_DIST_EXT_BTT_SAFE)) { - if (!erts_atom_get((char*)ep, len, objp)) { + if (!erts_atom_get((char*)ep, len, objp, is_latin1)) { goto error; } } else { - *objp = am_atom_put((char*)ep, len); + Eterm atom = erts_atom_put(ep, + len, + (is_latin1 + ? ERTS_ATOM_ENC_LATIN1 + : ERTS_ATOM_ENC_UTF8), + 0); + if (is_non_value(atom)) + goto error; + *objp = atom; } ep += len; break; @@ -1770,7 +1832,7 @@ enc_term(ErtsAtomCacheMap *acmp, Eterm obj, byte* ep, Uint32 dflags, put_int16(i, ep); ep += 2; ep = enc_atom(acmp,ref_node_name(obj),ep,dflags); - *ep++ = ((dflags & DFLAGS_INTERNAL_TAGS) && is_internal_ref(obj)) ? + *ep++ = ((dflags & DFLAG_INTERNAL_TAGS) && is_internal_ref(obj)) ? INTERNAL_CREATION : ref_creation(obj); ref_num = ref_numbers(obj); for (j = 0; j < i; j++) { @@ -1787,7 +1849,7 @@ enc_term(ErtsAtomCacheMap *acmp, Eterm obj, byte* ep, Uint32 dflags, j = port_number(obj); put_int32(j, ep); ep += 4; - *ep++ = ((dflags & DFLAGS_INTERNAL_TAGS) && is_internal_port(obj)) ? + *ep++ = ((dflags & DFLAG_INTERNAL_TAGS) && is_internal_port(obj)) ? INTERNAL_CREATION : port_creation(obj); break; @@ -1868,7 +1930,7 @@ enc_term(ErtsAtomCacheMap *acmp, Eterm obj, byte* ep, Uint32 dflags, byte* bytes; ERTS_GET_BINARY_BYTES(obj, bytes, bitoffs, bitsize); - if (dflags & DFLAGS_INTERNAL_TAGS) { + if (dflags & DFLAG_INTERNAL_TAGS) { ProcBin* pb = (ProcBin*) binary_val(obj); Uint bytesize = pb->size; if (pb->thing_word == HEADER_SUB_BIN) { @@ -2113,7 +2175,7 @@ static byte* dec_term(ErtsDistExternal *edep, Eterm** hpp, byte* ep, ErlOffHeap* off_heap, Eterm* objp) { Eterm* hp_saved = *hpp; - int n; + int n, is_latin1; register Eterm* hp = *hpp; /* Please don't take the address of hp */ Eterm* next = objp; @@ -2199,17 +2261,37 @@ dec_term(ErtsDistExternal *edep, Eterm** hpp, byte* ep, ErlOffHeap* off_heap, Et case ATOM_EXT: n = get_int16(ep); ep += 2; - goto dec_term_atom_common; + is_latin1 = 1; + goto dec_term_atom_common; case SMALL_ATOM_EXT: n = get_int8(ep); ep++; + is_latin1 = 1; + goto dec_term_atom_common; + case ATOM_UTF8_EXT: + n = get_int16(ep); + ep += 2; + is_latin1 = 0; + goto dec_term_atom_common; + case SMALL_ATOM_UTF8_EXT: + n = get_int8(ep); + ep++; + is_latin1 = 0; dec_term_atom_common: if (edep && (edep->flags & ERTS_DIST_EXT_BTT_SAFE)) { - if (!erts_atom_get((char*)ep, n, objp)) { + if (!erts_atom_get((char*)ep, n, objp, is_latin1)) { goto error; } } else { - *objp = am_atom_put((char*)ep, n); + Eterm atom = erts_atom_put(ep, + n, + (is_latin1 + ? ERTS_ATOM_ENC_LATIN1 + : ERTS_ATOM_ENC_UTF8), + 0); + if (is_non_value(atom)) + goto error; + *objp = atom; } ep += n; break; @@ -2869,7 +2951,7 @@ encode_size_struct2(ErtsAtomCacheMap *acmp, Eterm obj, unsigned dflags) result++; break; case ATOM_DEF: - if (dflags & DFLAGS_INTERNAL_TAGS) { + if (dflags & DFLAG_INTERNAL_TAGS) { if (atom_val(obj) >= (1<<16)) { result += 1 + 3; } @@ -2878,17 +2960,22 @@ encode_size_struct2(ErtsAtomCacheMap *acmp, Eterm obj, unsigned dflags) } } else { - int alen = atom_tab(atom_val(obj))->len; - if ((MAX_ATOM_LENGTH <= 255 || alen <= 255) - && (dflags & DFLAG_SMALL_ATOM_TAGS)) { - /* Make sure a SMALL_ATOM_EXT fits: SMALL_ATOM_EXT l t1 t2... */ - result += 1 + 1 + alen; + Atom *a = atom_tab(atom_val(obj)); + int alen; + if ((dflags & DFLAG_UTF8_ATOMS) || a->latin1_chars < 0) { + alen = a->len; + result += 1 + 1 + alen; + if (alen > 255) { + result++; /* ATOM_UTF8_EXT (not small) */ + } } else { - /* Make sure an ATOM_EXT fits: ATOM_EXT l1 l0 t1 t2... */ - result += 1 + 2 + alen; + alen = a->latin1_chars; + result += 1 + 1 + alen; + if (alen > 255 || !(dflags & DFLAG_SMALL_ATOM_TAGS)) + result++; /* ATOM_EXT (not small) */ } - insert_acache_map(acmp, obj); + insert_acache_map(acmp, obj, dflags); } break; case SMALL_DEF: @@ -2969,7 +3056,7 @@ encode_size_struct2(ErtsAtomCacheMap *acmp, Eterm obj, unsigned dflags) } break; case BINARY_DEF: - if (dflags & DFLAGS_INTERNAL_TAGS) { + if (dflags & DFLAG_INTERNAL_TAGS) { ProcBin* pb = (ProcBin*) binary_val(obj); Uint sub_extra = 0; Uint tot_bytes = pb->size; @@ -3058,6 +3145,17 @@ encode_size_struct2(ErtsAtomCacheMap *acmp, Eterm obj, unsigned dflags) return result; } +static int is_valid_utf8_atom(byte* bytes, Uint nbytes) +{ + byte* err_pos; + Uint num_chars; + + /*SVERK Do we really need to validate correct utf8? */ + return nbytes <= MAX_ATOM_SZ_LIMIT + && erts_analyze_utf8(bytes, nbytes, &err_pos, &num_chars, NULL) == ERTS_UTF8_OK + && num_chars <= MAX_ATOM_CHARACTERS; +} + static Sint decoded_size(byte *ep, byte* endp, int internal_tags) { @@ -3125,21 +3223,41 @@ decoded_size(byte *ep, byte* endp, int internal_tags) case ATOM_EXT: CHKSIZE(2); n = get_int16(ep); - if (n > MAX_ATOM_LENGTH) { + if (n > MAX_ATOM_CHARACTERS) { return -1; } SKIP(n+2+atom_extra_skip); atom_extra_skip = 0; break; + case ATOM_UTF8_EXT: + CHKSIZE(2); + n = get_int16(ep); + ep += 2; + if (!is_valid_utf8_atom(ep, n)) { + return -1; + } + SKIP(n+atom_extra_skip); + atom_extra_skip = 0; + break; case SMALL_ATOM_EXT: CHKSIZE(1); n = get_int8(ep); - if (n > MAX_ATOM_LENGTH) { + if (n > MAX_ATOM_CHARACTERS) { return -1; } SKIP(n+1+atom_extra_skip); atom_extra_skip = 0; break; + case SMALL_ATOM_UTF8_EXT: + CHKSIZE(1); + n = get_int8(ep); + ep++; + if (!is_valid_utf8_atom(ep, n)) { + return -1; + } + SKIP(n+atom_extra_skip); + atom_extra_skip = 0; + break; case ATOM_CACHE_REF: SKIP(1+atom_extra_skip); atom_extra_skip = 0; |