diff options
author | Sverker Eriksson <sverker@erlang.org> | 2012-12-14 19:52:59 +0100 |
---|---|---|
committer | Sverker Eriksson <sverker@erlang.org> | 2013-01-08 11:15:01 +0100 |
commit | a9a57385f0a5648bbfeccc5e8ef0bfe2cdac80c7 (patch) | |
tree | b7fd2766b020c37a68b29b0aac47ace685841349 /erts | |
parent | 81a5f0f91ce50416d44f48752cb2b7f4ae02d953 (diff) | |
download | otp-a9a57385f0a5648bbfeccc5e8ef0bfe2cdac80c7.tar.gz otp-a9a57385f0a5648bbfeccc5e8ef0bfe2cdac80c7.tar.bz2 otp-a9a57385f0a5648bbfeccc5e8ef0bfe2cdac80c7.zip |
erts: Change internal representation of atoms to utf8
Diffstat (limited to 'erts')
-rw-r--r-- | erts/emulator/beam/atom.c | 61 | ||||
-rw-r--r-- | erts/emulator/beam/atom.h | 51 | ||||
-rw-r--r-- | erts/emulator/beam/beam_load.c | 6 | ||||
-rw-r--r-- | erts/emulator/beam/bif.c | 36 | ||||
-rw-r--r-- | erts/emulator/beam/dist.h | 1 | ||||
-rw-r--r-- | erts/emulator/beam/erl_alloc.c | 6 | ||||
-rw-r--r-- | erts/emulator/beam/erl_alloc_util.c | 4 | ||||
-rw-r--r-- | erts/emulator/beam/erl_nif.c | 8 | ||||
-rw-r--r-- | erts/emulator/beam/erl_unicode.c | 174 | ||||
-rw-r--r-- | erts/emulator/beam/external.c | 127 | ||||
-rw-r--r-- | erts/emulator/beam/external.h | 2 | ||||
-rwxr-xr-x | erts/emulator/beam/global.h | 1 | ||||
-rw-r--r-- | erts/emulator/test/bif_SUITE.erl | 4 |
13 files changed, 312 insertions, 169 deletions
diff --git a/erts/emulator/beam/atom.c b/erts/emulator/beam/atom.c index d7c7f117cf..b41a98f2a2 100644 --- a/erts/emulator/beam/atom.c +++ b/erts/emulator/beam/atom.c @@ -111,7 +111,7 @@ atom_text_alloc(int bytes) { byte *res; - ASSERT(bytes <= MAX_ATOM_LENGTH); + ASSERT(bytes <= MAX_ATOM_SZ_LIMIT); if (atom_text_pos + bytes >= atom_text_end) { more_atom_space(); } @@ -198,7 +198,11 @@ am_atom_put(const char* name, int len) Atom a; Eterm ret; int aix; - +#ifdef DEBUG + byte* err_pos; + Uint num_chars; + ASSERT(erts_analyze_utf8(name, len, &err_pos, &num_chars, NULL) == ERTS_UTF8_OK); +#endif /* * Silently truncate the atom if it is too long. Overlong atoms * could occur in situations where we have no good way to return @@ -209,8 +213,8 @@ am_atom_put(const char* name, int len) * list_to_atom/1), the caller should check the length before * calling this function. */ - if (len > MAX_ATOM_LENGTH) { - len = MAX_ATOM_LENGTH; + if (len > MAX_ATOM_SZ_LIMIT) { + len = MAX_ATOM_SZ_LIMIT; /*SVERK Urk... */ } #ifdef ERTS_ATOM_PUT_OPS_STAT erts_smp_atomic_inc_nob(&atom_put_ops); @@ -230,6 +234,49 @@ am_atom_put(const char* name, int len) return ret; } +static void latin1_to_utf8(byte* conv_buf, const byte** srcp, int* lenp) +{ + byte* dst; + const byte* src = *srcp; + int i, len = *lenp; + + for (i=0 ; i < len; ++i) { + if (src[i] & 0x80) { + goto need_convertion; + } + } + return; + +need_convertion: + sys_memcpy(conv_buf, src, i); + dst = conv_buf + i; + for ( ; i < len; ++i) { + unsigned char chr = src[i]; + if (!(chr & 0x80)) { + *dst++ = chr; + } + else { + *dst++ = 0xC0 | (chr >> 6); + *dst++ = 0x80 | (chr & 0x3F); + } + } + *srcp = conv_buf; + *lenp = dst - conv_buf; +} + + +Eterm +am_atom_put2(const byte* name, int len, int is_latin1) +{ + byte utf8_copy[MAX_ATOM_SZ_FROM_LATIN1]; + + if (is_latin1) { + latin1_to_utf8(utf8_copy, &name, &len); + } + return am_atom_put((const char*)name, len); +} + + int atom_table_size(void) { @@ -264,14 +311,18 @@ int atom_table_sz(void) } int -erts_atom_get(const char *name, int len, Eterm* ap) +erts_atom_get(const char *name, int len, Eterm* ap, int is_latin1) { + byte utf8_copy[MAX_ATOM_SZ_FROM_LATIN1]; Atom a; int i; int res; a.len = len; a.name = (byte *)name; + if (is_latin1) { + latin1_to_utf8(utf8_copy, (const byte**)&a.name, &a.len); + } atom_read_lock(); i = index_get(&erts_atom_table, (void*) &a); res = i < 0 ? 0 : (*ap = make_atom(i), 1); diff --git a/erts/emulator/beam/atom.h b/erts/emulator/beam/atom.h index fd9c04d3d0..84dd6d8901 100644 --- a/erts/emulator/beam/atom.h +++ b/erts/emulator/beam/atom.h @@ -26,7 +26,9 @@ #include "erl_atom_table.h" -#define MAX_ATOM_LENGTH 255 +#define MAX_ATOM_CHARACTERS 255 +#define MAX_ATOM_SZ_FROM_LATIN1 (2*MAX_ATOM_CHARACTERS) +#define MAX_ATOM_SZ_LIMIT (4*MAX_ATOM_CHARACTERS) /* theoretical byte limit */ #define ATOM_LIMIT (1024*1024) #define MIN_ATOM_TABLE_SIZE 8192 @@ -53,8 +55,8 @@ typedef struct atom { extern IndexTable erts_atom_table; ERTS_GLB_INLINE Atom* atom_tab(Uint i); -ERTS_GLB_INLINE int erts_is_atom_bytes(byte *text, size_t len, Eterm term); -ERTS_GLB_INLINE int erts_is_atom_str(char *str, Eterm term); +ERTS_GLB_INLINE int erts_is_atom_utf8_bytes(byte *text, size_t len, Eterm term); +ERTS_GLB_INLINE int erts_is_atom_str(const char *str, Eterm term, int is_latin1); #if ERTS_GLB_INLINE_INCL_FUNC_DEF ERTS_GLB_INLINE Atom* @@ -63,7 +65,7 @@ atom_tab(Uint i) return (Atom *) erts_index_lookup(&erts_atom_table, i); } -ERTS_GLB_INLINE int erts_is_atom_bytes(byte *text, size_t len, Eterm term) +ERTS_GLB_INLINE int erts_is_atom_utf8_bytes(byte *text, size_t len, Eterm term) { Atom *a; if (!is_atom(term)) @@ -73,30 +75,50 @@ ERTS_GLB_INLINE int erts_is_atom_bytes(byte *text, size_t len, Eterm term) && sys_memcmp((void *) a->name, (void *) text, len) == 0); } -ERTS_GLB_INLINE int erts_is_atom_str(char *str, Eterm term) +ERTS_GLB_INLINE int erts_is_atom_str(const char *str, Eterm term, int is_latin1) { Atom *a; int i, len; - char *aname; + const byte* aname; + const byte* s = (const byte*) str; + if (!is_atom(term)) return 0; a = atom_tab(atom_val(term)); len = a->len; - aname = (char *) a->name; - for (i = 0; i < len; i++) - if (aname[i] != str[i] || str[i] == '\0') - return 0; - return str[len] == '\0'; + aname = a->name; + if (is_latin1) { + for (i = 0; i < len; s++) { + if (aname[i] < 0x80) { + if (aname[i] != *s || *s == '\0') + return 0; + i++; + } + else { + if (aname[i] != (0xC0 | (*s >> 6)) || + aname[i+1] != (0x80 | (*s & 0x3F))) { + return 0; + } + i += 2; + } + } + } + else { + for (i = 0; i < len; i++, s++) + if (aname[i] != *s || *s == '\0') + return 0; + } + return *s == '\0'; } #endif /* * Note, ERTS_IS_ATOM_STR() expects the first argument to be a - * string literal. + * 7-bit ASCII string literal. */ #define ERTS_IS_ATOM_STR(LSTR, TERM) \ - (erts_is_atom_bytes((byte *) LSTR, sizeof(LSTR) - 1, (TERM))) + (erts_is_atom_utf8_bytes((byte *) LSTR, sizeof(LSTR) - 1, (TERM))) #define ERTS_DECL_AM(S) Eterm AM_ ## S = am_atom_put(#S, sizeof(#S) - 1) #define ERTS_INIT_AM(S) AM_ ## S = am_atom_put(#S, sizeof(#S) - 1) @@ -104,12 +126,13 @@ int atom_table_size(void); /* number of elements */ int atom_table_sz(void); /* table size in bytes, excluding stored objects */ Eterm am_atom_put(const char*, int); /* most callers pass plain char*'s */ +Eterm am_atom_put2(const byte*, int, int is_latin1); int atom_erase(byte*, int); int atom_static_put(byte*, int); void init_atom_table(void); void atom_info(int, void *); void dump_atoms(int, void *); -int erts_atom_get(const char* name, int len, Eterm* ap); +int erts_atom_get(const char* name, int len, Eterm* ap, int is_latin1); void erts_atom_get_text_space_sizes(Uint *reserved, Uint *used); #endif diff --git a/erts/emulator/beam/beam_load.c b/erts/emulator/beam/beam_load.c index b51f076a5d..7bb964d5aa 100644 --- a/erts/emulator/beam/beam_load.c +++ b/erts/emulator/beam/beam_load.c @@ -1230,7 +1230,7 @@ load_atom_table(LoaderState* stp) GetByte(stp, n); GetString(stp, atom, n); - stp->atom[i] = am_atom_put((char*)atom, n); + stp->atom[i] = am_atom_put2(atom, n, 1); } /* @@ -1240,7 +1240,7 @@ load_atom_table(LoaderState* stp) if (is_nil(stp->module)) { stp->module = stp->atom[1]; } else if (stp->atom[1] != stp->module) { - char sbuf[256]; + char sbuf[MAX_ATOM_SZ_FROM_LATIN1]; Atom* ap; ap = atom_tab(atom_val(stp->atom[1])); @@ -1620,7 +1620,7 @@ read_line_table(LoaderState* stp) GetInt(stp, 2, n); GetString(stp, fname, n); - stp->fname[i] = am_atom_put((char*)fname, n); + stp->fname[i] = am_atom_put((char*)fname, n); /*SVERK ? */ } } diff --git a/erts/emulator/beam/bif.c b/erts/emulator/beam/bif.c index 1cdce49eef..89157068c0 100644 --- a/erts/emulator/beam/bif.c +++ b/erts/emulator/beam/bif.c @@ -2536,9 +2536,11 @@ BIF_RETTYPE append_element_2(BIF_ALIST_2) BIF_RETTYPE atom_to_list_1(BIF_ALIST_1) { - Uint need; - Eterm* hp; + Eterm do_utf8_to_list(Process*, Uint num, byte *bytes, Uint sz, Uint left, + Uint *num_built, Uint *num_eaten, Eterm tail); /*SVERK */ Atom* ap; + Uint num_chars, num_built, num_eaten; + Eterm res; if (is_not_atom(BIF_ARG_1)) BIF_ERROR(BIF_P, BADARG); @@ -2547,9 +2549,19 @@ BIF_RETTYPE atom_to_list_1(BIF_ALIST_1) ap = atom_tab(atom_val(BIF_ARG_1)); if (ap->len == 0) BIF_RET(NIL); /* the empty atom */ - need = ap->len*2; - hp = HAlloc(BIF_P, need); - BIF_RET(buf_to_intlist(&hp,(char*)ap->name,ap->len, NIL)); + { + byte* err_pos; + if (erts_analyze_utf8(ap->name, ap->len, &err_pos, &num_chars, NULL) + != ERTS_UTF8_OK) { + BIF_ERROR(BIF_P, BADARG); + } + } + + res = do_utf8_to_list(BIF_P, num_chars, ap->name, ap->len, ap->len, + &num_built, &num_eaten, NIL); + ASSERT(num_built == num_chars); + ASSERT(num_eaten == ap->len); + BIF_RET(res); } /**********************************************************************/ @@ -2559,18 +2571,18 @@ BIF_RETTYPE atom_to_list_1(BIF_ALIST_1) BIF_RETTYPE list_to_atom_1(BIF_ALIST_1) { Eterm res; - char *buf = (char *) erts_alloc(ERTS_ALC_T_TMP, MAX_ATOM_LENGTH); - int i = intlist_to_buf(BIF_ARG_1, buf, MAX_ATOM_LENGTH); + char *buf = (char *) erts_alloc(ERTS_ALC_T_TMP, MAX_ATOM_CHARACTERS); + int i = intlist_to_buf(BIF_ARG_1, buf, MAX_ATOM_CHARACTERS); if (i < 0) { erts_free(ERTS_ALC_T_TMP, (void *) buf); i = list_length(BIF_ARG_1); - if (i > MAX_ATOM_LENGTH) { + if (i > MAX_ATOM_CHARACTERS) { BIF_ERROR(BIF_P, SYSTEM_LIMIT); } BIF_ERROR(BIF_P, BADARG); } - res = am_atom_put(buf, i); + res = am_atom_put2((byte*)buf, i, 1); erts_free(ERTS_ALC_T_TMP, (void *) buf); BIF_RET(res); } @@ -2580,16 +2592,16 @@ BIF_RETTYPE list_to_atom_1(BIF_ALIST_1) BIF_RETTYPE list_to_existing_atom_1(BIF_ALIST_1) { int i; - char *buf = (char *) erts_alloc(ERTS_ALC_T_TMP, MAX_ATOM_LENGTH); + char *buf = (char *) erts_alloc(ERTS_ALC_T_TMP, MAX_ATOM_CHARACTERS); - if ((i = intlist_to_buf(BIF_ARG_1, buf, MAX_ATOM_LENGTH)) < 0) { + if ((i = intlist_to_buf(BIF_ARG_1, buf, MAX_ATOM_CHARACTERS)) < 0) { error: erts_free(ERTS_ALC_T_TMP, (void *) buf); BIF_ERROR(BIF_P, BADARG); } else { Eterm a; - if (erts_atom_get(buf, i, &a)) { + if (erts_atom_get(buf, i, &a, 1)) { erts_free(ERTS_ALC_T_TMP, (void *) buf); BIF_RET(a); } else { diff --git a/erts/emulator/beam/dist.h b/erts/emulator/beam/dist.h index 0db7d56cb2..129f637dba 100644 --- a/erts/emulator/beam/dist.h +++ b/erts/emulator/beam/dist.h @@ -39,6 +39,7 @@ #define DFLAG_DIST_HDR_ATOM_CACHE 0x2000 #define DFLAG_SMALL_ATOM_TAGS 0x4000 #define DFLAG_INTERNAL_TAGS 0x8000 +#define DFLAG_UTF8_ATOMS 0x10000 /* All flags that should be enabled when term_to_binary/1 is used. */ #define TERM_TO_BINARY_DFLAGS (DFLAG_EXTENDED_REFERENCES \ diff --git a/erts/emulator/beam/erl_alloc.c b/erts/emulator/beam/erl_alloc.c index 3eee53eba3..061f229f59 100644 --- a/erts/emulator/beam/erl_alloc.c +++ b/erts/emulator/beam/erl_alloc.c @@ -3045,13 +3045,13 @@ erts_request_alloc_info(struct process *c_p, Eterm alloc = CAR(consp); for (ai = ERTS_ALC_A_MIN; ai <= ERTS_ALC_A_MAX; ai++) - if (erts_is_atom_str((char *) erts_alc_a2ad[ai], alloc)) + if (erts_is_atom_str(erts_alc_a2ad[ai], alloc, 0)) goto save_alloc; - if (erts_is_atom_str("mseg_alloc", alloc)) { + if (erts_is_atom_str("mseg_alloc", alloc, 0)) { ai = ERTS_ALC_INFO_A_MSEG_ALLOC; goto save_alloc; } - if (erts_is_atom_str("alloc_util", alloc)) { + if (erts_is_atom_str("alloc_util", alloc, 0)) { ai = ERTS_ALC_INFO_A_ALLOC_UTIL; save_alloc: if (req_ai[ai]) diff --git a/erts/emulator/beam/erl_alloc_util.c b/erts/emulator/beam/erl_alloc_util.c index 97ba306a79..f8a8c00715 100644 --- a/erts/emulator/beam/erl_alloc_util.c +++ b/erts/emulator/beam/erl_alloc_util.c @@ -2815,10 +2815,10 @@ make_name_atoms(Allctr_t *allctr) char alloc[] = "alloc"; char realloc[] = "realloc"; char free[] = "free"; - char buf[MAX_ATOM_LENGTH]; + char buf[MAX_ATOM_CHARACTERS]; size_t prefix_len = strlen(allctr->name_prefix); - if (prefix_len > MAX_ATOM_LENGTH + sizeof(realloc) - 1) + if (prefix_len > MAX_ATOM_CHARACTERS + sizeof(realloc) - 1) erl_exit(1,"Too long allocator name: %salloc\n",allctr->name_prefix); memcpy((void *) buf, (void *) allctr->name_prefix, prefix_len); diff --git a/erts/emulator/beam/erl_nif.c b/erts/emulator/beam/erl_nif.c index 632d756481..185ac75d73 100644 --- a/erts/emulator/beam/erl_nif.c +++ b/erts/emulator/beam/erl_nif.c @@ -974,7 +974,7 @@ int enif_make_existing_atom_len(ErlNifEnv* env, const char* name, size_t len, ERL_NIF_TERM* atom, ErlNifCharEncoding encoding) { ASSERT(encoding == ERL_NIF_LATIN1); - return erts_atom_get(name, len, atom); + return erts_atom_get(name, len, atom, 1); } ERL_NIF_TERM enif_make_tuple(ErlNifEnv* env, unsigned cnt, ...) @@ -1633,7 +1633,7 @@ BIF_RETTYPE load_nif_2(BIF_ALIST_2) "this vm variant (%s).", entry->vm_variant, ERL_NIF_VM_VARIANT); } - else if (!erts_is_atom_str((char*)entry->name, mod_atom)) { + else if (!erts_is_atom_str((char*)entry->name, mod_atom, 1)) { ret = load_nif_error(BIF_P, bad_lib, "Library module name '%s' does not" " match calling module '%T'", entry->name, mod_atom); } @@ -1643,7 +1643,7 @@ BIF_RETTYPE load_nif_2(BIF_ALIST_2) for (i=0; i < entry->num_of_funcs && ret==am_ok; i++) { BeamInstr** code_pp; ErlNifFunc* f = &entry->funcs[i]; - if (!erts_atom_get(f->name, sys_strlen(f->name), &f_atom) + if (!erts_atom_get(f->name, sys_strlen(f->name), &f_atom, 1) || (code_pp = get_func_pp(mod->curr.code, f_atom, f->arity))==NULL) { ret = load_nif_error(BIF_P,bad_lib,"Function not found %T:%s/%u", mod_atom, f->name, f->arity); @@ -1746,7 +1746,7 @@ BIF_RETTYPE load_nif_2(BIF_ALIST_2) for (i=0; i < entry->num_of_funcs; i++) { BeamInstr* code_ptr; - erts_atom_get(entry->funcs[i].name, sys_strlen(entry->funcs[i].name), &f_atom); + erts_atom_get(entry->funcs[i].name, sys_strlen(entry->funcs[i].name), &f_atom, 1); code_ptr = *get_func_pp(mod->curr.code, f_atom, entry->funcs[i].arity); if (code_ptr[1] == 0) { diff --git a/erts/emulator/beam/erl_unicode.c b/erts/emulator/beam/erl_unicode.c index 51559aea1c..e24b6f1458 100644 --- a/erts/emulator/beam/erl_unicode.c +++ b/erts/emulator/beam/erl_unicode.c @@ -1155,7 +1155,7 @@ BIF_RETTYPE unicode_characters_to_list_2(BIF_ALIST_2) * a faster analyze and size count with this function. */ int erts_analyze_utf8(byte *source, Uint size, - byte **err_pos, Uint *num_chars, int *left) + byte **err_pos, Uint *num_chars, int *left) { *err_pos = source; *num_chars = 0; @@ -1210,7 +1210,7 @@ int erts_analyze_utf8(byte *source, Uint size, } ++(*num_chars); *err_pos = source; - if (left && --(*left) <= 0) { + if (left && --(*left) <= 0 && size) { return ERTS_UTF8_ANALYZE_MORE; } } @@ -1220,7 +1220,7 @@ int erts_analyze_utf8(byte *source, Uint size, /* * No errors should be able to occur - no overlongs, no malformed, no nothing */ -static Eterm do_utf8_to_list(Process *p, Uint num, byte *bytes, Uint sz, +Eterm do_utf8_to_list(Process *p, Uint num, byte *bytes, Uint sz, Uint left, Uint *num_built, Uint *num_eaten, Eterm tail) { @@ -1812,31 +1812,36 @@ BIF_RETTYPE atom_to_binary_2(BIF_ALIST_2) ap = atom_tab(atom_val(BIF_ARG_1)); if (BIF_ARG_2 == am_latin1) { - BIF_RET(new_binary(BIF_P, ap->name, ap->len)); - } else if (BIF_ARG_2 == am_utf8 || BIF_ARG_2 == am_unicode) { - int bin_size = 0; int i; Eterm bin_term; - byte* bin_p; - - for (i = 0; i < ap->len; i++) { - bin_size += (ap->name[i] >= 0x80) ? 2 : 1; + int bin_size = ap->len; + + for (i = 0; i < ap->len; ) { + if (ap->name[i] < 0x80) i++; + else { + ASSERT(ap->name[i] >= 0xC0); + if (ap->name[i] < 0xE0) { + ASSERT(i+1 < ap->len && (ap->name[i+1] & 0xC0) == 0x80); + i += 2; + bin_size -= 1; + } + else goto error; + } } if (bin_size == ap->len) { - BIF_RET(new_binary(BIF_P, ap->name, ap->len)); + bin_term = new_binary(BIF_P, ap->name, ap->len); } - bin_term = new_binary(BIF_P, 0, bin_size); - bin_p = binary_bytes(bin_term); - for (i = 0; i < ap->len; i++) { - byte b = ap->name[i]; - if (b < 0x80) { - *bin_p++ = b; - } else { - *bin_p++ = 0xC0 | (b >> 6); - *bin_p++ = 0x80 | (b & 0x3F); - } + else { + byte* bin_p; + int dbg_sz; + bin_term = new_binary(BIF_P, 0, bin_size); + bin_p = binary_bytes(bin_term); + dbg_sz = erts_utf8_to_latin1(bin_p, ap->name, ap->len); + ASSERT(dbg_sz == bin_size); (void)dbg_sz; } BIF_RET(bin_term); + } else if (BIF_ARG_2 == am_utf8 || BIF_ARG_2 == am_unicode) { + BIF_RET(new_binary(BIF_P, ap->name, ap->len)); } else { error: BIF_ERROR(BIF_P, BADARG); @@ -1856,102 +1861,52 @@ binary_to_atom(Process* p, Eterm bin, Eterm enc, int must_exist) bin_size = binary_size(bin); if (enc == am_latin1) { Eterm a; - if (bin_size > MAX_ATOM_LENGTH) { + if (bin_size > MAX_ATOM_CHARACTERS) { system_limit: erts_free_aligned_binary_bytes(temp_alloc); BIF_ERROR(p, SYSTEM_LIMIT); } if (!must_exist) { - a = am_atom_put((char *)bytes, bin_size); + a = am_atom_put2(bytes, bin_size, 1); erts_free_aligned_binary_bytes(temp_alloc); BIF_RET(a); - } else if (erts_atom_get((char *)bytes, bin_size, &a)) { + } else if (erts_atom_get((char *)bytes, bin_size, &a, 1)) { erts_free_aligned_binary_bytes(temp_alloc); BIF_RET(a); } else { goto badarg; } } else if (enc == am_utf8 || enc == am_unicode) { - char *buf; - char *dst; - int i; - int num_chars; Eterm res; + Uint num_chars = 0; + const byte* p = bytes; + Uint left = bin_size; - if (bin_size > 2*MAX_ATOM_LENGTH) { - byte* err_pos; - Uint n; - int reds_left = bin_size+1; /* Number of reductions left. */ - - if (erts_analyze_utf8(bytes, bin_size, &err_pos, - &n, &reds_left) == ERTS_UTF8_OK) { - /* - * Correct UTF-8 encoding, but too many characters to - * fit in an atom. - */ + while (left) { + if (++num_chars > MAX_ATOM_CHARACTERS) { goto system_limit; - } else { - /* - * Something wrong in the UTF-8 encoding or Unicode code - * points > 255. - */ - goto badarg; } - } - - /* - * Allocate a temporary buffer the same size as the binary, - * so that we don't need an extra overflow test. - */ - buf = (char *) erts_alloc(ERTS_ALC_T_TMP, bin_size); - dst = buf; - for (i = 0; i < bin_size; i++) { - int c = bytes[i]; - if (c < 0x80) { - *dst++ = c; - } else if (i < bin_size-1) { - int c2; - if ((c & 0xE0) != 0xC0) { - goto free_badarg; - } - i++; - c = (c & 0x3F) << 6; - c2 = bytes[i]; - if ((c2 & 0xC0) != 0x80) { - goto free_badarg; - } - c = c | (c2 & 0x3F); - if (0x80 <= c && c < 256) { - *dst++ = c; - } else { - goto free_badarg; - } - } else { - free_badarg: - erts_free(ERTS_ALC_T_TMP, (void *) buf); - goto badarg; + if ((p[0] & 0x80) == 0) { + ++p; + --left; } + else if (left >= 2 + && (p[0] & 0xFE) == 0xC2 /* only allow latin1 subset */ + && (p[1] & 0xC0) == 0x80) { + p += 2; + left -= 2; + } + else goto badarg; } - num_chars = dst - buf; - if (num_chars > MAX_ATOM_LENGTH) { - erts_free(ERTS_ALC_T_TMP, (void *) buf); - goto system_limit; - } + if (!must_exist) { - res = am_atom_put(buf, num_chars); - erts_free(ERTS_ALC_T_TMP, (void *) buf); - erts_free_aligned_binary_bytes(temp_alloc); - BIF_RET(res); - } else { - int exists = erts_atom_get(buf, num_chars, &res); - erts_free(ERTS_ALC_T_TMP, (void *) buf); - if (exists) { - erts_free_aligned_binary_bytes(temp_alloc); - BIF_RET(res); - } else { - goto badarg; - } + res = am_atom_put((char*)bytes, bin_size); + } + else if (!erts_atom_get((char*)bytes, bin_size, &res, 0)) { + goto badarg; } + erts_free_aligned_binary_bytes(temp_alloc); + BIF_RET(res); } else { badarg: erts_free_aligned_binary_bytes(temp_alloc); @@ -2670,3 +2625,30 @@ BIF_RETTYPE file_native_name_encoding_0(BIF_ALIST_0) } } +/* Assumes 'dest' has enough room. + */ +int erts_utf8_to_latin1(byte* dest, const byte* source, unsigned slen) +{ + byte* dp = dest; + while (slen > 0) { + if ((source[0] & 0x80) == 0) { + *dp++ = *source++; + --slen; + } + else if (slen > 1 && + (source[0] & 0xFE) == 0xC2 && + (source[1] & 0xC0) == 0x80) { + *dp++ = (char) ((source[0] << 6) | (source[1] & 0x3F)); + source += 2; + slen -= 2; + } + else { + /* Just let unconvertable octets through. This should not happen + in a correctly upgraded system */ + *dp++ = *source++; + --slen; + } + } + return dp - dest; +} + diff --git a/erts/emulator/beam/external.c b/erts/emulator/beam/external.c index 263ffc4eb3..68edcd0fa6 100644 --- a/erts/emulator/beam/external.c +++ b/erts/emulator/beam/external.c @@ -707,7 +707,7 @@ erts_prepare_dist_ext(ErtsDistExternal *edep, if (cix >= ERTS_ATOM_CACHE_SIZE) ERTS_EXT_HDR_FAIL; ep++; -#if MAX_ATOM_LENGTH > 255 +#if MAX_ATOM_CHARACTERS > 255 if (long_atoms) { CHKSIZE(2); len = get_int16(ep); @@ -720,7 +720,7 @@ erts_prepare_dist_ext(ErtsDistExternal *edep, len = get_int8(ep); ep++; } - if (len > MAX_ATOM_LENGTH) + if (len > MAX_ATOM_CHARACTERS) ERTS_EXT_HDR_FAIL; /* Too long atom */ CHKSIZE(len); atom = am_atom_put((char *) ep, len); @@ -1431,18 +1431,33 @@ enc_atom(ErtsAtomCacheMap *acmp, Eterm atom, byte *ep, Uint32 dflags) if (iix < 0) { i = atom_val(atom); j = atom_tab(i)->len; - if ((MAX_ATOM_LENGTH <= 255 || j <= 255) - && (dflags & DFLAG_SMALL_ATOM_TAGS)) { - *ep++ = SMALL_ATOM_EXT; - put_int8(j, ep); - ep++; + if (dflags & DFLAG_UTF8_ATOMS) { + if (j <= 255) { + *ep++ = ATOM_UTF8_EXT; + put_int16(j, ep); + ep += 2; + } + else { + *ep++ = SMALL_ATOM_UTF8_EXT; + put_int8(j, ep); + ep += 2; + } + sys_memcpy((char *) ep, (char*)atom_tab(i)->name, (int) j); } else { - *ep++ = ATOM_EXT; - put_int16(j, ep); - ep += 2; + if (j <= 255 && (dflags & DFLAG_SMALL_ATOM_TAGS)) { + *ep++ = SMALL_ATOM_EXT; + j = erts_utf8_to_latin1(ep+1, atom_tab(i)->name, j); + put_int8(j, ep); + ep++; + } + else { + *ep++ = ATOM_EXT; + j = erts_utf8_to_latin1(ep+2, atom_tab(i)->name, j); + put_int16(j, ep); + ep += 2; + } } - sys_memcpy((char *) ep, (char*)atom_tab(i)->name, (int) j); ep += j; return ep; } @@ -1482,7 +1497,7 @@ static byte* dec_atom(ErtsDistExternal *edep, byte* ep, Eterm* objp) { Uint len; - int n; + int n, is_latin1; switch (*ep++) { case ATOM_CACHE_REF: @@ -1498,17 +1513,29 @@ dec_atom(ErtsDistExternal *edep, byte* ep, Eterm* objp) case ATOM_EXT: len = get_int16(ep), ep += 2; + is_latin1 = 1; goto dec_atom_common; case SMALL_ATOM_EXT: len = get_int8(ep); ep++; + is_latin1 = 1; + goto dec_atom_common; + case ATOM_UTF8_EXT: + len = get_int16(ep), + ep += 2; + is_latin1 = 0; + goto dec_atom_common; + case SMALL_ATOM_UTF8_EXT: + len = get_int8(ep), + ep++; + is_latin1 = 0; dec_atom_common: if (edep && (edep->flags & ERTS_DIST_EXT_BTT_SAFE)) { - if (!erts_atom_get((char*)ep, len, objp)) { + if (!erts_atom_get((char*)ep, len, objp, is_latin1)) { goto error; } } else { - *objp = am_atom_put((char*)ep, len); + *objp = am_atom_put2(ep, len, is_latin1); } ep += len; break; @@ -2113,7 +2140,7 @@ static byte* dec_term(ErtsDistExternal *edep, Eterm** hpp, byte* ep, ErlOffHeap* off_heap, Eterm* objp) { Eterm* hp_saved = *hpp; - int n; + int n, is_latin1; register Eterm* hp = *hpp; /* Please don't take the address of hp */ Eterm* next = objp; @@ -2199,17 +2226,29 @@ dec_term(ErtsDistExternal *edep, Eterm** hpp, byte* ep, ErlOffHeap* off_heap, Et case ATOM_EXT: n = get_int16(ep); ep += 2; - goto dec_term_atom_common; + is_latin1 = 1; + goto dec_term_atom_common; case SMALL_ATOM_EXT: n = get_int8(ep); ep++; + is_latin1 = 1; + goto dec_term_atom_common; + case ATOM_UTF8_EXT: + n = get_int16(ep); + ep += 2; + is_latin1 = 0; + goto dec_term_atom_common; + case SMALL_ATOM_UTF8_EXT: + n = get_int8(ep); + ep++; + is_latin1 = 0; dec_term_atom_common: if (edep && (edep->flags & ERTS_DIST_EXT_BTT_SAFE)) { - if (!erts_atom_get((char*)ep, n, objp)) { + if (!erts_atom_get((char*)ep, n, objp, is_latin1)) { goto error; } } else { - *objp = am_atom_put((char*)ep, n); + *objp = am_atom_put2(ep, n, is_latin1); } ep += n; break; @@ -2879,14 +2918,15 @@ encode_size_struct2(ErtsAtomCacheMap *acmp, Eterm obj, unsigned dflags) } else { int alen = atom_tab(atom_val(obj))->len; - if ((MAX_ATOM_LENGTH <= 255 || alen <= 255) - && (dflags & DFLAG_SMALL_ATOM_TAGS)) { - /* Make sure a SMALL_ATOM_EXT fits: SMALL_ATOM_EXT l t1 t2... */ - result += 1 + 1 + alen; - } - else { - /* Make sure an ATOM_EXT fits: ATOM_EXT l1 l0 t1 t2... */ - result += 1 + 2 + alen; + result += 1 + 1 + alen; + if (dflags & DFLAG_UTF8_ATOMS) { + if (alen > 255) { + result++; /* ATOM_UTF8_EXT (not small) */ + } + /*SVERK we use utf8 length which is an over estimation */ + } + else if (alen > 255 || !(dflags & DFLAG_SMALL_ATOM_TAGS)) { + result++; /* ATOM_EXT (not small) */ } insert_acache_map(acmp, obj); } @@ -3058,6 +3098,17 @@ encode_size_struct2(ErtsAtomCacheMap *acmp, Eterm obj, unsigned dflags) return result; } +static int is_valid_utf8_atom(byte* bytes, Uint nbytes) +{ + byte* err_pos; + Uint num_chars; + + /*SVERK Do we really need to validate correct utf8? */ + return nbytes <= MAX_ATOM_SZ_LIMIT + && erts_analyze_utf8(bytes, nbytes, &err_pos, &num_chars, NULL) == ERTS_UTF8_OK + && num_chars <= MAX_ATOM_CHARACTERS; +} + static Sint decoded_size(byte *ep, byte* endp, int internal_tags) { @@ -3125,21 +3176,41 @@ decoded_size(byte *ep, byte* endp, int internal_tags) case ATOM_EXT: CHKSIZE(2); n = get_int16(ep); - if (n > MAX_ATOM_LENGTH) { + if (n > MAX_ATOM_CHARACTERS) { return -1; } SKIP(n+2+atom_extra_skip); atom_extra_skip = 0; break; + case ATOM_UTF8_EXT: + CHKSIZE(2); + n = get_int16(ep); + ep += 2; + if (!is_valid_utf8_atom(ep, n)) { + return -1; + } + SKIP(n+atom_extra_skip); + atom_extra_skip = 0; + break; case SMALL_ATOM_EXT: CHKSIZE(1); n = get_int8(ep); - if (n > MAX_ATOM_LENGTH) { + if (n > MAX_ATOM_CHARACTERS) { return -1; } SKIP(n+1+atom_extra_skip); atom_extra_skip = 0; break; + case SMALL_ATOM_UTF8_EXT: + CHKSIZE(1); + n = get_int8(ep); + ep++; + if (!is_valid_utf8_atom(ep, n)) { + return -1; + } + SKIP(n+atom_extra_skip); + atom_extra_skip = 0; + break; case ATOM_CACHE_REF: SKIP(1+atom_extra_skip); atom_extra_skip = 0; diff --git a/erts/emulator/beam/external.h b/erts/emulator/beam/external.h index eddd4571dd..50eea62225 100644 --- a/erts/emulator/beam/external.h +++ b/erts/emulator/beam/external.h @@ -51,6 +51,8 @@ #define NEW_FUN_EXT 'p' #define EXPORT_EXT 'q' #define FUN_EXT 'u' +#define ATOM_UTF8_EXT 'v' +#define SMALL_ATOM_UTF8_EXT 'w' #define DIST_HEADER 'D' #define ATOM_CACHE_REF 'R' diff --git a/erts/emulator/beam/global.h b/erts/emulator/beam/global.h index 4c0d3421c8..1500424d3e 100755 --- a/erts/emulator/beam/global.h +++ b/erts/emulator/beam/global.h @@ -1525,6 +1525,7 @@ char *erts_convert_filename_to_native(Eterm name, char *statbuf, int allow_empty, int allow_atom, Sint *used /* out */); Eterm erts_convert_native_to_filename(Process *p, byte *bytes); +int erts_utf8_to_latin1(byte* dest, const byte* source, unsigned slen); #define ERTS_UTF8_OK 0 #define ERTS_UTF8_INCOMPLETE 1 #define ERTS_UTF8_ERROR 2 diff --git a/erts/emulator/test/bif_SUITE.erl b/erts/emulator/test/bif_SUITE.erl index e2442861c7..02c6de8cb1 100644 --- a/erts/emulator/test/bif_SUITE.erl +++ b/erts/emulator/test/bif_SUITE.erl @@ -481,8 +481,6 @@ binary_to_atom(Config) when is_list(Config) -> %% Bad UTF8 sequences. ?line ?BADARG(binary_to_atom(id(<<255>>), utf8)), ?line ?BADARG(binary_to_atom(id(<<255,0>>), utf8)), - ?line ?BADARG(binary_to_atom(id(<<0:512/unit:8,255>>), utf8)), - ?line ?BADARG(binary_to_atom(id(<<0:512/unit:8,255,0>>), utf8)), ?line ?BADARG(binary_to_atom(id(<<16#C0,16#80>>), utf8)), %Overlong 0. ?line [?BADARG(binary_to_atom(<<C/utf8>>, utf8)) || C <- lists:seq(256, 16#D7FF)], @@ -494,6 +492,8 @@ binary_to_atom(Config) when is_list(Config) -> C <- lists:seq(16#90000, 16#10FFFF)], %% system_limit failures. + ?line ?SYS_LIMIT(binary_to_atom(id(<<0:512/unit:8,255>>), utf8)), + ?line ?SYS_LIMIT(binary_to_atom(id(<<0:512/unit:8,255,0>>), utf8)), ?line ?SYS_LIMIT(binary_to_atom(<<0:256/unit:8>>, latin1)), ?line ?SYS_LIMIT(binary_to_atom(<<0:257/unit:8>>, latin1)), ?line ?SYS_LIMIT(binary_to_atom(<<0:512/unit:8>>, latin1)), |