From 81a5f0f91ce50416d44f48752cb2b7f4ae02d953 Mon Sep 17 00:00:00 2001 From: Sverker Eriksson Date: Tue, 8 Jan 2013 11:06:44 +0100 Subject: erts: Refactor rename DFLAG(S)_INTERNAL_TAGS for conformity --- erts/emulator/beam/dist.h | 2 +- erts/emulator/beam/external.c | 18 +++++++++--------- 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/erts/emulator/beam/dist.h b/erts/emulator/beam/dist.h index 845151c895..0db7d56cb2 100644 --- a/erts/emulator/beam/dist.h +++ b/erts/emulator/beam/dist.h @@ -38,7 +38,7 @@ #define DFLAG_UNICODE_IO 0x1000 #define DFLAG_DIST_HDR_ATOM_CACHE 0x2000 #define DFLAG_SMALL_ATOM_TAGS 0x4000 -#define DFLAGS_INTERNAL_TAGS 0x8000 +#define DFLAG_INTERNAL_TAGS 0x8000 /* All flags that should be enabled when term_to_binary/1 is used. */ #define TERM_TO_BINARY_DFLAGS (DFLAG_EXTENDED_REFERENCES \ diff --git a/erts/emulator/beam/external.c b/erts/emulator/beam/external.c index ab1065aaa1..263ffc4eb3 100644 --- a/erts/emulator/beam/external.c +++ b/erts/emulator/beam/external.c @@ -467,7 +467,7 @@ Uint erts_encode_ext_size_2(Eterm term, unsigned dflags) Uint erts_encode_ext_size_ets(Eterm term) { - return encode_size_struct2(NULL, term, TERM_TO_BINARY_DFLAGS|DFLAGS_INTERNAL_TAGS); + return encode_size_struct2(NULL, term, TERM_TO_BINARY_DFLAGS|DFLAG_INTERNAL_TAGS); } @@ -500,7 +500,7 @@ void erts_encode_ext(Eterm term, byte **ext) byte* erts_encode_ext_ets(Eterm term, byte *ep, struct erl_off_heap_header** off_heap) { - return enc_term(NULL, term, ep, TERM_TO_BINARY_DFLAGS|DFLAGS_INTERNAL_TAGS, + return enc_term(NULL, term, ep, TERM_TO_BINARY_DFLAGS|DFLAG_INTERNAL_TAGS, off_heap); } @@ -1408,7 +1408,7 @@ enc_atom(ErtsAtomCacheMap *acmp, Eterm atom, byte *ep, Uint32 dflags) ASSERT(is_atom(atom)); - if (dflags & DFLAGS_INTERNAL_TAGS) { + if (dflags & DFLAG_INTERNAL_TAGS) { Uint aval = atom_val(atom); ASSERT(aval < (1<<24)); if (aval >= (1 << 16)) { @@ -1472,7 +1472,7 @@ enc_pid(ErtsAtomCacheMap *acmp, Eterm pid, byte* ep, Uint32 dflags) ep += 4; put_int32(os, ep); ep += 4; - *ep++ = (is_internal_pid(pid) && (dflags & DFLAGS_INTERNAL_TAGS)) ? + *ep++ = (is_internal_pid(pid) && (dflags & DFLAG_INTERNAL_TAGS)) ? INTERNAL_CREATION : pid_creation(pid); return ep; } @@ -1770,7 +1770,7 @@ enc_term(ErtsAtomCacheMap *acmp, Eterm obj, byte* ep, Uint32 dflags, put_int16(i, ep); ep += 2; ep = enc_atom(acmp,ref_node_name(obj),ep,dflags); - *ep++ = ((dflags & DFLAGS_INTERNAL_TAGS) && is_internal_ref(obj)) ? + *ep++ = ((dflags & DFLAG_INTERNAL_TAGS) && is_internal_ref(obj)) ? INTERNAL_CREATION : ref_creation(obj); ref_num = ref_numbers(obj); for (j = 0; j < i; j++) { @@ -1787,7 +1787,7 @@ enc_term(ErtsAtomCacheMap *acmp, Eterm obj, byte* ep, Uint32 dflags, j = port_number(obj); put_int32(j, ep); ep += 4; - *ep++ = ((dflags & DFLAGS_INTERNAL_TAGS) && is_internal_port(obj)) ? + *ep++ = ((dflags & DFLAG_INTERNAL_TAGS) && is_internal_port(obj)) ? INTERNAL_CREATION : port_creation(obj); break; @@ -1868,7 +1868,7 @@ enc_term(ErtsAtomCacheMap *acmp, Eterm obj, byte* ep, Uint32 dflags, byte* bytes; ERTS_GET_BINARY_BYTES(obj, bytes, bitoffs, bitsize); - if (dflags & DFLAGS_INTERNAL_TAGS) { + if (dflags & DFLAG_INTERNAL_TAGS) { ProcBin* pb = (ProcBin*) binary_val(obj); Uint bytesize = pb->size; if (pb->thing_word == HEADER_SUB_BIN) { @@ -2869,7 +2869,7 @@ encode_size_struct2(ErtsAtomCacheMap *acmp, Eterm obj, unsigned dflags) result++; break; case ATOM_DEF: - if (dflags & DFLAGS_INTERNAL_TAGS) { + if (dflags & DFLAG_INTERNAL_TAGS) { if (atom_val(obj) >= (1<<16)) { result += 1 + 3; } @@ -2969,7 +2969,7 @@ encode_size_struct2(ErtsAtomCacheMap *acmp, Eterm obj, unsigned dflags) } break; case BINARY_DEF: - if (dflags & DFLAGS_INTERNAL_TAGS) { + if (dflags & DFLAG_INTERNAL_TAGS) { ProcBin* pb = (ProcBin*) binary_val(obj); Uint sub_extra = 0; Uint tot_bytes = pb->size; -- cgit v1.2.3 From a9a57385f0a5648bbfeccc5e8ef0bfe2cdac80c7 Mon Sep 17 00:00:00 2001 From: Sverker Eriksson Date: Fri, 14 Dec 2012 19:52:59 +0100 Subject: erts: Change internal representation of atoms to utf8 --- erts/emulator/beam/atom.c | 61 +++++++++++-- erts/emulator/beam/atom.h | 51 ++++++++--- erts/emulator/beam/beam_load.c | 6 +- erts/emulator/beam/bif.c | 36 +++++--- erts/emulator/beam/dist.h | 1 + erts/emulator/beam/erl_alloc.c | 6 +- erts/emulator/beam/erl_alloc_util.c | 4 +- erts/emulator/beam/erl_nif.c | 8 +- erts/emulator/beam/erl_unicode.c | 174 ++++++++++++++++-------------------- erts/emulator/beam/external.c | 127 ++++++++++++++++++++------ erts/emulator/beam/external.h | 2 + erts/emulator/beam/global.h | 1 + erts/emulator/test/bif_SUITE.erl | 4 +- 13 files changed, 312 insertions(+), 169 deletions(-) diff --git a/erts/emulator/beam/atom.c b/erts/emulator/beam/atom.c index d7c7f117cf..b41a98f2a2 100644 --- a/erts/emulator/beam/atom.c +++ b/erts/emulator/beam/atom.c @@ -111,7 +111,7 @@ atom_text_alloc(int bytes) { byte *res; - ASSERT(bytes <= MAX_ATOM_LENGTH); + ASSERT(bytes <= MAX_ATOM_SZ_LIMIT); if (atom_text_pos + bytes >= atom_text_end) { more_atom_space(); } @@ -198,7 +198,11 @@ am_atom_put(const char* name, int len) Atom a; Eterm ret; int aix; - +#ifdef DEBUG + byte* err_pos; + Uint num_chars; + ASSERT(erts_analyze_utf8(name, len, &err_pos, &num_chars, NULL) == ERTS_UTF8_OK); +#endif /* * Silently truncate the atom if it is too long. Overlong atoms * could occur in situations where we have no good way to return @@ -209,8 +213,8 @@ am_atom_put(const char* name, int len) * list_to_atom/1), the caller should check the length before * calling this function. */ - if (len > MAX_ATOM_LENGTH) { - len = MAX_ATOM_LENGTH; + if (len > MAX_ATOM_SZ_LIMIT) { + len = MAX_ATOM_SZ_LIMIT; /*SVERK Urk... */ } #ifdef ERTS_ATOM_PUT_OPS_STAT erts_smp_atomic_inc_nob(&atom_put_ops); @@ -230,6 +234,49 @@ am_atom_put(const char* name, int len) return ret; } +static void latin1_to_utf8(byte* conv_buf, const byte** srcp, int* lenp) +{ + byte* dst; + const byte* src = *srcp; + int i, len = *lenp; + + for (i=0 ; i < len; ++i) { + if (src[i] & 0x80) { + goto need_convertion; + } + } + return; + +need_convertion: + sys_memcpy(conv_buf, src, i); + dst = conv_buf + i; + for ( ; i < len; ++i) { + unsigned char chr = src[i]; + if (!(chr & 0x80)) { + *dst++ = chr; + } + else { + *dst++ = 0xC0 | (chr >> 6); + *dst++ = 0x80 | (chr & 0x3F); + } + } + *srcp = conv_buf; + *lenp = dst - conv_buf; +} + + +Eterm +am_atom_put2(const byte* name, int len, int is_latin1) +{ + byte utf8_copy[MAX_ATOM_SZ_FROM_LATIN1]; + + if (is_latin1) { + latin1_to_utf8(utf8_copy, &name, &len); + } + return am_atom_put((const char*)name, len); +} + + int atom_table_size(void) { @@ -264,14 +311,18 @@ int atom_table_sz(void) } int -erts_atom_get(const char *name, int len, Eterm* ap) +erts_atom_get(const char *name, int len, Eterm* ap, int is_latin1) { + byte utf8_copy[MAX_ATOM_SZ_FROM_LATIN1]; Atom a; int i; int res; a.len = len; a.name = (byte *)name; + if (is_latin1) { + latin1_to_utf8(utf8_copy, (const byte**)&a.name, &a.len); + } atom_read_lock(); i = index_get(&erts_atom_table, (void*) &a); res = i < 0 ? 0 : (*ap = make_atom(i), 1); diff --git a/erts/emulator/beam/atom.h b/erts/emulator/beam/atom.h index fd9c04d3d0..84dd6d8901 100644 --- a/erts/emulator/beam/atom.h +++ b/erts/emulator/beam/atom.h @@ -26,7 +26,9 @@ #include "erl_atom_table.h" -#define MAX_ATOM_LENGTH 255 +#define MAX_ATOM_CHARACTERS 255 +#define MAX_ATOM_SZ_FROM_LATIN1 (2*MAX_ATOM_CHARACTERS) +#define MAX_ATOM_SZ_LIMIT (4*MAX_ATOM_CHARACTERS) /* theoretical byte limit */ #define ATOM_LIMIT (1024*1024) #define MIN_ATOM_TABLE_SIZE 8192 @@ -53,8 +55,8 @@ typedef struct atom { extern IndexTable erts_atom_table; ERTS_GLB_INLINE Atom* atom_tab(Uint i); -ERTS_GLB_INLINE int erts_is_atom_bytes(byte *text, size_t len, Eterm term); -ERTS_GLB_INLINE int erts_is_atom_str(char *str, Eterm term); +ERTS_GLB_INLINE int erts_is_atom_utf8_bytes(byte *text, size_t len, Eterm term); +ERTS_GLB_INLINE int erts_is_atom_str(const char *str, Eterm term, int is_latin1); #if ERTS_GLB_INLINE_INCL_FUNC_DEF ERTS_GLB_INLINE Atom* @@ -63,7 +65,7 @@ atom_tab(Uint i) return (Atom *) erts_index_lookup(&erts_atom_table, i); } -ERTS_GLB_INLINE int erts_is_atom_bytes(byte *text, size_t len, Eterm term) +ERTS_GLB_INLINE int erts_is_atom_utf8_bytes(byte *text, size_t len, Eterm term) { Atom *a; if (!is_atom(term)) @@ -73,30 +75,50 @@ ERTS_GLB_INLINE int erts_is_atom_bytes(byte *text, size_t len, Eterm term) && sys_memcmp((void *) a->name, (void *) text, len) == 0); } -ERTS_GLB_INLINE int erts_is_atom_str(char *str, Eterm term) +ERTS_GLB_INLINE int erts_is_atom_str(const char *str, Eterm term, int is_latin1) { Atom *a; int i, len; - char *aname; + const byte* aname; + const byte* s = (const byte*) str; + if (!is_atom(term)) return 0; a = atom_tab(atom_val(term)); len = a->len; - aname = (char *) a->name; - for (i = 0; i < len; i++) - if (aname[i] != str[i] || str[i] == '\0') - return 0; - return str[len] == '\0'; + aname = a->name; + if (is_latin1) { + for (i = 0; i < len; s++) { + if (aname[i] < 0x80) { + if (aname[i] != *s || *s == '\0') + return 0; + i++; + } + else { + if (aname[i] != (0xC0 | (*s >> 6)) || + aname[i+1] != (0x80 | (*s & 0x3F))) { + return 0; + } + i += 2; + } + } + } + else { + for (i = 0; i < len; i++, s++) + if (aname[i] != *s || *s == '\0') + return 0; + } + return *s == '\0'; } #endif /* * Note, ERTS_IS_ATOM_STR() expects the first argument to be a - * string literal. + * 7-bit ASCII string literal. */ #define ERTS_IS_ATOM_STR(LSTR, TERM) \ - (erts_is_atom_bytes((byte *) LSTR, sizeof(LSTR) - 1, (TERM))) + (erts_is_atom_utf8_bytes((byte *) LSTR, sizeof(LSTR) - 1, (TERM))) #define ERTS_DECL_AM(S) Eterm AM_ ## S = am_atom_put(#S, sizeof(#S) - 1) #define ERTS_INIT_AM(S) AM_ ## S = am_atom_put(#S, sizeof(#S) - 1) @@ -104,12 +126,13 @@ int atom_table_size(void); /* number of elements */ int atom_table_sz(void); /* table size in bytes, excluding stored objects */ Eterm am_atom_put(const char*, int); /* most callers pass plain char*'s */ +Eterm am_atom_put2(const byte*, int, int is_latin1); int atom_erase(byte*, int); int atom_static_put(byte*, int); void init_atom_table(void); void atom_info(int, void *); void dump_atoms(int, void *); -int erts_atom_get(const char* name, int len, Eterm* ap); +int erts_atom_get(const char* name, int len, Eterm* ap, int is_latin1); void erts_atom_get_text_space_sizes(Uint *reserved, Uint *used); #endif diff --git a/erts/emulator/beam/beam_load.c b/erts/emulator/beam/beam_load.c index b51f076a5d..7bb964d5aa 100644 --- a/erts/emulator/beam/beam_load.c +++ b/erts/emulator/beam/beam_load.c @@ -1230,7 +1230,7 @@ load_atom_table(LoaderState* stp) GetByte(stp, n); GetString(stp, atom, n); - stp->atom[i] = am_atom_put((char*)atom, n); + stp->atom[i] = am_atom_put2(atom, n, 1); } /* @@ -1240,7 +1240,7 @@ load_atom_table(LoaderState* stp) if (is_nil(stp->module)) { stp->module = stp->atom[1]; } else if (stp->atom[1] != stp->module) { - char sbuf[256]; + char sbuf[MAX_ATOM_SZ_FROM_LATIN1]; Atom* ap; ap = atom_tab(atom_val(stp->atom[1])); @@ -1620,7 +1620,7 @@ read_line_table(LoaderState* stp) GetInt(stp, 2, n); GetString(stp, fname, n); - stp->fname[i] = am_atom_put((char*)fname, n); + stp->fname[i] = am_atom_put((char*)fname, n); /*SVERK ? */ } } diff --git a/erts/emulator/beam/bif.c b/erts/emulator/beam/bif.c index 1cdce49eef..89157068c0 100644 --- a/erts/emulator/beam/bif.c +++ b/erts/emulator/beam/bif.c @@ -2536,9 +2536,11 @@ BIF_RETTYPE append_element_2(BIF_ALIST_2) BIF_RETTYPE atom_to_list_1(BIF_ALIST_1) { - Uint need; - Eterm* hp; + Eterm do_utf8_to_list(Process*, Uint num, byte *bytes, Uint sz, Uint left, + Uint *num_built, Uint *num_eaten, Eterm tail); /*SVERK */ Atom* ap; + Uint num_chars, num_built, num_eaten; + Eterm res; if (is_not_atom(BIF_ARG_1)) BIF_ERROR(BIF_P, BADARG); @@ -2547,9 +2549,19 @@ BIF_RETTYPE atom_to_list_1(BIF_ALIST_1) ap = atom_tab(atom_val(BIF_ARG_1)); if (ap->len == 0) BIF_RET(NIL); /* the empty atom */ - need = ap->len*2; - hp = HAlloc(BIF_P, need); - BIF_RET(buf_to_intlist(&hp,(char*)ap->name,ap->len, NIL)); + { + byte* err_pos; + if (erts_analyze_utf8(ap->name, ap->len, &err_pos, &num_chars, NULL) + != ERTS_UTF8_OK) { + BIF_ERROR(BIF_P, BADARG); + } + } + + res = do_utf8_to_list(BIF_P, num_chars, ap->name, ap->len, ap->len, + &num_built, &num_eaten, NIL); + ASSERT(num_built == num_chars); + ASSERT(num_eaten == ap->len); + BIF_RET(res); } /**********************************************************************/ @@ -2559,18 +2571,18 @@ BIF_RETTYPE atom_to_list_1(BIF_ALIST_1) BIF_RETTYPE list_to_atom_1(BIF_ALIST_1) { Eterm res; - char *buf = (char *) erts_alloc(ERTS_ALC_T_TMP, MAX_ATOM_LENGTH); - int i = intlist_to_buf(BIF_ARG_1, buf, MAX_ATOM_LENGTH); + char *buf = (char *) erts_alloc(ERTS_ALC_T_TMP, MAX_ATOM_CHARACTERS); + int i = intlist_to_buf(BIF_ARG_1, buf, MAX_ATOM_CHARACTERS); if (i < 0) { erts_free(ERTS_ALC_T_TMP, (void *) buf); i = list_length(BIF_ARG_1); - if (i > MAX_ATOM_LENGTH) { + if (i > MAX_ATOM_CHARACTERS) { BIF_ERROR(BIF_P, SYSTEM_LIMIT); } BIF_ERROR(BIF_P, BADARG); } - res = am_atom_put(buf, i); + res = am_atom_put2((byte*)buf, i, 1); erts_free(ERTS_ALC_T_TMP, (void *) buf); BIF_RET(res); } @@ -2580,16 +2592,16 @@ BIF_RETTYPE list_to_atom_1(BIF_ALIST_1) BIF_RETTYPE list_to_existing_atom_1(BIF_ALIST_1) { int i; - char *buf = (char *) erts_alloc(ERTS_ALC_T_TMP, MAX_ATOM_LENGTH); + char *buf = (char *) erts_alloc(ERTS_ALC_T_TMP, MAX_ATOM_CHARACTERS); - if ((i = intlist_to_buf(BIF_ARG_1, buf, MAX_ATOM_LENGTH)) < 0) { + if ((i = intlist_to_buf(BIF_ARG_1, buf, MAX_ATOM_CHARACTERS)) < 0) { error: erts_free(ERTS_ALC_T_TMP, (void *) buf); BIF_ERROR(BIF_P, BADARG); } else { Eterm a; - if (erts_atom_get(buf, i, &a)) { + if (erts_atom_get(buf, i, &a, 1)) { erts_free(ERTS_ALC_T_TMP, (void *) buf); BIF_RET(a); } else { diff --git a/erts/emulator/beam/dist.h b/erts/emulator/beam/dist.h index 0db7d56cb2..129f637dba 100644 --- a/erts/emulator/beam/dist.h +++ b/erts/emulator/beam/dist.h @@ -39,6 +39,7 @@ #define DFLAG_DIST_HDR_ATOM_CACHE 0x2000 #define DFLAG_SMALL_ATOM_TAGS 0x4000 #define DFLAG_INTERNAL_TAGS 0x8000 +#define DFLAG_UTF8_ATOMS 0x10000 /* All flags that should be enabled when term_to_binary/1 is used. */ #define TERM_TO_BINARY_DFLAGS (DFLAG_EXTENDED_REFERENCES \ diff --git a/erts/emulator/beam/erl_alloc.c b/erts/emulator/beam/erl_alloc.c index 3eee53eba3..061f229f59 100644 --- a/erts/emulator/beam/erl_alloc.c +++ b/erts/emulator/beam/erl_alloc.c @@ -3045,13 +3045,13 @@ erts_request_alloc_info(struct process *c_p, Eterm alloc = CAR(consp); for (ai = ERTS_ALC_A_MIN; ai <= ERTS_ALC_A_MAX; ai++) - if (erts_is_atom_str((char *) erts_alc_a2ad[ai], alloc)) + if (erts_is_atom_str(erts_alc_a2ad[ai], alloc, 0)) goto save_alloc; - if (erts_is_atom_str("mseg_alloc", alloc)) { + if (erts_is_atom_str("mseg_alloc", alloc, 0)) { ai = ERTS_ALC_INFO_A_MSEG_ALLOC; goto save_alloc; } - if (erts_is_atom_str("alloc_util", alloc)) { + if (erts_is_atom_str("alloc_util", alloc, 0)) { ai = ERTS_ALC_INFO_A_ALLOC_UTIL; save_alloc: if (req_ai[ai]) diff --git a/erts/emulator/beam/erl_alloc_util.c b/erts/emulator/beam/erl_alloc_util.c index 97ba306a79..f8a8c00715 100644 --- a/erts/emulator/beam/erl_alloc_util.c +++ b/erts/emulator/beam/erl_alloc_util.c @@ -2815,10 +2815,10 @@ make_name_atoms(Allctr_t *allctr) char alloc[] = "alloc"; char realloc[] = "realloc"; char free[] = "free"; - char buf[MAX_ATOM_LENGTH]; + char buf[MAX_ATOM_CHARACTERS]; size_t prefix_len = strlen(allctr->name_prefix); - if (prefix_len > MAX_ATOM_LENGTH + sizeof(realloc) - 1) + if (prefix_len > MAX_ATOM_CHARACTERS + sizeof(realloc) - 1) erl_exit(1,"Too long allocator name: %salloc\n",allctr->name_prefix); memcpy((void *) buf, (void *) allctr->name_prefix, prefix_len); diff --git a/erts/emulator/beam/erl_nif.c b/erts/emulator/beam/erl_nif.c index 632d756481..185ac75d73 100644 --- a/erts/emulator/beam/erl_nif.c +++ b/erts/emulator/beam/erl_nif.c @@ -974,7 +974,7 @@ int enif_make_existing_atom_len(ErlNifEnv* env, const char* name, size_t len, ERL_NIF_TERM* atom, ErlNifCharEncoding encoding) { ASSERT(encoding == ERL_NIF_LATIN1); - return erts_atom_get(name, len, atom); + return erts_atom_get(name, len, atom, 1); } ERL_NIF_TERM enif_make_tuple(ErlNifEnv* env, unsigned cnt, ...) @@ -1633,7 +1633,7 @@ BIF_RETTYPE load_nif_2(BIF_ALIST_2) "this vm variant (%s).", entry->vm_variant, ERL_NIF_VM_VARIANT); } - else if (!erts_is_atom_str((char*)entry->name, mod_atom)) { + else if (!erts_is_atom_str((char*)entry->name, mod_atom, 1)) { ret = load_nif_error(BIF_P, bad_lib, "Library module name '%s' does not" " match calling module '%T'", entry->name, mod_atom); } @@ -1643,7 +1643,7 @@ BIF_RETTYPE load_nif_2(BIF_ALIST_2) for (i=0; i < entry->num_of_funcs && ret==am_ok; i++) { BeamInstr** code_pp; ErlNifFunc* f = &entry->funcs[i]; - if (!erts_atom_get(f->name, sys_strlen(f->name), &f_atom) + if (!erts_atom_get(f->name, sys_strlen(f->name), &f_atom, 1) || (code_pp = get_func_pp(mod->curr.code, f_atom, f->arity))==NULL) { ret = load_nif_error(BIF_P,bad_lib,"Function not found %T:%s/%u", mod_atom, f->name, f->arity); @@ -1746,7 +1746,7 @@ BIF_RETTYPE load_nif_2(BIF_ALIST_2) for (i=0; i < entry->num_of_funcs; i++) { BeamInstr* code_ptr; - erts_atom_get(entry->funcs[i].name, sys_strlen(entry->funcs[i].name), &f_atom); + erts_atom_get(entry->funcs[i].name, sys_strlen(entry->funcs[i].name), &f_atom, 1); code_ptr = *get_func_pp(mod->curr.code, f_atom, entry->funcs[i].arity); if (code_ptr[1] == 0) { diff --git a/erts/emulator/beam/erl_unicode.c b/erts/emulator/beam/erl_unicode.c index 51559aea1c..e24b6f1458 100644 --- a/erts/emulator/beam/erl_unicode.c +++ b/erts/emulator/beam/erl_unicode.c @@ -1155,7 +1155,7 @@ BIF_RETTYPE unicode_characters_to_list_2(BIF_ALIST_2) * a faster analyze and size count with this function. */ int erts_analyze_utf8(byte *source, Uint size, - byte **err_pos, Uint *num_chars, int *left) + byte **err_pos, Uint *num_chars, int *left) { *err_pos = source; *num_chars = 0; @@ -1210,7 +1210,7 @@ int erts_analyze_utf8(byte *source, Uint size, } ++(*num_chars); *err_pos = source; - if (left && --(*left) <= 0) { + if (left && --(*left) <= 0 && size) { return ERTS_UTF8_ANALYZE_MORE; } } @@ -1220,7 +1220,7 @@ int erts_analyze_utf8(byte *source, Uint size, /* * No errors should be able to occur - no overlongs, no malformed, no nothing */ -static Eterm do_utf8_to_list(Process *p, Uint num, byte *bytes, Uint sz, +Eterm do_utf8_to_list(Process *p, Uint num, byte *bytes, Uint sz, Uint left, Uint *num_built, Uint *num_eaten, Eterm tail) { @@ -1812,31 +1812,36 @@ BIF_RETTYPE atom_to_binary_2(BIF_ALIST_2) ap = atom_tab(atom_val(BIF_ARG_1)); if (BIF_ARG_2 == am_latin1) { - BIF_RET(new_binary(BIF_P, ap->name, ap->len)); - } else if (BIF_ARG_2 == am_utf8 || BIF_ARG_2 == am_unicode) { - int bin_size = 0; int i; Eterm bin_term; - byte* bin_p; - - for (i = 0; i < ap->len; i++) { - bin_size += (ap->name[i] >= 0x80) ? 2 : 1; + int bin_size = ap->len; + + for (i = 0; i < ap->len; ) { + if (ap->name[i] < 0x80) i++; + else { + ASSERT(ap->name[i] >= 0xC0); + if (ap->name[i] < 0xE0) { + ASSERT(i+1 < ap->len && (ap->name[i+1] & 0xC0) == 0x80); + i += 2; + bin_size -= 1; + } + else goto error; + } } if (bin_size == ap->len) { - BIF_RET(new_binary(BIF_P, ap->name, ap->len)); + bin_term = new_binary(BIF_P, ap->name, ap->len); } - bin_term = new_binary(BIF_P, 0, bin_size); - bin_p = binary_bytes(bin_term); - for (i = 0; i < ap->len; i++) { - byte b = ap->name[i]; - if (b < 0x80) { - *bin_p++ = b; - } else { - *bin_p++ = 0xC0 | (b >> 6); - *bin_p++ = 0x80 | (b & 0x3F); - } + else { + byte* bin_p; + int dbg_sz; + bin_term = new_binary(BIF_P, 0, bin_size); + bin_p = binary_bytes(bin_term); + dbg_sz = erts_utf8_to_latin1(bin_p, ap->name, ap->len); + ASSERT(dbg_sz == bin_size); (void)dbg_sz; } BIF_RET(bin_term); + } else if (BIF_ARG_2 == am_utf8 || BIF_ARG_2 == am_unicode) { + BIF_RET(new_binary(BIF_P, ap->name, ap->len)); } else { error: BIF_ERROR(BIF_P, BADARG); @@ -1856,102 +1861,52 @@ binary_to_atom(Process* p, Eterm bin, Eterm enc, int must_exist) bin_size = binary_size(bin); if (enc == am_latin1) { Eterm a; - if (bin_size > MAX_ATOM_LENGTH) { + if (bin_size > MAX_ATOM_CHARACTERS) { system_limit: erts_free_aligned_binary_bytes(temp_alloc); BIF_ERROR(p, SYSTEM_LIMIT); } if (!must_exist) { - a = am_atom_put((char *)bytes, bin_size); + a = am_atom_put2(bytes, bin_size, 1); erts_free_aligned_binary_bytes(temp_alloc); BIF_RET(a); - } else if (erts_atom_get((char *)bytes, bin_size, &a)) { + } else if (erts_atom_get((char *)bytes, bin_size, &a, 1)) { erts_free_aligned_binary_bytes(temp_alloc); BIF_RET(a); } else { goto badarg; } } else if (enc == am_utf8 || enc == am_unicode) { - char *buf; - char *dst; - int i; - int num_chars; Eterm res; + Uint num_chars = 0; + const byte* p = bytes; + Uint left = bin_size; - if (bin_size > 2*MAX_ATOM_LENGTH) { - byte* err_pos; - Uint n; - int reds_left = bin_size+1; /* Number of reductions left. */ - - if (erts_analyze_utf8(bytes, bin_size, &err_pos, - &n, &reds_left) == ERTS_UTF8_OK) { - /* - * Correct UTF-8 encoding, but too many characters to - * fit in an atom. - */ + while (left) { + if (++num_chars > MAX_ATOM_CHARACTERS) { goto system_limit; - } else { - /* - * Something wrong in the UTF-8 encoding or Unicode code - * points > 255. - */ - goto badarg; } - } - - /* - * Allocate a temporary buffer the same size as the binary, - * so that we don't need an extra overflow test. - */ - buf = (char *) erts_alloc(ERTS_ALC_T_TMP, bin_size); - dst = buf; - for (i = 0; i < bin_size; i++) { - int c = bytes[i]; - if (c < 0x80) { - *dst++ = c; - } else if (i < bin_size-1) { - int c2; - if ((c & 0xE0) != 0xC0) { - goto free_badarg; - } - i++; - c = (c & 0x3F) << 6; - c2 = bytes[i]; - if ((c2 & 0xC0) != 0x80) { - goto free_badarg; - } - c = c | (c2 & 0x3F); - if (0x80 <= c && c < 256) { - *dst++ = c; - } else { - goto free_badarg; - } - } else { - free_badarg: - erts_free(ERTS_ALC_T_TMP, (void *) buf); - goto badarg; + if ((p[0] & 0x80) == 0) { + ++p; + --left; } + else if (left >= 2 + && (p[0] & 0xFE) == 0xC2 /* only allow latin1 subset */ + && (p[1] & 0xC0) == 0x80) { + p += 2; + left -= 2; + } + else goto badarg; } - num_chars = dst - buf; - if (num_chars > MAX_ATOM_LENGTH) { - erts_free(ERTS_ALC_T_TMP, (void *) buf); - goto system_limit; - } + if (!must_exist) { - res = am_atom_put(buf, num_chars); - erts_free(ERTS_ALC_T_TMP, (void *) buf); - erts_free_aligned_binary_bytes(temp_alloc); - BIF_RET(res); - } else { - int exists = erts_atom_get(buf, num_chars, &res); - erts_free(ERTS_ALC_T_TMP, (void *) buf); - if (exists) { - erts_free_aligned_binary_bytes(temp_alloc); - BIF_RET(res); - } else { - goto badarg; - } + res = am_atom_put((char*)bytes, bin_size); + } + else if (!erts_atom_get((char*)bytes, bin_size, &res, 0)) { + goto badarg; } + erts_free_aligned_binary_bytes(temp_alloc); + BIF_RET(res); } else { badarg: erts_free_aligned_binary_bytes(temp_alloc); @@ -2670,3 +2625,30 @@ BIF_RETTYPE file_native_name_encoding_0(BIF_ALIST_0) } } +/* Assumes 'dest' has enough room. + */ +int erts_utf8_to_latin1(byte* dest, const byte* source, unsigned slen) +{ + byte* dp = dest; + while (slen > 0) { + if ((source[0] & 0x80) == 0) { + *dp++ = *source++; + --slen; + } + else if (slen > 1 && + (source[0] & 0xFE) == 0xC2 && + (source[1] & 0xC0) == 0x80) { + *dp++ = (char) ((source[0] << 6) | (source[1] & 0x3F)); + source += 2; + slen -= 2; + } + else { + /* Just let unconvertable octets through. This should not happen + in a correctly upgraded system */ + *dp++ = *source++; + --slen; + } + } + return dp - dest; +} + diff --git a/erts/emulator/beam/external.c b/erts/emulator/beam/external.c index 263ffc4eb3..68edcd0fa6 100644 --- a/erts/emulator/beam/external.c +++ b/erts/emulator/beam/external.c @@ -707,7 +707,7 @@ erts_prepare_dist_ext(ErtsDistExternal *edep, if (cix >= ERTS_ATOM_CACHE_SIZE) ERTS_EXT_HDR_FAIL; ep++; -#if MAX_ATOM_LENGTH > 255 +#if MAX_ATOM_CHARACTERS > 255 if (long_atoms) { CHKSIZE(2); len = get_int16(ep); @@ -720,7 +720,7 @@ erts_prepare_dist_ext(ErtsDistExternal *edep, len = get_int8(ep); ep++; } - if (len > MAX_ATOM_LENGTH) + if (len > MAX_ATOM_CHARACTERS) ERTS_EXT_HDR_FAIL; /* Too long atom */ CHKSIZE(len); atom = am_atom_put((char *) ep, len); @@ -1431,18 +1431,33 @@ enc_atom(ErtsAtomCacheMap *acmp, Eterm atom, byte *ep, Uint32 dflags) if (iix < 0) { i = atom_val(atom); j = atom_tab(i)->len; - if ((MAX_ATOM_LENGTH <= 255 || j <= 255) - && (dflags & DFLAG_SMALL_ATOM_TAGS)) { - *ep++ = SMALL_ATOM_EXT; - put_int8(j, ep); - ep++; + if (dflags & DFLAG_UTF8_ATOMS) { + if (j <= 255) { + *ep++ = ATOM_UTF8_EXT; + put_int16(j, ep); + ep += 2; + } + else { + *ep++ = SMALL_ATOM_UTF8_EXT; + put_int8(j, ep); + ep += 2; + } + sys_memcpy((char *) ep, (char*)atom_tab(i)->name, (int) j); } else { - *ep++ = ATOM_EXT; - put_int16(j, ep); - ep += 2; + if (j <= 255 && (dflags & DFLAG_SMALL_ATOM_TAGS)) { + *ep++ = SMALL_ATOM_EXT; + j = erts_utf8_to_latin1(ep+1, atom_tab(i)->name, j); + put_int8(j, ep); + ep++; + } + else { + *ep++ = ATOM_EXT; + j = erts_utf8_to_latin1(ep+2, atom_tab(i)->name, j); + put_int16(j, ep); + ep += 2; + } } - sys_memcpy((char *) ep, (char*)atom_tab(i)->name, (int) j); ep += j; return ep; } @@ -1482,7 +1497,7 @@ static byte* dec_atom(ErtsDistExternal *edep, byte* ep, Eterm* objp) { Uint len; - int n; + int n, is_latin1; switch (*ep++) { case ATOM_CACHE_REF: @@ -1498,17 +1513,29 @@ dec_atom(ErtsDistExternal *edep, byte* ep, Eterm* objp) case ATOM_EXT: len = get_int16(ep), ep += 2; + is_latin1 = 1; goto dec_atom_common; case SMALL_ATOM_EXT: len = get_int8(ep); ep++; + is_latin1 = 1; + goto dec_atom_common; + case ATOM_UTF8_EXT: + len = get_int16(ep), + ep += 2; + is_latin1 = 0; + goto dec_atom_common; + case SMALL_ATOM_UTF8_EXT: + len = get_int8(ep), + ep++; + is_latin1 = 0; dec_atom_common: if (edep && (edep->flags & ERTS_DIST_EXT_BTT_SAFE)) { - if (!erts_atom_get((char*)ep, len, objp)) { + if (!erts_atom_get((char*)ep, len, objp, is_latin1)) { goto error; } } else { - *objp = am_atom_put((char*)ep, len); + *objp = am_atom_put2(ep, len, is_latin1); } ep += len; break; @@ -2113,7 +2140,7 @@ static byte* dec_term(ErtsDistExternal *edep, Eterm** hpp, byte* ep, ErlOffHeap* off_heap, Eterm* objp) { Eterm* hp_saved = *hpp; - int n; + int n, is_latin1; register Eterm* hp = *hpp; /* Please don't take the address of hp */ Eterm* next = objp; @@ -2199,17 +2226,29 @@ dec_term(ErtsDistExternal *edep, Eterm** hpp, byte* ep, ErlOffHeap* off_heap, Et case ATOM_EXT: n = get_int16(ep); ep += 2; - goto dec_term_atom_common; + is_latin1 = 1; + goto dec_term_atom_common; case SMALL_ATOM_EXT: n = get_int8(ep); ep++; + is_latin1 = 1; + goto dec_term_atom_common; + case ATOM_UTF8_EXT: + n = get_int16(ep); + ep += 2; + is_latin1 = 0; + goto dec_term_atom_common; + case SMALL_ATOM_UTF8_EXT: + n = get_int8(ep); + ep++; + is_latin1 = 0; dec_term_atom_common: if (edep && (edep->flags & ERTS_DIST_EXT_BTT_SAFE)) { - if (!erts_atom_get((char*)ep, n, objp)) { + if (!erts_atom_get((char*)ep, n, objp, is_latin1)) { goto error; } } else { - *objp = am_atom_put((char*)ep, n); + *objp = am_atom_put2(ep, n, is_latin1); } ep += n; break; @@ -2879,14 +2918,15 @@ encode_size_struct2(ErtsAtomCacheMap *acmp, Eterm obj, unsigned dflags) } else { int alen = atom_tab(atom_val(obj))->len; - if ((MAX_ATOM_LENGTH <= 255 || alen <= 255) - && (dflags & DFLAG_SMALL_ATOM_TAGS)) { - /* Make sure a SMALL_ATOM_EXT fits: SMALL_ATOM_EXT l t1 t2... */ - result += 1 + 1 + alen; - } - else { - /* Make sure an ATOM_EXT fits: ATOM_EXT l1 l0 t1 t2... */ - result += 1 + 2 + alen; + result += 1 + 1 + alen; + if (dflags & DFLAG_UTF8_ATOMS) { + if (alen > 255) { + result++; /* ATOM_UTF8_EXT (not small) */ + } + /*SVERK we use utf8 length which is an over estimation */ + } + else if (alen > 255 || !(dflags & DFLAG_SMALL_ATOM_TAGS)) { + result++; /* ATOM_EXT (not small) */ } insert_acache_map(acmp, obj); } @@ -3058,6 +3098,17 @@ encode_size_struct2(ErtsAtomCacheMap *acmp, Eterm obj, unsigned dflags) return result; } +static int is_valid_utf8_atom(byte* bytes, Uint nbytes) +{ + byte* err_pos; + Uint num_chars; + + /*SVERK Do we really need to validate correct utf8? */ + return nbytes <= MAX_ATOM_SZ_LIMIT + && erts_analyze_utf8(bytes, nbytes, &err_pos, &num_chars, NULL) == ERTS_UTF8_OK + && num_chars <= MAX_ATOM_CHARACTERS; +} + static Sint decoded_size(byte *ep, byte* endp, int internal_tags) { @@ -3125,21 +3176,41 @@ decoded_size(byte *ep, byte* endp, int internal_tags) case ATOM_EXT: CHKSIZE(2); n = get_int16(ep); - if (n > MAX_ATOM_LENGTH) { + if (n > MAX_ATOM_CHARACTERS) { return -1; } SKIP(n+2+atom_extra_skip); atom_extra_skip = 0; break; + case ATOM_UTF8_EXT: + CHKSIZE(2); + n = get_int16(ep); + ep += 2; + if (!is_valid_utf8_atom(ep, n)) { + return -1; + } + SKIP(n+atom_extra_skip); + atom_extra_skip = 0; + break; case SMALL_ATOM_EXT: CHKSIZE(1); n = get_int8(ep); - if (n > MAX_ATOM_LENGTH) { + if (n > MAX_ATOM_CHARACTERS) { return -1; } SKIP(n+1+atom_extra_skip); atom_extra_skip = 0; break; + case SMALL_ATOM_UTF8_EXT: + CHKSIZE(1); + n = get_int8(ep); + ep++; + if (!is_valid_utf8_atom(ep, n)) { + return -1; + } + SKIP(n+atom_extra_skip); + atom_extra_skip = 0; + break; case ATOM_CACHE_REF: SKIP(1+atom_extra_skip); atom_extra_skip = 0; diff --git a/erts/emulator/beam/external.h b/erts/emulator/beam/external.h index eddd4571dd..50eea62225 100644 --- a/erts/emulator/beam/external.h +++ b/erts/emulator/beam/external.h @@ -51,6 +51,8 @@ #define NEW_FUN_EXT 'p' #define EXPORT_EXT 'q' #define FUN_EXT 'u' +#define ATOM_UTF8_EXT 'v' +#define SMALL_ATOM_UTF8_EXT 'w' #define DIST_HEADER 'D' #define ATOM_CACHE_REF 'R' diff --git a/erts/emulator/beam/global.h b/erts/emulator/beam/global.h index 4c0d3421c8..1500424d3e 100755 --- a/erts/emulator/beam/global.h +++ b/erts/emulator/beam/global.h @@ -1525,6 +1525,7 @@ char *erts_convert_filename_to_native(Eterm name, char *statbuf, int allow_empty, int allow_atom, Sint *used /* out */); Eterm erts_convert_native_to_filename(Process *p, byte *bytes); +int erts_utf8_to_latin1(byte* dest, const byte* source, unsigned slen); #define ERTS_UTF8_OK 0 #define ERTS_UTF8_INCOMPLETE 1 #define ERTS_UTF8_ERROR 2 diff --git a/erts/emulator/test/bif_SUITE.erl b/erts/emulator/test/bif_SUITE.erl index e2442861c7..02c6de8cb1 100644 --- a/erts/emulator/test/bif_SUITE.erl +++ b/erts/emulator/test/bif_SUITE.erl @@ -481,8 +481,6 @@ binary_to_atom(Config) when is_list(Config) -> %% Bad UTF8 sequences. ?line ?BADARG(binary_to_atom(id(<<255>>), utf8)), ?line ?BADARG(binary_to_atom(id(<<255,0>>), utf8)), - ?line ?BADARG(binary_to_atom(id(<<0:512/unit:8,255>>), utf8)), - ?line ?BADARG(binary_to_atom(id(<<0:512/unit:8,255,0>>), utf8)), ?line ?BADARG(binary_to_atom(id(<<16#C0,16#80>>), utf8)), %Overlong 0. ?line [?BADARG(binary_to_atom(<>, utf8)) || C <- lists:seq(256, 16#D7FF)], @@ -494,6 +492,8 @@ binary_to_atom(Config) when is_list(Config) -> C <- lists:seq(16#90000, 16#10FFFF)], %% system_limit failures. + ?line ?SYS_LIMIT(binary_to_atom(id(<<0:512/unit:8,255>>), utf8)), + ?line ?SYS_LIMIT(binary_to_atom(id(<<0:512/unit:8,255,0>>), utf8)), ?line ?SYS_LIMIT(binary_to_atom(<<0:256/unit:8>>, latin1)), ?line ?SYS_LIMIT(binary_to_atom(<<0:257/unit:8>>, latin1)), ?line ?SYS_LIMIT(binary_to_atom(<<0:512/unit:8>>, latin1)), -- cgit v1.2.3 From e4e007afd032f7aca359d2665f91ddb12727521a Mon Sep 17 00:00:00 2001 From: Sverker Eriksson Date: Tue, 18 Dec 2012 18:31:46 +0100 Subject: stdlib: Fix printing of unicode atoms --- lib/stdlib/src/io_lib.erl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/stdlib/src/io_lib.erl b/lib/stdlib/src/io_lib.erl index 513d904c39..0f1f417d01 100644 --- a/lib/stdlib/src/io_lib.erl +++ b/lib/stdlib/src/io_lib.erl @@ -276,7 +276,7 @@ write_atom(Atom) -> Chars = atom_to_list(Atom), case quote_atom(Atom, Chars) of true -> - write_string(Chars, $'); %' + write_unicode_string(Chars, $'); %' false -> Chars end. -- cgit v1.2.3 From 685d009efcfd7521e9c918a14b58eac19755299d Mon Sep 17 00:00:00 2001 From: Sverker Eriksson Date: Fri, 21 Dec 2012 15:50:21 +0100 Subject: erl_interface: Enable decode of unicode atoms No API changes or additions. Just the ability for erl_interface to decode unicode atoms and convert them into latin1 strings to preserve backward compatibility for the existing API. --- lib/erl_interface/include/ei.h | 2 + lib/erl_interface/src/decode/decode_atom.c | 68 +++++++-- lib/erl_interface/src/decode/decode_boolean.c | 32 ++--- lib/erl_interface/src/decode/decode_pid.c | 13 +- lib/erl_interface/src/decode/decode_port.c | 12 +- lib/erl_interface/src/decode/decode_ref.c | 26 +--- lib/erl_interface/src/legacy/erl_marshal.c | 200 +++++++++++++------------- lib/erl_interface/src/misc/ei_decode_term.c | 37 +---- lib/erl_interface/src/misc/ei_printterm.c | 2 + lib/erl_interface/src/misc/get_type.c | 8 +- lib/erl_interface/src/misc/putget.h | 3 + 11 files changed, 202 insertions(+), 201 deletions(-) diff --git a/lib/erl_interface/include/ei.h b/lib/erl_interface/include/ei.h index ae815b414a..8f07b24852 100644 --- a/lib/erl_interface/include/ei.h +++ b/lib/erl_interface/include/ei.h @@ -115,6 +115,8 @@ #define ERL_FLOAT_EXT 'c' #define NEW_FLOAT_EXT 'F' #define ERL_ATOM_EXT 'd' +#define ERL_SMALL_ATOM_EXT 's' +#define ERL_UNICODE_ATOM_EXT 'v' #define ERL_REFERENCE_EXT 'e' #define ERL_NEW_REFERENCE_EXT 'r' #define ERL_PORT_EXT 'f' diff --git a/lib/erl_interface/src/decode/decode_atom.c b/lib/erl_interface/src/decode/decode_atom.c index c2e6a0426e..84edf1766a 100644 --- a/lib/erl_interface/src/decode/decode_atom.c +++ b/lib/erl_interface/src/decode/decode_atom.c @@ -21,24 +21,76 @@ #include "eiext.h" #include "putget.h" +static int utf8_to_latin1(char* dest, const char* source, unsigned len); + int ei_decode_atom(const char *buf, int *index, char *p) { const char *s = buf + *index; const char *s0 = s; int len; - if (get8(s) != ERL_ATOM_EXT) return -1; + switch (get8(s)) { + case ERL_ATOM_EXT: + len = get16be(s); + if (len > MAXATOMLEN) return -1; + if (p) { + memmove(p,s,len); + p[len] = (char)0; + } + break; + + case ERL_SMALL_ATOM_EXT: + len = get8(s); + if (p) { + memmove(p,s,len); + p[len] = (char)0; + } + break; + + case ERL_UNICODE_ATOM_EXT: + len = get16be(s); - len = get16be(s); + if (len > 2*MAXATOMLEN) return -1; - if (len > MAXATOMLEN) return -1; + if (p && utf8_to_latin1(p, s, len) < 0) return -1; + break; - if (p) { - memmove(p,s,len); - p[len] = (char)0; + default: + return -1; } + s += len; *index += s-s0; - - return 0; + return 0; +} + +int ei_internal_get_atom(const char** bufp, char* p) +{ + int ix = 0; + if (ei_decode_atom(*bufp, &ix, p) < 0) return -1; + *bufp += ix; + return 0; +} + +static int utf8_to_latin1(char* dest, const char* source, unsigned slen) +{ + const char* dest_end = dest + MAXATOMLEN - 1; + + while (slen > 0 && dest < dest_end) { + if ((source[0] & 0x80) == 0) { + *dest++ = *source++; + --slen; + } + else if (slen > 1 && + (source[0] & 0xFE) == 0xC2 && + (source[1] & 0xC0) == 0x80) { + *dest++ = (char) ((source[0] << 6) | (source[1] & 0x3F)); + source += 2; + slen -= 2; + } + else return -1; + } + *dest = 0; + return 0; } + diff --git a/lib/erl_interface/src/decode/decode_boolean.c b/lib/erl_interface/src/decode/decode_boolean.c index 9fd09c63f1..0a7a06f1d4 100644 --- a/lib/erl_interface/src/decode/decode_boolean.c +++ b/lib/erl_interface/src/decode/decode_boolean.c @@ -26,32 +26,20 @@ int ei_decode_boolean(const char *buf, int *index, int *p) { const char *s = buf + *index; const char *s0 = s; - int len; + char tbuf[MAXATOMLEN+1]; int t; - if (get8(s) != ERL_ATOM_EXT) return -1; + if (get_atom(&s, tbuf) < 0) return -1; - len = get16be(s); - - switch (len) { - case 4: - /* typecast makes ansi happy */ - if (strncmp((char*)s,"true",4)) return -1; - t = 1; - break; - - case 5: - if (strncmp((char*)s,"false",5)) return -1; - t = 0; - break; - - default: - return -1; - } - - s += len; + if (memcmp(tbuf, "true", 5) == 0) + t = 1; + else if (memcmp(tbuf, "false", 6) == 0) + t = 0; + else + return -1; + if (p) *p = t; *index += s-s0; - return 0; } + diff --git a/lib/erl_interface/src/decode/decode_pid.c b/lib/erl_interface/src/decode/decode_pid.c index 9ed1c36db6..a762ae499e 100644 --- a/lib/erl_interface/src/decode/decode_pid.c +++ b/lib/erl_interface/src/decode/decode_pid.c @@ -21,6 +21,7 @@ #include "eiext.h" #include "putget.h" + int ei_decode_pid(const char *buf, int *index, erlang_pid *p) { const char *s = buf + *index; @@ -30,17 +31,7 @@ int ei_decode_pid(const char *buf, int *index, erlang_pid *p) if (get8(s) != ERL_PID_EXT) return -1; /* first the nodename */ - if (get8(s) != ERL_ATOM_EXT) return -1; - - len = get16be(s); - - if (len > MAXATOMLEN) return -1; - - if (p) { - memmove(p->node, s, len); - p->node[len] = (char)0; - } - s += len; + if (get_atom(&s, p->node) < 0) return -1; /* now the numbers: num (4), serial (4), creation (1) */ if (p) { diff --git a/lib/erl_interface/src/decode/decode_port.c b/lib/erl_interface/src/decode/decode_port.c index 28abed801a..6eb2bc9197 100644 --- a/lib/erl_interface/src/decode/decode_port.c +++ b/lib/erl_interface/src/decode/decode_port.c @@ -30,17 +30,7 @@ int ei_decode_port(const char *buf, int *index, erlang_port *p) if (get8(s) != ERL_PORT_EXT) return -1; /* first the nodename */ - if (get8(s) != ERL_ATOM_EXT) return -1; - - len = get16be(s); - - if (len > MAXATOMLEN) return -1; - - if (p) { - memmove(p->node, s, len); - p->node[len] = (char)0; - } - s += len; + if (get_atom(&s, p->node) < 0) return -1; /* now the numbers: num (4), creation (1) */ if (p) { diff --git a/lib/erl_interface/src/decode/decode_ref.c b/lib/erl_interface/src/decode/decode_ref.c index 7b15808bc5..df3c30777b 100644 --- a/lib/erl_interface/src/decode/decode_ref.c +++ b/lib/erl_interface/src/decode/decode_ref.c @@ -21,6 +21,7 @@ #include "eiext.h" #include "putget.h" + int ei_decode_ref(const char *buf, int *index, erlang_ref *p) { const char *s = buf + *index; @@ -30,18 +31,8 @@ int ei_decode_ref(const char *buf, int *index, erlang_ref *p) switch (get8(s)) { case ERL_REFERENCE_EXT: - /* first the nodename */ - if (get8(s) != ERL_ATOM_EXT) return -1; - - len = get16be(s); - - if (len > MAXATOMLEN) return -1; - - if (p) { - memmove(p->node, s, len); - p->node[len] = (char)0; - } - s += len; + /* nodename */ + if (get_atom(&s, p->node) < 0) return -1; /* now the numbers: num (4), creation (1) */ if (p) { @@ -62,15 +53,7 @@ int ei_decode_ref(const char *buf, int *index, erlang_ref *p) if (p) p->len = count; /* then the nodename */ - if (get8(s) != ERL_ATOM_EXT) return -1; - len = get16be(s); - if (len > MAXATOMLEN) return -1; - - if (p) { - memmove(p->node, s, len); - p->node[len] = (char)0; - } - s += len; + if (get_atom(&s, p->node) < 0) return -1; /* creation */ if (p) { @@ -95,3 +78,4 @@ int ei_decode_ref(const char *buf, int *index, erlang_ref *p) return -1; } } + diff --git a/lib/erl_interface/src/legacy/erl_marshal.c b/lib/erl_interface/src/legacy/erl_marshal.c index dad715c762..775d7e82ca 100644 --- a/lib/erl_interface/src/legacy/erl_marshal.c +++ b/lib/erl_interface/src/legacy/erl_marshal.c @@ -51,7 +51,13 @@ static void erl_long_to_fp(long l, unsigned *d); #define CMP_ARRAY_SIZE 256 /* FIXME problem for threaded ? */ -static char cmp_array[CMP_ARRAY_SIZE]; + +static enum +{ + ERL_NUM_CMP=1, ERL_ATOM_CMP, ERL_REF_CMP, ERL_FUN_CMP, ERL_PORT_CMP, + ERL_PID_CMP, ERL_TUPLE_CMP, ERL_NIL_CMP, ERL_LIST_CMP, ERL_BIN_CMP +}cmp_array[CMP_ARRAY_SIZE]; + static int init_cmp_array_p=1; /* initialize array, the first time */ #if defined(VXWORKS) && CPU == PPC860 @@ -69,10 +75,8 @@ static int init_cmp_array_p=1; /* initialize array, the first time */ static int cmp_floats(double f1, double f2); static INLINE double to_float(long l); -#define ERL_NUM_CMP 1 -#define ERL_REF_CMP 3 - #define IS_ERL_NUM(t) (cmp_array[t]==ERL_NUM_CMP) +#define IS_ERL_ATOM(t) (cmp_array[t]==ERL_ATOM_CMP) #define CMP_NUM_CLASS_SIZE 256 static unsigned char cmp_num_class[CMP_NUM_CLASS_SIZE]; @@ -100,25 +104,27 @@ void erl_init_marshal(void) { if (init_cmp_array_p) { memset(cmp_array, 0, CMP_ARRAY_SIZE); - cmp_array[ERL_SMALL_INTEGER_EXT] = 1; - cmp_array[ERL_INTEGER_EXT] = 1; - cmp_array[ERL_FLOAT_EXT] = 1; - cmp_array[NEW_FLOAT_EXT] = 1; - cmp_array[ERL_SMALL_BIG_EXT] = 1; - cmp_array[ERL_LARGE_BIG_EXT] = 1; - cmp_array[ERL_ATOM_EXT] = 2; - cmp_array[ERL_REFERENCE_EXT] = 3; - cmp_array[ERL_NEW_REFERENCE_EXT] = 3; - cmp_array[ERL_FUN_EXT] = 4; - cmp_array[ERL_NEW_FUN_EXT] = 4; - cmp_array[ERL_PORT_EXT] = 5; - cmp_array[ERL_PID_EXT] = 6; - cmp_array[ERL_SMALL_TUPLE_EXT] = 7; - cmp_array[ERL_LARGE_TUPLE_EXT] = 7; - cmp_array[ERL_NIL_EXT] = 8; - cmp_array[ERL_STRING_EXT] = 9; - cmp_array[ERL_LIST_EXT] = 9; - cmp_array[ERL_BINARY_EXT] = 10; + cmp_array[ERL_SMALL_INTEGER_EXT] = ERL_NUM_CMP; + cmp_array[ERL_INTEGER_EXT] = ERL_NUM_CMP; + cmp_array[ERL_FLOAT_EXT] = ERL_NUM_CMP; + cmp_array[NEW_FLOAT_EXT] = ERL_NUM_CMP; + cmp_array[ERL_SMALL_BIG_EXT] = ERL_NUM_CMP; + cmp_array[ERL_LARGE_BIG_EXT] = ERL_NUM_CMP; + cmp_array[ERL_ATOM_EXT] = ERL_ATOM_CMP; + cmp_array[ERL_SMALL_ATOM_EXT] = ERL_ATOM_CMP; + cmp_array[ERL_UNICODE_ATOM_EXT] = ERL_ATOM_CMP; + cmp_array[ERL_REFERENCE_EXT] = ERL_REF_CMP; + cmp_array[ERL_NEW_REFERENCE_EXT] = ERL_REF_CMP; + cmp_array[ERL_FUN_EXT] = ERL_FUN_CMP; + cmp_array[ERL_NEW_FUN_EXT] = ERL_FUN_CMP; + cmp_array[ERL_PORT_EXT] = ERL_PORT_CMP; + cmp_array[ERL_PID_EXT] = ERL_PID_CMP; + cmp_array[ERL_SMALL_TUPLE_EXT] = ERL_TUPLE_CMP; + cmp_array[ERL_LARGE_TUPLE_EXT] = ERL_TUPLE_CMP; + cmp_array[ERL_NIL_EXT] = ERL_NIL_CMP; + cmp_array[ERL_STRING_EXT] = ERL_LIST_CMP; + cmp_array[ERL_LIST_EXT] = ERL_LIST_CMP; + cmp_array[ERL_BINARY_EXT] = ERL_BIN_CMP; init_cmp_array_p = 0; } if (init_cmp_num_class_p) { @@ -644,31 +650,14 @@ int erl_encode_buf(ETERM *ep, unsigned char **ext) } /* erl_encode_buf */ -/* - * A nice macro to make it look cleaner in the - * cases of PID's,PORT's and REF's below. - * It reads the NODE name from a buffer. - */ -#define READ_THE_NODE(ext,cp,len,i) \ -/* eat first atom, repr. the node */ \ -if (**ext != ERL_ATOM_EXT) \ - return (ETERM *) NULL; \ -*ext += 1; \ -i = (**ext << 8) | (*ext)[1]; \ -cp = (char *) *(ext) + 2; \ -*ext += (i + 2); \ -len = i - -#define STATIC_NODE_BUF_SZ 30 - -#define SET_NODE(node,node_buf,cp,len) \ -if (len >= STATIC_NODE_BUF_SZ) node = erl_malloc(len+1); \ -else node = node_buf; \ -memcpy(node, cp, len); \ -node[len] = '\0' - -#define RESET_NODE(node,len) \ -if (len >= STATIC_NODE_BUF_SZ) free(node) + +static int read_atom(unsigned char** ext, char* dst) +{ + int offs = 0; + int ret = ei_decode_atom((char*)*ext, &offs, dst); + *ext += offs; + return ret; +} /* * The actual DECODE engine. @@ -676,16 +665,17 @@ if (len >= STATIC_NODE_BUF_SZ) free(node) */ static ETERM *erl_decode_it(unsigned char **ext) { + char atom_buf[MAXATOMLEN+1]; char *cp; ETERM *ep,*tp,*np; unsigned int u,sign; - int i,j,len,arity; + int i,j,arity; double ff; /* Assume we are going to decode an integer */ ep = erl_alloc_eterm(ERL_INTEGER); ERL_COUNT(ep) = 1; - + switch (*(*ext)++) { case ERL_INTEGER_EXT: @@ -774,27 +764,27 @@ static ETERM *erl_decode_it(unsigned char **ext) return ep; case ERL_ATOM_EXT: + case ERL_SMALL_ATOM_EXT: + case ERL_UNICODE_ATOM_EXT: ERL_TYPE(ep) = ERL_ATOM; - i = (**ext << 8) | (*ext)[1]; - cp = (char *) *(ext) + 2; - *ext += (i + 2); + --(*ext); + if (read_atom(ext, atom_buf) < 0) return NULL; + + i = strlen(atom_buf); ep->uval.aval.len = i; ep->uval.aval.a = (char *) erl_malloc(i+1); - memcpy(ep->uval.aval.a, cp, i); - ep->uval.aval.a[i]='\0'; + memcpy(ep->uval.aval.a, atom_buf, i+1); return ep; case ERL_PID_EXT: erl_free_term(ep); { /* Why not use the constructors? */ - char *node; - char node_buf[STATIC_NODE_BUF_SZ]; + char* node = atom_buf; unsigned int number, serial; unsigned char creation; ETERM *eterm_p; - READ_THE_NODE(ext,cp,len,i); - SET_NODE(node,node_buf,cp,len); + if (read_atom(ext, node) < 0) return NULL; /* get the integers */ #if 0 @@ -816,20 +806,17 @@ static ETERM *erl_decode_it(unsigned char **ext) #endif creation = *(*ext)++; eterm_p = erl_mk_pid(node, number, serial, creation); - RESET_NODE(node,len); return eterm_p; } case ERL_REFERENCE_EXT: erl_free_term(ep); { - char *node; - char node_buf[STATIC_NODE_BUF_SZ]; + char* node = atom_buf; unsigned int number; unsigned char creation; ETERM *eterm_p; - READ_THE_NODE(ext,cp,len,i); - SET_NODE(node,node_buf,cp,len); + if (read_atom(ext, node) < 0) return NULL; /* get the integers */ #if 0 @@ -841,15 +828,13 @@ static ETERM *erl_decode_it(unsigned char **ext) #endif creation = *(*ext)++; eterm_p = erl_mk_ref(node, number, creation); - RESET_NODE(node,len); return eterm_p; } case ERL_NEW_REFERENCE_EXT: erl_free_term(ep); { - char *node; - char node_buf[STATIC_NODE_BUF_SZ]; + char* node = atom_buf; size_t cnt, i; unsigned int n[3]; unsigned char creation; @@ -862,8 +847,7 @@ static ETERM *erl_decode_it(unsigned char **ext) *ext += 2; #endif - READ_THE_NODE(ext,cp,len,i); - SET_NODE(node,node_buf,cp,len); + if (read_atom(ext, node) < 0) return NULL; /* get the integers */ creation = *(*ext)++; @@ -878,21 +862,18 @@ static ETERM *erl_decode_it(unsigned char **ext) #endif } eterm_p = __erl_mk_reference(node, cnt, n, creation); - RESET_NODE(node,len); return eterm_p; } case ERL_PORT_EXT: erl_free_term(ep); { - char *node; - char node_buf[STATIC_NODE_BUF_SZ]; + char* node = atom_buf; unsigned int number; unsigned char creation; ETERM *eterm_p; - READ_THE_NODE(ext,cp,len,i); - SET_NODE(node,node_buf,cp,len); + if (read_atom(ext, node) < 0) return NULL; /* get the integers */ #if 0 @@ -904,7 +885,6 @@ static ETERM *erl_decode_it(unsigned char **ext) #endif creation = *(*ext)++; eterm_p = erl_mk_port(node, number, creation); - RESET_NODE(node,len); return eterm_p; } @@ -1140,6 +1120,8 @@ unsigned char erl_ext_type(unsigned char *ext) case ERL_INTEGER_EXT: return ERL_INTEGER; case ERL_ATOM_EXT: + case ERL_SMALL_ATOM_EXT: + case ERL_UNICODE_ATOM_EXT: return ERL_ATOM; case ERL_PID_EXT: return ERL_PID; @@ -1191,6 +1173,8 @@ int erl_ext_size(unsigned char *t) case ERL_SMALL_INTEGER_EXT: case ERL_INTEGER_EXT: case ERL_ATOM_EXT: + case ERL_SMALL_ATOM_EXT: + case ERL_UNICODE_ATOM_EXT: case ERL_PID_EXT: case ERL_PORT_EXT: case ERL_REFERENCE_EXT: @@ -1229,15 +1213,31 @@ int erl_ext_size(unsigned char *t) } /* ext_size */ -/* - * A nice macro that eats up the atom pointed to. - */ -#define JUMP_ATOM(ext,i) \ -if (**ext != ERL_ATOM_EXT) \ - return 0; \ -*ext += 1; \ -i = (**ext << 8) | (*ext)[1]; \ -*ext += (i + 2) + +static int jump_atom(unsigned char** ext) +{ + unsigned char* e = *ext; + int len; + + switch (*e++) { + case ERL_ATOM_EXT: + case ERL_UNICODE_ATOM_EXT: + len = (e[0] << 8) | e[1]; + e += (len + 2); + break; + + case ERL_SMALL_ATOM_EXT: + len = e[0]; + e += (len + 1); + break; + + default: + return 0; + } + *ext = e; + return 1; +} + /* * MOVE the POINTER PAST the ENCODED ETERM we @@ -1259,25 +1259,26 @@ static int jump(unsigned char **ext) *ext += 1; break; case ERL_ATOM_EXT: - i = (**ext << 8) | (*ext)[1]; - *ext += (i + 2); + case ERL_SMALL_ATOM_EXT: + case ERL_UNICODE_ATOM_EXT: + jump_atom(ext); break; case ERL_PID_EXT: /* eat first atom */ - JUMP_ATOM(ext,i); + if (!jump_atom(ext)) return 0; *ext += 9; /* Two int's and the creation field */ break; case ERL_REFERENCE_EXT: case ERL_PORT_EXT: /* first field is an atom */ - JUMP_ATOM(ext,i); + if (!jump_atom(ext)) return 0; *ext += 5; /* One int and the creation field */ break; case ERL_NEW_REFERENCE_EXT: n = (**ext << 8) | (*ext)[1]; *ext += 2; /* first field is an atom */ - JUMP_ATOM(ext,i); + if (!jump_atom(ext)) return 0; *ext += 4*n+1; break; case ERL_NIL_EXT: @@ -1437,9 +1438,8 @@ do { \ #define CMP_EXT_SKIP_ATOM(EP) \ do { \ - if ((EP)[0] != ERL_ATOM_EXT) \ + if (!jump_atom(&(EP))) \ return CMP_EXT_ERROR_CODE; \ - (EP) += 3 + ((EP)[1] << 8 | (EP)[2]); \ } while (0) /* @@ -1569,6 +1569,7 @@ static int cmp_exe2(unsigned char **e1, unsigned char **e2) } *e2 += 1; + i = j = 0; switch (*(*e1)++) { case ERL_SMALL_INTEGER_EXT: @@ -1589,11 +1590,16 @@ static int cmp_exe2(unsigned char **e1, unsigned char **e2) *e1 += 4; *e2 += 4; return ret; case ERL_ATOM_EXT: - i = (**e1 << 8) | (*e1)[1]; - j = (**e2 << 8) | (*e2)[1]; - ret = cmpbytes(*e1 +2, i, *e2 +2, j); - *e1 += (i + 2); - *e2 += (j + 2); + case ERL_UNICODE_ATOM_EXT: + i = (**e1) << 8; (*e1)++; + j = (**e2) << 8; (*e2)++; + /*fall through*/ + case ERL_SMALL_ATOM_EXT: + i |= (**e1); (*e1)++; + j |= (**e2); (*e2)++; + ret = cmpbytes(*e1, i, *e2, j); + *e1 += i; + *e2 += j; return ret; case ERL_PID_EXT: { unsigned char *n1 = *e1; @@ -1622,7 +1628,7 @@ static int cmp_exe2(unsigned char **e1, unsigned char **e2) } case ERL_PORT_EXT: /* First compare node names ... */ - if (**e1 != ERL_ATOM_EXT || **e2 != ERL_ATOM_EXT) + if (!IS_ERL_ATOM(**e1) || !IS_ERL_ATOM(**e2)) return CMP_EXT_ERROR_CODE; ret = cmp_exe2(e1, e2); *e1 += 5; *e2 += 5; diff --git a/lib/erl_interface/src/misc/ei_decode_term.c b/lib/erl_interface/src/misc/ei_decode_term.c index 0b82ef0e35..6773f90bfc 100644 --- a/lib/erl_interface/src/misc/ei_decode_term.c +++ b/lib/erl_interface/src/misc/ei_decode_term.c @@ -48,20 +48,12 @@ int ei_decode_ei_term(const char* buf, int* index, ei_term* term) case NEW_FLOAT_EXT: return ei_decode_double(buf, index, &term->value.d_val); case ERL_ATOM_EXT: - len = get16be(s); - if (len > MAXATOMLEN) return -1; - memcpy(term->value.atom_name, s, len); - term->value.atom_name[len] = '\0'; - s += len; - break; + case ERL_SMALL_ATOM_EXT: + case ERL_UNICODE_ATOM_EXT: + return ei_decode_atom(buf, index, term->value.atom_name); case ERL_REFERENCE_EXT: /* first the nodename */ - if (get8(s) != ERL_ATOM_EXT) return -1; - len = get16be(s); - if (len > MAXATOMLEN) return -1; - memcpy(term->value.ref.node, s, len); - term->value.ref.node[len] = '\0'; - s += len; + if (get_atom(&s, term->value.ref.node) < 0) return -1; /* now the numbers: num (4), creation (1) */ term->value.ref.n[0] = get32be(s); term->value.ref.len = 1; @@ -71,12 +63,7 @@ int ei_decode_ei_term(const char* buf, int* index, ei_term* term) /* first the integer count */ term->value.ref.len = get16be(s); /* then the nodename */ - if (get8(s) != ERL_ATOM_EXT) return -1; - len = get16be(s); - if (len > MAXATOMLEN) return -1; - memcpy(term->value.ref.node, s, len); - term->value.ref.node[len] = '\0'; - s += len; + if (get_atom(&s, term->value.ref.node) < 0) return -1; /* creation */ term->value.ref.creation = get8(s) & 0x03; /* finally the id integers */ @@ -88,22 +75,12 @@ int ei_decode_ei_term(const char* buf, int* index, ei_term* term) } break; case ERL_PORT_EXT: - if (get8(s) != ERL_ATOM_EXT) return -1; - len = get16be(s); - if (len > MAXATOMLEN) return -1; - memcpy(term->value.port.node, s, len); - term->value.port.node[len] = '\0'; + if (get_atom(&s, term->value.port.node) < 0) return -1; term->value.port.id = get32be(s) & 0x0fffffff; /* 28 bits */; term->value.port.creation = get8(s) & 0x03; break; case ERL_PID_EXT: - if (get8(s) != ERL_ATOM_EXT) return -1; - /* name first */ - len = get16be(s); - if (len > MAXATOMLEN) return -1; - memcpy(term->value.pid.node, s, len); - term->value.pid.node[len] = '\0'; - s += len; + if (get_atom(&s, term->value.pid.node) < 0) return -1; /* now the numbers: num (4), serial (4), creation (1) */ term->value.pid.num = get32be(s) & 0x7fff; /* 15 bits */ term->value.pid.serial = get32be(s) & 0x1fff; /* 13 bits */ diff --git a/lib/erl_interface/src/misc/ei_printterm.c b/lib/erl_interface/src/misc/ei_printterm.c index 5fc6b3542c..620c6e72e2 100644 --- a/lib/erl_interface/src/misc/ei_printterm.c +++ b/lib/erl_interface/src/misc/ei_printterm.c @@ -133,6 +133,8 @@ static int print_term(FILE* fp, ei_x_buff* x, ei_get_type_internal(buf, index, &ty, &n); switch (ty) { case ERL_ATOM_EXT: + case ERL_SMALL_ATOM_EXT: + case ERL_UNICODE_ATOM_EXT: if (ei_decode_atom(buf, index, a) < 0) goto err; doquote = !islower((int)a[0]); diff --git a/lib/erl_interface/src/misc/get_type.c b/lib/erl_interface/src/misc/get_type.c index 2a680d0f94..c9a040fbbd 100644 --- a/lib/erl_interface/src/misc/get_type.c +++ b/lib/erl_interface/src/misc/get_type.c @@ -55,10 +55,12 @@ int ei_get_type(const char *buf, const int *index, int *type, int *len) break; case ERL_SMALL_TUPLE_EXT: + case ERL_SMALL_ATOM_EXT: *len = get8(s); break; case ERL_ATOM_EXT: + case ERL_UNICODE_ATOM_EXT: case ERL_STRING_EXT: *len = get16be(s); break; @@ -114,10 +116,14 @@ int ei_get_type_internal(const char *buf, const int *index, *type = get8(s); switch (*type) { + case ERL_SMALL_ATOM_EXT: + *type = ERL_ATOM_EXT; case ERL_SMALL_TUPLE_EXT: *len = get8(s); break; - + + case ERL_UNICODE_ATOM_EXT: + *type = ERL_ATOM_EXT; case ERL_ATOM_EXT: case ERL_STRING_EXT: *len = get16be(s); diff --git a/lib/erl_interface/src/misc/putget.h b/lib/erl_interface/src/misc/putget.h index 7a43de324b..8b0d4d3404 100644 --- a/lib/erl_interface/src/misc/putget.h +++ b/lib/erl_interface/src/misc/putget.h @@ -105,6 +105,9 @@ ((EI_ULONGLONG)((unsigned char *)(s))[-2] << 8) | \ (EI_ULONGLONG)((unsigned char *)(s))[-1])) +int ei_internal_get_atom(const char** bufp, char* p); +#define get_atom ei_internal_get_atom + typedef union float_ext { double d; EI_ULONGLONG val; -- cgit v1.2.3 From 5d79f55ca441727578d34b78ee0d6d8aa80976ee Mon Sep 17 00:00:00 2001 From: Rickard Green Date: Sat, 5 Jan 2013 03:07:14 +0100 Subject: Implement UTF-8 atom support for jinterface --- .../com/ericsson/otp/erlang/AbstractNode.java | 4 +- .../com/ericsson/otp/erlang/OtpErlangAtom.java | 2 +- .../com/ericsson/otp/erlang/OtpExternal.java | 6 + .../com/ericsson/otp/erlang/OtpInputStream.java | 64 +++++++-- .../com/ericsson/otp/erlang/OtpOutputStream.java | 60 ++++++++- lib/jinterface/test/nc_SUITE.erl | 146 +++++++++++++++------ 6 files changed, 226 insertions(+), 56 deletions(-) diff --git a/lib/jinterface/java_src/com/ericsson/otp/erlang/AbstractNode.java b/lib/jinterface/java_src/com/ericsson/otp/erlang/AbstractNode.java index 16cb544a16..c76fad5e45 100644 --- a/lib/jinterface/java_src/com/ericsson/otp/erlang/AbstractNode.java +++ b/lib/jinterface/java_src/com/ericsson/otp/erlang/AbstractNode.java @@ -90,6 +90,8 @@ public class AbstractNode { static final int dFlagExportPtrTag = 0x200; // NOT SUPPORTED static final int dFlagBitBinaries = 0x400; static final int dFlagNewFloats = 0x800; + static final int dFlagUnicodeIo = 0x1000; + static final int dFlagUtf8Atoms = 0x10000; int ntype = NTYPE_R6; int proto = 0; // tcp/ip @@ -98,7 +100,7 @@ public class AbstractNode { int creation = 0; int flags = dFlagExtendedReferences | dFlagExtendedPidsPorts | dFlagBitBinaries | dFlagNewFloats | dFlagFunTags - | dflagNewFunTags; + | dflagNewFunTags | dFlagUtf8Atoms; /* initialize hostname and default cookie */ static { diff --git a/lib/jinterface/java_src/com/ericsson/otp/erlang/OtpErlangAtom.java b/lib/jinterface/java_src/com/ericsson/otp/erlang/OtpErlangAtom.java index ced4dbb8c2..2768edc6fa 100644 --- a/lib/jinterface/java_src/com/ericsson/otp/erlang/OtpErlangAtom.java +++ b/lib/jinterface/java_src/com/ericsson/otp/erlang/OtpErlangAtom.java @@ -51,7 +51,7 @@ public class OtpErlangAtom extends OtpErlangObject implements Serializable, "null string value"); } - if (atom.length() > maxAtomLength) { + if (atom.codePointCount(0, atom.length()) > maxAtomLength) { throw new java.lang.IllegalArgumentException("Atom may not exceed " + maxAtomLength + " characters: " + atom); } diff --git a/lib/jinterface/java_src/com/ericsson/otp/erlang/OtpExternal.java b/lib/jinterface/java_src/com/ericsson/otp/erlang/OtpExternal.java index e70b9a786b..2a4cd4fa2d 100644 --- a/lib/jinterface/java_src/com/ericsson/otp/erlang/OtpExternal.java +++ b/lib/jinterface/java_src/com/ericsson/otp/erlang/OtpExternal.java @@ -88,6 +88,12 @@ public class OtpExternal { /** The tag used for old Funs */ public static final int funTag = 117; + /** The tag used for unicode atoms */ + public static final int atomUtf8Tag = 118; + + /** The tag used for small unicode atoms */ + public static final int smallAtomUtf8Tag = 119; + /** The tag used for compressed terms */ public static final int compressedTag = 80; diff --git a/lib/jinterface/java_src/com/ericsson/otp/erlang/OtpInputStream.java b/lib/jinterface/java_src/com/ericsson/otp/erlang/OtpInputStream.java index ae5f4ee072..c2a79af841 100644 --- a/lib/jinterface/java_src/com/ericsson/otp/erlang/OtpInputStream.java +++ b/lib/jinterface/java_src/com/ericsson/otp/erlang/OtpInputStream.java @@ -351,26 +351,64 @@ public class OtpInputStream extends ByteArrayInputStream { */ public String read_atom() throws OtpErlangDecodeException { int tag; - int len; + int len = -1; byte[] strbuf; String atom; tag = read1skip_version(); - if (tag != OtpExternal.atomTag) { - throw new OtpErlangDecodeException( - "wrong tag encountered, expected " + OtpExternal.atomTag - + ", got " + tag); - } + switch (tag) { - len = read2BE(); + case OtpExternal.atomTag: + len = read2BE(); + strbuf = new byte[len]; + this.readN(strbuf); + try { + atom = new String(strbuf, "ISO-8859-1"); + } catch (final java.io.UnsupportedEncodingException e) { + throw new OtpErlangDecodeException( + "Failed to decode ISO-8859-1 atom"); + } + if (atom.length() > OtpExternal.maxAtomLength) { + /* + * Throwing an exception would be better I think, + * but truncation seems to be the way it has + * been done in other parts of OTP... + */ + atom = atom.substring(0, OtpExternal.maxAtomLength); + } + break; - strbuf = new byte[len]; - this.readN(strbuf); - atom = OtpErlangString.newString(strbuf); + case OtpExternal.smallAtomUtf8Tag: + len = read1(); + /* fall through */ + case OtpExternal.atomUtf8Tag: + if (len < 0) { + len = read2BE(); + } + strbuf = new byte[len]; + this.readN(strbuf); + try { + atom = new String(strbuf, "UTF-8"); + } catch (final java.io.UnsupportedEncodingException e) { + throw new OtpErlangDecodeException( + "Failed to decode UTF-8 atom"); + } + if (atom.codePointCount(0, atom.length()) > OtpExternal.maxAtomLength) { + /* + * Throwing an exception would be better I think, + * but truncation seems to be the way it has + * been done in other parts of OTP... + */ + final int[] cps = OtpErlangString.stringToCodePoints(atom); + atom = new String(cps, 0, OtpExternal.maxAtomLength); + } + break; - if (atom.length() > OtpExternal.maxAtomLength) { - atom = atom.substring(0, OtpExternal.maxAtomLength); + default: + throw new OtpErlangDecodeException( + "wrong tag encountered, expected " + OtpExternal.atomTag + + ", or " + OtpExternal.atomUtf8Tag + ", got " + tag); } return atom; @@ -1152,6 +1190,8 @@ public class OtpInputStream extends ByteArrayInputStream { return new OtpErlangLong(this); case OtpExternal.atomTag: + case OtpExternal.smallAtomUtf8Tag: + case OtpExternal.atomUtf8Tag: return new OtpErlangAtom(this); case OtpExternal.floatTag: diff --git a/lib/jinterface/java_src/com/ericsson/otp/erlang/OtpOutputStream.java b/lib/jinterface/java_src/com/ericsson/otp/erlang/OtpOutputStream.java index 22ebb4688a..10bdf389cd 100644 --- a/lib/jinterface/java_src/com/ericsson/otp/erlang/OtpOutputStream.java +++ b/lib/jinterface/java_src/com/ericsson/otp/erlang/OtpOutputStream.java @@ -343,9 +343,63 @@ public class OtpOutputStream extends ByteArrayOutputStream { * the string to write. */ public void write_atom(final String atom) { - write1(OtpExternal.atomTag); - write2BE(atom.length()); - writeN(atom.getBytes()); + String enc_atom; + byte[] bytes; + boolean isLatin1 = true; + + if (atom.codePointCount(0, atom.length()) <= OtpExternal.maxAtomLength) { + enc_atom = atom; + } + else { + /* + * Throwing an exception would be better I think, + * but truncation seems to be the way it has + * been done in other parts of OTP... + */ + enc_atom = new String(OtpErlangString.stringToCodePoints(atom), + 0, OtpExternal.maxAtomLength); + } + + for (int offset = 0; offset < enc_atom.length();) { + final int cp = enc_atom.codePointAt(offset); + if ((cp & ~0xFF) != 0) { + isLatin1 = false; + break; + } + offset += Character.charCount(cp); + } + try { + if (isLatin1) { + bytes = enc_atom.getBytes("ISO-8859-1"); + write1(OtpExternal.atomTag); + write2BE(bytes.length); + } + else { + bytes = enc_atom.getBytes("UTF-8"); + final int length = bytes.length; + if (length < 256) { + write1(OtpExternal.smallAtomUtf8Tag); + write1(length); + } + else { + write1(OtpExternal.atomUtf8Tag); + write2BE(length); + } + } + writeN(bytes); + } catch (final java.io.UnsupportedEncodingException e) { + /* + * Sigh, why didn't the API designer add an + * OtpErlangEncodeException to these encoding + * functions?!? Instead of changing the API we + * write an invalid atom and let it fail for + * whoever trying to decode this... Sigh, + * again... + */ + write1(OtpExternal.smallAtomUtf8Tag); + write1(2); + write2BE(0xffff); /* Invalid UTF-8 */ + } } /** diff --git a/lib/jinterface/test/nc_SUITE.erl b/lib/jinterface/test/nc_SUITE.erl index d5388e54f4..ccc87a6761 100644 --- a/lib/jinterface/test/nc_SUITE.erl +++ b/lib/jinterface/test/nc_SUITE.erl @@ -22,6 +22,15 @@ -include_lib("common_test/include/ct.hrl"). -include("test_server_line.hrl"). +-define(VERSION_MAGIC, 131). + +-define(ATOM_EXT, 100). +-define(REFERENCE_EXT, 101). +-define(PORT_EXT, 102). +-define(PID_EXT, 103). +-define(NEW_REFERENCE_EXT, 114). +-define(ATOM_UTF8_EXT, 118). +-define(SMALL_ATOM_UTF8_EXT, 119). -export([all/0, suite/0,groups/0,init_per_group/2,end_per_group/2, init_per_suite/1, @@ -45,6 +54,10 @@ unicode/1, unicode_list_to_string/1, unicode_string_to_list/1, + utf8_atom/1, + utf8_pid/1, + utf8_port/1, + utf8_ref/1, connect/1]). @@ -58,7 +71,9 @@ all() -> decompress_roundtrip, compress_roundtrip, integer_roundtrip, fun_roundtrip, lists_roundtrip, lists_roundtrip_2, lists_iterator, unicode, - unicode_list_to_string, unicode_string_to_list, connect]. + unicode_list_to_string, unicode_string_to_list, + utf8_atom, utf8_pid, utf8_port, utf8_ref, + connect]. groups() -> []. @@ -448,6 +463,71 @@ unicode_string_to_list(Config) when is_list(Config) -> end, ["unicode"]). +evil_smiley() -> + <<240,159,152,136>>. + +evil_smileys(0) -> + []; +evil_smileys(N) -> + [evil_smiley() | evil_smileys(N-1)]. + +utf8_atom(Config) when is_list(Config) -> + ES = evil_smiley(), + SmallUA = binary_to_term(list_to_binary([?VERSION_MAGIC, + ?SMALL_ATOM_UTF8_EXT, + size(ES), + ES])), + true = is_atom(SmallUA), + NoESs = 300 div size(ES), + ESs = evil_smileys(NoESs), + LargeUA = binary_to_term(list_to_binary([?VERSION_MAGIC, + ?ATOM_UTF8_EXT, + uint16_be(NoESs*size(ES)), + ESs])), + true = is_atom(LargeUA), + erlang:display({atom, SmallUA, LargeUA}), + do_echo([SmallUA, LargeUA], Config). + +utf8_nodenames_ext() -> + H = "@host", + ES = evil_smiley(), + SmallUANodeExt = list_to_binary([?SMALL_ATOM_UTF8_EXT, + size(ES)+length(H), + ES, + H]), + NoESs = 300 div size(ES), + ESs = evil_smileys(NoESs), + LargeUANodeExt = list_to_binary([?ATOM_UTF8_EXT, + uint16_be(NoESs*size(ES)+length(H)), + ESs, + H]), + {SmallUANodeExt, LargeUANodeExt}. + +utf8_pid(Config) when is_list(Config) -> + {SmallUANodeExt, LargeUANodeExt} = utf8_nodenames_ext(), + SmallPid = mk_pid({SmallUANodeExt, 2}, 4711, 4711), + LargePid = mk_pid({LargeUANodeExt, 2}, 4711, 4711), + erlang:display({pid, SmallPid, node(SmallPid)}), + erlang:display({pid, LargePid, node(LargePid)}), + do_echo([SmallPid, LargePid], Config). + +utf8_port(Config) when is_list(Config) -> + {SmallUANodeExt, LargeUANodeExt} = utf8_nodenames_ext(), + SmallPort = mk_port({SmallUANodeExt, 2}, 4711), + erlang:display({port, SmallPort, node(SmallPort)}), + LargePort = mk_port({LargeUANodeExt, 2}, 4711), + erlang:display({port, LargePort, node(LargePort)}), + do_echo([SmallPort, LargePort], Config). + +utf8_ref(Config) when is_list(Config) -> + {SmallUANodeExt, LargeUANodeExt} = utf8_nodenames_ext(), + SmallRef = mk_ref({SmallUANodeExt, 2}, [4711, 4711, 4711]), + erlang:display({ref, SmallRef, node(SmallRef)}), + LargeRef = mk_ref({LargeUANodeExt, 2}, [4711, 4711, 4711]), + erlang:display({ref, LargeRef, node(LargeRef)}), + do_echo([SmallRef, LargeRef], Config). + + %% Lazy list cp_gen(N) -> cp_gen(N, -1, 16#110000). @@ -646,16 +726,6 @@ make_name() -> ++ "-" ++ integer_to_list(B) ++ "-" ++ integer_to_list(C)). - - --define(VERSION_MAGIC, 131). - --define(ATOM_EXT, 100). --define(REFERENCE_EXT, 101). --define(PORT_EXT, 102). --define(PID_EXT, 103). --define(NEW_REFERENCE_EXT, 114). - uint32_be(Uint) when is_integer(Uint), 0 =< Uint, Uint < 1 bsl 32 -> [(Uint bsr 24) band 16#ff, (Uint bsr 16) band 16#ff, @@ -679,72 +749,70 @@ uint8(Uint) -> mk_pid({NodeName, Creation}, Number, Serial) when is_atom(NodeName) -> - mk_pid({atom_to_list(NodeName), Creation}, Number, Serial); -mk_pid({NodeName, Creation}, Number, Serial) -> + <> = term_to_binary(NodeName), + mk_pid({NodeNameExt, Creation}, Number, Serial); +mk_pid({NodeNameExt, Creation}, Number, Serial) -> case catch binary_to_term(list_to_binary([?VERSION_MAGIC, ?PID_EXT, - ?ATOM_EXT, - uint16_be(length(NodeName)), - NodeName, + NodeNameExt, uint32_be(Number), uint32_be(Serial), uint8(Creation)])) of Pid when is_pid(Pid) -> Pid; {'EXIT', {badarg, _}} -> - exit({badarg, mk_pid, [{NodeName, Creation}, Number, Serial]}); + exit({badarg, mk_pid, [{NodeNameExt, Creation}, Number, Serial]}); Other -> exit({unexpected_binary_to_term_result, Other}) end. mk_port({NodeName, Creation}, Number) when is_atom(NodeName) -> - mk_port({atom_to_list(NodeName), Creation}, Number); -mk_port({NodeName, Creation}, Number) -> + <> = term_to_binary(NodeName), + mk_port({NodeNameExt, Creation}, Number); +mk_port({NodeNameExt, Creation}, Number) -> case catch binary_to_term(list_to_binary([?VERSION_MAGIC, ?PORT_EXT, - ?ATOM_EXT, - uint16_be(length(NodeName)), - NodeName, + NodeNameExt, uint32_be(Number), uint8(Creation)])) of Port when is_port(Port) -> Port; {'EXIT', {badarg, _}} -> - exit({badarg, mk_port, [{NodeName, Creation}, Number]}); + exit({badarg, mk_port, [{NodeNameExt, Creation}, Number]}); Other -> exit({unexpected_binary_to_term_result, Other}) end. -mk_ref({NodeName, Creation}, Numbers) when is_atom(NodeName), - is_integer(Creation), - is_list(Numbers) -> - mk_ref({atom_to_list(NodeName), Creation}, Numbers); -mk_ref({NodeName, Creation}, [Number]) when is_list(NodeName), - is_integer(Creation), - is_integer(Number) -> +mk_ref({NodeName, Creation}, [Number] = NL) when is_atom(NodeName), + is_integer(Creation), + is_integer(Number) -> + <> = term_to_binary(NodeName), + mk_ref({NodeNameExt, Creation}, NL); +mk_ref({NodeNameExt, Creation}, [Number]) when is_integer(Creation), + is_integer(Number) -> case catch binary_to_term(list_to_binary([?VERSION_MAGIC, ?REFERENCE_EXT, - ?ATOM_EXT, - uint16_be(length(NodeName)), - NodeName, + NodeNameExt, uint32_be(Number), uint8(Creation)])) of Ref when is_reference(Ref) -> Ref; {'EXIT', {badarg, _}} -> - exit({badarg, mk_ref, [{NodeName, Creation}, [Number]]}); + exit({badarg, mk_ref, [{NodeNameExt, Creation}, [Number]]}); Other -> exit({unexpected_binary_to_term_result, Other}) end; -mk_ref({NodeName, Creation}, Numbers) when is_list(NodeName), +mk_ref({NodeName, Creation}, Numbers) when is_atom(NodeName), is_integer(Creation), is_list(Numbers) -> + <> = term_to_binary(NodeName), + mk_ref({NodeNameExt, Creation}, Numbers); +mk_ref({NodeNameExt, Creation}, Numbers) when is_integer(Creation), + is_list(Numbers) -> case catch binary_to_term(list_to_binary([?VERSION_MAGIC, ?NEW_REFERENCE_EXT, uint16_be(length(Numbers)), - ?ATOM_EXT, - uint16_be(length(NodeName)), - NodeName, + NodeNameExt, uint8(Creation), lists:map(fun (N) -> uint32_be(N) @@ -753,7 +821,7 @@ mk_ref({NodeName, Creation}, Numbers) when is_list(NodeName), Ref when is_reference(Ref) -> Ref; {'EXIT', {badarg, _}} -> - exit({badarg, mk_ref, [{NodeName, Creation}, Numbers]}); + exit({badarg, mk_ref, [{NodeNameExt, Creation}, Numbers]}); Other -> exit({unexpected_binary_to_term_result, Other}) end. -- cgit v1.2.3 From 0dd3b88cdf90283d9c276ee415f985cb764e522f Mon Sep 17 00:00:00 2001 From: Rickard Green Date: Thu, 10 Jan 2013 12:47:46 +0100 Subject: UTF-8 support for distribution --- erts/emulator/beam/atom.c | 164 +++++++++++++++++++++---------- erts/emulator/beam/atom.h | 13 ++- erts/emulator/beam/atom.names | 2 + erts/emulator/beam/beam_load.c | 4 +- erts/emulator/beam/bif.c | 26 ++--- erts/emulator/beam/dist.c | 8 +- erts/emulator/beam/erl_alloc.types | 2 + erts/emulator/beam/erl_bif_ddll.c | 5 +- erts/emulator/beam/erl_bif_info.c | 22 +++-- erts/emulator/beam/erl_bif_port.c | 2 +- erts/emulator/beam/erl_db.c | 2 +- erts/emulator/beam/erl_db_util.c | 3 +- erts/emulator/beam/erl_init.c | 4 +- erts/emulator/beam/erl_unicode.c | 139 +++++++++++++++++++++----- erts/emulator/beam/external.c | 193 +++++++++++++++++++++++-------------- erts/emulator/beam/external.h | 5 +- erts/emulator/beam/global.h | 10 +- erts/emulator/beam/io.c | 28 ++++-- erts/emulator/beam/utils.c | 2 +- lib/kernel/include/dist.hrl | 1 + lib/kernel/src/dist_util.erl | 3 +- 21 files changed, 443 insertions(+), 195 deletions(-) diff --git a/erts/emulator/beam/atom.c b/erts/emulator/beam/atom.c index b41a98f2a2..82dd320ea9 100644 --- a/erts/emulator/beam/atom.c +++ b/erts/emulator/beam/atom.c @@ -162,6 +162,7 @@ atom_alloc(Atom* tmpl) obj->name = atom_text_alloc(tmpl->len); sys_memcpy(obj->name, tmpl->name, tmpl->len); obj->len = tmpl->len; + obj->latin1_chars = tmpl->latin1_chars; obj->slot.index = -1; /* @@ -192,48 +193,6 @@ atom_free(Atom* obj) erts_free(ERTS_ALC_T_ATOM, (void*) obj); } -Eterm -am_atom_put(const char* name, int len) -{ - Atom a; - Eterm ret; - int aix; -#ifdef DEBUG - byte* err_pos; - Uint num_chars; - ASSERT(erts_analyze_utf8(name, len, &err_pos, &num_chars, NULL) == ERTS_UTF8_OK); -#endif - /* - * Silently truncate the atom if it is too long. Overlong atoms - * could occur in situations where we have no good way to return - * an error, such as in the I/O system. (Unfortunately, many - * drivers don't check for errors.) - * - * If an error should be produced for overlong atoms (such in - * list_to_atom/1), the caller should check the length before - * calling this function. - */ - if (len > MAX_ATOM_SZ_LIMIT) { - len = MAX_ATOM_SZ_LIMIT; /*SVERK Urk... */ - } -#ifdef ERTS_ATOM_PUT_OPS_STAT - erts_smp_atomic_inc_nob(&atom_put_ops); -#endif - a.len = len; - a.name = (byte*)name; - atom_read_lock(); - aix = index_get(&erts_atom_table, (void*) &a); - atom_read_unlock(); - if (aix >= 0) - ret = make_atom(aix); - else { - atom_write_lock(); - ret = make_atom(index_put(&erts_atom_table, (void*) &a)); - atom_write_unlock(); - } - return ret; -} - static void latin1_to_utf8(byte* conv_buf, const byte** srcp, int* lenp) { byte* dst; @@ -264,19 +223,116 @@ need_convertion: *lenp = dst - conv_buf; } - +/* + * erts_atom_put() may fail. If it fails THE_NON_VALUE is returned! + */ Eterm -am_atom_put2(const byte* name, int len, int is_latin1) +erts_atom_put(const byte *name, int len, ErtsAtomEncoding enc, int trunc) { byte utf8_copy[MAX_ATOM_SZ_FROM_LATIN1]; + const byte *text = name; + int tlen = len; + Sint no_latin1_chars; + Atom a; + int aix; - if (is_latin1) { - latin1_to_utf8(utf8_copy, &name, &len); +#ifdef ERTS_ATOM_PUT_OPS_STAT + erts_smp_atomic_inc_nob(&atom_put_ops); +#endif + + if (tlen < 0) { + if (trunc) + tlen = 0; + else + return THE_NON_VALUE; } - return am_atom_put((const char*)name, len); -} + switch (enc) { + case ERTS_ATOM_ENC_7BIT_ASCII: + if (tlen > MAX_ATOM_CHARACTERS) { + if (trunc) + tlen = MAX_ATOM_CHARACTERS; + else + return THE_NON_VALUE; + } +#ifdef DEBUG + for (aix = 0; aix < len; aix++) { + ASSERT((name[aix] & 0x80) == 0); + } +#endif + no_latin1_chars = tlen; + break; + case ERTS_ATOM_ENC_LATIN1: + if (tlen > MAX_ATOM_CHARACTERS) { + if (trunc) + tlen = MAX_ATOM_CHARACTERS; + else + return THE_NON_VALUE; + } + no_latin1_chars = tlen; + latin1_to_utf8(utf8_copy, &text, &tlen); + break; + case ERTS_ATOM_ENC_UTF8: + /* First sanity check; need to verify later */ + if (tlen > MAX_ATOM_SZ_LIMIT && !trunc) + return THE_NON_VALUE; + break; + } + a.len = tlen; + a.name = (byte *) text; + atom_read_lock(); + aix = index_get(&erts_atom_table, (void*) &a); + atom_read_unlock(); + if (aix >= 0) { + /* Already in table no need to verify it */ + return make_atom(aix); + } + + if (enc == ERTS_ATOM_ENC_UTF8) { + /* Need to verify encoding and length */ + byte *err_pos; + Uint no_chars; + switch (erts_analyze_utf8_x((byte *) text, + (Uint) tlen, + &err_pos, + &no_chars, NULL, + &no_latin1_chars, + MAX_ATOM_CHARACTERS)) { + case ERTS_UTF8_OK: + ASSERT(no_chars <= MAX_ATOM_CHARACTERS); + break; + case ERTS_UTF8_OK_MAX_CHARS: + /* Truncated... */ + if (!trunc) + return THE_NON_VALUE; + ASSERT(no_chars == MAX_ATOM_CHARACTERS); + tlen = err_pos - text; + break; + default: + /* Bad utf8... */ + return THE_NON_VALUE; + } + } + + ASSERT(tlen <= MAX_ATOM_SZ_LIMIT); + ASSERT(-1 <= no_latin1_chars && no_latin1_chars <= MAX_ATOM_CHARACTERS); + + a.len = tlen; + a.latin1_chars = (Sint16) no_latin1_chars; + a.name = (byte *) text; + atom_write_lock(); + aix = index_put(&erts_atom_table, (void*) &a); + atom_write_unlock(); + return make_atom(aix); +} + +Eterm +am_atom_put(const char* name, int len) +{ + /* Assumes 7-bit ascii; use erts_atom_put() for other encodings... */ + return erts_atom_put((byte *) name, len, ERTS_ATOM_ENC_7BIT_ASCII, 1); +} int atom_table_size(void) { @@ -318,10 +374,11 @@ erts_atom_get(const char *name, int len, Eterm* ap, int is_latin1) int i; int res; - a.len = len; + a.len = (Sint16) len; a.name = (byte *)name; if (is_latin1) { - latin1_to_utf8(utf8_copy, (const byte**)&a.name, &a.len); + latin1_to_utf8(utf8_copy, (const byte**)&a.name, &len); + a.len = (Sint16) len; } atom_read_lock(); i = index_get(&erts_atom_table, (void*) &a); @@ -384,8 +441,15 @@ init_atom_table(void) for (i = 0; erl_atom_names[i] != 0; i++) { int ix; a.len = strlen(erl_atom_names[i]); + a.latin1_chars = a.len; a.name = (byte*)erl_atom_names[i]; a.slot.index = i; +#ifdef DEBUG + /* Verify 7-bit ascii */ + for (ix = 0; ix < a.len; ix++) { + ASSERT((a.name[ix] & 0x80) == 0); + } +#endif ix = index_put(&erts_atom_table, (void*) &a); atom_text_pos -= a.len; atom_space -= a.len; diff --git a/erts/emulator/beam/atom.h b/erts/emulator/beam/atom.h index 84dd6d8901..f721999a4c 100644 --- a/erts/emulator/beam/atom.h +++ b/erts/emulator/beam/atom.h @@ -47,7 +47,8 @@ */ typedef struct atom { IndexSlot slot; /* MUST BE LOCATED AT TOP OF STRUCT!!! */ - int len; /* length of atom name */ + Sint16 len; /* length of atom name (UTF-8 encoded) */ + Sint16 latin1_chars; /* 0-255 if atom can be encoded in latin1; otherwise, -1 */ int ord0; /* ordinal value of first 3 bytes + 7 bits */ byte* name; /* name of atom */ } Atom; @@ -113,6 +114,12 @@ ERTS_GLB_INLINE int erts_is_atom_str(const char *str, Eterm term, int is_latin1) #endif +typedef enum { + ERTS_ATOM_ENC_7BIT_ASCII, + ERTS_ATOM_ENC_LATIN1, + ERTS_ATOM_ENC_UTF8 +} ErtsAtomEncoding; + /* * Note, ERTS_IS_ATOM_STR() expects the first argument to be a * 7-bit ASCII string literal. @@ -125,8 +132,8 @@ ERTS_GLB_INLINE int erts_is_atom_str(const char *str, Eterm term, int is_latin1) int atom_table_size(void); /* number of elements */ int atom_table_sz(void); /* table size in bytes, excluding stored objects */ -Eterm am_atom_put(const char*, int); /* most callers pass plain char*'s */ -Eterm am_atom_put2(const byte*, int, int is_latin1); +Eterm am_atom_put(const char*, int); /* ONLY 7-bit ascii! */ +Eterm erts_atom_put(const byte *name, int len, ErtsAtomEncoding enc, int trunc); int atom_erase(byte*, int); int atom_static_put(byte*, int); void init_atom_table(void); diff --git a/erts/emulator/beam/atom.names b/erts/emulator/beam/atom.names index afcbd732df..59c9f39e7b 100644 --- a/erts/emulator/beam/atom.names +++ b/erts/emulator/beam/atom.names @@ -17,6 +17,8 @@ # %CopyrightEnd% # +# +# IMPORTANT! All atoms defined here *need* to be in 7-bit ascii! # # File format: # diff --git a/erts/emulator/beam/beam_load.c b/erts/emulator/beam/beam_load.c index 7bb964d5aa..8b4135e21d 100644 --- a/erts/emulator/beam/beam_load.c +++ b/erts/emulator/beam/beam_load.c @@ -1230,7 +1230,7 @@ load_atom_table(LoaderState* stp) GetByte(stp, n); GetString(stp, atom, n); - stp->atom[i] = am_atom_put2(atom, n, 1); + stp->atom[i] = erts_atom_put(atom, n, ERTS_ATOM_ENC_LATIN1, 1); } /* @@ -1620,7 +1620,7 @@ read_line_table(LoaderState* stp) GetInt(stp, 2, n); GetString(stp, fname, n); - stp->fname[i] = am_atom_put((char*)fname, n); /*SVERK ? */ + stp->fname[i] = erts_atom_put(fname, n, ERTS_ATOM_ENC_LATIN1, 1); } } diff --git a/erts/emulator/beam/bif.c b/erts/emulator/beam/bif.c index 89157068c0..a0b4a8c049 100644 --- a/erts/emulator/beam/bif.c +++ b/erts/emulator/beam/bif.c @@ -2536,11 +2536,13 @@ BIF_RETTYPE append_element_2(BIF_ALIST_2) BIF_RETTYPE atom_to_list_1(BIF_ALIST_1) { - Eterm do_utf8_to_list(Process*, Uint num, byte *bytes, Uint sz, Uint left, - Uint *num_built, Uint *num_eaten, Eterm tail); /*SVERK */ Atom* ap; Uint num_chars, num_built, num_eaten; + byte* err_pos; Eterm res; +#ifdef DEBUG + int ares; +#endif if (is_not_atom(BIF_ARG_1)) BIF_ERROR(BIF_P, BADARG); @@ -2549,16 +2551,15 @@ BIF_RETTYPE atom_to_list_1(BIF_ALIST_1) ap = atom_tab(atom_val(BIF_ARG_1)); if (ap->len == 0) BIF_RET(NIL); /* the empty atom */ - { - byte* err_pos; - if (erts_analyze_utf8(ap->name, ap->len, &err_pos, &num_chars, NULL) - != ERTS_UTF8_OK) { - BIF_ERROR(BIF_P, BADARG); - } - } + +#ifdef DEBUG + ares = +#endif + erts_analyze_utf8(ap->name, ap->len, &err_pos, &num_chars, NULL); + ASSERT(ares == ERTS_UTF8_OK); - res = do_utf8_to_list(BIF_P, num_chars, ap->name, ap->len, ap->len, - &num_built, &num_eaten, NIL); + res = erts_utf8_to_list(BIF_P, num_chars, ap->name, ap->len, ap->len, + &num_built, &num_eaten, NIL); ASSERT(num_built == num_chars); ASSERT(num_eaten == ap->len); BIF_RET(res); @@ -2582,7 +2583,8 @@ BIF_RETTYPE list_to_atom_1(BIF_ALIST_1) } BIF_ERROR(BIF_P, BADARG); } - res = am_atom_put2((byte*)buf, i, 1); + res = erts_atom_put((byte *) buf, i, ERTS_ATOM_ENC_LATIN1, 1); + ASSERT(is_atom(res)); erts_free(ERTS_ALC_T_TMP, (void *) buf); BIF_RET(res); } diff --git a/erts/emulator/beam/dist.c b/erts/emulator/beam/dist.c index 28c4621ff2..8c3bcd1de4 100644 --- a/erts/emulator/beam/dist.c +++ b/erts/emulator/beam/dist.c @@ -1646,7 +1646,7 @@ dsig_send(ErtsDSigData *dsdp, Eterm ctl, Eterm msg, int force_busy) data_size += erts_encode_dist_ext_size(ctl, flags, acmp); if (is_value(msg)) data_size += erts_encode_dist_ext_size(msg, flags, acmp); - erts_finalize_atom_cache_map(acmp); + erts_finalize_atom_cache_map(acmp, flags); dhdr_ext_size = erts_encode_ext_dist_header_size(acmp); data_size += dhdr_ext_size; @@ -1996,7 +1996,8 @@ erts_dist_command(Port *prt, int reds_limit) ASSERT(ob); do { ob->extp = erts_encode_ext_dist_header_finalize(ob->extp, - dep->cache); + dep->cache, + flags); if (!(flags & DFLAG_DIST_HDR_ATOM_CACHE)) *--ob->extp = PASS_THROUGH; /* Old node; 'pass through' needed */ @@ -2040,7 +2041,8 @@ erts_dist_command(Port *prt, int reds_limit) Uint size; oq.first->extp = erts_encode_ext_dist_header_finalize(oq.first->extp, - dep->cache); + dep->cache, + flags); reds += ERTS_PORT_REDS_DIST_CMD_FINALIZE; if (!(flags & DFLAG_DIST_HDR_ATOM_CACHE)) *--oq.first->extp = PASS_THROUGH; /* Old node; 'pass through' diff --git a/erts/emulator/beam/erl_alloc.types b/erts/emulator/beam/erl_alloc.types index 0a4407f009..2b649b589b 100644 --- a/erts/emulator/beam/erl_alloc.types +++ b/erts/emulator/beam/erl_alloc.types @@ -49,6 +49,8 @@ # true after a "+enable X" statement or if it has been passed as a # command line argument to make_alloc_types. The variable X is false # after a "+disable X" statement or if it has never been mentioned. +# +# IMPORTANT! Only use 7-bit ascii text in this file! +if smp +disable threads_no_smp diff --git a/erts/emulator/beam/erl_bif_ddll.c b/erts/emulator/beam/erl_bif_ddll.c index 7f7c975e78..59a53870f3 100644 --- a/erts/emulator/beam/erl_bif_ddll.c +++ b/erts/emulator/beam/erl_bif_ddll.c @@ -1869,7 +1869,10 @@ static Eterm build_load_error_hp(Eterm *hp, int code) static Eterm mkatom(char *str) { - return am_atom_put(str, sys_strlen(str)); + return erts_atom_put((byte *) str, + sys_strlen(str), + ERTS_ATOM_ENC_LATIN1, + 1); } static char *pick_list_or_atom(Eterm name_term) diff --git a/erts/emulator/beam/erl_bif_info.c b/erts/emulator/beam/erl_bif_info.c index a3811ccdb0..c910bd0cb6 100755 --- a/erts/emulator/beam/erl_bif_info.c +++ b/erts/emulator/beam/erl_bif_info.c @@ -2296,8 +2296,10 @@ BIF_RETTYPE system_info_1(BIF_ALIST_1) for (i = num_instructions-1; i >= 0; i--) { res = erts_bld_cons(hpp, hszp, erts_bld_tuple(hpp, hszp, 2, - am_atom_put(opc[i].name, - strlen(opc[i].name)), + erts_atom_put(opc[i].name, + strlen(opc[i].name), + ERTS_ATOM_ENC_LATIN1, + 1), erts_bld_uint(hpp, hszp, opc[i].count)), res); @@ -3901,7 +3903,7 @@ static Eterm lcnt_build_lock_stats_term(Eterm **hpp, Uint *szp, erts_lcnt_lock_s timer_ns = stats->timer.ns; timer_n = stats->timer_n; - af = am_atom_put(stats->file, strlen(stats->file)); + af = erts_atom_put(stats->file, strlen(stats->file), ERTS_ATOM_ENC_LATIN1, 1); uil = erts_bld_uint( hpp, szp, line); tloc = erts_bld_tuple(hpp, szp, 2, af, uil); @@ -3938,13 +3940,13 @@ static Eterm lcnt_build_lock_term(Eterm **hpp, Uint *szp, erts_lcnt_lock_t *lock ASSERT(ltype); - type = am_atom_put(ltype, strlen(ltype)); - name = am_atom_put(lock->name, strlen(lock->name)); + type = erts_atom_put(ltype, strlen(ltype), ERTS_ATOM_ENC_LATIN1, 1); + name = erts_atom_put(lock->name, strlen(lock->name), ERTS_ATOM_ENC_LATIN1, 1); if (lock->flag & ERTS_LCNT_LT_ALLOC) { /* use allocator types names as id's for allocator locks */ ltype = (char *) ERTS_ALC_A2AD(signed_val(lock->id)); - id = am_atom_put(ltype, strlen(ltype)); + id = erts_atom_put(ltype, strlen(ltype), ERTS_ATOM_ENC_LATIN1, 1); } else if (lock->flag & ERTS_LCNT_LT_PROCLOCK) { /* use registered names as id's for process locks if available */ proc = erts_proc_lookup(lock->id); @@ -3984,12 +3986,12 @@ static Eterm lcnt_build_result_term(Eterm **hpp, Uint *szp, erts_lcnt_data_t *da dtns = erts_bld_uint( hpp, szp, data->duration.ns); tdt = erts_bld_tuple(hpp, szp, 2, dts, dtns); - adur = am_atom_put(str_duration, strlen(str_duration)); + adur = erts_atom_put(str_duration, strlen(str_duration), ERTS_ATOM_ENC_LATIN1, 1); tdur = erts_bld_tuple(hpp, szp, 2, adur, tdt); /* lock tuple */ - aloc = am_atom_put(str_locks, strlen(str_locks)); + aloc = erts_atom_put(str_locks, strlen(str_locks), ERTS_ATOM_ENC_LATIN1, 1); for (lock = data->current_locks->head; lock != NULL ; lock = lock->next ) { lloc = lcnt_build_lock_term(hpp, szp, lock, lloc); @@ -4125,14 +4127,14 @@ BIF_RETTYPE erts_debug_lock_counters_1(BIF_ALIST_1) static void os_info_init(void) { - Eterm type = am_atom_put(os_type, strlen(os_type)); + Eterm type = erts_atom_put((byte *) os_type, strlen(os_type), ERTS_ATOM_ENC_LATIN1, 1); Eterm flav; int major, minor, build; char* buf = erts_alloc(ERTS_ALC_T_TMP, 1024); /* More than enough */ Eterm* hp; os_flavor(buf, 1024); - flav = am_atom_put(buf, strlen(buf)); + flav = erts_atom_put((byte *) buf, strlen(buf), ERTS_ATOM_ENC_LATIN1, 1); erts_free(ERTS_ALC_T_TMP, (void *) buf); hp = erts_alloc(ERTS_ALC_T_LL_TEMP_TERM, (3+4)*sizeof(Eterm)); os_type_tuple = TUPLE2(hp, type, flav); diff --git a/erts/emulator/beam/erl_bif_port.c b/erts/emulator/beam/erl_bif_port.c index f9009166c0..4b270414cb 100644 --- a/erts/emulator/beam/erl_bif_port.c +++ b/erts/emulator/beam/erl_bif_port.c @@ -68,7 +68,7 @@ BIF_RETTYPE open_port_2(BIF_ALIST_2) } else { str = "einval"; } - BIF_P->fvalue = am_atom_put(str, strlen(str)); + BIF_P->fvalue = erts_atom_put((byte *) str, strlen(str), ERTS_ATOM_ENC_LATIN1, 1); BIF_ERROR(BIF_P, EXC_ERROR); } diff --git a/erts/emulator/beam/erl_db.c b/erts/emulator/beam/erl_db.c index 1ba1048afa..f8a4882ec0 100644 --- a/erts/emulator/beam/erl_db.c +++ b/erts/emulator/beam/erl_db.c @@ -3815,7 +3815,7 @@ erts_ets_colliding_names(Process* p, Eterm name, Uint cnt) while (index >= atom_table_size()) { char tmp[20]; erts_snprintf(tmp, sizeof(tmp), "am%x", atom_table_size()); - am_atom_put(tmp,strlen(tmp)); + erts_atom_put((byte *) tmp, strlen(tmp), ERTS_ATOM_ENC_LATIN1, 1); } list = CONS(hp, make_atom(index), list); hp += 2; diff --git a/erts/emulator/beam/erl_db_util.c b/erts/emulator/beam/erl_db_util.c index 0c9ca83ce4..407a55a3d7 100644 --- a/erts/emulator/beam/erl_db_util.c +++ b/erts/emulator/beam/erl_db_util.c @@ -4768,7 +4768,8 @@ static int match_compact(ErlHeapFragment *expr, DMCErrInfo *err_info) ASSERT(j < x); erts_snprintf(buff+1, sizeof(buff) - 1, "%u", (unsigned) j); /* Yes, writing directly into terms, they ARE off heap */ - *p = am_atom_put(buff, strlen(buff)); + *p = erts_atom_put((byte *) buff, strlen(buff), + ERTS_ATOM_ENC_LATIN1, 1); } ++p; } diff --git a/erts/emulator/beam/erl_init.c b/erts/emulator/beam/erl_init.c index 4b90e5394a..369eab5980 100644 --- a/erts/emulator/beam/erl_init.c +++ b/erts/emulator/beam/erl_init.c @@ -344,7 +344,7 @@ erl_first_process_otp(char* modname, void* code, unsigned size, int argc, char** ErlSpawnOpts so; Eterm env; - start_mod = am_atom_put(modname, sys_strlen(modname)); + start_mod = erts_atom_put((byte *) modname, sys_strlen(modname), ERTS_ATOM_ENC_LATIN1, 1); if (erts_find_function(start_mod, am_start, 2, erts_active_code_ix()) == NULL) { erl_exit(5, "No function %s:start/2\n", modname); @@ -441,7 +441,7 @@ load_preloaded(void) i = 0; while ((name = preload_p[i].name) != NULL) { length = preload_p[i].size; - module_name = am_atom_put(name, sys_strlen(name)); + module_name = erts_atom_put((byte *) name, sys_strlen(name), ERTS_ATOM_ENC_LATIN1, 1); if ((code = sys_preload_begin(&preload_p[i])) == 0) erl_exit(1, "Failed to find preloaded code for module %s\n", name); diff --git a/erts/emulator/beam/erl_unicode.c b/erts/emulator/beam/erl_unicode.c index e24b6f1458..6600ce4a4a 100644 --- a/erts/emulator/beam/erl_unicode.c +++ b/erts/emulator/beam/erl_unicode.c @@ -1154,15 +1154,24 @@ BIF_RETTYPE unicode_characters_to_list_2(BIF_ALIST_2) * When input to characters_to_list is a plain binary and the format is 'unicode', we do * a faster analyze and size count with this function. */ -int erts_analyze_utf8(byte *source, Uint size, - byte **err_pos, Uint *num_chars, int *left) +static ERTS_INLINE int +analyze_utf8(byte *source, Uint size, byte **err_pos, Uint *num_chars, int *left, + Sint *num_latin1_chars, Uint max_chars) { + Uint latin1_count; + int is_latin1; *err_pos = source; + if (num_latin1_chars) { + is_latin1 = 1; + latin1_count = 0; + } *num_chars = 0; while (size) { if (((*source) & ((byte) 0x80)) == 0) { source++; - --size; + --size; + if (num_latin1_chars) + latin1_count++; } else if (((*source) & ((byte) 0xE0)) == 0xC0) { if (size < 2) { return ERTS_UTF8_INCOMPLETE; @@ -1173,6 +1182,11 @@ int erts_analyze_utf8(byte *source, Uint size, } source += 2; size -= 2; + if (num_latin1_chars) { + latin1_count++; + if ((source[0] & ((byte) 0xFC)) != ((byte) 0xC0)) + is_latin1 = 0; + } } else if (((*source) & ((byte) 0xF0)) == 0xE0) { if (size < 3) { return ERTS_UTF8_INCOMPLETE; @@ -1188,6 +1202,8 @@ int erts_analyze_utf8(byte *source, Uint size, } source += 3; size -= 3; + if (num_latin1_chars) + is_latin1 = 0; } else if (((*source) & ((byte) 0xF8)) == 0xF0) { if (size < 4) { return ERTS_UTF8_INCOMPLETE; @@ -1205,22 +1221,41 @@ int erts_analyze_utf8(byte *source, Uint size, } source += 4; size -= 4; + if (num_latin1_chars) + is_latin1 = 0; } else { return ERTS_UTF8_ERROR; } ++(*num_chars); *err_pos = source; + if (max_chars && size > 0 && *num_chars == max_chars) + return ERTS_UTF8_OK_MAX_CHARS; if (left && --(*left) <= 0 && size) { return ERTS_UTF8_ANALYZE_MORE; } } + if (num_latin1_chars) + *num_latin1_chars = is_latin1 ? latin1_count : -1; return ERTS_UTF8_OK; } +int erts_analyze_utf8(byte *source, Uint size, + byte **err_pos, Uint *num_chars, int *left) +{ + return analyze_utf8(source, size, err_pos, num_chars, left, NULL, 0); +} + +int erts_analyze_utf8_x(byte *source, Uint size, + byte **err_pos, Uint *num_chars, int *left, + Sint *num_latin1_chars, Uint max_chars) +{ + return analyze_utf8(source, size, err_pos, num_chars, left, num_latin1_chars, max_chars); +} + /* * No errors should be able to occur - no overlongs, no malformed, no nothing - */ -Eterm do_utf8_to_list(Process *p, Uint num, byte *bytes, Uint sz, + */ +static Eterm do_utf8_to_list(Process *p, Uint num, byte *bytes, Uint sz, Uint left, Uint *num_built, Uint *num_eaten, Eterm tail) { @@ -1275,6 +1310,12 @@ Eterm do_utf8_to_list(Process *p, Uint num, byte *bytes, Uint sz, return ret; } +Eterm erts_utf8_to_list(Process *p, Uint num, byte *bytes, Uint sz, Uint left, + Uint *num_built, Uint *num_eaten, Eterm tail) +{ + return do_utf8_to_list(p, num, bytes, sz, left, num_built, num_eaten, tail); +} + static int is_candidate(Uint cp) { int index,pos; @@ -1849,14 +1890,14 @@ BIF_RETTYPE atom_to_binary_2(BIF_ALIST_2) } static BIF_RETTYPE -binary_to_atom(Process* p, Eterm bin, Eterm enc, int must_exist) +binary_to_atom(Process* proc, Eterm bin, Eterm enc, int must_exist) { byte* bytes; byte *temp_alloc = NULL; Uint bin_size; if ((bytes = erts_get_aligned_binary_bytes(bin, &temp_alloc)) == 0) { - BIF_ERROR(p, BADARG); + BIF_ERROR(proc, BADARG); } bin_size = binary_size(bin); if (enc == am_latin1) { @@ -1864,11 +1905,16 @@ binary_to_atom(Process* p, Eterm bin, Eterm enc, int must_exist) if (bin_size > MAX_ATOM_CHARACTERS) { system_limit: erts_free_aligned_binary_bytes(temp_alloc); - BIF_ERROR(p, SYSTEM_LIMIT); + BIF_ERROR(proc, SYSTEM_LIMIT); } if (!must_exist) { - a = am_atom_put2(bytes, bin_size, 1); - erts_free_aligned_binary_bytes(temp_alloc); + a = erts_atom_put((byte *) bytes, + bin_size, + ERTS_ATOM_ENC_LATIN1, + 0); + erts_free_aligned_binary_bytes(temp_alloc); + if (is_non_value(a)) + goto badarg; BIF_RET(a); } else if (erts_atom_get((char *)bytes, bin_size, &a, 1)) { erts_free_aligned_binary_bytes(temp_alloc); @@ -1900,17 +1946,22 @@ binary_to_atom(Process* p, Eterm bin, Eterm enc, int must_exist) } if (!must_exist) { - res = am_atom_put((char*)bytes, bin_size); + res = erts_atom_put((byte *) bytes, + bin_size, + ERTS_ATOM_ENC_UTF8, + 0); } else if (!erts_atom_get((char*)bytes, bin_size, &res, 0)) { goto badarg; } erts_free_aligned_binary_bytes(temp_alloc); + if (is_non_value(res)) + goto badarg; BIF_RET(res); } else { badarg: erts_free_aligned_binary_bytes(temp_alloc); - BIF_ERROR(p, BADARG); + BIF_ERROR(proc, BADARG); } } @@ -2625,30 +2676,70 @@ BIF_RETTYPE file_native_name_encoding_0(BIF_ALIST_0) } } -/* Assumes 'dest' has enough room. - */ -int erts_utf8_to_latin1(byte* dest, const byte* source, unsigned slen) +int erts_utf8_is_latin1_string(const byte *string, int len) +{ + /* Assumes string is encoded in valid UTF-8 */ + int i; + while (i < len) { + if ((string[i] & 0x80) == 0) + i++; + else if (i+1 < len + && (string[i] & 0xFE) == 0xC2 + && (string[i+1] & 0xC0) == 0x80) + i +=2; + else + return 0; + } + return 1; +} + +int erts_utf8_to_latin1(byte* dest, const byte* source, int slen) { + /* + * Assumes source contains valid utf8 that can be encoded as latin1, + * and that dest has enough room. + */ byte* dp = dest; while (slen > 0) { if ((source[0] & 0x80) == 0) { *dp++ = *source++; --slen; } - else if (slen > 1 && - (source[0] & 0xFE) == 0xC2 && - (source[1] & 0xC0) == 0x80) { + else { + ASSERT(slen > 1); + ASSERT((source[0] & 0xFE) == 0xC2); + ASSERT((source[1] & 0xC0) == 0x80); *dp++ = (char) ((source[0] << 6) | (source[1] & 0x3F)); source += 2; slen -= 2; } - else { - /* Just let unconvertable octets through. This should not happen - in a correctly upgraded system */ - *dp++ = *source++; - --slen; - } } return dp - dest; } +int erts_utf8_to_latin1_backwards(byte *dest, const byte *source, int slen) +{ + /* + * Assumes source contains valid utf8 that can be encoded as latin1, + * and that dest has enough room. + */ + int dix = 0; + int six = slen; + while (six > 0) { + six--; + dix--; + if ((source[six] & 0x80) == 0) + dest[dix] = source[six]; + else { + byte c; + ASSERT(six > 0); + ASSERT((source[six] & 0xC0) == 0x80); + ASSERT((source[six-1] & 0xFE) == 0xC2); + c = source[six] & 0x3F; + six--; + c |= source[six] << 6; + dest[dix] = c; + } + } + return -dix; +} diff --git a/erts/emulator/beam/external.c b/erts/emulator/beam/external.c index 68edcd0fa6..8c4d9108d4 100644 --- a/erts/emulator/beam/external.c +++ b/erts/emulator/beam/external.c @@ -142,6 +142,7 @@ erts_init_atom_cache_map(ErtsAtomCacheMap *acmp) { if (acmp) { int ix; + acmp->long_atoms = 0; for (ix = 0; ix < ERTS_ATOM_CACHE_SIZE; ix++) acmp->cache[ix].iix = -1; acmp->sz = 0; @@ -154,6 +155,7 @@ erts_reset_atom_cache_map(ErtsAtomCacheMap *acmp) { if (acmp) { int i; + acmp->long_atoms = 0; for (i = 0; i < acmp->sz; i++) { ASSERT(0 <= acmp->cix[i] && acmp->cix[i] < ERTS_ATOM_CACHE_SIZE); acmp->cache[acmp->cix[i]].iix = -1; @@ -175,9 +177,23 @@ erts_destroy_atom_cache_map(ErtsAtomCacheMap *acmp) } static ERTS_INLINE void -insert_acache_map(ErtsAtomCacheMap *acmp, Eterm atom) +insert_acache_map(ErtsAtomCacheMap *acmp, Eterm atom, Uint32 dflags) { - if (acmp && acmp->sz < ERTS_MAX_INTERNAL_ATOM_CACHE_ENTRIES) { + /* + * If the receiver do not understand utf8 atoms + * and this atom cannot be represented in latin1, + * we are not allowed to cache it. + * + * In this case all atoms are assumed to have + * latin1 encoding in the cache. By refusing it + * in the cache we will instead encode it using + * ATOM_UTF8_EXT/SMALL_ATOM_UTF8_EXT which the + * receiver do not recognize and tear down the + * connection. + */ + if (acmp && acmp->sz < ERTS_MAX_INTERNAL_ATOM_CACHE_ENTRIES + && ((dflags & DFLAG_UTF8_ATOMS) + || atom_tab(atom_val(atom))->latin1_chars >= 0)) { int ix; ASSERT(acmp->hdr_sz < 0); ix = atom2cix(atom); @@ -190,7 +206,7 @@ insert_acache_map(ErtsAtomCacheMap *acmp, Eterm atom) } static ERTS_INLINE int -get_iix_acache_map(ErtsAtomCacheMap *acmp, Eterm atom) +get_iix_acache_map(ErtsAtomCacheMap *acmp, Eterm atom, Uint32 dflags) { if (!acmp) return -1; @@ -199,7 +215,9 @@ get_iix_acache_map(ErtsAtomCacheMap *acmp, Eterm atom) ASSERT(is_atom(atom)); ix = atom2cix(atom); if (acmp->cache[ix].iix < 0) { - ASSERT(acmp->sz == ERTS_MAX_INTERNAL_ATOM_CACHE_ENTRIES); + ASSERT(acmp->sz == ERTS_MAX_INTERNAL_ATOM_CACHE_ENTRIES + || (!(dflags & DFLAG_UTF8_ATOMS) + && atom_tab(atom_val(atom))->latin1_chars < 0)); return -1; } else { @@ -210,18 +228,17 @@ get_iix_acache_map(ErtsAtomCacheMap *acmp, Eterm atom) } void -erts_finalize_atom_cache_map(ErtsAtomCacheMap *acmp) +erts_finalize_atom_cache_map(ErtsAtomCacheMap *acmp, Uint32 dflags) { if (acmp) { -#if MAX_ATOM_LENGTH > 255 -#error "This code is not complete; long_atoms info need to be passed to the following stages." - int long_atoms = 0; /* !0 if one or more atoms are long than 255. */ -#endif + int utf8_atoms = (int) (dflags & DFLAG_UTF8_ATOMS); + int long_atoms = 0; /* !0 if one or more atoms are longer than 255. */ int i; int sz; int fix_sz = 1 /* VERSION_MAGIC */ + 1 /* DIST_HEADER */ + + 1 /* dist header flags */ + 1 /* number of internal cache entries */ ; int min_sz; @@ -230,22 +247,23 @@ erts_finalize_atom_cache_map(ErtsAtomCacheMap *acmp) min_sz = fix_sz+(2+4)*acmp->sz; sz = fix_sz; for (i = 0; i < acmp->sz; i++) { + Atom *a; Eterm atom; int len; atom = acmp->cache[acmp->cix[i]].atom; ASSERT(is_atom(atom)); - len = atom_tab(atom_val(atom))->len; -#if MAX_ATOM_LENGTH > 255 + a = atom_tab(atom_val(atom)); + len = (int) (utf8_atoms ? a->len : a->latin1_chars); + ASSERT(len >= 0); if (!long_atoms && len > 255) long_atoms = 1; -#endif /* Enough for a new atom cache value */ sz += 1 /* cix */ + 1 /* length */ + len /* text */; } -#if MAX_ATOM_LENGTH > 255 - if (long_atoms) + if (long_atoms) { + acmp->long_atoms = 1; sz += acmp->sz; /* we need 2 bytes per atom for length */ -#endif + } /* Dynamically sized flag field */ sz += ERTS_DIST_HDR_ATOM_CACHE_FLAG_BYTES(acmp->sz); if (sz < min_sz) @@ -274,6 +292,7 @@ byte *erts_encode_ext_dist_header_setup(byte *ctl_ext, ErtsAtomCacheMap *acmp) else { int i; byte *ep = ctl_ext; + byte dist_hdr_flags = acmp->long_atoms ? ERTS_DIST_HDR_LONG_ATOMS_FLG : 0; ASSERT(acmp->hdr_sz >= 0); /* * Write cache update instructions. Note that this is a purely @@ -296,28 +315,36 @@ byte *erts_encode_ext_dist_header_setup(byte *ctl_ext, ErtsAtomCacheMap *acmp) } --ep; put_int8(acmp->sz, ep); + --ep; + put_int8(dist_hdr_flags, ep); *--ep = DIST_HEADER; *--ep = VERSION_MAGIC; return ep; } } -byte *erts_encode_ext_dist_header_finalize(byte *ext, ErtsAtomCache *cache) +byte *erts_encode_ext_dist_header_finalize(byte *ext, ErtsAtomCache *cache, Uint32 dflags) { byte *ip; byte instr_buf[(2+4)*ERTS_ATOM_CACHE_SIZE]; int ci, sz; + byte dist_hdr_flags; + int long_atoms; + int utf8_atoms = (int) (dflags & DFLAG_UTF8_ATOMS); register byte *ep = ext; ASSERT(ep[0] == VERSION_MAGIC); if (ep[1] != DIST_HEADER) return ext; + dist_hdr_flags = ep[2]; + long_atoms = ERTS_DIST_HDR_LONG_ATOMS_FLG & ((int) dist_hdr_flags); + /* * Update output atom cache and write the external version of * the dist header. We write the header backwards just * before the actual term(s). */ - ep += 2; + ep += 3; ci = (int) get_int8(ep); ASSERT(0 <= ci && ci < ERTS_ATOM_CACHE_SIZE); ep += 1; @@ -342,12 +369,7 @@ byte *erts_encode_ext_dist_header_finalize(byte *ext, ErtsAtomCache *cache) flgs_bytes = ERTS_DIST_HDR_ATOM_CACHE_FLAG_BYTES(ci); ASSERT(flgs_bytes <= sizeof(flgs_buf)); -#if MAX_ATOM_LENGTH > 255 - /* long_atoms info needs to be passed from previous stages */ - if (long_atoms) - flgs |= ERTS_DIST_HDR_LONG_ATOMS_FLG; -#endif - flgs = 0; + flgs = (Uint32) dist_hdr_flags; flgs_buf_ix = 0; if ((ci & 1) == 0) used_half_bytes = 2; @@ -382,17 +404,22 @@ byte *erts_encode_ext_dist_header_finalize(byte *ext, ErtsAtomCache *cache) Atom *a; cache->out_arr[cix] = atom; a = atom_tab(atom_val(atom)); - sz = a->len; - ep -= sz; - sys_memcpy((void *) ep, (void *) a->name, sz); -#if MAX_ATOM_LENGTH > 255 + if (utf8_atoms) { + sz = a->len; + ep -= sz; + sys_memcpy((void *) ep, (void *) a->name, sz); + } + else { + ASSERT(0 <= a->latin1_chars && a->latin1_chars <= MAX_ATOM_CHARACTERS); + ep -= a->latin1_chars; + sz = erts_utf8_to_latin1(ep, a->name, a->len); + ASSERT(a->latin1_chars == sz); + } if (long_atoms) { ep -= 2; put_int16(sz, ep); } - else -#endif - { + else { ASSERT(0 <= sz && sz <= 255); --ep; put_int8(sz, ep); @@ -553,6 +580,7 @@ erts_prepare_dist_ext(ErtsDistExternal *edep, #endif register byte *ep = ext; + int utf8_atoms = (int) (dep->flags & DFLAG_UTF8_ATOMS); edep->heap_size = -1; edep->ext_endp = ext+size; @@ -611,9 +639,7 @@ erts_prepare_dist_ext(ErtsDistExternal *edep, ERTS_EXT_HDR_FAIL; ep++; if (no_atoms) { -#if MAX_ATOM_LENGTH > 255 int long_atoms = 0; -#endif #ifdef DEBUG byte *flgs_buf = ep; #endif @@ -632,14 +658,8 @@ erts_prepare_dist_ext(ErtsDistExternal *edep, */ byte_ix = ERTS_DIST_HDR_ATOM_CACHE_FLAG_BYTE_IX(no_atoms); bit_ix = ERTS_DIST_HDR_ATOM_CACHE_FLAG_BIT_IX(no_atoms); - if (flgsp[byte_ix] & (((byte) ERTS_DIST_HDR_LONG_ATOMS_FLG) - << bit_ix)) { -#if MAX_ATOM_LENGTH > 255 + if (flgsp[byte_ix] & (((byte) ERTS_DIST_HDR_LONG_ATOMS_FLG) << bit_ix)) long_atoms = 1; -#else - ERTS_EXT_HDR_FAIL; /* Long atoms not supported yet */ -#endif - } #ifdef DEBUG byte_ix = 0; @@ -707,23 +727,25 @@ erts_prepare_dist_ext(ErtsDistExternal *edep, if (cix >= ERTS_ATOM_CACHE_SIZE) ERTS_EXT_HDR_FAIL; ep++; -#if MAX_ATOM_CHARACTERS > 255 if (long_atoms) { CHKSIZE(2); len = get_int16(ep); ep += 2; } - else -#endif - { + else { CHKSIZE(1); len = get_int8(ep); ep++; } - if (len > MAX_ATOM_CHARACTERS) - ERTS_EXT_HDR_FAIL; /* Too long atom */ CHKSIZE(len); - atom = am_atom_put((char *) ep, len); + atom = erts_atom_put((byte *) ep, + len, + (utf8_atoms + ? ERTS_ATOM_ENC_UTF8 + : ERTS_ATOM_ENC_LATIN1), + 0); + if (is_non_value(atom)) + ERTS_EXT_HDR_FAIL; ep += len; cache->in_arr[cix] = atom; edep->attab.atom[tix] = atom; @@ -1404,7 +1426,8 @@ static byte* enc_atom(ErtsAtomCacheMap *acmp, Eterm atom, byte *ep, Uint32 dflags) { int iix; - int i, j; + int len; + int utf8_atoms = (int) (dflags & DFLAG_UTF8_ATOMS); ASSERT(is_atom(atom)); @@ -1423,42 +1446,46 @@ enc_atom(ErtsAtomCacheMap *acmp, Eterm atom, byte *ep, Uint32 dflags) } return ep; } + /* * term_to_binary/1,2 and the initial distribution message * don't use the cache. */ - iix = get_iix_acache_map(acmp, atom); - if (iix < 0) { - i = atom_val(atom); - j = atom_tab(i)->len; - if (dflags & DFLAG_UTF8_ATOMS) { - if (j <= 255) { + + iix = get_iix_acache_map(acmp, atom, dflags); + if (iix < 0) { + Atom *a = atom_tab(atom_val(atom)); + if (utf8_atoms || a->latin1_chars < 0) { + len = a->len; + if (len > 255) { *ep++ = ATOM_UTF8_EXT; - put_int16(j, ep); + put_int16(len, ep); ep += 2; } else { *ep++ = SMALL_ATOM_UTF8_EXT; - put_int8(j, ep); - ep += 2; + put_int8(len, ep); + ep += 1; } - sys_memcpy((char *) ep, (char*)atom_tab(i)->name, (int) j); + sys_memcpy((char *) ep, (char *) a->name, len); } else { - if (j <= 255 && (dflags & DFLAG_SMALL_ATOM_TAGS)) { + if (a->latin1_chars <= 255 && (dflags & DFLAG_SMALL_ATOM_TAGS)) { *ep++ = SMALL_ATOM_EXT; - j = erts_utf8_to_latin1(ep+1, atom_tab(i)->name, j); - put_int8(j, ep); + len = erts_utf8_to_latin1(ep+1, a->name, a->len); + ASSERT(len == a->latin1_chars); + put_int8(len, ep); ep++; } else { *ep++ = ATOM_EXT; - j = erts_utf8_to_latin1(ep+2, atom_tab(i)->name, j); - put_int16(j, ep); + len = erts_utf8_to_latin1(ep+2, a->name, a->len); + ASSERT(len == a->latin1_chars); + put_int16(len, ep); ep += 2; } } - ep += j; + ep += len; return ep; } @@ -1535,7 +1562,15 @@ dec_atom(ErtsDistExternal *edep, byte* ep, Eterm* objp) goto error; } } else { - *objp = am_atom_put2(ep, len, is_latin1); + Eterm atom = erts_atom_put(ep, + len, + (is_latin1 + ? ERTS_ATOM_ENC_LATIN1 + : ERTS_ATOM_ENC_UTF8), + 0); + if (is_non_value(atom)) + goto error; + *objp = atom; } ep += len; break; @@ -2248,7 +2283,15 @@ dec_term_atom_common: goto error; } } else { - *objp = am_atom_put2(ep, n, is_latin1); + Eterm atom = erts_atom_put(ep, + n, + (is_latin1 + ? ERTS_ATOM_ENC_LATIN1 + : ERTS_ATOM_ENC_UTF8), + 0); + if (is_non_value(atom)) + goto error; + *objp = atom; } ep += n; break; @@ -2917,18 +2960,22 @@ encode_size_struct2(ErtsAtomCacheMap *acmp, Eterm obj, unsigned dflags) } } else { - int alen = atom_tab(atom_val(obj))->len; - result += 1 + 1 + alen; - if (dflags & DFLAG_UTF8_ATOMS) { + Atom *a = atom_tab(atom_val(obj)); + int alen; + if ((dflags & DFLAG_UTF8_ATOMS) || a->latin1_chars < 0) { + alen = a->len; + result += 1 + 1 + alen; if (alen > 255) { result++; /* ATOM_UTF8_EXT (not small) */ } - /*SVERK we use utf8 length which is an over estimation */ - } - else if (alen > 255 || !(dflags & DFLAG_SMALL_ATOM_TAGS)) { - result++; /* ATOM_EXT (not small) */ } - insert_acache_map(acmp, obj); + else { + alen = a->latin1_chars; + result += 1 + 1 + alen; + if (alen > 255 || !(dflags & DFLAG_SMALL_ATOM_TAGS)) + result++; /* ATOM_EXT (not small) */ + } + insert_acache_map(acmp, obj, dflags); } break; case SMALL_DEF: diff --git a/erts/emulator/beam/external.h b/erts/emulator/beam/external.h index 50eea62225..ad430117c8 100644 --- a/erts/emulator/beam/external.h +++ b/erts/emulator/beam/external.h @@ -92,6 +92,7 @@ typedef struct cache { typedef struct { int hdr_sz; int sz; + int long_atoms; int cix[ERTS_ATOM_CACHE_SIZE]; struct { Eterm atom; @@ -152,12 +153,12 @@ typedef struct { void erts_init_atom_cache_map(ErtsAtomCacheMap *); void erts_reset_atom_cache_map(ErtsAtomCacheMap *); void erts_destroy_atom_cache_map(ErtsAtomCacheMap *); -void erts_finalize_atom_cache_map(ErtsAtomCacheMap *); +void erts_finalize_atom_cache_map(ErtsAtomCacheMap *, Uint32); Uint erts_encode_ext_dist_header_size(ErtsAtomCacheMap *); Uint erts_encode_ext_dist_header_size(ErtsAtomCacheMap *); byte *erts_encode_ext_dist_header_setup(byte *, ErtsAtomCacheMap *); -byte *erts_encode_ext_dist_header_finalize(byte *, ErtsAtomCache *); +byte *erts_encode_ext_dist_header_finalize(byte *, ErtsAtomCache *, Uint32); Uint erts_encode_dist_ext_size(Eterm, Uint32, ErtsAtomCacheMap *); void erts_encode_dist_ext(Eterm, byte **, Uint32, ErtsAtomCacheMap *); diff --git a/erts/emulator/beam/global.h b/erts/emulator/beam/global.h index 1500424d3e..eccdf10c75 100755 --- a/erts/emulator/beam/global.h +++ b/erts/emulator/beam/global.h @@ -1519,17 +1519,25 @@ Sint erts_native_filename_need(Eterm ioterm, int encoding); void erts_copy_utf8_to_utf16_little(byte *target, byte *bytes, int num_chars); int erts_analyze_utf8(byte *source, Uint size, byte **err_pos, Uint *num_chars, int *left); +int erts_analyze_utf8_x(byte *source, Uint size, + byte **err_pos, Uint *num_chars, int *left, + Sint *num_latin1_chars, Uint max_chars); char *erts_convert_filename_to_native(Eterm name, char *statbuf, size_t statbuf_size, ErtsAlcType_t alloc_type, int allow_empty, int allow_atom, Sint *used /* out */); Eterm erts_convert_native_to_filename(Process *p, byte *bytes); -int erts_utf8_to_latin1(byte* dest, const byte* source, unsigned slen); +Eterm erts_utf8_to_list(Process *p, Uint num, byte *bytes, Uint sz, Uint left, + Uint *num_built, Uint *num_eaten, Eterm tail); +int erts_utf8_is_latin1_string(const byte *string, int len); +int erts_utf8_to_latin1(byte* dest, const byte* source, int slen); +int erts_utf8_to_latin1_backwards(byte* dest, const byte* source, int slen); #define ERTS_UTF8_OK 0 #define ERTS_UTF8_INCOMPLETE 1 #define ERTS_UTF8_ERROR 2 #define ERTS_UTF8_ANALYZE_MORE 3 +#define ERTS_UTF8_OK_MAX_CHARS 4 /* erl_trace.c */ void erts_init_trace(void); diff --git a/erts/emulator/beam/io.c b/erts/emulator/beam/io.c index 60b9238d38..b1eb75bede 100644 --- a/erts/emulator/beam/io.c +++ b/erts/emulator/beam/io.c @@ -646,8 +646,11 @@ erts_open_driver(erts_driver_t* driver, /* Pointer to driver. */ if (IS_TRACED_FL(port, F_TRACE_PORTS)) { trace_port_open(port, - pid, - am_atom_put(port->name, strlen(port->name))); + pid, + erts_atom_put((byte *) port->name, + strlen(port->name), + ERTS_ATOM_ENC_LATIN1, + 1)); } if (driver->start) { @@ -4765,7 +4768,8 @@ int driver_exit(ErlDrvPort ix, int err) return driver_failure_term(ix, am_normal, 0); else { char* err_str = erl_errno_id(err); - Eterm am_err = am_atom_put(err_str, sys_strlen(err_str)); + Eterm am_err = erts_atom_put((byte *) err_str, sys_strlen(err_str), + ERTS_ATOM_ENC_LATIN1, 1); return driver_failure_term(ix, am_err, 0); } } @@ -4778,8 +4782,12 @@ int driver_failure(ErlDrvPort ix, int code) int driver_failure_atom(ErlDrvPort ix, char* string) { - Eterm am = am_atom_put(string, strlen(string)); - return driver_failure_term(ix, am, 0); + return driver_failure_term(ix, + erts_atom_put((byte *) string, + strlen(string), + ERTS_ATOM_ENC_LATIN1, + 1), + 0); } int driver_failure_posix(ErlDrvPort ix, int err) @@ -4796,7 +4804,10 @@ int driver_failure_eof(ErlDrvPort ix) ErlDrvTermData driver_mk_atom(char* string) { - Eterm am = am_atom_put(string, sys_strlen(string)); + Eterm am = erts_atom_put((byte *) string, + sys_strlen(string), + ERTS_ATOM_ENC_LATIN1, + 1); ERTS_SMP_CHK_NO_PROC_LOCKS; return (ErlDrvTermData) am; } @@ -5091,7 +5102,10 @@ init_driver(erts_driver_t *drv, ErlDrvEntry *de, DE_Handle *handle) erts_smp_mtx_init_x(drv->lock, "driver_lock", #if defined(ERTS_ENABLE_LOCK_CHECK) || defined(ERTS_ENABLE_LOCK_COUNT) - am_atom_put(drv->name, sys_strlen(drv->name)) + erts_atom_put((byte *) drv->name, + sys_strlen(drv->name), + ERTS_ATOM_ENC_LATIN1, + 1) #else NIL #endif diff --git a/erts/emulator/beam/utils.c b/erts/emulator/beam/utils.c index 1969fc762c..97b6d01207 100644 --- a/erts/emulator/beam/utils.c +++ b/erts/emulator/beam/utils.c @@ -370,7 +370,7 @@ Eterm erts_bld_atom(Uint **hpp, Uint *szp, char *str) { if (hpp) - return am_atom_put(str, sys_strlen(str)); + return erts_atom_put((byte *) str, sys_strlen(str), ERTS_ATOM_ENC_LATIN1, 1); else return THE_NON_VALUE; } diff --git a/lib/kernel/include/dist.hrl b/lib/kernel/include/dist.hrl index 5b52f6f294..91e13d99a9 100644 --- a/lib/kernel/include/dist.hrl +++ b/lib/kernel/include/dist.hrl @@ -36,3 +36,4 @@ -define(DFLAG_UNICODE_IO,16#1000). -define(DFLAG_DIST_HDR_ATOM_CACHE,16#2000). -define(DFLAG_SMALL_ATOM_TAGS, 16#4000). +-define(DFLAG_UTF8_ATOMS, 16#10000). diff --git a/lib/kernel/src/dist_util.erl b/lib/kernel/src/dist_util.erl index f0d54a2f3e..b1d32322e3 100644 --- a/lib/kernel/src/dist_util.erl +++ b/lib/kernel/src/dist_util.erl @@ -115,7 +115,8 @@ make_this_flags(RequestType, OtherNode) -> ?DFLAG_NEW_FLOATS bor ?DFLAG_UNICODE_IO bor ?DFLAG_DIST_HDR_ATOM_CACHE bor - ?DFLAG_SMALL_ATOM_TAGS). + ?DFLAG_SMALL_ATOM_TAGS bor + ?DFLAG_UTF8_ATOMS). handshake_other_started(#hs_data{request_type=ReqType}=HSData0) -> {PreOtherFlags,Node,Version} = recv_name(HSData0), -- cgit v1.2.3 From e81e9e367eb0f6bae1c818f5351a7118f832c31c Mon Sep 17 00:00:00 2001 From: Sverker Eriksson Date: Wed, 16 Jan 2013 20:44:09 +0100 Subject: atom fixes for NIFs and atom_to_binary --- erts/emulator/beam/erl_nif.c | 24 +++++++++----- erts/emulator/beam/erl_unicode.c | 67 +++++----------------------------------- erts/emulator/beam/global.h | 2 -- 3 files changed, 24 insertions(+), 69 deletions(-) diff --git a/erts/emulator/beam/erl_nif.c b/erts/emulator/beam/erl_nif.c index 185ac75d73..f00d5f86ce 100644 --- a/erts/emulator/beam/erl_nif.c +++ b/erts/emulator/beam/erl_nif.c @@ -743,16 +743,23 @@ int enif_get_atom(ErlNifEnv* env, Eterm atom, char* buf, unsigned len, { Atom* ap; ASSERT(encoding == ERL_NIF_LATIN1); - if (is_not_atom(atom)) { + if (is_not_atom(atom) || len==0) { return 0; } ap = atom_tab(atom_val(atom)); - if (ap->len+1 > len) { + + if (ap->latin1_chars < 0 || ap->latin1_chars >= len) { return 0; } - sys_memcpy(buf, ap->name, ap->len); - buf[ap->len] = '\0'; - return ap->len + 1; + if (ap->latin1_chars == ap->len) { + sys_memcpy(buf, ap->name, ap->len); + } + else { + int dlen = erts_utf8_to_latin1((byte*)buf, ap->name, ap->len); + ASSERT(dlen == ap->latin1_chars); (void)dlen; + } + buf[ap->latin1_chars] = '\0'; + return ap->latin1_chars + 1; } int enif_get_int(ErlNifEnv* env, Eterm term, int* ip) @@ -854,7 +861,10 @@ int enif_get_atom_length(ErlNifEnv* env, Eterm atom, unsigned* len, ASSERT(enc == ERL_NIF_LATIN1); if (is_not_atom(atom)) return 0; ap = atom_tab(atom_val(atom)); - *len = ap->len; + if (ap->latin1_chars < 0) { + return 0; + } + *len = ap->latin1_chars; return 1; } @@ -961,7 +971,7 @@ ERL_NIF_TERM enif_make_atom(ErlNifEnv* env, const char* name) ERL_NIF_TERM enif_make_atom_len(ErlNifEnv* env, const char* name, size_t len) { - return am_atom_put(name, len); + return erts_atom_put((byte*)name, len, ERTS_ATOM_ENC_LATIN1, 1); } int enif_make_existing_atom(ErlNifEnv* env, const char* name, ERL_NIF_TERM* atom, diff --git a/erts/emulator/beam/erl_unicode.c b/erts/emulator/beam/erl_unicode.c index 6600ce4a4a..c00293de89 100644 --- a/erts/emulator/beam/erl_unicode.c +++ b/erts/emulator/beam/erl_unicode.c @@ -1853,32 +1853,21 @@ BIF_RETTYPE atom_to_binary_2(BIF_ALIST_2) ap = atom_tab(atom_val(BIF_ARG_1)); if (BIF_ARG_2 == am_latin1) { - int i; Eterm bin_term; - int bin_size = ap->len; - - for (i = 0; i < ap->len; ) { - if (ap->name[i] < 0x80) i++; - else { - ASSERT(ap->name[i] >= 0xC0); - if (ap->name[i] < 0xE0) { - ASSERT(i+1 < ap->len && (ap->name[i+1] & 0xC0) == 0x80); - i += 2; - bin_size -= 1; - } - else goto error; - } + + if (ap->latin1_chars < 0) { + goto error; } - if (bin_size == ap->len) { + if (ap->latin1_chars == ap->len) { bin_term = new_binary(BIF_P, ap->name, ap->len); } else { byte* bin_p; int dbg_sz; - bin_term = new_binary(BIF_P, 0, bin_size); + bin_term = new_binary(BIF_P, 0, ap->latin1_chars); bin_p = binary_bytes(bin_term); dbg_sz = erts_utf8_to_latin1(bin_p, ap->name, ap->len); - ASSERT(dbg_sz == bin_size); (void)dbg_sz; + ASSERT(dbg_sz == ap->latin1_chars); (void)dbg_sz; } BIF_RET(bin_term); } else if (BIF_ARG_2 == am_utf8 || BIF_ARG_2 == am_unicode) { @@ -2676,23 +2665,6 @@ BIF_RETTYPE file_native_name_encoding_0(BIF_ALIST_0) } } -int erts_utf8_is_latin1_string(const byte *string, int len) -{ - /* Assumes string is encoded in valid UTF-8 */ - int i; - while (i < len) { - if ((string[i] & 0x80) == 0) - i++; - else if (i+1 < len - && (string[i] & 0xFE) == 0xC2 - && (string[i+1] & 0xC0) == 0x80) - i +=2; - else - return 0; - } - return 1; -} - int erts_utf8_to_latin1(byte* dest, const byte* source, int slen) { /* @@ -2700,6 +2672,7 @@ int erts_utf8_to_latin1(byte* dest, const byte* source, int slen) * and that dest has enough room. */ byte* dp = dest; + while (slen > 0) { if ((source[0] & 0x80) == 0) { *dp++ = *source++; @@ -2717,29 +2690,3 @@ int erts_utf8_to_latin1(byte* dest, const byte* source, int slen) return dp - dest; } -int erts_utf8_to_latin1_backwards(byte *dest, const byte *source, int slen) -{ - /* - * Assumes source contains valid utf8 that can be encoded as latin1, - * and that dest has enough room. - */ - int dix = 0; - int six = slen; - while (six > 0) { - six--; - dix--; - if ((source[six] & 0x80) == 0) - dest[dix] = source[six]; - else { - byte c; - ASSERT(six > 0); - ASSERT((source[six] & 0xC0) == 0x80); - ASSERT((source[six-1] & 0xFE) == 0xC2); - c = source[six] & 0x3F; - six--; - c |= source[six] << 6; - dest[dix] = c; - } - } - return -dix; -} diff --git a/erts/emulator/beam/global.h b/erts/emulator/beam/global.h index eccdf10c75..649352ca91 100755 --- a/erts/emulator/beam/global.h +++ b/erts/emulator/beam/global.h @@ -1530,9 +1530,7 @@ char *erts_convert_filename_to_native(Eterm name, char *statbuf, Eterm erts_convert_native_to_filename(Process *p, byte *bytes); Eterm erts_utf8_to_list(Process *p, Uint num, byte *bytes, Uint sz, Uint left, Uint *num_built, Uint *num_eaten, Eterm tail); -int erts_utf8_is_latin1_string(const byte *string, int len); int erts_utf8_to_latin1(byte* dest, const byte* source, int slen); -int erts_utf8_to_latin1_backwards(byte* dest, const byte* source, int slen); #define ERTS_UTF8_OK 0 #define ERTS_UTF8_INCOMPLETE 1 #define ERTS_UTF8_ERROR 2 -- cgit v1.2.3 From 97abb095cd2182d5c3fafd525da4943ef74dc8e5 Mon Sep 17 00:00:00 2001 From: Rickard Green Date: Wed, 16 Jan 2013 23:35:13 +0100 Subject: Add utf8 atom distribution test cases --- erts/emulator/test/distribution_SUITE.erl | 164 ++++++++++++++++++++++++++++-- 1 file changed, 154 insertions(+), 10 deletions(-) diff --git a/erts/emulator/test/distribution_SUITE.erl b/erts/emulator/test/distribution_SUITE.erl index f3a177faf2..aaf6420ccd 100644 --- a/erts/emulator/test/distribution_SUITE.erl +++ b/erts/emulator/test/distribution_SUITE.erl @@ -37,8 +37,10 @@ dist_auto_connect_never/1, dist_auto_connect_once/1, dist_parallel_send/1, atom_roundtrip/1, - atom_roundtrip_r13b/1, + unicode_atom_roundtrip/1, + atom_roundtrip_r15b/1, contended_atom_cache_entry/1, + contended_unicode_atom_cache_entry/1, bad_dist_structure/1, bad_dist_ext_receive/1, bad_dist_ext_process_info/1, @@ -62,8 +64,9 @@ all() -> link_to_dead_new_node, applied_monitor_node, ref_port_roundtrip, nil_roundtrip, stop_dist, {group, trap_bif}, {group, dist_auto_connect}, - dist_parallel_send, atom_roundtrip, atom_roundtrip_r13b, - contended_atom_cache_entry, bad_dist_structure, {group, bad_dist_ext}]. + dist_parallel_send, atom_roundtrip, unicode_atom_roundtrip, atom_roundtrip_r15b, + contended_atom_cache_entry, contended_unicode_atom_cache_entry, + bad_dist_structure, {group, bad_dist_ext}]. groups() -> [{bulk_send, [], [bulk_send_small, bulk_send_big, bulk_send_bigbig]}, @@ -1085,19 +1088,27 @@ atom_roundtrip(Config) when is_list(Config) -> ?line stop_node(Node), ?line ok. -atom_roundtrip_r13b(Config) when is_list(Config) -> - case ?t:is_release_available("r13b") of +atom_roundtrip_r15b(Config) when is_list(Config) -> + case ?t:is_release_available("r15b") of true -> ?line AtomData = atom_data(), ?line verify_atom_data(AtomData), - ?line {ok, Node} = start_node(Config, [], "r13b"), + ?line {ok, Node} = start_node(Config, [], "r15b"), ?line do_atom_roundtrip(Node, AtomData), ?line stop_node(Node), ?line ok; false -> - ?line {skip,"No OTP R13B available"} + ?line {skip,"No OTP R15B available"} end. +unicode_atom_roundtrip(Config) when is_list(Config) -> + ?line AtomData = unicode_atom_data(), + ?line verify_atom_data(AtomData), + ?line {ok, Node} = start_node(Config), + ?line do_atom_roundtrip(Node, AtomData), + ?line stop_node(Node), + ?line ok. + do_atom_roundtrip(Node, AtomData) -> ?line Parent = self(), ?line Proc = spawn_link(Node, fun () -> verify_atom_data_loop(Parent) end), @@ -1133,7 +1144,32 @@ verify_atom_data(AtomData) -> end, AtomData). +uc_atup(ATxt) -> + {string_to_atom(ATxt), ATxt}. + +unicode_atom_data() -> + [uc_atup(lists:seq(16#1f600, 16#1f600+254)), + uc_atup(lists:seq(16#1f600, 16#1f600+63)), + uc_atup(lists:seq(0, 254)), + uc_atup(lists:seq(100, 163)), + uc_atup(lists:seq(200, 354)), + uc_atup(lists:seq(200, 263)), + uc_atup(lists:seq(2000, 2254)), + uc_atup(lists:seq(2000, 2063)), + uc_atup(lists:seq(65500, 65754)), + uc_atup(lists:seq(65500, 65563)) + | lists:map(fun (N) -> + uc_atup(lists:seq(64000+N, 64254+N)) + end, + lists:seq(1, 2000))]. + contended_atom_cache_entry(Config) when is_list(Config) -> + contended_atom_cache_entry_test(Config, latin1). + +contended_unicode_atom_cache_entry(Config) when is_list(Config) -> + contended_atom_cache_entry_test(Config, unicode). + +contended_atom_cache_entry_test(Config, Type) -> ?line TestServer = self(), ?line ProcessPairs = 10, ?line Msgs = 100000, @@ -1147,7 +1183,14 @@ contended_atom_cache_entry(Config) when is_list(Config) -> true), Master = self(), CIX = get_cix(), - TestAtoms = get_conflicting_atoms(CIX, ProcessPairs), + TestAtoms = case Type of + latin1 -> + get_conflicting_atoms(CIX, + ProcessPairs); + unicode -> + get_conflicting_unicode_atoms(CIX, + ProcessPairs) + end, io:format("Testing with the following atoms all using " "cache index ~p:~n ~p~n", [CIX, TestAtoms]), @@ -1159,8 +1202,12 @@ contended_atom_cache_entry(Config) when is_list(Config) -> fun () -> Atom = receive {Ref, txt, ATxt} -> - list_to_atom( - ATxt) + case Type of + latin1 -> + list_to_atom(ATxt); + unicode -> + string_to_atom(ATxt) + end end, receive_ref_atom(Ref, Atom, @@ -1252,6 +1299,20 @@ get_conflicting_atoms(CIX, N) -> get_conflicting_atoms(CIX, N) end. +get_conflicting_unicode_atoms(_CIX, 0) -> + []; +get_conflicting_unicode_atoms(CIX, N) -> + {A, B, C} = now(), + Atom = string_to_atom([16#1f608] ++ "atom" ++ integer_to_list(A*1000000000000 + + B*1000000 + + C)), + case erts_debug:get_internal_state({atom_out_cache_index, Atom}) of + CIX -> + [Atom|get_conflicting_unicode_atoms(CIX, N-1)]; + _ -> + get_conflicting_unicode_atoms(CIX, N) + end. + -define(COOKIE, ''). -define(DOP_LINK, 1). -define(DOP_SEND, 2). @@ -2131,3 +2192,86 @@ repeat(_Fun, 0) -> repeat(Fun, N) -> Fun(), repeat(Fun, N-1). + +string_to_atom(String) -> + Utf8List = string_to_utf8_list(String), + Len = length(Utf8List), + TagLen = case Len < 256 of + true -> [119, Len]; + false -> [118, Len bsr 8, Len band 16#ff] + end, + binary_to_term(list_to_binary([131, TagLen, Utf8List])). + +string_to_utf8_list([]) -> + []; +string_to_utf8_list([CP|CPs]) when is_integer(CP), + 0 =< CP, + CP =< 16#7F -> + [CP | string_to_utf8_list(CPs)]; +string_to_utf8_list([CP|CPs]) when is_integer(CP), + 16#80 =< CP, + CP =< 16#7FF -> + [16#C0 bor (CP bsr 6), + 16#80 bor (16#3F band CP) + | string_to_utf8_list(CPs)]; +string_to_utf8_list([CP|CPs]) when is_integer(CP), + 16#800 =< CP, + CP =< 16#FFFF -> + [16#E0 bor (CP bsr 12), + 16#80 bor (16#3F band (CP bsr 6)), + 16#80 bor (16#3F band CP) + | string_to_utf8_list(CPs)]; +string_to_utf8_list([CP|CPs]) when is_integer(CP), + 16#10000 =< CP, + CP =< 16#10FFFF -> + [16#F0 bor (CP bsr 18), + 16#80 bor (16#3F band (CP bsr 12)), + 16#80 bor (16#3F band (CP bsr 6)), + 16#80 bor (16#3F band CP) + | string_to_utf8_list(CPs)]. + +utf8_list_to_string([]) -> + []; +utf8_list_to_string([B|Bs]) when is_integer(B), + 0 =< B, + B =< 16#7F -> + [B | utf8_list_to_string(Bs)]; +utf8_list_to_string([B0, B1 | Bs]) when is_integer(B0), + 16#C0 =< B0, + B0 =< 16#DF, + is_integer(B1), + 16#80 =< B1, + B1 =< 16#BF -> + [(((B0 band 16#1F) bsl 6) + bor (B1 band 16#3F)) + | utf8_list_to_string(Bs)]; +utf8_list_to_string([B0, B1, B2 | Bs]) when is_integer(B0), + 16#E0 =< B0, + B0 =< 16#EF, + is_integer(B1), + 16#80 =< B1, + B1 =< 16#BF, + is_integer(B2), + 16#80 =< B2, + B2 =< 16#BF -> + [(((B0 band 16#F) bsl 12) + bor ((B1 band 16#3F) bsl 6) + bor (B2 band 16#3F)) + | utf8_list_to_string(Bs)]; +utf8_list_to_string([B0, B1, B2, B3 | Bs]) when is_integer(B0), + 16#F0 =< B0, + B0 =< 16#F7, + is_integer(B1), + 16#80 =< B1, + B1 =< 16#BF, + is_integer(B2), + 16#80 =< B2, + B2 =< 16#BF, + is_integer(B3), + 16#80 =< B3, + B3 =< 16#BF -> + [(((B0 band 16#7) bsl 18) + bor ((B1 band 16#3F) bsl 12) + bor ((B2 band 16#3F) bsl 6) + bor (B3 band 16#3F)) + | utf8_list_to_string(Bs)]. -- cgit v1.2.3 From b553664f54034e8c04ae6f9cc44f16b7f516518b Mon Sep 17 00:00:00 2001 From: Sverker Eriksson Date: Fri, 11 Jan 2013 17:27:29 +0100 Subject: erl_interface: utf8 atoms continued --- lib/erl_interface/doc/src/ei.xml | 59 +++- lib/erl_interface/doc/src/erl_eterm.xml | 18 +- lib/erl_interface/include/ei.h | 35 +- lib/erl_interface/include/erl_interface.h | 40 ++- lib/erl_interface/src/connect/ei_connect.c | 5 +- lib/erl_interface/src/connect/ei_connect_int.h | 2 + lib/erl_interface/src/connect/eirecv.c | 12 +- lib/erl_interface/src/decode/decode_atom.c | 185 +++++++--- lib/erl_interface/src/decode/decode_boolean.c | 8 +- lib/erl_interface/src/decode/decode_fun.c | 6 +- lib/erl_interface/src/decode/decode_pid.c | 3 +- lib/erl_interface/src/decode/decode_port.c | 3 +- lib/erl_interface/src/decode/decode_ref.c | 6 +- lib/erl_interface/src/encode/encode_atom.c | 100 +++++- lib/erl_interface/src/encode/encode_fun.c | 4 +- lib/erl_interface/src/encode/encode_pid.c | 24 +- lib/erl_interface/src/encode/encode_port.c | 23 +- lib/erl_interface/src/encode/encode_ref.c | 24 +- lib/erl_interface/src/legacy/erl_connect.c | 16 +- lib/erl_interface/src/legacy/erl_eterm.c | 158 +++++++-- lib/erl_interface/src/legacy/erl_eterm.h | 4 +- lib/erl_interface/src/legacy/erl_format.c | 20 +- lib/erl_interface/src/legacy/erl_malloc.c | 20 +- lib/erl_interface/src/legacy/erl_marshal.c | 259 ++++++++------ lib/erl_interface/src/legacy/global_whereis.c | 11 +- lib/erl_interface/src/misc/ei_decode_term.c | 13 +- lib/erl_interface/src/misc/ei_printterm.c | 5 +- lib/erl_interface/src/misc/ei_x_encode.c | 21 +- lib/erl_interface/src/misc/get_type.c | 77 +--- lib/erl_interface/src/misc/putget.h | 6 +- lib/erl_interface/src/misc/show_msg.c | 12 +- lib/erl_interface/test/ei_decode_encode_SUITE.erl | 66 +++- .../ei_decode_encode_test.c | 386 ++++++++++++--------- lib/ic/examples/all-against-all/client.c | 1 + lib/ic/examples/c-client/client.c | 1 + lib/ic/examples/c-server/client.c | 1 + 36 files changed, 1062 insertions(+), 572 deletions(-) diff --git a/lib/erl_interface/doc/src/ei.xml b/lib/erl_interface/doc/src/ei.xml index 539e16d837..0b0b1eeb79 100644 --- a/lib/erl_interface/doc/src/ei.xml +++ b/lib/erl_interface/doc/src/ei.xml @@ -82,6 +82,22 @@ function returns the size required (note that for strings an extra byte is needed for the 0 string terminator).

+
+ DATA TYPES + + + enum erlang_char_encoding + +

+ +enum erlang_char_encoding { + ERLANG_ASCII, ERLANG_LATIN1, ERLANG_UTF8, ERLANG_WHATEVER +}; + +

The character encoding used for atoms.

+
+
+
voidei_set_compat_rel(release_number) @@ -225,11 +241,31 @@ Encode an atom

Encodes an atom in the binary format. The parameter - is the name of the atom. Only upto bytes + is the name of the atom in latin1 encoding. Only upto MAXATOMLEN-1 bytes are encoded. The name should be zero-terminated, except for the function.

+ + intei_encode_atom_as(char *buf, int *index, const char *p, enum erlang_char_encoding from_enc, enum erlang_char_encoding to_enc) + intei_encode_atom_len_as(char *buf, int *index, const char *p, int len, enum erlang_char_encoding from_enc, enum erlang_char_encoding to_enc) + intei_x_encode_atom_as(ei_x_buff* x, const char *p, enum erlang_char_encoding from_enc, enum erlang_char_encoding to_enc) + intei_x_encode_atom_len_as(ei_x_buff* x, const char *p, int len, enum erlang_char_encoding from_enc, enum erlang_char_encoding to_enc) + Encode an atom + +

Encodes an atom in the binary format with character encoding + to_enc (latin1 or utf8). + The p parameter is the name of the atom with character encoding + from_enc. + The name must either be zero-terminated or a function variant with a len + parameter must be used.

+

The encoding will fail if the atom is too long or if it can not be represented + with character encoding to_enc.

+

These functions were introduced in R16 release of Erlang/OTP as part of a first step + to support UTF8 atoms. Atoms encoded with ERLANG_UTF8 + can not be decoded by earlier releases than R16.

+
+
intei_encode_binary(char *buf, int *index, const void *p, long len) intei_x_encode_binary(ei_x_buff* x, const void *p, long len) @@ -490,10 +526,29 @@ ei_x_encode_empty_list(&x); Decode an atom

This function decodes an atom from the binary format. The - name of the atom is placed at . There can be at most + null terminated name of the atom is placed at . There can be at most bytes placed in the buffer.

+ + intei_decode_atom_as(const char *buf, int *index, char *p, int plen, enum erlang_char_encoding want, enum erlang_char_encoding* was, enum erlang_char_encoding* result) + Decode an atom + +

This function decodes an atom from the binary format. The + null terminated name of the atom is placed in buffer at p of length + plen bytes.

+

The wanted string encoding is specified by + want. The original encoding used in the + binary format (latin1 or utf8) can be obtained from *was. The actual encoding of the resulting string + (7-bit ascii, latin1 or utf8) can be obtained from *result. Both was and result can be NULL. + *result may differ from want if want is ERLANG_WHATEVER or if + *result turn out to be pure 7-bit ascii (compatible with both latin1 and utf8).

+

This function fails if the atom is too long for the buffer + or if it can not be represented with encoding want.

+

This functions was introduced in R16 release of Erlang/OTP as part of a first step + to support UTF8 atoms.

+
+
intei_decode_binary(const char *buf, int *index, void *p, long *len) Decode a binary diff --git a/lib/erl_interface/doc/src/erl_eterm.xml b/lib/erl_interface/doc/src/erl_eterm.xml index f403618c59..c7840d7813 100644 --- a/lib/erl_interface/doc/src/erl_eterm.xml +++ b/lib/erl_interface/doc/src/erl_eterm.xml @@ -77,10 +77,12 @@

+ A string representing atom . - The length (in characters) of atom t. + + The length (in bytes) of atom t. A pointer to the contents of @@ -92,6 +94,7 @@ The floating point value of . + The Node in pid . The sequence number in pid . @@ -104,6 +107,7 @@ The creation number in port . + The node in port . The first part of the reference number in ref . Use @@ -296,7 +300,7 @@ iohead ::= Binary ETERM *erl_mk_atom(string) Creates an atom - char *string; + const char *string;

Creates an atom.

@@ -305,10 +309,12 @@ iohead ::= Binary

Returns an Erlang term containing an atom. Note that it is the callers responsibility to make sure that contains a valid name for an atom.

-

can be used to retrieve the - atom name (as a string). Note that the string is not - 0-terminated in the atom. returns - the length of the atom name.

+

and + can be used to retrieve the atom name (as a null terminated string). + and returns the length of the atom name.

+

Note that the UTF8 variants were introduced in Erlang/OTP releases R16 + and the string returned by ERL_ATOM_PTR(atom) was not null terminated on older releases.

+
diff --git a/lib/erl_interface/include/ei.h b/lib/erl_interface/include/ei.h index 8f07b24852..20e575f64d 100644 --- a/lib/erl_interface/include/ei.h +++ b/lib/erl_interface/include/ei.h @@ -116,7 +116,8 @@ #define NEW_FLOAT_EXT 'F' #define ERL_ATOM_EXT 'd' #define ERL_SMALL_ATOM_EXT 's' -#define ERL_UNICODE_ATOM_EXT 'v' +#define ERL_ATOM_UTF8_EXT 'v' +#define ERL_SMALL_ATOM_UTF8_EXT 'w' #define ERL_REFERENCE_EXT 'e' #define ERL_NEW_REFERENCE_EXT 'r' #define ERL_PORT_EXT 'f' @@ -185,12 +186,16 @@ extern volatile int __erl_errno; #define EI_MAXHOSTNAMELEN 64 #define EI_MAXALIVELEN 63 #define EI_MAX_COOKIE_SIZE 512 -#define MAXATOMLEN 255 +#define MAXATOMLEN (255 + 1) +#define MAXATOMLEN_UTF8 (255*4 + 1) #define MAXNODELEN EI_MAXALIVELEN+1+EI_MAXHOSTNAMELEN +enum erlang_char_encoding { ERLANG_ASCII, ERLANG_LATIN1, ERLANG_UTF8, ERLANG_WHATEVER }; + /* a pid */ typedef struct { - char node[MAXATOMLEN+1]; + char node[MAXATOMLEN_UTF8]; + enum erlang_char_encoding node_org_enc; unsigned int num; unsigned int serial; unsigned int creation; @@ -198,14 +203,16 @@ typedef struct { /* a port */ typedef struct { - char node[MAXATOMLEN+1]; + char node[MAXATOMLEN_UTF8]; + enum erlang_char_encoding node_org_enc; unsigned int id; unsigned int creation; } erlang_port; /* a ref */ typedef struct { - char node[MAXATOMLEN+1]; + char node[MAXATOMLEN_UTF8]; + enum erlang_char_encoding node_org_enc; int len; unsigned int n[3]; unsigned int creation; @@ -225,15 +232,16 @@ typedef struct { long msgtype; erlang_pid from; erlang_pid to; - char toname[MAXATOMLEN+1]; - char cookie[MAXATOMLEN+1]; + char toname[MAXATOMLEN_UTF8]; + char cookie[MAXATOMLEN_UTF8]; erlang_trace token; } erlang_msg; /* a fun */ typedef struct { long arity; - char module[MAXATOMLEN+1]; + char module[MAXATOMLEN_UTF8]; + enum erlang_char_encoding module_org_enc; char md5[16]; long index; long old_index; @@ -258,7 +266,7 @@ typedef struct { union { long i_val; double d_val; - char atom_name[MAXATOMLEN+1]; + char atom_name[MAXATOMLEN_UTF8]; erlang_pid pid; erlang_port port; erlang_ref ref; @@ -427,9 +435,17 @@ int ei_encode_string_len(char *buf, int *index, const char *p, int len); int ei_x_encode_string(ei_x_buff* x, const char* s); int ei_x_encode_string_len(ei_x_buff* x, const char* s, int len); int ei_encode_atom(char *buf, int *index, const char *p); +int ei_encode_atom_as(char *buf, int *index, const char *p, + enum erlang_char_encoding from, enum erlang_char_encoding to); int ei_encode_atom_len(char *buf, int *index, const char *p, int len); +int ei_encode_atom_len_as(char *buf, int *index, const char *p, int len, + enum erlang_char_encoding from, enum erlang_char_encoding to); int ei_x_encode_atom(ei_x_buff* x, const char* s); +int ei_x_encode_atom_as(ei_x_buff* x, const char* s, + enum erlang_char_encoding from, enum erlang_char_encoding to); int ei_x_encode_atom_len(ei_x_buff* x, const char* s, int len); +int ei_x_encode_atom_len_as(ei_x_buff* x, const char* s, int len, + enum erlang_char_encoding from, enum erlang_char_encoding to); int ei_encode_binary(char *buf, int *index, const void *p, long len); int ei_x_encode_binary(ei_x_buff* x, const void* s, int len); int ei_encode_pid(char *buf, int *index, const erlang_pid *p); @@ -479,6 +495,7 @@ int ei_decode_boolean(const char *buf, int *index, int *p); int ei_decode_char(const char *buf, int *index, char *p); int ei_decode_string(const char *buf, int *index, char *p); int ei_decode_atom(const char *buf, int *index, char *p); +int ei_decode_atom_as(const char *buf, int *index, char *p, int destlen, enum erlang_char_encoding want, enum erlang_char_encoding* was, enum erlang_char_encoding* result); int ei_decode_binary(const char *buf, int *index, void *p, long *len); int ei_decode_fun(const char* buf, int* index, erlang_fun* p); void free_fun(erlang_fun* f); diff --git a/lib/erl_interface/include/erl_interface.h b/lib/erl_interface/include/erl_interface.h index 1c4a94700d..98acc0d71d 100644 --- a/lib/erl_interface/include/erl_interface.h +++ b/lib/erl_interface/include/erl_interface.h @@ -95,19 +95,24 @@ #define ERL_FLOAT_VALUE(x) ((x)->uval.fval.f) -#define ERL_ATOM_PTR(x) ((x)->uval.aval.a) -#define ERL_ATOM_SIZE(x) ((x)->uval.aval.len) +#define ERL_ATOM_PTR(x) erl_atom_ptr_latin1((Erl_Atom_data*) &(x)->uval.aval.d) +#define ERL_ATOM_PTR_UTF8(x) erl_atom_ptr_utf8((Erl_Atom_data*) &(x)->uval.aval.d) +#define ERL_ATOM_SIZE(x) erl_atom_size_latin1((Erl_Atom_data*) &(x)->uval.aval.d) +#define ERL_ATOM_SIZE_UTF8(x) erl_atom_size_utf8((Erl_Atom_data*) &(x)->uval.aval.d) -#define ERL_PID_NODE(x) ((x)->uval.pidval.node) +#define ERL_PID_NODE(x) erl_atom_ptr_latin1((Erl_Atom_data*) &(x)->uval.pidval.node) +#define ERL_PID_NODE_UTF8(x) erl_atom_ptr_utf8((Erl_Atom_data*) &(x)->uval.pidval.node) #define ERL_PID_NUMBER(x) ((x)->uval.pidval.number) #define ERL_PID_SERIAL(x) ((x)->uval.pidval.serial) #define ERL_PID_CREATION(x) ((x)->uval.pidval.creation) -#define ERL_PORT_NODE(x) ((x)->uval.portval.node) +#define ERL_PORT_NODE(x) erl_atom_ptr_latin1((Erl_Atom_data*) &(x)->uval.portval.node) +#define ERL_PORT_NODE_UTF8(x) erl_atom_ptr_utf8((Erl_Atom_data*) &(x)->uval.portval.node) #define ERL_PORT_NUMBER(x) ((x)->uval.portval.number) #define ERL_PORT_CREATION(x) ((x)->uval.portval.creation) -#define ERL_REF_NODE(x) ((x)->uval.refval.node) +#define ERL_REF_NODE(x) erl_atom_ptr_latin1((Erl_Atom_data*) &(x)->uval.refval.node) +#define ERL_REF_NODE_UTF8(x) erl_atom_ptr_utf8((Erl_Atom_data*) &(x)->uval.refval.node) #define ERL_REF_NUMBER(x) ((x)->uval.refval.n[0]) #define ERL_REF_NUMBERS(x) ((x)->uval.refval.n) #define ERL_REF_LEN(x) ((x)->uval.refval.len) @@ -182,15 +187,27 @@ typedef struct { double f; } Erl_Float; +typedef struct { + char *utf8; + int lenU; + char *latin1; + int lenL; +} Erl_Atom_data; + +char* erl_atom_ptr_latin1(Erl_Atom_data*); +char* erl_atom_ptr_utf8(Erl_Atom_data*); +int erl_atom_size_latin1(Erl_Atom_data*); +int erl_atom_size_utf8(Erl_Atom_data*); +char* erl_atom_init_latin1(Erl_Atom_data*, const char*); + typedef struct { Erl_Header h; - int len; - char *a; + Erl_Atom_data d; } Erl_Atom; typedef struct { Erl_Header h; - char * node; + Erl_Atom_data node; unsigned int number; unsigned int serial; unsigned char creation; @@ -198,14 +215,14 @@ typedef struct { typedef struct { Erl_Header h; - char * node; + Erl_Atom_data node; unsigned int number; unsigned char creation; } Erl_Port; typedef struct { Erl_Header h; - char * node; + Erl_Atom_data node; int len; unsigned int n[3]; unsigned char creation; @@ -289,7 +306,7 @@ typedef struct _eterm { } ETERM; -#define MAXREGLEN 255 /* max length of registered (atom) name */ +#define MAXREGLEN (255*4) /* max length of registered (atom) name */ typedef struct { int type; /* one of the message type constants in eiext.h */ @@ -409,6 +426,7 @@ unsigned char erl_ext_type(unsigned char*); /* Note: returned 'char' before R9C unsigned char *erl_peek_ext(unsigned char*,int); int erl_term_len(ETERM*); +int cmp_latin1_vs_utf8(const char* sL, int lenL, const char* sU, int lenU); /* -------------------------------------------------------------------- */ /* Wrappers around ei functions */ diff --git a/lib/erl_interface/src/connect/ei_connect.c b/lib/erl_interface/src/connect/ei_connect.c index 34362b4b9f..a17257795e 100644 --- a/lib/erl_interface/src/connect/ei_connect.c +++ b/lib/erl_interface/src/connect/ei_connect.c @@ -459,6 +459,7 @@ int ei_connect_xinit(ei_cnode* ec, const char *thishostname, /* memmove(&ec->this_ipaddr, thisipaddr, sizeof(ec->this_ipaddr)); */ strcpy(ec->self.node,thisnodename); + ec->self.node_org_enc = ERLANG_LATIN1; ec->self.num = 0; ec->self.serial = 0; ec->self.creation = creation; @@ -1332,7 +1333,9 @@ static int send_name_or_challenge(int fd, char *nodename, | DFLAG_EXTENDED_PIDS_PORTS | DFLAG_FUN_TAGS | DFLAG_NEW_FUN_TAGS - | DFLAG_NEW_FLOATS)); + | DFLAG_NEW_FLOATS + | DFLAG_SMALL_ATOM_TAGS + | DFLAG_UTF8_ATOMS)); if (f_chall) put32be(s, challenge); memcpy(s, nodename, strlen(nodename)); diff --git a/lib/erl_interface/src/connect/ei_connect_int.h b/lib/erl_interface/src/connect/ei_connect_int.h index 3c42b49b82..81c384e38d 100644 --- a/lib/erl_interface/src/connect/ei_connect_int.h +++ b/lib/erl_interface/src/connect/ei_connect_int.h @@ -102,6 +102,8 @@ extern int h_errno; #define DFLAG_NEW_FUN_TAGS 0x80 #define DFLAG_EXTENDED_PIDS_PORTS 0x100 #define DFLAG_NEW_FLOATS 0x800 +#define DFLAG_SMALL_ATOM_TAGS 0x4000 +#define DFLAG_UTF8_ATOMS 0x10000 ei_cnode *ei_fd_to_cnode(int fd); int ei_distversion(int fd); diff --git a/lib/erl_interface/src/connect/eirecv.c b/lib/erl_interface/src/connect/eirecv.c index 86852f947d..075f78e3d2 100644 --- a/lib/erl_interface/src/connect/eirecv.c +++ b/lib/erl_interface/src/connect/eirecv.c @@ -108,7 +108,7 @@ ei_recv_internal (int fd, switch (msg->msgtype) { case ERL_SEND: /* { SEND, Cookie, ToPid } */ if (ei_tracelevel >= 4) show_this_msg = 1; - if (ei_decode_atom(header,&index,msg->cookie) + if (ei_decode_atom_as(header,&index,msg->cookie,sizeof(msg->cookie),ERLANG_UTF8,NULL,NULL) || ei_decode_pid(header,&index,&msg->to)) { erl_errno = EIO; @@ -120,8 +120,8 @@ ei_recv_internal (int fd, case ERL_REG_SEND: /* { REG_SEND, From, Cookie, ToName } */ if (ei_tracelevel >= 4) show_this_msg = 1; if (ei_decode_pid(header,&index,&msg->from) - || ei_decode_atom(header,&index,msg->cookie) - || ei_decode_atom(header,&index,msg->toname)) + || ei_decode_atom_as(header,&index,msg->cookie,sizeof(msg->cookie),ERLANG_UTF8,NULL,NULL) + || ei_decode_atom_as(header,&index,msg->toname,sizeof(msg->toname),ERLANG_UTF8,NULL,NULL)) { erl_errno = EIO; return -1; @@ -157,7 +157,7 @@ ei_recv_internal (int fd, case ERL_SEND_TT: /* { SEND_TT, Cookie, ToPid, TraceToken } */ if (ei_tracelevel >= 4) show_this_msg = 1; - if (ei_decode_atom(header,&index,msg->cookie) + if (ei_decode_atom_as(header,&index,msg->cookie,sizeof(msg->cookie),ERLANG_UTF8,NULL,NULL) || ei_decode_pid(header,&index,&msg->to) || ei_decode_trace(header,&index,&msg->token)) { @@ -171,8 +171,8 @@ ei_recv_internal (int fd, case ERL_REG_SEND_TT: /* { REG_SEND_TT, From, Cookie, ToName, TraceToken } */ if (ei_tracelevel >= 4) show_this_msg = 1; if (ei_decode_pid(header,&index,&msg->from) - || ei_decode_atom(header,&index,msg->cookie) - || ei_decode_atom(header,&index,msg->toname) + || ei_decode_atom_as(header,&index,msg->cookie,sizeof(msg->cookie),ERLANG_UTF8,NULL,NULL) + || ei_decode_atom_as(header,&index,msg->toname,sizeof(msg->toname),ERLANG_UTF8,NULL,NULL) || ei_decode_trace(header,&index,&msg->token)) { erl_errno = EIO; diff --git a/lib/erl_interface/src/decode/decode_atom.c b/lib/erl_interface/src/decode/decode_atom.c index 84edf1766a..2ada418243 100644 --- a/lib/erl_interface/src/decode/decode_atom.c +++ b/lib/erl_interface/src/decode/decode_atom.c @@ -21,76 +21,155 @@ #include "eiext.h" #include "putget.h" -static int utf8_to_latin1(char* dest, const char* source, unsigned len); int ei_decode_atom(const char *buf, int *index, char *p) { - const char *s = buf + *index; - const char *s0 = s; - int len; - - switch (get8(s)) { - case ERL_ATOM_EXT: - len = get16be(s); - if (len > MAXATOMLEN) return -1; - if (p) { - memmove(p,s,len); - p[len] = (char)0; - } - break; - - case ERL_SMALL_ATOM_EXT: - len = get8(s); - if (p) { - memmove(p,s,len); - p[len] = (char)0; - } - break; - - case ERL_UNICODE_ATOM_EXT: - len = get16be(s); - - if (len > 2*MAXATOMLEN) return -1; - - if (p && utf8_to_latin1(p, s, len) < 0) return -1; - break; - - default: - return -1; - } - - s += len; - *index += s-s0; - return 0; + return ei_decode_atom_as(buf, index, p, MAXATOMLEN, ERLANG_LATIN1, NULL, NULL); } -int ei_internal_get_atom(const char** bufp, char* p) +int ei_decode_atom_as(const char *buf, int *index, char* p, int destlen, + enum erlang_char_encoding want_enc, + enum erlang_char_encoding* was_encp, + enum erlang_char_encoding* res_encp) { - int ix = 0; - if (ei_decode_atom(*bufp, &ix, p) < 0) return -1; - *bufp += ix; + const char *s = buf + *index; + const char *s0 = s; + int len; + enum erlang_char_encoding got_enc; + + switch (get8(s)) { + case ERL_ATOM_EXT: + len = get16be(s); + got_enc = ERLANG_LATIN1; + break; + case ERL_SMALL_ATOM_EXT: + len = get8(s); + got_enc = ERLANG_LATIN1; + break; + case ERL_ATOM_UTF8_EXT: + len = get16be(s); + got_enc = ERLANG_UTF8; + break; + case ERL_SMALL_ATOM_UTF8_EXT: + len = get8(s); + got_enc = ERLANG_UTF8; + break; + default: + return -1; + } + + if (want_enc == got_enc || want_enc == ERLANG_WHATEVER || want_enc == ERLANG_ASCII) { + int i, found_non_ascii = 0; + if (len >= destlen) + return -1; + for (i=0; i 0 && dest < dest_end) { - if ((source[0] & 0x80) == 0) { - *dest++ = *source++; + while (slen > 0) { + if (dst >= dst_end) return -1; + if ((src[0] & 0x80) == 0) { + if (dst_start) { + *dst = *src; + } + ++dst; + ++src; --slen; } else if (slen > 1 && - (source[0] & 0xFE) == 0xC2 && - (source[1] & 0xC0) == 0x80) { - *dest++ = (char) ((source[0] << 6) | (source[1] & 0x3F)); - source += 2; + (src[0] & 0xFE) == 0xC2 && + (src[1] & 0xC0) == 0x80) { + if (dst_start) { + *dst = (char) ((src[0] << 6) | (src[1] & 0x3F)); + } + ++dst; + src += 2; slen -= 2; + found_non_ascii = 1; } else return -1; } - *dest = 0; + if (res_encp) { + *res_encp = found_non_ascii ? ERLANG_LATIN1 : ERLANG_ASCII; + } + return dst - dst_start; +} + +int latin1_to_utf8(char* dst, const char* src, int slen, int destlen, + enum erlang_char_encoding* res_encp) +{ + const char* const src_end = src + slen; + const char* const dst_start = dst; + const char* const dst_end = dst + destlen; + int found_non_ascii = 0; + + while (src < src_end) { + if (dst >= dst_end) return -1; + if ((src[0] & 0x80) == 0) { + if (dst_start) { + *dst = *src; + } + ++dst; + } + else { + if (dst_start) { + unsigned char ch = *src; + dst[0] = 0xC0 | (ch >> 6); + dst[1] = 0x80 | (ch & 0x3F); + } + dst += 2; + found_non_ascii = 1; + } + ++src; + } + if (res_encp) { + *res_encp = found_non_ascii ? ERLANG_UTF8 : ERLANG_ASCII; + } + return dst - dst_start; +} + + + +int ei_internal_get_atom(const char** bufp, char* p, + enum erlang_char_encoding* was_encp) +{ + int ix = 0; + if (ei_decode_atom_as(*bufp, &ix, p, MAXATOMLEN_UTF8, ERLANG_UTF8, was_encp, NULL) < 0) + return -1; + *bufp += ix; return 0; } + diff --git a/lib/erl_interface/src/decode/decode_boolean.c b/lib/erl_interface/src/decode/decode_boolean.c index 0a7a06f1d4..f20690249b 100644 --- a/lib/erl_interface/src/decode/decode_boolean.c +++ b/lib/erl_interface/src/decode/decode_boolean.c @@ -24,12 +24,11 @@ /* c non-zero -> erlang "true" atom, otherwise "false" */ int ei_decode_boolean(const char *buf, int *index, int *p) { - const char *s = buf + *index; - const char *s0 = s; - char tbuf[MAXATOMLEN+1]; + char tbuf[6]; int t; - if (get_atom(&s, tbuf) < 0) return -1; + if (ei_decode_atom_as(buf, index, tbuf, sizeof(tbuf), ERLANG_ASCII, NULL, NULL) < 0) + return -1; if (memcmp(tbuf, "true", 5) == 0) t = 1; @@ -39,7 +38,6 @@ int ei_decode_boolean(const char *buf, int *index, int *p) return -1; if (p) *p = t; - *index += s-s0; return 0; } diff --git a/lib/erl_interface/src/decode/decode_fun.c b/lib/erl_interface/src/decode/decode_fun.c index 64fb9e86d8..7bbef5db44 100644 --- a/lib/erl_interface/src/decode/decode_fun.c +++ b/lib/erl_interface/src/decode/decode_fun.c @@ -42,7 +42,8 @@ int ei_decode_fun(const char *buf, int *index, erlang_fun *p) if (ei_decode_pid(s, &ix, (p == NULL ? (erlang_pid*)NULL : &p->pid)) < 0) return -1; /* then the module (atom) */ - if (ei_decode_atom(s, &ix, (p == NULL ? (char*)NULL : p->module)) < 0) + if (ei_decode_atom_as(s, &ix, (p == NULL ? (char*)NULL : p->module), + MAXATOMLEN_UTF8, ERLANG_UTF8, &p->module_org_enc, NULL) < 0) return -1; /* then the index */ if (ei_decode_long(s, &ix, (p == NULL ? (long*)NULL : &p->index)) < 0) @@ -84,7 +85,8 @@ int ei_decode_fun(const char *buf, int *index, erlang_fun *p) if (p != NULL) p->n_free_vars = i; /* then the module (atom) */ ix = 0; - if (ei_decode_atom(s, &ix, (p == NULL ? (char*)NULL : p->module)) < 0) + if (ei_decode_atom_as(s, &ix, (p == NULL ? (char*)NULL : p->module), + MAXATOMLEN_UTF8, ERLANG_UTF8, &p->module_org_enc, NULL) < 0) return -1; /* then the old_index */ if (ei_decode_long(s, &ix, (p == NULL ? (long*)NULL : &p->old_index)) < 0) diff --git a/lib/erl_interface/src/decode/decode_pid.c b/lib/erl_interface/src/decode/decode_pid.c index a762ae499e..e79952195d 100644 --- a/lib/erl_interface/src/decode/decode_pid.c +++ b/lib/erl_interface/src/decode/decode_pid.c @@ -26,12 +26,11 @@ int ei_decode_pid(const char *buf, int *index, erlang_pid *p) { const char *s = buf + *index; const char *s0 = s; - int len; if (get8(s) != ERL_PID_EXT) return -1; /* first the nodename */ - if (get_atom(&s, p->node) < 0) return -1; + if (get_atom(&s, p->node, &p->node_org_enc) < 0) return -1; /* now the numbers: num (4), serial (4), creation (1) */ if (p) { diff --git a/lib/erl_interface/src/decode/decode_port.c b/lib/erl_interface/src/decode/decode_port.c index 6eb2bc9197..5fd96b51a4 100644 --- a/lib/erl_interface/src/decode/decode_port.c +++ b/lib/erl_interface/src/decode/decode_port.c @@ -25,12 +25,11 @@ int ei_decode_port(const char *buf, int *index, erlang_port *p) { const char *s = buf + *index; const char *s0 = s; - int len; if (get8(s) != ERL_PORT_EXT) return -1; /* first the nodename */ - if (get_atom(&s, p->node) < 0) return -1; + if (get_atom(&s, p->node, &p->node_org_enc) < 0) return -1; /* now the numbers: num (4), creation (1) */ if (p) { diff --git a/lib/erl_interface/src/decode/decode_ref.c b/lib/erl_interface/src/decode/decode_ref.c index df3c30777b..7294e5d239 100644 --- a/lib/erl_interface/src/decode/decode_ref.c +++ b/lib/erl_interface/src/decode/decode_ref.c @@ -26,13 +26,13 @@ int ei_decode_ref(const char *buf, int *index, erlang_ref *p) { const char *s = buf + *index; const char *s0 = s; - int count, len, i; + int count, i; switch (get8(s)) { case ERL_REFERENCE_EXT: /* nodename */ - if (get_atom(&s, p->node) < 0) return -1; + if (get_atom(&s, p->node, &p->node_org_enc) < 0) return -1; /* now the numbers: num (4), creation (1) */ if (p) { @@ -53,7 +53,7 @@ int ei_decode_ref(const char *buf, int *index, erlang_ref *p) if (p) p->len = count; /* then the nodename */ - if (get_atom(&s, p->node) < 0) return -1; + if (get_atom(&s, p->node, &p->node_org_enc) < 0) return -1; /* creation */ if (p) { diff --git a/lib/erl_interface/src/encode/encode_atom.c b/lib/erl_interface/src/encode/encode_atom.c index 6f41f045e0..a3d7c4c759 100644 --- a/lib/erl_interface/src/encode/encode_atom.c +++ b/lib/erl_interface/src/encode/encode_atom.c @@ -26,25 +26,95 @@ int ei_encode_atom(char *buf, int *index, const char *p) { size_t len = strlen(p); - if (len >= INT_MAX) return -1; - return ei_encode_atom_len(buf, index, p, len); + if (len >= MAXATOMLEN) + len = MAXATOMLEN - 1; + return ei_encode_atom_len_as(buf, index, p, len, ERLANG_LATIN1, ERLANG_LATIN1); } int ei_encode_atom_len(char *buf, int *index, const char *p, int len) +{ + /* This function is documented to truncate at MAXATOMLEN (256) */ + if (len >= MAXATOMLEN) + len = MAXATOMLEN - 1; + return ei_encode_atom_len_as(buf, index, p, len, ERLANG_LATIN1, ERLANG_LATIN1); +} + +int ei_encode_atom_as(char *buf, int *index, const char *p, + enum erlang_char_encoding from_enc, + enum erlang_char_encoding to_enc) +{ + return ei_encode_atom_len_as(buf, index, p, strlen(p), from_enc, to_enc); +} + +int ei_encode_atom_len_as(char *buf, int *index, const char *p, int len, + enum erlang_char_encoding from_enc, + enum erlang_char_encoding to_enc) { char *s = buf + *index; char *s0 = s; + int offs; - /* This function is documented to truncate at MAXATOMLEN (256) */ - if (len > MAXATOMLEN) - len = MAXATOMLEN; + if (from_enc == ERLANG_LATIN1 && len >= MAXATOMLEN) { + return -1; + } - if (!buf) s += 3; - else { - put8(s,ERL_ATOM_EXT); - put16be(s,len); + switch(to_enc) { + case ERLANG_LATIN1: + if (buf) { + put8(s,ERL_ATOM_EXT); + switch (from_enc) { + case ERLANG_UTF8: + len = utf8_to_latin1(s+2, p, len, MAXATOMLEN-1, NULL); + if (len < 0) return -1; + break; + case ERLANG_ASCII: + case ERLANG_LATIN1: + memcpy(s+2, p, len); + break; + default: + return -1; + } + put16be(s,len); + } + else { + s += 3; + if (from_enc == ERLANG_UTF8) { + len = utf8_to_latin1(NULL, p, len, MAXATOMLEN-1, NULL); + if (len < 0) return -1; + } + } + break; + + case ERLANG_UTF8: + offs = 1 + 1; + switch (from_enc) { + case ERLANG_LATIN1: + if (len >= 256/2) offs++; + len = latin1_to_utf8((buf ? s+offs : NULL), p, len, MAXATOMLEN_UTF8-1, NULL); + break; + case ERLANG_ASCII: + case ERLANG_UTF8: + if (len >= 256) offs++; + if (buf) memcpy(s+offs, p, len); + break; + default: + return -1; + } + if (buf) { + if (offs == 2) { + put8(s, ERL_SMALL_ATOM_UTF8_EXT); + put8(s, len); + } + else { + put8(s, ERL_ATOM_UTF8_EXT); + put16be(s, len); + } + } + else s+= offs; + break; - memmove(s,p,len); /* unterminated string */ + default: + return -1; } s += len; @@ -53,3 +123,13 @@ int ei_encode_atom_len(char *buf, int *index, const char *p, int len) return 0; } +int +ei_internal_put_atom(char** bufp, const char* p, int slen, + enum erlang_char_encoding to_enc) +{ + int ix = 0; + if (ei_encode_atom_len_as(*bufp, &ix, p, slen, ERLANG_UTF8, to_enc) < 0) + return -1; + *bufp += ix; + return 0; +} diff --git a/lib/erl_interface/src/encode/encode_fun.c b/lib/erl_interface/src/encode/encode_fun.c index 54ee2083d6..4daee32648 100644 --- a/lib/erl_interface/src/encode/encode_fun.c +++ b/lib/erl_interface/src/encode/encode_fun.c @@ -35,7 +35,7 @@ int ei_encode_fun(char *buf, int *index, const erlang_fun *p) ix += sizeof(char) + 4; if (ei_encode_pid(buf, &ix, &p->pid) < 0) return -1; - if (ei_encode_atom(buf, &ix, p->module) < 0) + if (ei_encode_atom_as(buf, &ix, p->module, ERLANG_UTF8, p->module_org_enc) < 0) return -1; if (ei_encode_long(buf, &ix, p->index) < 0) return -1; @@ -60,7 +60,7 @@ int ei_encode_fun(char *buf, int *index, const erlang_fun *p) } else size_p = NULL; ix += 1 + 4 + 1 + sizeof(p->md5) + 4 + 4; - if (ei_encode_atom(buf, &ix, p->module) < 0) + if (ei_encode_atom_as(buf, &ix, p->module, ERLANG_UTF8, p->module_org_enc) < 0) return -1; if (ei_encode_long(buf, &ix, p->old_index) < 0) return -1; diff --git a/lib/erl_interface/src/encode/encode_pid.c b/lib/erl_interface/src/encode/encode_pid.c index ee7f235c17..0cf3ef4efb 100644 --- a/lib/erl_interface/src/encode/encode_pid.c +++ b/lib/erl_interface/src/encode/encode_pid.c @@ -24,29 +24,23 @@ int ei_encode_pid(char *buf, int *index, const erlang_pid *p) { char *s = buf + *index; - char *s0 = s; - int len = strlen(p->node); - - if (!buf) s += 13 + len; - else { - put8(s,ERL_PID_EXT); - /* first the nodename */ - put8(s,ERL_ATOM_EXT); + ++(*index); /* skip ERL_PID_EXT */ + if (ei_encode_atom_len_as(buf, index, p->node, strlen(p->node), ERLANG_UTF8, p->node_org_enc) < 0) + return -1; + + if (buf) { + put8(s,ERL_PID_EXT); - put16be(s,len); - - memmove(s, p->node, len); - s += len; + s = buf + *index; /* now the integers */ put32be(s,p->num & 0x7fff); /* 15 bits */ put32be(s,p->serial & 0x1fff); /* 13 bits */ put8(s,(p->creation & 0x03)); /* 2 bits */ } - - *index += s-s0; - + + *index += 4 + 4 + 1; return 0; } diff --git a/lib/erl_interface/src/encode/encode_port.c b/lib/erl_interface/src/encode/encode_port.c index fbbb33182e..2bf9e26d78 100644 --- a/lib/erl_interface/src/encode/encode_port.c +++ b/lib/erl_interface/src/encode/encode_port.c @@ -24,28 +24,23 @@ int ei_encode_port(char *buf, int *index, const erlang_port *p) { char *s = buf + *index; - char *s0 = s; - int len = strlen(p->node); - - if (!buf) s += 9 + len; - else { - put8(s,ERL_PORT_EXT); - /* first the nodename */ - put8(s,ERL_ATOM_EXT); + ++(*index); /* skip ERL_PORT_EXT */ + if (ei_encode_atom_len_as(buf, index, p->node, strlen(p->node), ERLANG_UTF8, + p->node_org_enc) < 0) { + return -1; + } + if (buf) { + put8(s,ERL_PORT_EXT); - put16be(s,len); - - memmove(s, p->node, len); - s += len; + s = buf + *index; /* now the integers */ put32be(s,p->id & 0x0fffffff /* 28 bits */); put8(s,(p->creation & 0x03)); } - *index += s-s0; - + *index += 4 + 1; return 0; } diff --git a/lib/erl_interface/src/encode/encode_ref.c b/lib/erl_interface/src/encode/encode_ref.c index 292b452864..e8b3173315 100644 --- a/lib/erl_interface/src/encode/encode_ref.c +++ b/lib/erl_interface/src/encode/encode_ref.c @@ -24,36 +24,32 @@ int ei_encode_ref(char *buf, int *index, const erlang_ref *p) { char *s = buf + *index; - char *s0 = s; - int len = strlen(p->node); int i; + (*index) += 1 + 2; /* skip to node atom */ + if (ei_encode_atom_len_as(buf, index, p->node, strlen(p->node), ERLANG_UTF8, + p->node_org_enc) < 0) { + return -1; + } + /* Always encode as an extended reference; all participating parties are now expected to be able to decode extended references. */ - if (!buf) s += 1 + 2 + (3+len) + p->len*4 + 1; - else { + if (buf) { put8(s,ERL_NEW_REFERENCE_EXT); /* first, number of integers */ put16be(s, p->len); /* then the nodename */ - put8(s,ERL_ATOM_EXT); - - put16be(s,len); - - memmove(s, p->node, len); - s += len; + s = buf + *index; /* now the integers */ put8(s,(p->creation & 0x03)); for (i = 0; i < p->len; i++) put32be(s,p->n[i]); - - } - - *index += s-s0; + } + *index += p->len*4 + 1; return 0; } diff --git a/lib/erl_interface/src/legacy/erl_connect.c b/lib/erl_interface/src/legacy/erl_connect.c index 41d4fa3138..be83fa8469 100644 --- a/lib/erl_interface/src/legacy/erl_connect.c +++ b/lib/erl_interface/src/legacy/erl_connect.c @@ -247,9 +247,15 @@ int erl_send(int fd, ETERM *to ,ETERM *msg) erl_errno = EINVAL; return -1; } - - strncpy(topid.node, (char *)ERL_PID_NODE(to), sizeof(topid.node)); - topid.node[sizeof(topid.node)-1] = '\0'; + + if (to->uval.pidval.node.latin1) { + strcpy(topid.node, to->uval.pidval.node.latin1); + topid.node_org_enc = ERLANG_LATIN1; + } + else { + strcpy(topid.node, to->uval.pidval.node.utf8); + topid.node_org_enc = ERLANG_UTF8; + } topid.num = ERL_PID_NUMBER(to); topid.serial = ERL_PID_SERIAL(to); topid.creation = ERL_PID_CREATION(to); @@ -263,7 +269,7 @@ static int erl_do_receive_msg(int fd, ei_x_buff* x, ErlMessage* emsg) erlang_msg msg; int r; - msg.from.node[0] = msg.to.node[0] = '\0'; + msg.from.node[0] = msg.to.node[0] = msg.toname[0] = '\0'; r = ei_do_receive_msg(fd, 0, &msg, x, 0); if (r == ERL_MSG) { @@ -299,7 +305,7 @@ static int erl_do_receive_msg(int fd, ei_x_buff* x, ErlMessage* emsg) emsg->to = erl_mk_pid(msg.to.node, msg.to.num, msg.to.serial, msg.to.creation); else emsg->to = NULL; - memcpy(emsg->to_name, msg.toname, MAXATOMLEN+1); + strcpy(emsg->to_name, msg.toname); return r; } diff --git a/lib/erl_interface/src/legacy/erl_eterm.c b/lib/erl_interface/src/legacy/erl_eterm.c index 8d559f0f55..aa0fd5ddcf 100644 --- a/lib/erl_interface/src/legacy/erl_eterm.c +++ b/lib/erl_interface/src/legacy/erl_eterm.c @@ -36,6 +36,7 @@ #include "erl_error.h" #include "erl_internal.h" #include "ei_internal.h" +#include "putget.h" #define ERL_IS_BYTE(x) (ERL_IS_INTEGER(x) && (ERL_INT_VALUE(x) & ~0xFF) == 0) @@ -142,9 +143,7 @@ ETERM *erl_mk_atom (const char *s) ep = erl_alloc_eterm(ERL_ATOM); ERL_COUNT(ep) = 1; - ERL_ATOM_SIZE(ep) = strlen(s); - if ((ERL_ATOM_PTR(ep) = strsave(s)) == NULL) - { + if (erl_atom_init_latin1(&ep->uval.aval.d, s) == NULL) { erl_free_term(ep); erl_errno = ENOMEM; return NULL; @@ -152,6 +151,65 @@ ETERM *erl_mk_atom (const char *s) return ep; } +char* erl_atom_ptr_latin1(Erl_Atom_data* a) +{ + if (a->latin1 == NULL) { + enum erlang_char_encoding enc; + a->lenL = utf8_to_latin1(NULL, a->utf8, a->lenU, a->lenU, &enc); + if (a->lenL < 0) { + a->lenL = 0; + return NULL; + } + if (enc == ERLANG_ASCII) { + a->latin1 = a->utf8; + } + else { + a->latin1 = malloc(a->lenL+1); + utf8_to_latin1(a->latin1, a->utf8, a->lenU, a->lenL, NULL); + a->latin1[a->lenL] = '\0'; + } + } + return a->latin1; +} + +char* erl_atom_ptr_utf8(Erl_Atom_data* a) +{ + if (a->utf8 == NULL) { + int dlen = a->lenL * 2; /* over estimation */ + a->utf8 = malloc(dlen + 1); + a->lenU = latin1_to_utf8(a->utf8, a->latin1, a->lenL, dlen, NULL); + a->utf8[a->lenU] = '\0'; + } + return a->utf8; + +} +int erl_atom_size_latin1(Erl_Atom_data* a) +{ + if (a->latin1 == NULL) { + erl_atom_ptr_latin1(a); + } + return a->lenL; +} +int erl_atom_size_utf8(Erl_Atom_data* a) +{ + if (a->utf8 == NULL) { + erl_atom_ptr_utf8(a); + } + return a->lenU; +} +char* erl_atom_init_latin1(Erl_Atom_data* a, const char* s) +{ + a->lenL = strlen(s); + if ((a->latin1 = strsave(s)) == NULL) + { + return NULL; + } + a->utf8 = NULL; + a->lenU = 0; + return a->latin1; +} + + /* * Given a string as input, creates a list. */ @@ -208,12 +266,19 @@ ETERM *erl_mk_pid(const char *node, ep = erl_alloc_eterm(ERL_PID); ERL_COUNT(ep) = 1; - if ((ERL_PID_NODE(ep) = strsave(node)) == NULL) + if (erl_atom_init_latin1(&ep->uval.pidval.node, node) == NULL) { erl_free_term(ep); erl_errno = ENOMEM; return NULL; } + erl_mk_pid_helper(ep, number, serial, creation); + return ep; +} + +void erl_mk_pid_helper(ETERM *ep, unsigned int number, + unsigned int serial, unsigned char creation) +{ ERL_PID_NUMBER(ep) = number & 0x7fff; /* 15 bits */ if (ei_internal_use_r9_pids_ports()) { ERL_PID_SERIAL(ep) = serial & 0x07; /* 3 bits */ @@ -222,7 +287,6 @@ ETERM *erl_mk_pid(const char *node, ERL_PID_SERIAL(ep) = serial & 0x1fff; /* 13 bits */ } ERL_PID_CREATION(ep) = creation & 0x03; /* 2 bits */ - return ep; } /* @@ -239,12 +303,18 @@ ETERM *erl_mk_port(const char *node, ep = erl_alloc_eterm(ERL_PORT); ERL_COUNT(ep) = 1; - if ((ERL_PORT_NODE(ep) = strsave(node)) == NULL) + if (erl_atom_init_latin1(&ep->uval.portval.node, node) == NULL) { erl_free_term(ep); erl_errno = ENOMEM; return NULL; } + erl_mk_port_helper(ep, number, creation); + return ep; +} + +void erl_mk_port_helper(ETERM* ep, unsigned number, unsigned char creation) +{ if (ei_internal_use_r9_pids_ports()) { ERL_PORT_NUMBER(ep) = number & 0x3ffff; /* 18 bits */ } @@ -252,29 +322,29 @@ ETERM *erl_mk_port(const char *node, ERL_PORT_NUMBER(ep) = number & 0x0fffffff; /* 18 bits */ } ERL_PORT_CREATION(ep) = creation & 0x03; /* 2 bits */ - return ep; } /* * Create any kind of reference. */ -ETERM *__erl_mk_reference (const char *node, +ETERM *__erl_mk_reference (ETERM* t, + const char *node, size_t len, unsigned int n[], unsigned char creation) { - ETERM * t; - - if (node == NULL) return NULL; - - t = erl_alloc_eterm(ERL_REF); - ERL_COUNT(t) = 1; - - if ((ERL_REF_NODE(t) = strsave(node)) == NULL) - { - erl_free_term(t); - erl_errno = ENOMEM; - return NULL; + if (t == NULL) { + if (node == NULL) return NULL; + + t = erl_alloc_eterm(ERL_REF); + ERL_COUNT(t) = 1; + + if (erl_atom_init_latin1(&t->uval.refval.node, node) == NULL) + { + erl_free_term(t); + erl_errno = ENOMEM; + return NULL; + } } ERL_REF_LEN(t) = len; ERL_REF_NUMBERS(t)[0] = n[0] & 0x3ffff; /* 18 bits */ @@ -294,7 +364,7 @@ ETERM *erl_mk_ref (const char *node, { unsigned int n[3] = {0, 0, 0}; n[0] = number; - return __erl_mk_reference(node, 1, n, creation); + return __erl_mk_reference(NULL, node, 1, n, creation); } /* @@ -307,7 +377,7 @@ erl_mk_long_ref (const char *node, { unsigned int n[3] = {0, 0, 0}; n[0] = n3; n[1] = n2; n[2] = n1; - return __erl_mk_reference(node, 3, n, creation); + return __erl_mk_reference(NULL, node, 3, n, creation); } /* @@ -758,6 +828,28 @@ int erl_iolist_length (const ETERM* term) return -1; } +static int erl_atom_copy(Erl_Atom_data* dst, const Erl_Atom_data* src) +{ + if (src->latin1 == src->utf8) { + dst->latin1 = dst->utf8 = strsave(src->latin1); + dst->lenL = dst->lenU = strlen(src->latin1); + } + else if (src->latin1) { + dst->latin1 = strsave(src->latin1); + dst->lenL = strlen(src->latin1); + dst->utf8 = NULL; + dst->lenU = 0; + } + else { + dst->utf8 = strsave(src->utf8); + dst->lenU = strlen(src->utf8); + dst->latin1 = NULL; + dst->lenL = 0; + } + return (dst->latin1 != NULL || dst->utf8 == NULL); +} + + /* * Return a brand NEW COPY of an ETERM. */ @@ -796,9 +888,7 @@ ETERM *erl_copy_term(const ETERM *ep) ERL_FLOAT_VALUE(cp) = ERL_FLOAT_VALUE(ep); break; case ERL_ATOM: - ERL_ATOM_SIZE(cp) = ERL_ATOM_SIZE(ep); - ERL_ATOM_PTR(cp) = strsave(ERL_ATOM_PTR(ep)); - if (ERL_ATOM_PTR(cp) == NULL) + if (!erl_atom_copy(&cp->uval.aval.d, &ep->uval.aval.d)) { erl_free_term(cp); erl_errno = ENOMEM; @@ -810,17 +900,17 @@ ETERM *erl_copy_term(const ETERM *ep) name and plug in. Somewhat ugly (also done with port and ref below). */ memcpy(&cp->uval.pidval, &ep->uval.pidval, sizeof(Erl_Pid)); - ERL_PID_NODE(cp) = strsave(ERL_PID_NODE(ep)); + erl_atom_copy(&cp->uval.pidval.node, &ep->uval.pidval.node); ERL_COUNT(cp) = 1; break; case ERL_PORT: memcpy(&cp->uval.portval, &ep->uval.portval, sizeof(Erl_Port)); - ERL_PORT_NODE(cp) = strsave(ERL_PORT_NODE(ep)); + erl_atom_copy(&cp->uval.portval.node, &ep->uval.portval.node); ERL_COUNT(cp) = 1; break; case ERL_REF: memcpy(&cp->uval.refval, &ep->uval.refval, sizeof(Erl_Ref)); - ERL_REF_NODE(cp) = strsave(ERL_REF_NODE(ep)); + erl_atom_copy(&cp->uval.refval.node, &ep->uval.refval.node); ERL_COUNT(cp) = 1; break; case ERL_LIST: @@ -883,29 +973,29 @@ int erl_print_term(FILE *fp, const ETERM *ep) j = i = doquote = 0; switch(ERL_TYPE(ep)) { - case ERL_ATOM: + case ERL_ATOM: { + char* adata = ERL_ATOM_PTR(ep); /* FIXME: what if some weird locale is in use? */ - if (!islower((int)ERL_ATOM_PTR(ep)[0])) + if (!islower(adata[0])) doquote = 1; for (i = 0; !doquote && i < ERL_ATOM_SIZE(ep); i++) { - doquote = !(isalnum((int)ERL_ATOM_PTR(ep)[i]) - || (ERL_ATOM_PTR(ep)[i] == '_')); + doquote = !(isalnum(adata[i]) || (adata[i] == '_')); } if (doquote) { putc('\'', fp); ch_written++; } - fputs(ERL_ATOM_PTR(ep), fp); + fputs(adata, fp); ch_written += ERL_ATOM_SIZE(ep); if (doquote) { putc('\'', fp); ch_written++; } break; - + } case ERL_VARIABLE: if (!isupper((int)ERL_VAR_NAME(ep)[0])) { doquote = 1; diff --git a/lib/erl_interface/src/legacy/erl_eterm.h b/lib/erl_interface/src/legacy/erl_eterm.h index 41b008f04f..2e8129d9cd 100644 --- a/lib/erl_interface/src/legacy/erl_eterm.h +++ b/lib/erl_interface/src/legacy/erl_eterm.h @@ -55,7 +55,9 @@ typedef struct _heapmark { } Erl_HeapMark; -ETERM * __erl_mk_reference(const char *, size_t, unsigned int n[], unsigned char); +void erl_mk_port_helper(ETERM* ep, unsigned number, unsigned char creation); +void erl_mk_pid_helper(ETERM*, unsigned,unsigned, unsigned char); +ETERM * __erl_mk_reference(ETERM*, const char *, size_t, unsigned int n[], unsigned char); int erl_current_fix_desc(void); #endif /* _ERL_ETERM_H */ diff --git a/lib/erl_interface/src/legacy/erl_format.c b/lib/erl_interface/src/legacy/erl_format.c index dc85806c36..533241e396 100644 --- a/lib/erl_interface/src/legacy/erl_format.c +++ b/lib/erl_interface/src/legacy/erl_format.c @@ -574,10 +574,22 @@ static int ematch(ETERM *p, ETERM *t) switch (type_p) { - case ERL_ATOM: - return p->uval.aval.len == t->uval.aval.len && - memcmp(p->uval.aval.a, t->uval.aval.a, p->uval.aval.len) == 0; - + case ERL_ATOM: { + Erl_Atom_data* pa = &p->uval.aval.d; + Erl_Atom_data* ta = &t->uval.aval.d; + if (pa->utf8 && ta->utf8) { + return pa->lenU == ta->lenU && memcmp(pa->utf8, ta->utf8, pa->lenU)==0; + } + else if (pa->latin1 && ta->latin1) { + return pa->lenL == ta->lenL && memcmp(pa->latin1, ta->latin1, pa->lenL)==0; + } + else if (pa->latin1) { + return cmp_latin1_vs_utf8(pa->latin1, pa->lenL, ta->utf8, ta->lenU)==0; + } + else { + return cmp_latin1_vs_utf8(ta->latin1, ta->lenL, pa->utf8, pa->lenU)==0; + } + } case ERL_VARIABLE: if (strcmp(p->uval.vval.name, "_") == 0) /* anon. variable */ return ERL_TRUE; diff --git a/lib/erl_interface/src/legacy/erl_malloc.c b/lib/erl_interface/src/legacy/erl_malloc.c index f51a6c69b3..d09239e02d 100644 --- a/lib/erl_interface/src/legacy/erl_malloc.c +++ b/lib/erl_interface/src/legacy/erl_malloc.c @@ -112,6 +112,18 @@ do { \ (ptr) = NULL; \ } while (0) +static void erl_atom_free(Erl_Atom_data* p) +{ + erl_free(p->latin1); + if (p->utf8 != p->latin1) { + erl_free(p->utf8); + } + p->latin1 = NULL; + p->utf8 = NULL; + p->lenL = 0; + p->lenU = 0; +} + static void _erl_free_term (ETERM *ep, int external, int compound) { restart: @@ -122,7 +134,7 @@ restart: switch(ERL_TYPE(ep)) { case ERL_ATOM: - FREE_AND_CLEAR(ERL_ATOM_PTR(ep)); + erl_atom_free(&ep->uval.aval.d); break; case ERL_VARIABLE: FREE_AND_CLEAR(ERL_VAR_NAME(ep)); @@ -161,13 +173,13 @@ restart: FREE_AND_CLEAR(ERL_BIN_PTR(ep)); break; case ERL_PID: - FREE_AND_CLEAR(ERL_PID_NODE(ep)); + erl_atom_free(&ep->uval.pidval.node); break; case ERL_PORT: - FREE_AND_CLEAR(ERL_PORT_NODE(ep)); + erl_atom_free(&ep->uval.portval.node); break; case ERL_REF: - FREE_AND_CLEAR(ERL_REF_NODE(ep)); + erl_atom_free(&ep->uval.refval.node); break; case ERL_EMPTY_LIST: case ERL_INTEGER: diff --git a/lib/erl_interface/src/legacy/erl_marshal.c b/lib/erl_interface/src/legacy/erl_marshal.c index 775d7e82ca..884e9d421b 100644 --- a/lib/erl_interface/src/legacy/erl_marshal.c +++ b/lib/erl_interface/src/legacy/erl_marshal.c @@ -44,6 +44,9 @@ int erl_fp_compare(unsigned *a, unsigned *b); static void erl_long_to_fp(long l, unsigned *d); #endif +static int cmpbytes(unsigned char* s1,int l1,unsigned char* s2,int l2); +static int cmpatoms(unsigned char* s1, int l1, unsigned char tag1, unsigned char* s2, int l2, unsigned char tag2); + /* Used when comparing two encoded byte arrays */ /* this global data is ok (from threading point of view) since it is * initialized once and never changed @@ -111,8 +114,9 @@ void erl_init_marshal(void) cmp_array[ERL_SMALL_BIG_EXT] = ERL_NUM_CMP; cmp_array[ERL_LARGE_BIG_EXT] = ERL_NUM_CMP; cmp_array[ERL_ATOM_EXT] = ERL_ATOM_CMP; + cmp_array[ERL_ATOM_UTF8_EXT] = ERL_ATOM_CMP; cmp_array[ERL_SMALL_ATOM_EXT] = ERL_ATOM_CMP; - cmp_array[ERL_UNICODE_ATOM_EXT] = ERL_ATOM_CMP; + cmp_array[ERL_SMALL_ATOM_UTF8_EXT] = ERL_ATOM_CMP; cmp_array[ERL_REFERENCE_EXT] = ERL_REF_CMP; cmp_array[ERL_NEW_REFERENCE_EXT] = ERL_REF_CMP; cmp_array[ERL_FUN_EXT] = ERL_FUN_CMP; @@ -162,6 +166,21 @@ static int erl_length_x(const ETERM *ep) { *============================================================== */ +static void encode_atom(Erl_Atom_data* a, unsigned char **ext) +{ + int ix = 0; + if (a->latin1) { + ei_encode_atom_len_as((char*)*ext, &ix, a->latin1, a->lenL, + ERLANG_LATIN1, ERLANG_LATIN1); + } + else if (ei_encode_atom_len_as((char*)*ext, &ix, a->utf8, a->lenU, + ERLANG_UTF8, ERLANG_LATIN1) < 0) { + ei_encode_atom_len_as((char*)*ext, &ix, a->utf8, a->lenU, + ERLANG_UTF8, ERLANG_UTF8); + } + *ext += ix; +} + /* * The actual ENCODE engine. * Returns 0 on success, otherwise 1. @@ -176,12 +195,7 @@ int erl_encode_it(ETERM *ep, unsigned char **ext, int dist) switch(ERL_TYPE(ep)) { case ERL_ATOM: - i = ep->uval.aval.len; - *(*ext)++ = ERL_ATOM_EXT; - *(*ext)++ = (i >>8) &0xff; - *(*ext)++ = i &0xff; - memcpy((void *) *ext, (const void *) ep->uval.aval.a, i); - *ext += i; + encode_atom(&ep->uval.aval.d, ext); return 0; case ERL_INTEGER: @@ -292,12 +306,7 @@ int erl_encode_it(ETERM *ep, unsigned char **ext, int dist) case ERL_PID: *(*ext)++ = ERL_PID_EXT; /* First poke in node as an atom */ - i = strlen((char *)ERL_PID_NODE(ep)); - *(*ext)++ = ERL_ATOM_EXT; - *(*ext)++ = (i >>8) &0xff; - *(*ext)++ = i &0xff; - memcpy(*ext, ERL_PID_NODE(ep), i); - *ext += i; + encode_atom(&ep->uval.pidval.node, ext); /* And then fill in the integer fields */ i = ERL_PID_NUMBER(ep); *(*ext)++ = (i >> 24) &0xff; @@ -325,11 +334,8 @@ int erl_encode_it(ETERM *ep, unsigned char **ext, int dist) *(*ext)++ = (len >> 8) &0xff; *(*ext)++ = len &0xff; - *(*ext)++ = ERL_ATOM_EXT; - *(*ext)++ = (i >> 8) &0xff; - *(*ext)++ = i &0xff; - memcpy(*ext, ERL_REF_NODE(ep), i); - *ext += i; + encode_atom(&ep->uval.refval.node, ext); + *(*ext)++ = ERL_REF_CREATION(ep); /* Then the integer fields */ for (j = 0; j < ERL_REF_LEN(ep); j++) { @@ -344,12 +350,7 @@ int erl_encode_it(ETERM *ep, unsigned char **ext, int dist) case ERL_PORT: *(*ext)++ = ERL_PORT_EXT; /* First poke in node as an atom */ - i = strlen((char *)ERL_PORT_NODE(ep)); - *(*ext)++ = ERL_ATOM_EXT; - *(*ext)++ = (i >>8) &0xff; - *(*ext)++ = i &0xff; - memcpy(*ext, ERL_PORT_NODE(ep), i); - *ext += i; + encode_atom(&ep->uval.portval.node, ext); /* Then the integer fields */ i = ERL_PORT_NUMBER(ep); *(*ext)++ = (i >> 24) &0xff; @@ -500,6 +501,16 @@ int erl_term_len(ETERM *ep) return 1+erl_term_len_helper(ep, 4); } +static int atom_len_helper(Erl_Atom_data* a) +{ + if (erl_atom_ptr_latin1(a)) { + return 1 + 2 + a->lenL; /* ERL_ATOM_EXT */ + } + else { + return 1 + 1 + (a->lenU > 255) + a->lenU; + } +} + static int erl_term_len_helper(ETERM *ep, int dist) { int len = 0; @@ -511,8 +522,7 @@ static int erl_term_len_helper(ETERM *ep, int dist) if (ep) { switch (ERL_TYPE(ep)) { case ERL_ATOM: - i = ep->uval.aval.len; - len = i + 3; + len = atom_len_helper(&ep->uval.aval.d); break; case ERL_INTEGER: @@ -544,20 +554,15 @@ static int erl_term_len_helper(ETERM *ep, int dist) break; case ERL_PID: - /* 1 + N + 4 + 4 + 1 where N = 3 + strlen */ - i = strlen((char *)ERL_PID_NODE(ep)); - len = 13 + i; + len = 1 + atom_len_helper(&ep->uval.pidval.node) + 4 + 4 + 1; break; case ERL_REF: - i = strlen((char *)ERL_REF_NODE(ep)); - len = 1 + 2 + (i+3) + 1 + ERL_REF_LEN(ep) * 4; + len = 1 + 2 + atom_len_helper(&ep->uval.refval.node) + 1 + ERL_REF_LEN(ep) * 4; break; case ERL_PORT: - /* 1 + N + 4 + 1 where N = 3 + strlen */ - i = strlen((char *)ERL_PORT_NODE(ep)); - len = 9 + i; + len = 1 + atom_len_helper(&ep->uval.portval.node) + 4 + 1; break; case ERL_EMPTY_LIST: @@ -651,11 +656,33 @@ int erl_encode_buf(ETERM *ep, unsigned char **ext) } /* erl_encode_buf */ -static int read_atom(unsigned char** ext, char* dst) +static int read_atom(unsigned char** ext, Erl_Atom_data* a) { + char buf[MAXATOMLEN_UTF8]; int offs = 0; - int ret = ei_decode_atom((char*)*ext, &offs, dst); + enum erlang_char_encoding enc; + int ret = ei_decode_atom_as((char*)*ext, &offs, buf, MAXATOMLEN_UTF8, + ERLANG_WHATEVER, NULL, &enc); *ext += offs; + + if (ret == 0) { + int i = strlen(buf); + char* clone = erl_malloc(i+1); + memcpy(clone, buf, i+1); + + a->latin1 = NULL; + a->lenL = 0; + a->utf8 = NULL; + a->lenU = 0; + if (enc == ERLANG_LATIN1 || enc == ERLANG_ASCII) { + a->latin1 = clone; + a->lenL = i; + } + if (enc == ERLANG_UTF8 || enc == ERLANG_ASCII) { + a->utf8 = clone; + a->lenU = i; + } + } return ret; } @@ -665,7 +692,6 @@ static int read_atom(unsigned char** ext, char* dst) */ static ETERM *erl_decode_it(unsigned char **ext) { - char atom_buf[MAXATOMLEN+1]; char *cp; ETERM *ep,*tp,*np; unsigned int u,sign; @@ -765,127 +791,89 @@ static ETERM *erl_decode_it(unsigned char **ext) case ERL_ATOM_EXT: case ERL_SMALL_ATOM_EXT: - case ERL_UNICODE_ATOM_EXT: + case ERL_ATOM_UTF8_EXT: + case ERL_SMALL_ATOM_UTF8_EXT: + ERL_TYPE(ep) = ERL_ATOM; --(*ext); - if (read_atom(ext, atom_buf) < 0) return NULL; - - i = strlen(atom_buf); - ep->uval.aval.len = i; - ep->uval.aval.a = (char *) erl_malloc(i+1); - memcpy(ep->uval.aval.a, atom_buf, i+1); + if (read_atom(ext, &ep->uval.aval.d) < 0) return NULL; return ep; case ERL_PID_EXT: - erl_free_term(ep); - { /* Why not use the constructors? */ - char* node = atom_buf; + { unsigned int number, serial; unsigned char creation; - ETERM *eterm_p; - if (read_atom(ext, node) < 0) return NULL; + ERL_TYPE(ep) = ERL_PID; + if (read_atom(ext, &ep->uval.pidval.node) < 0) return NULL; /* get the integers */ -#if 0 - /* FIXME: Remove code or whatever.... - Ints on the wire are big-endian (== network byte order) - so use ntoh[sl]. (But some are little-endian! Arrrgh!) - Also, the libc authors can be expected to optimize them - heavily. However, the marshalling makes no guarantees - about alignments -- so it won't work at all. */ - number = ntohl(*((unsigned int *)*ext)++); - serial = ntohl(*((unsigned int *)*ext)++); -#else number = ((*ext)[0] << 24) | ((*ext)[1]) << 16 | ((*ext)[2]) << 8 | ((*ext)[3]); *ext += 4; serial = ((*ext)[0] << 24) | ((*ext)[1]) << 16 | ((*ext)[2]) << 8 | ((*ext)[3]); *ext += 4; -#endif creation = *(*ext)++; - eterm_p = erl_mk_pid(node, number, serial, creation); - return eterm_p; + erl_mk_pid_helper(ep, number, serial, creation); + return ep; } case ERL_REFERENCE_EXT: - erl_free_term(ep); { - char* node = atom_buf; - unsigned int number; + unsigned int n[3] = {0, 0, 0}; unsigned char creation; - ETERM *eterm_p; - if (read_atom(ext, node) < 0) return NULL; + ERL_TYPE(ep) = ERL_REF; + if (read_atom(ext, &ep->uval.refval.node) < 0) return NULL; /* get the integers */ -#if 0 - number = ntohl(*((unsigned int *)*ext)++); -#else - number = ((*ext)[0] << 24) | ((*ext)[1]) << 16 | + n[0] = ((*ext)[0] << 24) | ((*ext)[1]) << 16 | ((*ext)[2]) << 8 | ((*ext)[3]); *ext += 4; -#endif creation = *(*ext)++; - eterm_p = erl_mk_ref(node, number, creation); - return eterm_p; + __erl_mk_reference(ep, NULL, 1, n, creation); + return ep; } case ERL_NEW_REFERENCE_EXT: - erl_free_term(ep); { - char* node = atom_buf; size_t cnt, i; unsigned int n[3]; unsigned char creation; - ETERM *eterm_p; -#if 0 - cnt = ntohs(*((unsigned short *)*ext)++); -#else + ERL_TYPE(ep) = ERL_REF; cnt = ((*ext)[0] << 8) | (*ext)[1]; *ext += 2; -#endif - if (read_atom(ext, node) < 0) return NULL; + if (read_atom(ext, &ep->uval.refval.node) < 0) return NULL; /* get the integers */ creation = *(*ext)++; for(i = 0; i < cnt; i++) { -#if 0 - n[i] = ntohl(*((unsigned int *)*ext)++); -#else n[i] = ((*ext)[0] << 24) | ((*ext)[1]) << 16 | ((*ext)[2]) << 8 | ((*ext)[3]); *ext += 4; -#endif } - eterm_p = __erl_mk_reference(node, cnt, n, creation); - return eterm_p; + __erl_mk_reference(ep, NULL, cnt, n, creation); + return ep; } case ERL_PORT_EXT: - erl_free_term(ep); { - char* node = atom_buf; unsigned int number; unsigned char creation; - ETERM *eterm_p; - if (read_atom(ext, node) < 0) return NULL; + ERL_TYPE(ep) = ERL_PORT; + if (read_atom(ext, &ep->uval.portval.node) < 0) return NULL; /* get the integers */ -#if 0 - number = ntohl(*((unsigned int *)*ext)++); -#else number = ((*ext)[0] << 24) | ((*ext)[1]) << 16 | ((*ext)[2]) << 8 | ((*ext)[3]); *ext += 4; -#endif creation = *(*ext)++; - eterm_p = erl_mk_port(node, number, creation); - return eterm_p; + erl_mk_port_helper(ep, number, creation); + return ep; } case ERL_NIL_EXT: @@ -1120,8 +1108,9 @@ unsigned char erl_ext_type(unsigned char *ext) case ERL_INTEGER_EXT: return ERL_INTEGER; case ERL_ATOM_EXT: + case ERL_ATOM_UTF8_EXT: case ERL_SMALL_ATOM_EXT: - case ERL_UNICODE_ATOM_EXT: + case ERL_SMALL_ATOM_UTF8_EXT: return ERL_ATOM; case ERL_PID_EXT: return ERL_PID; @@ -1173,8 +1162,9 @@ int erl_ext_size(unsigned char *t) case ERL_SMALL_INTEGER_EXT: case ERL_INTEGER_EXT: case ERL_ATOM_EXT: + case ERL_ATOM_UTF8_EXT: case ERL_SMALL_ATOM_EXT: - case ERL_UNICODE_ATOM_EXT: + case ERL_SMALL_ATOM_UTF8_EXT: case ERL_PID_EXT: case ERL_PORT_EXT: case ERL_REFERENCE_EXT: @@ -1221,12 +1211,13 @@ static int jump_atom(unsigned char** ext) switch (*e++) { case ERL_ATOM_EXT: - case ERL_UNICODE_ATOM_EXT: + case ERL_ATOM_UTF8_EXT: len = (e[0] << 8) | e[1]; e += (len + 2); break; case ERL_SMALL_ATOM_EXT: + case ERL_SMALL_ATOM_UTF8_EXT: len = e[0]; e += (len + 1); break; @@ -1259,8 +1250,9 @@ static int jump(unsigned char **ext) *ext += 1; break; case ERL_ATOM_EXT: + case ERL_ATOM_UTF8_EXT: case ERL_SMALL_ATOM_EXT: - case ERL_UNICODE_ATOM_EXT: + case ERL_SMALL_ATOM_UTF8_EXT: jump_atom(ext); break; case ERL_PID_EXT: @@ -1426,6 +1418,58 @@ static int cmpbytes(unsigned char* s1,int l1,unsigned char* s2,int l2) } /* cmpbytes */ +#define tag2enc(T) ((T)==ERL_ATOM_EXT || (T)==ERL_SMALL_ATOM_EXT ? ERLANG_LATIN1 : ERLANG_UTF8) + +static int cmpatoms(unsigned char* s1, int l1, unsigned char tag1, + unsigned char* s2, int l2, unsigned char tag2) +{ + enum erlang_char_encoding enc1 = tag2enc(tag1); + enum erlang_char_encoding enc2 = tag2enc(tag2); + + if (enc1 == enc2) { + return cmpbytes(s1, l1,s2,l2); + } + + if (enc1 == ERLANG_LATIN1) { + return cmp_latin1_vs_utf8((char*)s1, l1, (char*)s2, l2); + } + else { + return -cmp_latin1_vs_utf8((char*)s2, l2, (char*)s1, l1); + } +} + +int cmp_latin1_vs_utf8(const char* strL, int lenL, const char* strU, int lenU) +{ + unsigned char* sL = (unsigned char*)strL; + unsigned char* sU = (unsigned char*)strU; + unsigned char* sL_end = sL + lenL; + unsigned char* sU_end = sU + lenU; + + while(sL < sL_end && sU < sU_end) { + unsigned char UasL; + if (*sL >= 0x80) { + if (*sU < 0xC4 && (sU+1) < sU_end) { + UasL = ((sU[0] & 0x3) << 6) | (sU[1] & 0x3F); + } + else return -1; + } + else { + UasL = *sU; + } + if (*sL < UasL) return -1; + if (*sL > UasL) return 1; + + sL++; + if (*sU < 0x80) sU++; + else if (*sU < 0xE0) sU += 2; + else if (*sU < 0xF0) sU += 3; + else /*if (*sU < 0xF8)*/ sU += 4; + } + + return (sU >= sU_end) - (sL >= sL_end); /* -1, 0 or 1 */ +} + + #define CMP_EXT_ERROR_CODE 4711 #define CMP_EXT_INT32_BE(AP, BP) \ @@ -1561,6 +1605,7 @@ static int cmp_exe2(unsigned char **e1, unsigned char **e2) int min, ret,i,j,k; double ff1, ff2; unsigned char *tmp1, *tmp2; + unsigned char tag1, tag2; if ( ((*e1)[0] == ERL_STRING_EXT) && ((*e2)[0] == ERL_LIST_EXT) ) { return cmp_string_list(e1, e2); @@ -1568,9 +1613,10 @@ static int cmp_exe2(unsigned char **e1, unsigned char **e2) return -cmp_string_list(e2, e1); } - *e2 += 1; + tag1 = *(*e1)++; + tag2 = *(*e2)++; i = j = 0; - switch (*(*e1)++) + switch (tag1) { case ERL_SMALL_INTEGER_EXT: if (**e1 < **e2) ret = -1; @@ -1590,14 +1636,15 @@ static int cmp_exe2(unsigned char **e1, unsigned char **e2) *e1 += 4; *e2 += 4; return ret; case ERL_ATOM_EXT: - case ERL_UNICODE_ATOM_EXT: + case ERL_ATOM_UTF8_EXT: i = (**e1) << 8; (*e1)++; j = (**e2) << 8; (*e2)++; /*fall through*/ case ERL_SMALL_ATOM_EXT: + case ERL_SMALL_ATOM_UTF8_EXT: i |= (**e1); (*e1)++; j |= (**e2); (*e2)++; - ret = cmpbytes(*e1, i, *e2, j); + ret = cmpatoms(*e1, i, tag1, *e2, j, tag2); *e1 += i; *e2 += j; return ret; diff --git a/lib/erl_interface/src/legacy/global_whereis.c b/lib/erl_interface/src/legacy/global_whereis.c index 2afb193504..e6c556d907 100644 --- a/lib/erl_interface/src/legacy/global_whereis.c +++ b/lib/erl_interface/src/legacy/global_whereis.c @@ -85,7 +85,16 @@ ETERM *erl_global_whereis(int fd, const char *name, char *node) opid = erl_decode((unsigned char*)buf); /* extract the nodename for the caller */ - if (node) strcpy(node,epid.node); + if (node) { + char* node_str = ERL_PID_NODE(opid); + if (node_str) { + strcpy(node, node_str); + } + else { + erl_free_term(opid); + return NULL; + } + } return opid; } diff --git a/lib/erl_interface/src/misc/ei_decode_term.c b/lib/erl_interface/src/misc/ei_decode_term.c index 6773f90bfc..65afee89cc 100644 --- a/lib/erl_interface/src/misc/ei_decode_term.c +++ b/lib/erl_interface/src/misc/ei_decode_term.c @@ -32,7 +32,7 @@ int ei_decode_ei_term(const char* buf, int* index, ei_term* term) { const char* s = buf + *index, * s0 = s; - int len, i, n, sign; + int i, n, sign; char c; if (term == NULL) return -1; @@ -48,12 +48,13 @@ int ei_decode_ei_term(const char* buf, int* index, ei_term* term) case NEW_FLOAT_EXT: return ei_decode_double(buf, index, &term->value.d_val); case ERL_ATOM_EXT: + case ERL_ATOM_UTF8_EXT: case ERL_SMALL_ATOM_EXT: - case ERL_UNICODE_ATOM_EXT: + case ERL_SMALL_ATOM_UTF8_EXT: return ei_decode_atom(buf, index, term->value.atom_name); case ERL_REFERENCE_EXT: /* first the nodename */ - if (get_atom(&s, term->value.ref.node) < 0) return -1; + if (get_atom(&s, term->value.ref.node, &term->value.ref.node_org_enc) < 0) return -1; /* now the numbers: num (4), creation (1) */ term->value.ref.n[0] = get32be(s); term->value.ref.len = 1; @@ -63,7 +64,7 @@ int ei_decode_ei_term(const char* buf, int* index, ei_term* term) /* first the integer count */ term->value.ref.len = get16be(s); /* then the nodename */ - if (get_atom(&s, term->value.ref.node) < 0) return -1; + if (get_atom(&s, term->value.ref.node, &term->value.ref.node_org_enc) < 0) return -1; /* creation */ term->value.ref.creation = get8(s) & 0x03; /* finally the id integers */ @@ -75,12 +76,12 @@ int ei_decode_ei_term(const char* buf, int* index, ei_term* term) } break; case ERL_PORT_EXT: - if (get_atom(&s, term->value.port.node) < 0) return -1; + if (get_atom(&s, term->value.port.node, &term->value.port.node_org_enc) < 0) return -1; term->value.port.id = get32be(s) & 0x0fffffff; /* 28 bits */; term->value.port.creation = get8(s) & 0x03; break; case ERL_PID_EXT: - if (get_atom(&s, term->value.pid.node) < 0) return -1; + if (get_atom(&s, term->value.pid.node, &term->value.port.node_org_enc) < 0) return -1; /* now the numbers: num (4), serial (4), creation (1) */ term->value.pid.num = get32be(s) & 0x7fff; /* 15 bits */ term->value.pid.serial = get32be(s) & 0x1fff; /* 13 bits */ diff --git a/lib/erl_interface/src/misc/ei_printterm.c b/lib/erl_interface/src/misc/ei_printterm.c index 620c6e72e2..91fe73e68c 100644 --- a/lib/erl_interface/src/misc/ei_printterm.c +++ b/lib/erl_interface/src/misc/ei_printterm.c @@ -132,9 +132,10 @@ static int print_term(FILE* fp, ei_x_buff* x, doquote = 0; ei_get_type_internal(buf, index, &ty, &n); switch (ty) { - case ERL_ATOM_EXT: + case ERL_ATOM_EXT: + case ERL_ATOM_UTF8_EXT: case ERL_SMALL_ATOM_EXT: - case ERL_UNICODE_ATOM_EXT: + case ERL_SMALL_ATOM_UTF8_EXT: if (ei_decode_atom(buf, index, a) < 0) goto err; doquote = !islower((int)a[0]); diff --git a/lib/erl_interface/src/misc/ei_x_encode.c b/lib/erl_interface/src/misc/ei_x_encode.c index fa1e26ccbb..44dcff7664 100644 --- a/lib/erl_interface/src/misc/ei_x_encode.c +++ b/lib/erl_interface/src/misc/ei_x_encode.c @@ -197,18 +197,33 @@ int ei_x_encode_tuple_header(ei_x_buff* x, long n) int ei_x_encode_atom(ei_x_buff* x, const char* s) { - return ei_x_encode_atom_len(x, s, strlen(s)); + return ei_x_encode_atom_len_as(x, s, strlen(s), ERLANG_LATIN1, ERLANG_LATIN1); } int ei_x_encode_atom_len(ei_x_buff* x, const char* s, int len) +{ + return ei_x_encode_atom_len_as(x, s, len, ERLANG_LATIN1, ERLANG_LATIN1); +} + +int ei_x_encode_atom_as(ei_x_buff* x, const char* s, + enum erlang_char_encoding from_enc, + enum erlang_char_encoding to_enc) +{ + return ei_x_encode_atom_len_as(x, s, strlen(s), from_enc, to_enc); +} + +int ei_x_encode_atom_len_as(ei_x_buff* x, const char* s, int len, + enum erlang_char_encoding from_enc, + enum erlang_char_encoding to_enc) { int i = x->index; - ei_encode_atom_len(NULL, &i, s, len); + ei_encode_atom_len_as(NULL, &i, s, len, from_enc, to_enc); if (!x_fix_buff(x, i)) return -1; - return ei_encode_atom_len(x->buff, &x->index, s, len); + return ei_encode_atom_len_as(x->buff, &x->index, s, len, from_enc, to_enc); } + int ei_x_encode_pid(ei_x_buff* x, const erlang_pid* pid) { int i = x->index; diff --git a/lib/erl_interface/src/misc/get_type.c b/lib/erl_interface/src/misc/get_type.c index c9a040fbbd..54465196b0 100644 --- a/lib/erl_interface/src/misc/get_type.c +++ b/lib/erl_interface/src/misc/get_type.c @@ -33,80 +33,6 @@ int ei_get_type(const char *buf, const int *index, int *type, int *len) return ei_get_type_internal(buf, index, type, len); } -#if 0 -int ei_get_type(const char *buf, const int *index, int *type, int *len) -{ - const char *s = buf + *index; - int itype = get8(s); /* Internal type */ - - *len = 0; - - switch (*type) { - - case ERL_SMALL_INTEGER_EXT: - case ERL_INTEGER_EXT: - case ERL_SMALL_BIG_EXT: - case ERL_LARGE_BIG_EXT: - *type = EI_TYPE_INTEGER; - break; - - case ERL_FLOAT_EXT: - *type = EI_TYPE_FLOAT; - break; - - case ERL_SMALL_TUPLE_EXT: - case ERL_SMALL_ATOM_EXT: - *len = get8(s); - break; - - case ERL_ATOM_EXT: - case ERL_UNICODE_ATOM_EXT: - case ERL_STRING_EXT: - *len = get16be(s); - break; - - case ERL_LARGE_TUPLE_EXT: - case ERL_LIST_EXT: - case ERL_BINARY_EXT: - *len = get32be(s); - break; - - case ERL_SMALL_BIG_EXT: - *len = (get8(s)+1)/2; /* big arity */ - break; - - case ERL_LARGE_BIG_EXT: - *len = (get32be(s)+1)/2; /* big arity */ - break; - - case ERL_BINARY_EXT: - *type = EI_TYPE_BINARY; - break; - - case ERL_PID_EXT: - *type = EI_TYPE_PID; - break; - - case ERL_PORT_EXT: - *type = EI_TYPE_PORT; - break; - - case ERL_REFERENCE_EXT: - case ERL_NEW_REFERENCE_EXT: - *type = EI_TYPE_REF; - break; - - default: - break; - } - - /* leave index unchanged */ - return 0; -} -#endif - - -/* Old definition of function above */ int ei_get_type_internal(const char *buf, const int *index, int *type, int *len) @@ -117,12 +43,13 @@ int ei_get_type_internal(const char *buf, const int *index, switch (*type) { case ERL_SMALL_ATOM_EXT: + case ERL_SMALL_ATOM_UTF8_EXT: *type = ERL_ATOM_EXT; case ERL_SMALL_TUPLE_EXT: *len = get8(s); break; - case ERL_UNICODE_ATOM_EXT: + case ERL_ATOM_UTF8_EXT: *type = ERL_ATOM_EXT; case ERL_ATOM_EXT: case ERL_STRING_EXT: diff --git a/lib/erl_interface/src/misc/putget.h b/lib/erl_interface/src/misc/putget.h index 8b0d4d3404..77ae168f8c 100644 --- a/lib/erl_interface/src/misc/putget.h +++ b/lib/erl_interface/src/misc/putget.h @@ -105,8 +105,12 @@ ((EI_ULONGLONG)((unsigned char *)(s))[-2] << 8) | \ (EI_ULONGLONG)((unsigned char *)(s))[-1])) -int ei_internal_get_atom(const char** bufp, char* p); +int utf8_to_latin1(char* dst, const char* src, int slen, int destlen, enum erlang_char_encoding* res_encp); +int latin1_to_utf8(char* dst, const char* src, int slen, int destlen, enum erlang_char_encoding* res_encp); +int ei_internal_get_atom(const char** bufp, char* p, enum erlang_char_encoding*); +int ei_internal_put_atom(char** bufp, const char* p, int slen, enum erlang_char_encoding); #define get_atom ei_internal_get_atom +#define put_atom ei_internal_put_atom typedef union float_ext { double d; diff --git a/lib/erl_interface/src/misc/show_msg.c b/lib/erl_interface/src/misc/show_msg.c index 194296798b..ca46b15aff 100644 --- a/lib/erl_interface/src/misc/show_msg.c +++ b/lib/erl_interface/src/misc/show_msg.c @@ -132,13 +132,13 @@ int ei_show_sendmsg(FILE *stream, const char *header, const char *msgbuf) switch (msg.msgtype) { case ERL_SEND: - if (ei_decode_atom(header,&index,msg.cookie) + if (ei_decode_atom_as(header,&index,msg.cookie,sizeof(msg.cookie),ERLANG_UTF8,NULL,NULL) || ei_decode_pid(header,&index,&msg.to)) return -1; mbuf = msgbuf; break; case ERL_SEND_TT: - if (ei_decode_atom(header,&index,msg.cookie) + if (ei_decode_atom_as(header,&index,msg.cookie,sizeof(msg.cookie),ERLANG_UTF8,NULL,NULL) || ei_decode_pid(header,&index,&msg.to) || ei_decode_trace(header,&index,&msg.token)) return -1; mbuf = msgbuf; @@ -146,15 +146,15 @@ int ei_show_sendmsg(FILE *stream, const char *header, const char *msgbuf) case ERL_REG_SEND: if (ei_decode_pid(header,&index,&msg.from) - || ei_decode_atom(header,&index,msg.cookie) - || ei_decode_atom(header,&index,msg.toname)) return -1; + || ei_decode_atom_as(header,&index,msg.cookie,sizeof(msg.cookie),ERLANG_UTF8,NULL,NULL) + || ei_decode_atom_as(header,&index,msg.toname,sizeof(msg.toname),ERLANG_UTF8,NULL,NULL)) return -1; mbuf = msgbuf; break; case ERL_REG_SEND_TT: if (ei_decode_pid(header,&index,&msg.from) - || ei_decode_atom(header,&index,msg.cookie) - || ei_decode_atom(header,&index,msg.toname) + || ei_decode_atom_as(header,&index,msg.cookie,sizeof(msg.cookie),ERLANG_UTF8,NULL,NULL) + || ei_decode_atom_as(header,&index,msg.toname,sizeof(msg.toname),ERLANG_UTF8,NULL,NULL) || ei_decode_trace(header,&index,&msg.token)) return -1; mbuf = msgbuf; break; diff --git a/lib/erl_interface/test/ei_decode_encode_SUITE.erl b/lib/erl_interface/test/ei_decode_encode_SUITE.erl index 85cb62239b..e8ae7a6f81 100644 --- a/lib/erl_interface/test/ei_decode_encode_SUITE.erl +++ b/lib/erl_interface/test/ei_decode_encode_SUITE.erl @@ -118,6 +118,9 @@ test_ei_decode_encode(Config) when is_list(Config) -> ?line send_rec(P, OXPort), ?line send_rec(P, OXRef), + %% Unicode atoms + [send_rec(P, Atom) || Atom <- unicode_atom_data()], + ?line runner:recv_eot(P), ok. @@ -127,7 +130,7 @@ test_ei_decode_encode(Config) when is_list(Config) -> % We read two packets for each test, the ei_decode_encode and ei_x_decode_encode version.... send_rec(P, Term) when is_port(P) -> - ?t:format("Testing: ~p~n", [Term]), + %%?t:format("Testing: ~p~n", [Term]), P ! {self(), {command, term_to_binary(Term)}}, {_B,Term} = get_buf_and_term(P). @@ -146,7 +149,7 @@ get_buf_and_term(P) -> _ -> B1 = list_to_binary([131,B]), % No magic, add T = binary_to_term(B1), - io:format("~w\n~w\n(got no magic)\n",[B,T]), + %io:format("~w\n~w\n(got no magic)\n",[B,T]), {B,T} end. @@ -160,7 +163,7 @@ get_binary(P) -> case runner:get_term(P) of {bytes,L} -> B = list_to_binary(L), - io:format("~w\n",[L]), + %%io:format("~w\n",[L]), % For strange reasons <<131>> show up as <>.... % io:format("~w\n",[B]), B; @@ -305,3 +308,60 @@ mk_ref({NodeName, Creation}, Numbers) when is_list(NodeName), exit({unexpected_binary_to_term_result, Other}) end. + + +unicode_atom_data() -> + [uc_atup(lists:seq(16#1f600, 16#1f600+254)), + uc_atup(lists:seq(16#1f600, 16#1f600+63)), + uc_atup(lists:seq(1, 255)), + uc_atup(lists:seq(100, 163)), + uc_atup(lists:seq(200, 354)), + uc_atup(lists:seq(200, 263)), + uc_atup(lists:seq(2000, 2254)), + uc_atup(lists:seq(2000, 2063)), + uc_atup(lists:seq(65500, 65754)), + uc_atup(lists:seq(65500, 65563)) + | lists:map(fun (N) -> + uc_atup(lists:seq(64000+N, 64254+N)) + end, + lists:seq(1, 2000))]. + +uc_atup(ATxt) -> + string_to_atom(ATxt). + +string_to_atom(String) -> + Utf8List = string_to_utf8_list(String), + Len = length(Utf8List), + TagLen = case Len < 256 of + true -> [119, Len]; + false -> [118, Len bsr 8, Len band 16#ff] + end, + binary_to_term(list_to_binary([131, TagLen, Utf8List])). + +string_to_utf8_list([]) -> + []; +string_to_utf8_list([CP|CPs]) when is_integer(CP), + 0 =< CP, + CP =< 16#7F -> + [CP | string_to_utf8_list(CPs)]; +string_to_utf8_list([CP|CPs]) when is_integer(CP), + 16#80 =< CP, + CP =< 16#7FF -> + [16#C0 bor (CP bsr 6), + 16#80 bor (16#3F band CP) + | string_to_utf8_list(CPs)]; +string_to_utf8_list([CP|CPs]) when is_integer(CP), + 16#800 =< CP, + CP =< 16#FFFF -> + [16#E0 bor (CP bsr 12), + 16#80 bor (16#3F band (CP bsr 6)), + 16#80 bor (16#3F band CP) + | string_to_utf8_list(CPs)]; +string_to_utf8_list([CP|CPs]) when is_integer(CP), + 16#10000 =< CP, + CP =< 16#10FFFF -> + [16#F0 bor (CP bsr 18), + 16#80 bor (16#3F band (CP bsr 12)), + 16#80 bor (16#3F band (CP bsr 6)), + 16#80 bor (16#3F band CP) + | string_to_utf8_list(CPs)]. diff --git a/lib/erl_interface/test/ei_decode_encode_SUITE_data/ei_decode_encode_test.c b/lib/erl_interface/test/ei_decode_encode_SUITE_data/ei_decode_encode_test.c index 406f02ecfb..194ce9057b 100644 --- a/lib/erl_interface/test/ei_decode_encode_SUITE_data/ei_decode_encode_test.c +++ b/lib/erl_interface/test/ei_decode_encode_SUITE_data/ei_decode_encode_test.c @@ -29,171 +29,222 @@ * Author: kent@erix.ericsson.se */ -#define EI_DECODE_ENCODE(FUNC,TYPE) \ - { \ - char *buf; \ - char buf2[1024]; \ - TYPE p; \ - int size1 = 0; \ - int size2 = 0; \ - int size3 = 0; \ - int err; \ - ei_x_buff arg; \ -\ - message("ei_decode_" #FUNC ", arg is type " #TYPE); \ - buf = read_packet(NULL); \ - err = ei_decode_ ## FUNC(buf+1, &size1, &p); \ - if (err != 0) { \ - if (err != -1) { \ - fail("decode returned non zero but not -1"); \ - } else { \ - fail("decode returned non zero"); \ - } \ - return; \ - } \ - if (size1 < 1) { \ - fail("size is < 1"); \ - return; \ - } \ -\ - message("ei_encode_" #FUNC " buf is NULL, arg is type " #TYPE); \ - err = ei_encode_ ## FUNC(NULL, &size2, &p); \ - if (err != 0) { \ - if (err != -1) { \ - fail("size calculation returned non zero but not -1"); \ - return; \ - } else { \ - fail("size calculation returned non zero"); \ - return; \ - } \ - } \ - if (size1 != size2) { \ - message("size1 = %d, size2 = %d\n",size1,size2); \ - fail("decode and encode size differs when buf is NULL"); \ - return; \ - } \ - message("ei_encode_" #FUNC ", arg is type " #TYPE); \ - err = ei_encode_ ## FUNC(buf2, &size3, &p); \ - if (err != 0) { \ - if (err != -1) { \ - fail("returned non zero but not -1"); \ - } else { \ - fail("returned non zero"); \ - } \ - return; \ - } \ - if (size1 != size3) { \ - message("size1 = %d, size2 = %d\n",size1,size3); \ - fail("decode and encode size differs"); \ - return; \ - } \ - send_buffer(buf2, size1); \ -\ - message("ei_x_encode_" #FUNC ", arg is type " #TYPE); \ - ei_x_new(&arg); \ - err = ei_x_encode_ ## FUNC(&arg, &p); \ - if (err != 0) { \ - if (err != -1) { \ - fail("returned non zero but not -1"); \ - } else { \ - fail("returned non zero"); \ - } \ - ei_x_free(&arg); \ - return; \ - } \ - if (arg.index < 1) { \ - fail("size is < 1"); \ - ei_x_free(&arg); \ - return; \ - } \ - send_buffer(arg.buff, arg.index); \ - ei_x_free(&arg); \ - } - -#define EI_DECODE_ENCODE_BIG(FUNC,TYPE) \ - { \ - char *buf; \ - char buf2[2048]; \ - TYPE *p; \ - int size1 = 0; \ - int size2 = 0; \ - int size3 = 0; \ - int err, index = 0, len, type; \ - ei_x_buff arg; \ -\ - message("ei_decode_" #FUNC ", arg is type " #TYPE); \ - buf = read_packet(NULL); \ - ei_get_type(buf+1, &index, &type, &len); \ - p = ei_alloc_big(len); \ - err = ei_decode_ ## FUNC(buf+1, &size1, p); \ - if (err != 0) { \ - if (err != -1) { \ - fail("decode returned non zero but not -1"); \ - } else { \ - fail("decode returned non zero"); \ - } \ - return; \ - } \ - if (size1 < 1) { \ - fail("size is < 1"); \ - return; \ - } \ -\ - message("ei_encode_" #FUNC " buf is NULL, arg is type " #TYPE); \ - err = ei_encode_ ## FUNC(NULL, &size2, p); \ - if (err != 0) { \ - if (err != -1) { \ - fail("size calculation returned non zero but not -1"); \ - return; \ - } else { \ - fail("size calculation returned non zero"); \ - return; \ - } \ - } \ - if (size1 != size2) { \ - message("size1 = %d, size2 = %d\n",size1,size2); \ - fail("decode and encode size differs when buf is NULL"); \ - return; \ - } \ - message("ei_encode_" #FUNC ", arg is type " #TYPE); \ - err = ei_encode_ ## FUNC(buf2, &size3, p); \ - if (err != 0) { \ - if (err != -1) { \ - fail("returned non zero but not -1"); \ - } else { \ - fail("returned non zero"); \ - } \ - return; \ - } \ - if (size1 != size3) { \ - message("size1 = %d, size2 = %d\n",size1,size3); \ - fail("decode and encode size differs"); \ - return; \ - } \ - send_buffer(buf2, size1); \ -\ - message("ei_x_encode_" #FUNC ", arg is type " #TYPE); \ - ei_x_new(&arg); \ - err = ei_x_encode_ ## FUNC(&arg, p); \ - if (err != 0) { \ - if (err != -1) { \ - fail("returned non zero but not -1"); \ - } else { \ - fail("returned non zero"); \ - } \ - ei_x_free(&arg); \ - return; \ - } \ - if (arg.index < 1) { \ - fail("size is < 1"); \ - ei_x_free(&arg); \ - return; \ - } \ - send_buffer(arg.buff, arg.index); \ - ei_x_free(&arg); \ - ei_free_big(p); \ - } +/*#define MESSAGE(FMT,A1,A2) message(FMT,A1,A2)*/ +#define MESSAGE(FMT,A1,A2) +typedef int decodeFT(const char *buf, int *index, void*); +typedef int encodeFT(char *buf, int *index, void*); +typedef int x_encodeFT(ei_x_buff*, void*); + +struct Type { + char* name; + char* type; + decodeFT* ei_decode_fp; + encodeFT* ei_encode_fp; + x_encodeFT* ei_x_encode_fp; +}; + +typedef struct +{ + char name[MAXATOMLEN_UTF8]; + enum erlang_char_encoding enc; +}my_atom; + +int ei_decode_my_atom(const char *buf, int *index, my_atom* a) +{ + return ei_decode_atom_as(buf, index, a->name, sizeof(a->name), ERLANG_UTF8, &a->enc, NULL); +} +int ei_encode_my_atom(char *buf, int *index, my_atom* a) +{ + return ei_encode_atom_as(buf, index, a->name, ERLANG_UTF8, a->enc); +} +int ei_x_encode_my_atom(ei_x_buff* x, my_atom* a) +{ + return ei_x_encode_atom_as(x, a->name, ERLANG_UTF8, a->enc); +} + +void decode_encode(struct Type* t, void* obj) +{ + char *buf; + char buf2[1024]; + int size1 = 0; + int size2 = 0; + int size3 = 0; + int err; + ei_x_buff arg; + + MESSAGE("ei_decode_%s, arg is type %s", t->name, t->type); + buf = read_packet(NULL); + err = t->ei_decode_fp(buf+1, &size1, obj); + if (err != 0) { + if (err != -1) { + fail("decode returned non zero but not -1"); + } else { + fail("decode returned non zero"); + } + return; + } + if (size1 < 1) { + fail("size is < 1"); + return; + } + + MESSAGE("ei_encode_%s buf is NULL, arg is type %s", t->name, t->type); + err = t->ei_encode_fp(NULL, &size2, obj); + if (err != 0) { + if (err != -1) { + fail("size calculation returned non zero but not -1"); + return; + } else { + fail("size calculation returned non zero"); + return; + } + } + if (size1 != size2) { + MESSAGE("size1 = %d, size2 = %d\n",size1,size2); + fail("decode and encode size differs when buf is NULL"); + return; + } + MESSAGE("ei_encode_%s, arg is type %s", t->name, t->type); + err = t->ei_encode_fp(buf2, &size3, obj); + if (err != 0) { + if (err != -1) { + fail("returned non zero but not -1"); + } else { + fail("returned non zero"); + } + return; + } + if (size1 != size3) { + MESSAGE("size1 = %d, size2 = %d\n",size1,size3); + fail("decode and encode size differs"); + return; + } + send_buffer(buf2, size1); + + MESSAGE("ei_x_encode_%s, arg is type %s", t->name, t->type); + ei_x_new(&arg); + err = t->ei_x_encode_fp(&arg, obj); + if (err != 0) { + if (err != -1) { + fail("returned non zero but not -1"); + } else { + fail("returned non zero"); + } + ei_x_free(&arg); + return; + } + if (arg.index < 1) { + fail("size is < 1"); + ei_x_free(&arg); + return; + } + send_buffer(arg.buff, arg.index); + ei_x_free(&arg); +} + + +#define EI_DECODE_ENCODE(TYPE, ERLANG_TYPE) { \ + struct Type type_struct = {#TYPE, #ERLANG_TYPE, \ + (decodeFT*)ei_decode_##TYPE, \ + (encodeFT*)ei_encode_##TYPE, \ + (x_encodeFT*)ei_x_encode_##TYPE }; \ + ERLANG_TYPE type_obj; \ + decode_encode(&type_struct, &type_obj); \ + } + + +void decode_encode_big(struct Type* t) +{ + char *buf; + char buf2[2048]; + void *p; /* (TYPE*) */ + int size1 = 0; + int size2 = 0; + int size3 = 0; + int err, index = 0, len, type; + ei_x_buff arg; + + MESSAGE("ei_decode_%s, arg is type %s", t->name, t->type); + buf = read_packet(NULL); + ei_get_type(buf+1, &index, &type, &len); + p = ei_alloc_big(len); + err = t->ei_decode_fp(buf+1, &size1, p); + if (err != 0) { + if (err != -1) { + fail("decode returned non zero but not -1"); + } else { + fail("decode returned non zero"); + } + return; + } + if (size1 < 1) { + fail("size is < 1"); + return; + } + + MESSAGE("ei_encode_%s buf is NULL, arg is type %s", t->name, t->type); + err = t->ei_encode_fp(NULL, &size2, p); + if (err != 0) { + if (err != -1) { + fail("size calculation returned non zero but not -1"); + return; + } else { + fail("size calculation returned non zero"); + return; + } + } + if (size1 != size2) { + MESSAGE("size1 = %d, size2 = %d\n",size1,size2); + fail("decode and encode size differs when buf is NULL"); + return; + } + MESSAGE("ei_encode_%s, arg is type %s", t->name, t->type); + err = t->ei_encode_fp(buf2, &size3, p); + if (err != 0) { + if (err != -1) { + fail("returned non zero but not -1"); + } else { + fail("returned non zero"); + } + return; + } + if (size1 != size3) { + MESSAGE("size1 = %d, size2 = %d\n",size1,size3); + fail("decode and encode size differs"); + return; + } + send_buffer(buf2, size1); + + MESSAGE("ei_x_encode_%s, arg is type %s", t->name, t->type); + ei_x_new(&arg); + err = t->ei_x_encode_fp(&arg, p); + if (err != 0) { + if (err != -1) { + fail("returned non zero but not -1"); + } else { + fail("returned non zero"); + } + ei_x_free(&arg); + return; + } + if (arg.index < 1) { + fail("size is < 1"); + ei_x_free(&arg); + return; + } + send_buffer(arg.buff, arg.index); + ei_x_free(&arg); + ei_free_big(p); +} + +#define EI_DECODE_ENCODE_BIG(TYPE, ERLANG_TYPE) { \ + struct Type type_struct = {#TYPE, #ERLANG_TYPE, \ + (decodeFT*)ei_decode_##TYPE, \ + (encodeFT*)ei_encode_##TYPE, \ + (x_encodeFT*)ei_x_encode_##TYPE }; \ + decode_encode_big(&type_struct); \ + } @@ -201,6 +252,8 @@ TESTCASE(test_ei_decode_encode) { + int i; + EI_DECODE_ENCODE(fun , erlang_fun); EI_DECODE_ENCODE(pid , erlang_pid); EI_DECODE_ENCODE(port , erlang_port); @@ -223,6 +276,11 @@ TESTCASE(test_ei_decode_encode) EI_DECODE_ENCODE(port , erlang_port); EI_DECODE_ENCODE(ref , erlang_ref); + /* Unicode atoms */ + for (i=0; i<2010; i++) { + EI_DECODE_ENCODE(my_atom , my_atom); + } + report(1); } diff --git a/lib/ic/examples/all-against-all/client.c b/lib/ic/examples/all-against-all/client.c index e0a52b142d..5dece9cfa6 100644 --- a/lib/ic/examples/all-against-all/client.c +++ b/lib/ic/examples/all-against-all/client.c @@ -88,6 +88,7 @@ int main(){ /* Initiating pid*/ strcpy(pid.node,client_node); + pid.node_org_enc = ERLANG_LATIN1; pid.num = 99; pid.serial = 0; pid.creation = 0; diff --git a/lib/ic/examples/c-client/client.c b/lib/ic/examples/c-client/client.c index 816477cf15..5b11510ce3 100644 --- a/lib/ic/examples/c-client/client.c +++ b/lib/ic/examples/c-client/client.c @@ -64,6 +64,7 @@ int main() /* Initiating pid*/ strcpy(pid.node,CLNODE); + pid.node_org_enc = ERLANG_LATIN1; pid.num = 99; pid.serial = 0; pid.creation = 0; diff --git a/lib/ic/examples/c-server/client.c b/lib/ic/examples/c-server/client.c index fa570089b5..605e41ddb1 100644 --- a/lib/ic/examples/c-server/client.c +++ b/lib/ic/examples/c-server/client.c @@ -58,6 +58,7 @@ int main() /* Initiating pid*/ strcpy(pid.node, CLNODE); + pid.node_org_enc = ERLANG_LATIN1; pid.num = 99; pid.serial = 0; pid.creation = 0; -- cgit v1.2.3 From a912b3c6f4759a6a8e60fc4ea559c19edb02448c Mon Sep 17 00:00:00 2001 From: Rickard Green Date: Thu, 17 Jan 2013 21:09:46 +0100 Subject: test case --- erts/emulator/test/distribution_SUITE.erl | 197 ++++++++++++++++++++++++++---- 1 file changed, 175 insertions(+), 22 deletions(-) diff --git a/erts/emulator/test/distribution_SUITE.erl b/erts/emulator/test/distribution_SUITE.erl index aaf6420ccd..e7bd8d19c3 100644 --- a/erts/emulator/test/distribution_SUITE.erl +++ b/erts/emulator/test/distribution_SUITE.erl @@ -18,7 +18,17 @@ %% -module(distribution_SUITE). --compile(r13). +-compile(r15). + +-define(VERSION_MAGIC, 131). + +-define(ATOM_EXT, 100). +-define(REFERENCE_EXT, 101). +-define(PORT_EXT, 102). +-define(PID_EXT, 103). +-define(NEW_REFERENCE_EXT, 114). +-define(ATOM_UTF8_EXT, 118). +-define(SMALL_ATOM_UTF8_EXT, 119). %% Tests distribution and the tcp driver. @@ -1139,27 +1149,66 @@ atom_data() -> lists:seq(1, 2000)). verify_atom_data(AtomData) -> - lists:foreach(fun ({Atom, AtomTxt}) -> - AtomTxt = atom_to_list(Atom) + lists:foreach(fun ({Atom, AtomTxt}) when is_atom(Atom) -> + AtomTxt = atom_to_list(Atom); + ({PPR, AtomTxt}) -> + % Pid, Port, or Ref + AtomTxt = atom_to_list(node(PPR)) end, AtomData). -uc_atup(ATxt) -> - {string_to_atom(ATxt), ATxt}. +uc_atom_tup(ATxt) -> + Atom = string_to_atom(ATxt), + ATxt = atom_to_list(Atom), + {Atom, ATxt}. + +uc_pid_tup(ATxt) -> + ATxtExt = string_to_atom_ext(ATxt), + Pid = mk_pid({ATxtExt, 1}, 4711,17), + true = is_pid(Pid), + Atom = node(Pid), + true = is_atom(Atom), + ATxt = atom_to_list(Atom), + {Pid, ATxt}. + +uc_port_tup(ATxt) -> + ATxtExt = string_to_atom_ext(ATxt), + Port = mk_port({ATxtExt, 2}, 4711), + true = is_port(Port), + Atom = node(Port), + true = is_atom(Atom), + ATxt = atom_to_list(Atom), + {Port, ATxt}. + +uc_ref_tup(ATxt) -> + ATxtExt = string_to_atom_ext(ATxt), + Ref = mk_ref({ATxtExt, 3}, [4711,17, 4711]), + true = is_reference(Ref), + Atom = node(Ref), + true = is_atom(Atom), + ATxt = atom_to_list(Atom), + {Ref, ATxt}. + unicode_atom_data() -> - [uc_atup(lists:seq(16#1f600, 16#1f600+254)), - uc_atup(lists:seq(16#1f600, 16#1f600+63)), - uc_atup(lists:seq(0, 254)), - uc_atup(lists:seq(100, 163)), - uc_atup(lists:seq(200, 354)), - uc_atup(lists:seq(200, 263)), - uc_atup(lists:seq(2000, 2254)), - uc_atup(lists:seq(2000, 2063)), - uc_atup(lists:seq(65500, 65754)), - uc_atup(lists:seq(65500, 65563)) + [uc_pid_tup(lists:seq(16#1f600, 16#1f600+249) ++ "@host"), + uc_pid_tup(lists:seq(16#1f600, 16#1f600+30) ++ "@host"), + uc_port_tup(lists:seq(16#1f600, 16#1f600+249) ++ "@host"), + uc_port_tup(lists:seq(16#1f600, 16#1f600+30) ++ "@host"), + uc_ref_tup(lists:seq(16#1f600, 16#1f600+249) ++ "@host"), + uc_ref_tup(lists:seq(16#1f600, 16#1f600+30) ++ "@host"), + uc_atom_tup(lists:seq(16#1f600, 16#1f600+254)), + uc_atom_tup(lists:seq(16#1f600, 16#1f600+63)), + uc_atom_tup(lists:seq(0, 254)), + uc_atom_tup(lists:seq(100, 163)), + uc_atom_tup(lists:seq(200, 354)), + uc_atom_tup(lists:seq(200, 263)), + uc_atom_tup(lists:seq(2000, 2254)), + uc_atom_tup(lists:seq(2000, 2063)), + uc_atom_tup(lists:seq(65500, 65754)), + uc_atom_tup(lists:seq(65500, 65563)) | lists:map(fun (N) -> - uc_atup(lists:seq(64000+N, 64254+N)) + uc_atom_tup(lists:seq(64000+N, 64254+N)) end, lists:seq(1, 2000))]. @@ -2193,14 +2242,19 @@ repeat(Fun, N) -> Fun(), repeat(Fun, N-1). -string_to_atom(String) -> +string_to_atom_ext(String) -> Utf8List = string_to_utf8_list(String), Len = length(Utf8List), - TagLen = case Len < 256 of - true -> [119, Len]; - false -> [118, Len bsr 8, Len band 16#ff] - end, - binary_to_term(list_to_binary([131, TagLen, Utf8List])). + case Len < 256 of + true -> + [?SMALL_ATOM_UTF8_EXT, Len | Utf8List]; + false -> + [?ATOM_UTF8_EXT, Len bsr 8, Len band 16#ff | Utf8List] + end. + +string_to_atom(String) -> + binary_to_term(list_to_binary([?VERSION_MAGIC + | string_to_atom_ext(String)])). string_to_utf8_list([]) -> []; @@ -2275,3 +2329,102 @@ utf8_list_to_string([B0, B1, B2, B3 | Bs]) when is_integer(B0), bor ((B2 band 16#3F) bsl 6) bor (B3 band 16#3F)) | utf8_list_to_string(Bs)]. + +mk_pid({NodeName, Creation}, Number, Serial) when is_atom(NodeName) -> + <> = term_to_binary(NodeName), + mk_pid({NodeNameExt, Creation}, Number, Serial); +mk_pid({NodeNameExt, Creation}, Number, Serial) -> + case catch binary_to_term(list_to_binary([?VERSION_MAGIC, + ?PID_EXT, + NodeNameExt, + uint32_be(Number), + uint32_be(Serial), + uint8(Creation)])) of + Pid when is_pid(Pid) -> + Pid; + {'EXIT', {badarg, _}} -> + exit({badarg, mk_pid, [{NodeNameExt, Creation}, Number, Serial]}); + Other -> + exit({unexpected_binary_to_term_result, Other}) + end. + +mk_port({NodeName, Creation}, Number) when is_atom(NodeName) -> + <> = term_to_binary(NodeName), + mk_port({NodeNameExt, Creation}, Number); +mk_port({NodeNameExt, Creation}, Number) -> + case catch binary_to_term(list_to_binary([?VERSION_MAGIC, + ?PORT_EXT, + NodeNameExt, + uint32_be(Number), + uint8(Creation)])) of + Port when is_port(Port) -> + Port; + {'EXIT', {badarg, _}} -> + exit({badarg, mk_port, [{NodeNameExt, Creation}, Number]}); + Other -> + exit({unexpected_binary_to_term_result, Other}) + end. + +mk_ref({NodeName, Creation}, [Number] = NL) when is_atom(NodeName), + is_integer(Creation), + is_integer(Number) -> + <> = term_to_binary(NodeName), + mk_ref({NodeNameExt, Creation}, NL); +mk_ref({NodeNameExt, Creation}, [Number]) when is_integer(Creation), + is_integer(Number) -> + case catch binary_to_term(list_to_binary([?VERSION_MAGIC, + ?REFERENCE_EXT, + NodeNameExt, + uint32_be(Number), + uint8(Creation)])) of + Ref when is_reference(Ref) -> + Ref; + {'EXIT', {badarg, _}} -> + exit({badarg, mk_ref, [{NodeNameExt, Creation}, [Number]]}); + Other -> + exit({unexpected_binary_to_term_result, Other}) + end; +mk_ref({NodeName, Creation}, Numbers) when is_atom(NodeName), + is_integer(Creation), + is_list(Numbers) -> + <> = term_to_binary(NodeName), + mk_ref({NodeNameExt, Creation}, Numbers); +mk_ref({NodeNameExt, Creation}, Numbers) when is_integer(Creation), + is_list(Numbers) -> + case catch binary_to_term(list_to_binary([?VERSION_MAGIC, + ?NEW_REFERENCE_EXT, + uint16_be(length(Numbers)), + NodeNameExt, + uint8(Creation), + lists:map(fun (N) -> + uint32_be(N) + end, + Numbers)])) of + Ref when is_reference(Ref) -> + Ref; + {'EXIT', {badarg, _}} -> + exit({badarg, mk_ref, [{NodeNameExt, Creation}, Numbers]}); + Other -> + exit({unexpected_binary_to_term_result, Other}) + end. + + +uint32_be(Uint) when is_integer(Uint), 0 =< Uint, Uint < 1 bsl 32 -> + [(Uint bsr 24) band 16#ff, + (Uint bsr 16) band 16#ff, + (Uint bsr 8) band 16#ff, + Uint band 16#ff]; +uint32_be(Uint) -> + exit({badarg, uint32_be, [Uint]}). + + +uint16_be(Uint) when is_integer(Uint), 0 =< Uint, Uint < 1 bsl 16 -> + [(Uint bsr 8) band 16#ff, + Uint band 16#ff]; +uint16_be(Uint) -> + exit({badarg, uint16_be, [Uint]}). + +uint8(Uint) when is_integer(Uint), 0 =< Uint, Uint < 1 bsl 8 -> + Uint band 16#ff; +uint8(Uint) -> + exit({badarg, uint8, [Uint]}). -- cgit v1.2.3 From b15688d40d5147c1122aaad3b82495fbbc4dede8 Mon Sep 17 00:00:00 2001 From: Rickard Green Date: Sat, 19 Jan 2013 00:45:16 +0100 Subject: UTF-8 atom documentation --- erts/doc/src/erl_dist_protocol.xml | 288 ++++++++++++++++++++- erts/doc/src/erl_ext_dist.xml | 121 ++++++++- erts/doc/src/erlang.xml | 10 +- lib/kernel/internal_doc/distribution_handshake.txt | 216 +--------------- 4 files changed, 398 insertions(+), 237 deletions(-) diff --git a/erts/doc/src/erl_dist_protocol.xml b/erts/doc/src/erl_dist_protocol.xml index 6c725fc82d..0252187be5 100644 --- a/erts/doc/src/erl_dist_protocol.xml +++ b/erts/doc/src/erl_dist_protocol.xml @@ -547,13 +547,289 @@ If Result > 0, the packet only consists of [119, Result]. --> - +
- Handshake -

- The handshake is discussed in detail in the internal documentation for - the kernel (Erlang) application. -

+ Distribution Handshake +

+ This section describes the distribution handshake protocol + introduced in the OTP-R6 release of Erlang/OTP. This + description was previously located in + $ERL_TOP/lib/kernel/internal_doc/distribution_handshake.txt, + and has more or less been copied and "formatted" here. It has been + more or less unchanged since the year 1999, but the handshake + should not have changed much since then either. +

+
+ General +

+ The TCP/IP distribution uses a handshake which expects a + connection based protocol, i.e. the protocol does not include + any authentication after the handshake procedure. +

+

+ This is not entirely safe, as it is vulnerable against takeover + attacks, but it is a tradeoff between fair safety and performance. +

+

+ The cookies are never sent in cleartext and the handshake procedure + expects the client (called A) to be the first one to prove that it can + generate a sufficient digest. The digest is generated with the + MD5 message digest algorithm and the challenges are expected to be very + random numbers. +

+
+
+ Definitions +

+ A challenge is a 32 bit integer number in big endian order. Below the function + gen_challenge() returns a random 32 bit integer used as a challenge. +

+

+ A digest is a (16 bytes) MD5 hash of the Challenge (as text) concatenated + with the cookie (as text). Below, the function gen_digest(Challenge, Cookie) + generates a digest as described above. +

+

+ An out_cookie is the cookie used in outgoing communication to a certain node, + so that A's out_cookie for B should correspond with B's in_cookie for A and + the other way around. A's out_cookie for B and A's in_cookie for B need NOT + be the same. Below the function out_cookie(Node) returns the current + node's out_cookie for Node. +

+

+ An in_cookie is the cookie expected to be used by another node when + communicating with us, so that A's in_cookie for B corresponds with B's + out_cookie for A. Below the function in_cookie(Node) returns the current + node's in_cookie for Node. +

+

+ The cookies are text strings that can be viewed as passwords. +

+

+ Every message in the handshake starts with a 16 bit big endian integer + which contains the length of the message (not counting the two initial bytes). + In erlang this corresponds to the gen_tcp option {packet, 2}. Note that after + the handshake, the distribution switches to 4 byte packet headers. +

+ +
+
+ The Handshake in Detail +

+ Imagine two nodes, node A, which initiates the handshake and node B, which + accepts the connection. +

+ + 1) connect/accept +

A connects to B via TCP/IP and B accepts the connection.

+ 2) send_name/receive_name +

A sends an initial identification to B. B receives the message. + The message looks like this (every "square" being one byte and the packet + header removed): +

+
++---+--------+--------+-----+-----+-----+-----+-----+-----+-...-+-----+
+|'n'|Version0|Version1|Flag0|Flag1|Flag2|Flag3|Name0|Name1| ... |NameN|
++---+--------+--------+-----+-----+-----+-----+-----+-----+-... +-----+
+
+

+ The 'n' is just a message tag. + Version0 and Version1 is the distribution version selected by node A, + based on information from EPMD. (16 bit big endian) + Flag0 ... Flag3 are capability flags, the capabilities defined in + $ERL_TOP/lib/kernel/include/dist.hrl. + (32 bit big endian) + Name0 ... NameN is the full nodename of A, as a string of bytes (the + packet length denotes how long it is). +

+ 3) recv_status/send_status +

B sends a status message to A, which indicates + if the connection is allowed. The following status codes are defined:

+ + ok + The handshake will continue. + ok_simultaneous + The handshake will continue, but A is informed that B + has another ongoing connection attempt that will be + shut down (simultaneous connect where A's name is + greater than B's name, compared literally). + nok + The handshake will not continue, as B already has an ongoing handshake + which it itself has initiated. (simultaneous connect where B's name is + greater than A's). + not_allowed + The connection is disallowed for some (unspecified) security + reason. + alive + A connection to the node is already active, which either means + that node A is confused or that the TCP connection breakdown + of a previous node with this name has not yet reached node B. + See 3B below. + +

This is the format of the status message:

+
++---+-------+-------+-...-+-------+
+|'s'|Status0|Status1| ... |StatusN|
++---+-------+-------+-...-+-------+
+
+

+ 's' is the message tag Status0 ... StatusN is the status as a string (not terminated) +

+
+ 3B) send_status/recv_status +

If status was 'alive', node A will answer with + another status message containing either 'true' which means that the + connection should continue (The old connection from this node is broken), or + 'false', which simply means that the connection should be closed, the + connection attempt was a mistake.

+ 4) recv_challenge/send_challenge +

If the status was ok or ok_simultaneous, + The handshake continues with B sending A another message, the challenge. + The challenge contains the same type of information as the "name" message + initially sent from A to B, with the addition of a 32 bit challenge:

+
++---+--------+--------+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-...-+-----+
+|'n'|Version0|Version1|Flag0|Flag1|Flag2|Flag3|Chal0|Chal1|Chal2|Chal3|Name0|Name1| ... |NameN|
++---+--------+--------+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-... +-----+
+
+

+ Where Chal0 ... Chal3 is the challenge as a 32 bit big endian integer + and the other fields are B's version, flags and full nodename. +

+ 5) send_challenge_reply/recv_challenge_reply +

Now A has generated a digest and its own challenge. Those are + sent together in a package to B:

+
++---+-----+-----+-----+-----+-----+-----+-----+-----+-...-+------+
+|'r'|Chal0|Chal1|Chal2|Chal3|Dige0|Dige1|Dige2|Dige3| ... |Dige15|
++---+-----+-----+-----+-----+-----+-----+-----+-----+-...-+------+
+
+

+ Where 'r' is the tag, Chal0 ... Chal3 is A's challenge for B to handle and + Dige0 ... Dige15 is the digest that A constructed from the challenge B sent + in the previous step. +

+ 6) recv_challenge_ack/send_challenge_ack +

B checks that the digest received from A is correct and generates a + digest from the challenge received from A. The digest is then sent to A. The + message looks like this:

+
++---+-----+-----+-----+-----+-...-+------+
+|'a'|Dige0|Dige1|Dige2|Dige3| ... |Dige15|
++---+-----+-----+-----+-----+-...-+------+
+
+

+ Where 'a' is the tag and Dige0 ... Dige15 is the digest calculated by B + for A's challenge.

+ 7) +

A checks the digest from B and the connection is up.

+
+
+
+ Semigraphic View +
+A (initiator)						B (acceptor)
+
+TCP connect ----------------------------------------->	
+							TCP accept
+
+send_name   ----------------------------------------->
+							recv_name
+
+	    <----------------------------------------	send_status
+recv_status
+(if status was 'alive'
+ send_status - - - - - - - - - - - - - - - - - - - ->
+							recv_status)
+							ChB = gen_challenge()
+		          (ChB)
+	    <----------------------------------------	send_challenge
+recv_challenge
+
+ChA = gen_challenge(),
+OCA = out_cookie(B),
+DiA = gen_digest(ChB,OCA)
+			  (ChA, DiA)
+send_challenge_reply -------------------------------->
+							recv_challenge_reply
+							ICB = in_cookie(A),
+							check:
+							DiA == gen_digest
+								(ChB, ICB) ?
+							- if OK:
+	    						 OCB = out_cookie(A),
+							 DiB = gen_digest
+			(DiB)					(ChA, OCB)
+	    <-----------------------------------------	 send_challenge_ack
+recv_challenge_ack					 DONE
+ICA = in_cookie(B),                                     - else
+check:                                                   CLOSE
+DiB == gen_digest(ChA,ICA) ?
+- if OK
+ DONE
+- else
+ CLOSE
+
+
+ +
+ The Currently Defined Distribution Flags +

+ Currently (OTP-R16) the following capability flags are defined: +

+
+%% The node should be published and part of the global namespace
+-define(DFLAG_PUBLISHED,1).
+
+%% The node implements an atom cache (obsolete)
+-define(DFLAG_ATOM_CACHE,2).
+
+%% The node implements extended (3 * 32 bits) references. This is
+%% required today. If not present connection will be refused.
+-define(DFLAG_EXTENDED_REFERENCES,4).
+
+%% The node implements distributed process monitoring.
+-define(DFLAG_DIST_MONITOR,8).
+
+%% The node uses separate tag for fun's (lambdas) in the distribution protocol.
+-define(DFLAG_FUN_TAGS,16#10).
+
+%% The node implements distributed named process monitoring.
+-define(DFLAG_DIST_MONITOR_NAME,16#20).
+
+%% The (hidden) node implements atom cache (obsolete)
+-define(DFLAG_HIDDEN_ATOM_CACHE,16#40).
+
+%% The node understand new fun-tags
+-define(DFLAG_NEW_FUN_TAGS,16#80).
+
+%% The node is capable of handling extended pids and ports. This is
+%% required today. If not present connection will be refused.
+-define(DFLAG_EXTENDED_PIDS_PORTS,16#100).
+
+%%
+-define(DFLAG_EXPORT_PTR_TAG,16#200).
+
+%%
+-define(DFLAG_BIT_BINARIES,16#400).
+
+%% The node understands new float format
+-define(DFLAG_NEW_FLOATS,16#800).
+
+%%
+-define(DFLAG_UNICODE_IO,16#1000).
+
+%% The node implements atom cache in distribution header.
+-define(DFLAG_DIST_HDR_ATOM_CACHE,16#2000).
+
+%% The node understand the SMALL_ATOM_EXT tag
+-define(DFLAG_SMALL_ATOM_TAGS, 16#4000).
+
+%% The node understand UTF-8 encoded atoms
+-define(DFLAG_UTF8_ATOMS, 16#10000).
+
+
+
diff --git a/erts/doc/src/erl_ext_dist.xml b/erts/doc/src/erl_ext_dist.xml index fd2da2cfe3..28afea8b29 100644 --- a/erts/doc/src/erl_ext_dist.xml +++ b/erts/doc/src/erl_ext_dist.xml @@ -119,10 +119,39 @@ Data + + +

As of ERTS version 5.10 (OTP-R16) support + for UTF-8 encoded atoms has been introduced in the external format. + However, only characters that can be encoded using Latin1 (ISO-8859-1) + are currently supported in atoms. The support for UTF-8 encoded atoms + in the external format has been implemented in order to be able to support + all Unicode characters in atoms in some future release. Full + support for Unicode atoms will not happen before OTP-R18, and might + be introduced even later than that. Until full Unicode support for + atoms has been introduced, it is an error to pass atoms containing + characters that cannot be encoded in Latin1, and the behavior is + undefined.

+

When the + DFLAG_UTF8_ATOMS + distribution flag has been exchanged between both nodes in the + distribution handshake, + all atoms in the distribution header will be encoded in UTF-8; otherwise, + all atoms in the distribution header will be encoded in Latin1. The two + new tags ATOM_UTF8_EXT, and + SMALL_ATOM_UTF8_EXT + will only be used if the DFLAG_UTF8_ATOMS distribution flag has + been exchanged between nodes, or if an atom containing characters + that cannot be encoded in Latin1 is encountered. +

+

The maximum number of allowed characters in an atom is 255. In the + UTF-8 case each character may need 4 bytes to be encoded. +

+
-
- + +
Distribution header

As of erts version 5.7.2 the old atom cache protocol was @@ -219,8 +248,7 @@

The least significant bit in that half byte is the LongAtoms flag. If it is set, 2 bytes are used for atom lengths instead of - 1 byte in the distribution header. However, the current emulator - cannot handle long atoms, so it will currently always be 0. + 1 byte in the distribution header.

After the Flags field follow the AtomCacheRefs. The @@ -247,15 +275,25 @@

InternalSegmentIndex together with the SegmentIndex completely identify the location of an atom cache entry in the - atom cache. Length is number of one byte characters that - the atom text consists of. Length is a two byte big endian integer + atom cache. Length is number of bytes that AtomText + consists of. Length is a two byte big endian integer if the LongAtoms flag has been set, otherwise a one byte - integer. Subsequent CachedAtomRefs with the same + integer. When the + DFLAG_UTF8_ATOMS + distribution flag has been exchanged between both nodes in the + distribution handshake, + characters in AtomText is encoded in UTF-8; otherwise, + encoded in Latin1. Subsequent CachedAtomRefs with the same SegmentIndex and InternalSegmentIndex as this NewAtomCacheRef will refer to this atom until a new NewAtomCacheRef with the same SegmentIndex and InternalSegmentIndex appear.

+

+ For more information on encoding of atoms, see + note on UTF-8 encoded atoms + in the beginning of this document. +

If the NewCacheEntryFlag for the next AtomCacheRef has not been set, a CachedAtomRef on the following format @@ -383,9 +421,9 @@

An atom is stored with a 2 byte unsigned length in big-endian order, - followed by Len numbers of 8 bit characters that forms the - AtomName. - Note: The maximum allowed value for Len is 255. + followed by Len numbers of 8 bit Latin1 characters that forms + the AtomName. + Note: The maximum allowed value for Len is 255.

@@ -754,12 +792,14 @@

An atom is stored with a 1 byte unsigned length, - followed by Len numbers of 8 bit characters that + followed by Len numbers of 8 bit Latin1 characters that forms the AtomName. Longer atoms can be represented by ATOM_EXT. Note the SMALL_ATOM_EXT was introduced in erts version 5.7.2 and - require a small atom distribution flag exchanged in the distribution - handshake. + require an exchange of the + DFLAG_SMALL_ATOM_TAGS + distribution flag in the + distribution handshake.

@@ -1007,7 +1047,62 @@ This term is used in minor version 1 of the external format.

+
+ + ATOM_UTF8_EXT + + + + 1 + 2 + Len + + + 118 + Len + AtomName + +
+

+ An atom is stored with a 2 byte unsigned length in big-endian order, + followed by Len bytes containing the AtomName encoded + in UTF-8. +

+

+ For more information on encoding of atoms, see + note on UTF-8 encoded atoms + in the beginning of this document. +

+
+
+ + SMALL_ATOM_UTF8_EXT + + + + 1 + 1 + Len + + + 119 + Len + AtomName + +
+

+ An atom is stored with a 1 byte unsigned length, + followed by Len bytes containing the AtomName encoded + in UTF-8. Longer atoms encoded in UTF-8 can be represented using + ATOM_UTF8_EXT. +

+

+ For more information on encoding of atoms, see + note on UTF-8 encoded atoms + in the beginning of this document. +

+
diff --git a/erts/doc/src/erlang.xml b/erts/doc/src/erlang.xml index 5002c48ca1..0077c0096c 100644 --- a/erts/doc/src/erlang.xml +++ b/erts/doc/src/erlang.xml @@ -277,7 +277,9 @@ the binary contains Unicode characters greater than 16#FF. In a future release, such Unicode characters might be allowed and binary_to_atom(Binary, utf8) - will not fail in that case.

+ will not fail in that case. For more information on Unicode support in atoms + see note on UTF-8 encoded atoms + in the chapter about the external term format in the ERTS User's Guide.

 > binary_to_atom(<<"Erlang">>, latin1).
@@ -1647,9 +1649,11 @@ os_prompt% 

Returns the atom whose text representation is String.

String may only contain ISO-latin-1 - characterns (i.e. numbers below 256) as the current + characters (i.e. numbers below 256) as the current implementation does not allow unicode characters >= 256 in - atoms.

+ atoms. For more information on Unicode support in atoms + see note on UTF-8 encoded atoms + in the chapter about the external term format in the ERTS User's Guide.

 > list_to_atom("Erlang").
 'Erlang'
diff --git a/lib/kernel/internal_doc/distribution_handshake.txt b/lib/kernel/internal_doc/distribution_handshake.txt index 6a3ee22ed3..d00c4ceb02 100644 --- a/lib/kernel/internal_doc/distribution_handshake.txt +++ b/lib/kernel/internal_doc/distribution_handshake.txt @@ -1,215 +1 @@ -HOW THE DISTRIBUTION HANDSHAKE WORKS ------------------------------------- - -This document describes the distribution handshake introduced in -the R6 release of Erlang/OTP. - -GENERAL -------- - -The TCP/IP distribution uses a handshake which expects a -connection based protocol, i.e. the protocol does not include -any authentication after the handshake procedure. - -This is not entirely safe, as it is vulnerable against takeover -attacks, but it is a tradeoff between fair safety and performance. - -The cookies are never sent in cleartext and the handshake procedure -expects the client (called A) to be the first one to prove that it can -generate a sufficient digest. The digest is generated with the -MD5 message digest algorithm and the challenges are expected to be very -random numbers. - -DEFINITIONS ------------ - -A challenge is a 32 bit integer number in big endian order. Below the function -gen_challenge() returns a random 32 bit integer used as a challenge. - -A digest is a (16 bytes) MD5 hash of [the Challenge (as text) concatenated -with the cookie (as text)]. Below, the function gen_digest(Challenge, Cookie) -generates a digest as described above. - -An out_cookie is the cookie used in outgoing communication to a certain node, -so that A's out_cookie for B should correspond with B's in_cookie for A and -the other way around. A's out_cookie for B and A's in_cookie for B need *NOT* -be the same. Below the function out_cookie(Node) returns the current -node's out_cookie for Node. - -An in_cookie is the cookie expected to be used by another node when -communicating with us, so that A's in_cookie for B corresponds with B's -out_cookie for A. Below the function in_cookie(Node) returns the current -node's in_cookie for Node. - -The cookies are text strings that can be viewed as passwords. - -Every message in the handshake starts with a 16 bit big endian integer -which contains the length of the message (not counting the two initial bytes). -In erlang this corresponds to the gen_tcp option {packet, 2}. Note that after -the handshake, the distribution switches to 4 byte packet headers. - -THE HANDSHAKE IN DETAIL ------------------------ - -Imagine two nodes, node A, which initiates the handshake and node B, which -accepts the connection. - -1) connect/accept: A connects to B via TCP/IP and B accepts the connection. - -2) send_name/receive_name: A sends an initial identification to B. -B receives the message. The message looks -like this (every "square" being one byte and the packet header removed): - -+---+--------+--------+-----+-----+-----+-----+-----+-----+-...-+-----+ -|'n'|Version0|Version1|Flag0|Flag1|Flag2|Flag3|Name0|Name1| ... |NameN| -+---+--------+--------+-----+-----+-----+-----+-----+-----+-... +-----+ - -The 'n' is just a message tag, -Version0 & Version1 is the distribution version selected by node A, - based on information from EPMD. (16 bit big endian) -Flag0 ... Flag3 are capability flags, the capabilities defined in dist.hrl. - (32 bit big endian) -Name0 ... NameN is the full nodename of A, as a string of bytes (the - packet length denotes how long it is). - -3) recv_status/send_status: B sends a status message to A, which indicates -if the connection is allowed. Four different status codes are defined: -ok: The handshake will continue. -ok_simultaneous: The handshake will continue, but A is informed that B - has another ongoing connection attempt that will be - shut down (simultaneous connect where A's name is - greater than B's name, compared literally), -nok: The handshake will not continue, as B already has an ongoing handshake - which it itself has initiated. (simultaneous connect where B's name is - greater than A's) -not_allowed: The connection is disallowed for some (unspecified) security - reason. -alive: A connection to the node is already active, which either means - that node A is confused or that the TCP connection breakdown - of a previous node with this name has not yet reached node B. - See 3B below. - -This is the format of the status message: - -+---+-------+-------+-...-+-------+ -|'s'|Status0|Status1| ... |StatusN| -+---+-------+-------+-...-+-------+ - -'s' is the message tag -Status0 ... StatusN is the status as a string (not terminated) - -3B) send_status/recv_status: If status was 'alive', node A will answer with -another status message containing either 'true' which means that the -connection should continue (The old connection from this node is broken), or -'false', which simply means that the connection should be closed, the -connection attempt was a mistake. - -4) recv_challenge/send_challenge: If the status was 'ok' or 'ok_simultaneous', -The handshake continues with B sending A another message, the challenge. -The challenge contains the same type of information as the "name" message -initially sent from A to B, with the addition of a 32 bit challenge: - -+---+--------+--------+-----+-----+-----+-----+-----+-----+-----+-----+--- -|'n'|Version0|Version1|Flag0|Flag1|Flag2|Flag3|Chal0|Chal1|Chal2|Chal3| -+---+--------+--------+-----+-----+-----+-----+-----+-----+-----+-----+--- - ------+-----+-...-+-----+ - Name0|Name1| ... |NameN| - ------+-----+-... +-----+ - -Where Chal0 ... Chal3 is the challenge as a 32 bit big endian integer -and the other fields are B's version, flags and full nodename. - -5) send_challenge_reply/recv_challenge_reply: Now A has generated -a digest and its own challenge. Those are sent together in a package -to B: - -+---+-----+-----+-----+-----+-----+-----+-----+-----+-...-+------+ -|'r'|Chal0|Chal1|Chal2|Chal3|Dige0|Dige1|Dige2|Dige3| ... |Dige15| -+---+-----+-----+-----+-----+-----+-----+-----+-----+-...-+------+ - -Where 'r' is the tag, Chal0 ... Chal3 is A's challenge for B to handle and -Dige0 ... Dige15 is the digest that A constructed from the challenge B sent -in the previous step. - -6) recv_challenge_ack/send_challenge_ack: B checks that the digest received -from A is correct and generates a digest from the challenge received from -A. The digest is then sent to A. The message looks like this: - -+---+-----+-----+-----+-----+-...-+------+ -|'a'|Dige0|Dige1|Dige2|Dige3| ... |Dige15| -+---+-----+-----+-----+-----+-...-+------+ - -Where 'a' is the tag and Dige0 ... Dige15 is the digest calculated by B -for A's challenge. - -7) A checks the digest from B and the connection is up. - -SEMIGRAPHIC VIEW ----------------- - -A (initiator) B (acceptor) - -TCP connect -----------------------------------------> - TCP accept - -send_name -----------------------------------------> - recv_name - - <---------------------------------------- send_status -recv_status -(if status was 'alive' - send_status - - - - - - - - - - - - - - - - - - - -> - recv_status) - ChB = gen_challenge() - (ChB) - <---------------------------------------- send_challenge -recv_challenge - -ChA = gen_challenge(), -OCA = out_cookie(B), -DiA = gen_digest(ChB,OCA) - (ChA, DiA) -send_challenge_reply --------------------------------> - recv_challenge_reply - ICB = in_cookie(A), - check: - DiA == gen_digest - (ChB, ICB) ? - - if OK: - OCB = out_cookie(A), - DiB = gen_digest - (DiB) (ChA, OCB) - <----------------------------------------- send_challenge_ack -recv_challenge_ack DONE -ICA = in_cookie(B), - else -check: CLOSE -DiB == gen_digest(ChA,ICA) ? -- if OK - DONE -- else - CLOSE - - -THE CURRENTLY DEFINED FLAGS ---------------------------- -Currently the following capability flags are defined: - -%% The node should be published and part of the global namespace --define(DFLAG_PUBLISHED,1). - -%% The node implements an atom cache --define(DFLAG_ATOM_CACHE,2). - -%% The node implements extended (3 * 32 bits) references --define(DFLAG_EXTENDED_REFERENCES,4). - -%% The node implements distributed process monitoring. --define(DFLAG_DIST_MONITOR,8). - -%% The node uses separate tag for fun's (lambdas) in the distribution protocol. --define(DFLAG_FUN_TAGS,16). - -An R6 erlang node implements all of the above, while a C or Java node only -implements DFLAG_EXTENDED_REFERENCES. - -Last modified 1999-11-08 -- Patrik Nyblom, OTP +This information has been moved to the "Distribution Protocol" chapter of "ERTS User's Guide". -- cgit v1.2.3 From 6347bc07dac9f958b4ac3cb751dabae08d350b8b Mon Sep 17 00:00:00 2001 From: Rickard Green Date: Sat, 19 Jan 2013 00:46:21 +0100 Subject: Fix merge conflict with hasse --- lib/stdlib/src/io_lib.erl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/stdlib/src/io_lib.erl b/lib/stdlib/src/io_lib.erl index 0f1f417d01..513d904c39 100644 --- a/lib/stdlib/src/io_lib.erl +++ b/lib/stdlib/src/io_lib.erl @@ -276,7 +276,7 @@ write_atom(Atom) -> Chars = atom_to_list(Atom), case quote_atom(Atom, Chars) of true -> - write_unicode_string(Chars, $'); %' + write_string(Chars, $'); %' false -> Chars end. -- cgit v1.2.3 From 5ee4c7136ce8f311e3d3384ae0feb29bcbff6e85 Mon Sep 17 00:00:00 2001 From: Rickard Green Date: Mon, 21 Jan 2013 15:44:33 +0100 Subject: workaround... --- erts/emulator/test/distribution_SUITE.erl | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/erts/emulator/test/distribution_SUITE.erl b/erts/emulator/test/distribution_SUITE.erl index e7bd8d19c3..7edd92d56f 100644 --- a/erts/emulator/test/distribution_SUITE.erl +++ b/erts/emulator/test/distribution_SUITE.erl @@ -1240,9 +1240,9 @@ contended_atom_cache_entry_test(Config, Type) -> get_conflicting_unicode_atoms(CIX, ProcessPairs) end, - io:format("Testing with the following atoms all using " - "cache index ~p:~n ~p~n", - [CIX, TestAtoms]), +% io:format("Testing with the following atoms all using " +% "cache index ~p:~n ~p~n", +% [CIX, TestAtoms]), Ps = lists:map( fun (A) -> Ref = make_ref(), -- cgit v1.2.3 From 3aa60cc472bc330dbe9360eb27a1f340b7e23dc6 Mon Sep 17 00:00:00 2001 From: Rickard Green Date: Tue, 22 Jan 2013 18:35:35 +0100 Subject: Add UTF-8 node name support for epmd --- erts/epmd/src/epmd_cli.c | 13 ++- erts/epmd/src/epmd_int.h | 20 +++- erts/epmd/src/epmd_srv.c | 215 ++++++++++++++++++++++++++++++++++++------ erts/epmd/test/epmd_SUITE.erl | 47 ++++++++- 4 files changed, 254 insertions(+), 41 deletions(-) diff --git a/erts/epmd/src/epmd_cli.c b/erts/epmd/src/epmd_cli.c index 74408e3ebe..1d4de64b63 100644 --- a/erts/epmd/src/epmd_cli.c +++ b/erts/epmd/src/epmd_cli.c @@ -22,6 +22,7 @@ #endif #include "epmd.h" /* Renamed from 'epmd_r4.h' */ #include "epmd_int.h" +#include "erl_printf.h" /* erts_snprintf */ /* forward declarations */ @@ -114,16 +115,18 @@ void epmd_call(EpmdVars *g,int what) epmd_cleanup_exit(g,1); } j = ntohl(i); - if (!g->silent) - printf("epmd: up and running on port %d with data:\n", j); + if (!g->silent) { + rval = erts_snprintf(buf, OUTBUF_SIZE, + "epmd: up and running on port %d with data:\n", j); + write(1, buf, rval); + } while(1) { - if ((rval = read(fd,buf,1)) <= 0) { + if ((rval = read(fd,buf,OUTBUF_SIZE)) <= 0) { close(fd); epmd_cleanup_exit(g,0); } - buf[rval] = '\0'; if (!g->silent) - printf("%s",buf); + write(1, buf, rval); /* Potentially UTF-8 encoded */ } } diff --git a/erts/epmd/src/epmd_int.h b/erts/epmd/src/epmd_int.h index 14d05c3f19..b25412c905 100644 --- a/erts/epmd/src/epmd_int.h +++ b/erts/epmd/src/epmd_int.h @@ -226,13 +226,25 @@ #define MAX_UNREG_COUNT 1000 #define DEBUG_MAX_UNREG_COUNT 5 -/* Maximum length of a node name == atom name */ -#define MAXSYMLEN 255 +/* + * Maximum length of a node name == atom name + * 255 characters; UTF-8 encoded -> max 255*4 + */ +#define MAXSYMLEN (255*4) #define MAX_LISTEN_SOCKETS 16 -#define INBUF_SIZE 1024 -#define OUTBUF_SIZE 1024 +/* + * Largest request: ALIVE2_REQ + * 2 + 13 + 2*MAXSYMLEN + * Largest response: PORT2_RESP + * 2 + 14 + 2*MAXSYMLEN + * + * That is, 3*MAXSYMLEN should be large enough + */ + +#define INBUF_SIZE (3*MAXSYMLEN) +#define OUTBUF_SIZE (3*MAXSYMLEN) #define get_int16(s) ((((unsigned char*) (s))[0] << 8) | \ (((unsigned char*) (s))[1])) diff --git a/erts/epmd/src/epmd_srv.c b/erts/epmd/src/epmd_srv.c index 36565b7438..2a74c4955e 100644 --- a/erts/epmd/src/epmd_srv.c +++ b/erts/epmd/src/epmd_srv.c @@ -73,7 +73,7 @@ static int conn_open(EpmdVars*,int); static int conn_close_fd(EpmdVars*,int); static void node_init(EpmdVars*); -static Node *node_reg2(EpmdVars*,char*, int, int, unsigned char, unsigned char, int, int, int, char*); +static Node *node_reg2(EpmdVars*, int, char*, int, int, unsigned char, unsigned char, int, int, int, char*); static int node_unreg(EpmdVars*,char*); static int node_unreg_sock(EpmdVars*,int); @@ -81,6 +81,113 @@ static int reply(EpmdVars*,int,char *,int); static void dbg_print_buf(EpmdVars*,char *,int); static void print_names(EpmdVars*); +static int is_same_str(char *x, char *y) +{ + int i = 0; + /* + * Using strcmp() == 0 is probably ok, but just to be sure, + * since we got UTF-8 strings, we do it ourselves. + * + * We assume null-terminated correctly encoded UTF-8. + */ + while (x[i] == y[i]) { + if (x[i] == '\0') + return 1; + i++; + } + return 0; +} + +static int copy_str(char *x, char *y) +{ + int i = 0; + /* + * Using strcpy() is probably ok, but just to be sure, + * since we got UTF-8 strings, we do it ourselves. + * + * We assume null-terminated correctly encoded UTF-8. + */ + while (1) { + x[i] = y[i]; + if (y[i] == '\0') + return i; + i++; + } +} + +static int length_str(char *x) +{ + int i = 0; + /* + * Using strlen is probably ok, but just to be sure, + * since we got UTF-8 strings, we do it ourselves. + * + * We assume null-terminated correctly encoded UTF-8. + */ + while (x[i]) + i++; + return i; +} + +static int verify_utf8(const char *src, int sz, int null_term) +{ + unsigned char *source = (unsigned char *) src; + int size = sz; + int num_chars = 0; + while (size) { + if (null_term && (*source) == 0) + return num_chars; + if (((*source) & ((unsigned char) 0x80)) == 0) { + source++; + --size; + } else if (((*source) & ((unsigned char) 0xE0)) == 0xC0) { + if (size < 2) + return -1; + if (((source[1] & ((unsigned char) 0xC0)) != 0x80) || + ((*source) < 0xC2) /* overlong */) { + return -1; + } + source += 2; + size -= 2; + } else if (((*source) & ((unsigned char) 0xF0)) == 0xE0) { + if (size < 3) + return -1; + if (((source[1] & ((unsigned char) 0xC0)) != 0x80) || + ((source[2] & ((unsigned char) 0xC0)) != 0x80) || + (((*source) == 0xE0) && (source[1] < 0xA0)) /* overlong */ ) { + return -1; + } + if ((((*source) & ((unsigned char) 0xF)) == 0xD) && + ((source[1] & 0x20) != 0)) { + return -1; + } + source += 3; + size -= 3; + } else if (((*source) & ((unsigned char) 0xF8)) == 0xF0) { + if (size < 4) + return -1; + if (((source[1] & ((unsigned char) 0xC0)) != 0x80) || + ((source[2] & ((unsigned char) 0xC0)) != 0x80) || + ((source[3] & ((unsigned char) 0xC0)) != 0x80) || + (((*source) == 0xF0) && (source[1] < 0x90)) /* overlong */) { + return -1; + } + if ((((*source) & ((unsigned char)0x7)) > 0x4U) || + ((((*source) & ((unsigned char)0x7)) == 0x4U) && + ((source[1] & ((unsigned char)0x3F)) > 0xFU))) { + return -1; + } + source += 4; + size -= 4; + } else { + return -1; + } + ++num_chars; + } + return num_chars; +} + + static EPMD_INLINE void select_fd_set(EpmdVars* g, int fd) { FD_SET(fd, &g->orig_read_mask); @@ -525,10 +632,11 @@ static void do_request(g, fd, s, buf, bsize) } name = &buf[11]; name[namelen]='\000'; + extra = &buf[11+namelen+2]; extra[extralen]='\000'; wbuf[0] = EPMD_ALIVE2_RESP; - if ((node = node_reg2(g, name, fd, eport, nodetype, protocol, + if ((node = node_reg2(g, namelen, name, fd, eport, nodetype, protocol, highvsn, lowvsn, extralen, extra)) == NULL) { wbuf[1] = 1; /* error */ put_int16(99, wbuf+2); @@ -573,22 +681,28 @@ static void do_request(g, fd, s, buf, bsize) { char *name = &buf[1]; /* Points to node name */ + int nsz; Node *node; - + + nsz = verify_utf8(name, bsize, 0); + if (nsz < 1 || 255 < nsz) { + dbg_printf(g,0,"invalid node name in PORT2_REQ"); + return; + } + wbuf[0] = EPMD_PORT2_RESP; for (node = g->nodes.reg; node; node = node->next) { int offset; - if (strcmp(node->symname, name) == 0) { + if (is_same_str(node->symname, name)) { wbuf[1] = 0; /* ok */ put_int16(node->port,wbuf+2); wbuf[4] = node->nodetype; wbuf[5] = node->protocol; put_int16(node->highvsn,wbuf+6); put_int16(node->lowvsn,wbuf+8); - put_int16(strlen(node->symname),wbuf+10); + put_int16(length_str(node->symname),wbuf+10); offset = 12; - strcpy(wbuf + offset,node->symname); - offset += strlen(node->symname); + offset += copy_str(wbuf + offset,node->symname); put_int16(node->extralen,wbuf + offset); offset += 2; memcpy(wbuf + offset,node->extra,node->extralen); @@ -629,15 +743,22 @@ static void do_request(g, fd, s, buf, bsize) for (node = g->nodes.reg; node; node = node->next) { - int len; + int len = 0; + int r; /* CAREFUL!!! These are parsed by "erl_epmd.erl" so a slight change in syntax will break < OTP R3A */ - erts_snprintf(wbuf, sizeof(wbuf), "name %s at port %d\n",node->symname, node->port); - len = strlen(wbuf); + len += copy_str(&wbuf[len], "name "); + len += copy_str(&wbuf[len], node->symname); + r = erts_snprintf(&wbuf[len], sizeof(wbuf)-len, + " at port %d\n", node->port); + if (r < 0) + goto failed_names_resp; + len += r; if (reply(g, fd, wbuf, len) != len) { + failed_names_resp: dbg_tty_printf(g,1,"failed to send NAMES_RESP"); return; } @@ -665,16 +786,22 @@ static void do_request(g, fd, s, buf, bsize) for (node = g->nodes.reg; node; node = node->next) { - int len; + int len = 0, r; /* CAREFUL!!! These are parsed by "erl_epmd.erl" so a slight change in syntax will break < OTP R3A */ - erts_snprintf(wbuf, sizeof(wbuf), "active name <%s> at port %d, fd = %d\n", - node->symname, node->port, node->fd); - len = strlen(wbuf) + 1; - if (reply(g, fd,wbuf,len) != len) + len += copy_str(&wbuf[len], "active name <"); + len += copy_str(&wbuf[len], node->symname); + r = erts_snprintf(&wbuf[len], sizeof(wbuf)-len, + "> at port %d, fd = %d\n", + node->port, node->fd); + if (r < 0) + goto failed_dump_resp; + len += r + 1; + if (reply(g, fd,wbuf,len) != len) { + failed_dump_resp: dbg_tty_printf(g,1,"failed to send DUMP_RESP"); return; } @@ -682,16 +809,22 @@ static void do_request(g, fd, s, buf, bsize) for (node = g->nodes.unreg; node; node = node->next) { - int len; + int len = 0, r; /* CAREFUL!!! These are parsed by "erl_epmd.erl" so a slight change in syntax will break < OTP R3A */ - erts_snprintf(wbuf, sizeof(wbuf), "old/unused name <%s>, port = %d, fd = %d \n", - node->symname,node->port, node->fd); - len = strlen(wbuf) + 1; - if (reply(g, fd,wbuf,len) != len) + len += copy_str(&wbuf[len], "old/unused name <"); + len += copy_str(&wbuf[len], node->symname); + r = erts_snprintf(&wbuf[len], sizeof(wbuf)-len, + ">, port = %d, fd = %d \n", + node->port, node->fd); + if (r < 0) + goto failed_dump_resp2; + len += r + 1; + if (reply(g, fd,wbuf,len) != len) { + failed_dump_resp2: dbg_tty_printf(g,1,"failed to send DUMP_RESP"); return; } @@ -933,7 +1066,7 @@ static int node_unreg(EpmdVars *g,char *name) Node *node = g->nodes.reg; /* Point to first node */ for (; node; prev = &node->next, node = node->next) - if (strcmp(node->symname, name) == 0) + if (is_same_str(node->symname, name)) { dbg_tty_printf(g,1,"unregistering '%s:%d', port %d", node->symname, node->creation, node->port); @@ -1013,6 +1146,7 @@ static int node_unreg_sock(EpmdVars *g,int fd) */ static Node *node_reg2(EpmdVars *g, + int namelen, char* name, int fd, int port, @@ -1025,6 +1159,7 @@ static Node *node_reg2(EpmdVars *g, { Node *prev; /* Point to previous node or NULL */ Node *node; /* Point to first node */ + int sz; /* Can be NULL; means old style */ if (extra == NULL) @@ -1032,21 +1167,47 @@ static Node *node_reg2(EpmdVars *g, /* Fail if node name is too long */ - if (strlen(name) > MAXSYMLEN) + + if (namelen > MAXSYMLEN) { - dbg_printf(g,0,"node name is too long (%d) %s", strlen(name), name); + too_long_name: + dbg_printf(g,0,"node name is too long (%d) %s", namelen, name); return NULL; } + + sz = verify_utf8(name, namelen, 0); + if (sz > 255) + goto too_long_name; + + if (sz < 0) { + dbg_printf(g,0,"invalid node name encoding"); + return NULL; + } + if (extralen > MAXSYMLEN) { - dbg_printf(g,0,"extra data is too long (%d) %s", strlen(name), name); +#if 0 + too_long_extra: +#endif + dbg_printf(g,0,"extra data is too long (%d) %s", extralen, extra); return NULL; } +#if 0 /* Should we require valid utf8 here? */ + sz = verify_utf8(extra, extralen, 0); + if (sz > 255) + goto too_long_extra; + + if (sz < 0) { + dbg_printf(g,0,"invalid extra data encoding"); + return NULL; + } +#endif + /* Fail if it is already registered */ for (node = g->nodes.reg; node; node = node->next) - if (strcmp(node->symname, name) == 0) + if (is_same_str(node->symname, name)) { dbg_printf(g,0,"node name already occupied %s", name); return NULL; @@ -1058,7 +1219,7 @@ static Node *node_reg2(EpmdVars *g, prev = NULL; for (node = g->nodes.unreg; node; prev = node, node = node->next) - if (strcmp(node->symname, name) == 0) + if (is_same_str(node->symname, name)) { dbg_tty_printf(g,1,"reusing slot with same name '%s'", node->symname); @@ -1126,7 +1287,7 @@ static Node *node_reg2(EpmdVars *g, node->lowvsn = lowvsn; node->extralen = extralen; memcpy(node->extra,extra,extralen); - strcpy(node->symname,name); + copy_str(node->symname,name); select_fd_set(g, fd); if (highvsn == 0) { diff --git a/erts/epmd/test/epmd_SUITE.erl b/erts/epmd/test/epmd_SUITE.erl index fd9969ae2b..fc0abef400 100644 --- a/erts/epmd/test/epmd_SUITE.erl +++ b/erts/epmd/test/epmd_SUITE.erl @@ -45,6 +45,8 @@ register_names_1/1, register_names_2/1, register_duplicate_name/1, + unicode_name/1, + long_unicode_name/1, get_port_nr/1, slow_get_port_nr/1, unregister_others_name_1/1, @@ -107,7 +109,8 @@ suite() -> [{ct_hooks,[ts_install_cth]}]. all() -> [register_name, register_names_1, register_names_2, - register_duplicate_name, get_port_nr, slow_get_port_nr, + register_duplicate_name, unicode_name, long_unicode_name, + get_port_nr, slow_get_port_nr, unregister_others_name_1, unregister_others_name_2, register_overflow, name_with_null_inside, name_null_terminated, stupid_names_req, no_data, @@ -197,6 +200,37 @@ register_duplicate_name(Config) when is_list(Config) -> ?line ok = close(Sock), % Unregister ok. +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% + +unicode_name(doc) -> + ["Check that we can register and lookup a unicode name"]; +unicode_name(suite) -> + []; +unicode_name(Config) when is_list(Config) -> + ok = epmdrun(), + NodeName = [16#1f608], + {ok,Sock} = register_node_v2(4711, 72, 0, 5, 5, NodeName, []), + {ok,NodeInfo} = port_please_v2(NodeName), + NodeName = NodeInfo#node_info.node_name, + ok = close(Sock), + ok. + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% + +long_unicode_name(doc) -> + ["Check that we can register and lookup a long unicode name"]; +long_unicode_name(suite) -> + []; +long_unicode_name(Config) when is_list(Config) -> + ok = epmdrun(), + BaseChar = 16#1f600, + NodeName = lists:seq(BaseChar, BaseChar+200), % will be 800 bytes long + {ok,Sock} = register_node_v2(4711, 72, 0, 5, 5, NodeName, []), + {ok,NodeInfo} = port_please_v2(NodeName), + NodeName = NodeInfo#node_info.node_name, + ok = close(Sock), + ok. + % Internal function to register a node name, no close, i.e. unregister register_node(Name) -> @@ -205,9 +239,10 @@ register_node(Name,Port) -> register_node_v2(Port,$M,0,5,5,Name,""). register_node_v2(Port, NodeType, Prot, HVsn, LVsn, Name, Extra) -> + Utf8Name = unicode:characters_to_binary(Name), Req = [?EPMD_ALIVE2_REQ, put16(Port), NodeType, Prot, put16(HVsn), put16(LVsn), - size16(Name), Name, + put16(size(Utf8Name)), binary_to_list(Utf8Name), size16(Extra), Extra], case send_req(Req) of {ok,Sock} -> @@ -226,7 +261,8 @@ register_node_v2(Port, NodeType, Prot, HVsn, LVsn, Name, Extra) -> % Internal function to fetch information about a node port_please_v2(Name) -> - case send_req([?EPMD_PORT_PLEASE2_REQ, Name]) of + case send_req([?EPMD_PORT_PLEASE2_REQ, + binary_to_list(unicode:characters_to_binary(Name))]) of {ok,Sock} -> case recv_until_sock_closes(Sock) of {ok, Resp} -> @@ -247,7 +283,7 @@ parse_port2_resp(Resp) -> ELen:16,Extra:ELen/binary>> when Res =:= 0 -> {ok, #node_info{port=Port,node_type=NodeType,prot=Prot, hvsn=HVsn,lvsn=LVsn, - node_name=binary_to_list(NodeName), + node_name=unicode:characters_to_list(NodeName), extra=binary_to_list(Extra)}}; _Other -> test_server:format("invalid port2 resp: ~p~n", @@ -737,7 +773,7 @@ buffer_overrun_2(doc) -> ["Test security vulnerability in fake extra lengths in alive2_req"]; buffer_overrun_2(Config) when is_list(Config) -> ?line ok = epmdrun(), - ?line [false | Rest] = [hostile2(N) || N <- lists:seq(255,10000)], + ?line [false | Rest] = [hostile2(N) || N <- lists:seq(255*4,10000)], ?line true = alltrue(Rest), ok. hostile(N) -> @@ -880,6 +916,7 @@ no_live_killing(Config) when is_list(Config) -> ?line close(Sock3), ok. + %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% % Terminate all tests with killing epmd. -- cgit v1.2.3 From 8eb544073fe243a8935a54f83f9c9f1f7478e3c5 Mon Sep 17 00:00:00 2001 From: Sverker Eriksson Date: Tue, 22 Jan 2013 17:20:17 +0100 Subject: erts: Fix bug in analyze_utf8 causing faulty latin1 detection --- erts/emulator/beam/erl_unicode.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/erts/emulator/beam/erl_unicode.c b/erts/emulator/beam/erl_unicode.c index c00293de89..883405d066 100644 --- a/erts/emulator/beam/erl_unicode.c +++ b/erts/emulator/beam/erl_unicode.c @@ -1180,13 +1180,13 @@ analyze_utf8(byte *source, Uint size, byte **err_pos, Uint *num_chars, int *left ((*source) < 0xC2) /* overlong */) { return ERTS_UTF8_ERROR; } - source += 2; - size -= 2; if (num_latin1_chars) { latin1_count++; if ((source[0] & ((byte) 0xFC)) != ((byte) 0xC0)) is_latin1 = 0; } + source += 2; + size -= 2; } else if (((*source) & ((byte) 0xF0)) == 0xE0) { if (size < 3) { return ERTS_UTF8_INCOMPLETE; -- cgit v1.2.3 From 1f4765cca4874fa92fcfad888fbe6d5f2fbf74d1 Mon Sep 17 00:00:00 2001 From: Sverker Eriksson Date: Tue, 22 Jan 2013 19:25:36 +0100 Subject: erl_interface: even more utf8 atom stuff --- lib/erl_interface/doc/src/ei.xml | 12 +++-- lib/erl_interface/src/connect/ei_connect.c | 2 +- lib/erl_interface/src/encode/encode_atom.c | 59 +++++++++++++++++++- lib/erl_interface/src/legacy/erl_connect.c | 2 +- lib/erl_interface/src/misc/ei_format.c | 4 +- lib/erl_interface/src/misc/ei_printterm.c | 2 +- lib/erl_interface/src/misc/show_msg.c | 2 +- lib/erl_interface/src/prog/ei_fake_prog.c | 6 +++ lib/erl_interface/test/ei_decode_encode_SUITE.erl | 63 +++++++++++----------- .../ei_decode_encode_test.c | 16 ++++-- 10 files changed, 122 insertions(+), 46 deletions(-) diff --git a/lib/erl_interface/doc/src/ei.xml b/lib/erl_interface/doc/src/ei.xml index 0b0b1eeb79..e9c7c644b5 100644 --- a/lib/erl_interface/doc/src/ei.xml +++ b/lib/erl_interface/doc/src/ei.xml @@ -94,7 +94,11 @@ enum erlang_char_encoding { ERLANG_ASCII, ERLANG_LATIN1, ERLANG_UTF8, ERLANG_WHATEVER }; -

The character encoding used for atoms.

+

The character encoding used for atoms. ERLANG_ASCII represents 7-bit ASCII. + Latin1 and UTF8 are different extensions of 7-bit ASCII. All 7-bit ASCII characters + are valid Latin1 and UTF8 characters. ASCII and Latin1 both represent each character + by one byte. A UTF8 character can consist of one to four bytes. ERLANG_WHATEVER + is not an encoding but rather used as a wildcard.

@@ -256,11 +260,11 @@ enum erlang_char_encoding {

Encodes an atom in the binary format with character encoding to_enc (latin1 or utf8). The p parameter is the name of the atom with character encoding - from_enc. + from_enc (ascii, latin1 or utf8). The name must either be zero-terminated or a function variant with a len parameter must be used.

-

The encoding will fail if the atom is too long or if it can not be represented - with character encoding to_enc.

+

The encoding will fail if p is not a valid string in encoding from_enc, + if the string is too long or if it can not be represented with character encoding to_enc.

These functions were introduced in R16 release of Erlang/OTP as part of a first step to support UTF8 atoms. Atoms encoded with ERLANG_UTF8 can not be decoded by earlier releases than R16.

diff --git a/lib/erl_interface/src/connect/ei_connect.c b/lib/erl_interface/src/connect/ei_connect.c index a17257795e..4421bbb7fe 100644 --- a/lib/erl_interface/src/connect/ei_connect.c +++ b/lib/erl_interface/src/connect/ei_connect.c @@ -1071,7 +1071,7 @@ int ei_rpc(ei_cnode* ec, int fd, char *mod, char *fun, int i, index; ei_term t; erlang_msg msg; - char rex[MAXATOMLEN+1]; + char rex[MAXATOMLEN]; if (ei_rpc_to(ec, fd, mod, fun, inbuf, inbuflen) < 0) { return -1; diff --git a/lib/erl_interface/src/encode/encode_atom.c b/lib/erl_interface/src/encode/encode_atom.c index a3d7c4c759..8bbe962396 100644 --- a/lib/erl_interface/src/encode/encode_atom.c +++ b/lib/erl_interface/src/encode/encode_atom.c @@ -22,6 +22,11 @@ #include "eiext.h" #include "putget.h" + +static int copy_ascii_atom(char* dst, const char* src, int slen); +static int copy_utf8_atom(char* dst, const char* src, int slen); + + int ei_encode_atom(char *buf, int *index, const char *p) { size_t len = strlen(p); @@ -54,7 +59,8 @@ int ei_encode_atom_len_as(char *buf, int *index, const char *p, int len, char *s0 = s; int offs; - if (from_enc == ERLANG_LATIN1 && len >= MAXATOMLEN) { + if (len >= MAXATOMLEN && (from_enc == ERLANG_LATIN1 || + from_enc == ERLANG_ASCII)) { return -1; } @@ -68,6 +74,8 @@ int ei_encode_atom_len_as(char *buf, int *index, const char *p, int len, if (len < 0) return -1; break; case ERLANG_ASCII: + if (copy_ascii_atom(s+2, p, len) < 0) return -1; + break; case ERLANG_LATIN1: memcpy(s+2, p, len); break; @@ -93,9 +101,11 @@ int ei_encode_atom_len_as(char *buf, int *index, const char *p, int len, len = latin1_to_utf8((buf ? s+offs : NULL), p, len, MAXATOMLEN_UTF8-1, NULL); break; case ERLANG_ASCII: + if (buf && copy_ascii_atom(s+offs, p, len) < 0) return -1; + break; case ERLANG_UTF8: if (len >= 256) offs++; - if (buf) memcpy(s+offs, p, len); + if (buf && copy_utf8_atom(s+offs, p, len) < 0) return -1; break; default: return -1; @@ -133,3 +143,48 @@ ei_internal_put_atom(char** bufp, const char* p, int slen, *bufp += ix; return 0; } + + +int copy_ascii_atom(char* dst, const char* src, int slen) +{ + while (slen > 0) { + if ((src[0] & 0x80) != 0) return -1; + *dst++ = *src++; + slen--; + } + return 0; +} + +int copy_utf8_atom(char* dst, const char* src, int slen) +{ + int num_chars = 0; + + while (slen > 0) { + if (++num_chars >= MAXATOMLEN) return -1; + if ((src[0] & 0x80) != 0) { + if ((src[0] & 0xE0) == 0xC0) { + if (slen < 2 || (src[1] & 0xC0) != 0x80) return -1; + *dst++ = *src++; + slen--; + } + else if ((src[0] & 0xF0) == 0xE0) { + if (slen < 3 || (src[1] & 0xC0) != 0x80 || (src[2] & 0xC0) != 0x80) return -1; + *dst++ = *src++; + *dst++ = *src++; + slen -= 2; + } + else if ((src[0] & 0xF8) == 0xF0) { + if (slen < 4 || (src[1] & 0xC0) != 0x80 || (src[2] & 0xC0) != 0x80 || (src[3] & 0xC0) != 0x80) return -1; + *dst++ = *src++; + *dst++ = *src++; + *dst++ = *src++; + slen -= 3; + } + else return -1; + } + *dst++ = *src++; + slen--; + } + return 0; +} + diff --git a/lib/erl_interface/src/legacy/erl_connect.c b/lib/erl_interface/src/legacy/erl_connect.c index be83fa8469..f82704ea8b 100644 --- a/lib/erl_interface/src/legacy/erl_connect.c +++ b/lib/erl_interface/src/legacy/erl_connect.c @@ -125,7 +125,7 @@ static ei_cnode erl_if_ec; int erl_connect_init(int this_node_number, char *cookie, short creation) { - char nn[MAXATOMLEN+1]; + char nn[MAXATOMLEN]; sprintf(nn, "c%d", this_node_number); diff --git a/lib/erl_interface/src/misc/ei_format.c b/lib/erl_interface/src/misc/ei_format.c index 281a192535..b5f11e618e 100644 --- a/lib/erl_interface/src/misc/ei_format.c +++ b/lib/erl_interface/src/misc/ei_format.c @@ -139,8 +139,8 @@ static int patom(const char** fmt, ei_x_buff* x) --(*fmt); len = *fmt - start; /* FIXME why truncate atom name and not fail?! */ - if (len > MAXATOMLEN) - len = MAXATOMLEN; + if (len >= MAXATOMLEN) + len = MAXATOMLEN-1; return ei_x_encode_atom_len(x, start, len); } diff --git a/lib/erl_interface/src/misc/ei_printterm.c b/lib/erl_interface/src/misc/ei_printterm.c index 91fe73e68c..f3003a6172 100644 --- a/lib/erl_interface/src/misc/ei_printterm.c +++ b/lib/erl_interface/src/misc/ei_printterm.c @@ -115,7 +115,7 @@ static int print_term(FILE* fp, ei_x_buff* x, const char* buf, int* index) { int i, doquote, n, m, ty, r; - char a[MAXATOMLEN+1], *p; + char a[MAXATOMLEN], *p; int ch_written = 0; /* counter of written chars */ erlang_pid pid; erlang_port port; diff --git a/lib/erl_interface/src/misc/show_msg.c b/lib/erl_interface/src/misc/show_msg.c index ca46b15aff..33b09643ca 100644 --- a/lib/erl_interface/src/misc/show_msg.c +++ b/lib/erl_interface/src/misc/show_msg.c @@ -457,7 +457,7 @@ static void show_term(const char *termbuf, int *index, FILE *stream) break; case ERL_FUN_EXT: { - char atom[MAXATOMLEN+1]; + char atom[MAXATOMLEN]; long idx; long uniq; const char* s = termbuf + *index, * s0 = s; diff --git a/lib/erl_interface/src/prog/ei_fake_prog.c b/lib/erl_interface/src/prog/ei_fake_prog.c index 68eb537211..34101a2851 100644 --- a/lib/erl_interface/src/prog/ei_fake_prog.c +++ b/lib/erl_interface/src/prog/ei_fake_prog.c @@ -96,6 +96,7 @@ int main(void) EI_ULONGLONG *ulonglongp = (EI_ULONGLONG*)NULL; EI_ULONGLONG ulonglongx = 0; #endif + enum erlang_char_encoding enc; intx = erl_errno; @@ -148,9 +149,13 @@ int main(void) ei_x_encode_string(&eix, charp); ei_x_encode_string_len(&eix, charp, intx); ei_encode_atom(charp, intp, charp); + ei_encode_atom_as(charp, intp, charp, ERLANG_LATIN1, ERLANG_UTF8); ei_encode_atom_len(charp, intp, charp, intx); + ei_encode_atom_len_as(charp, intp, charp, intx, ERLANG_ASCII, ERLANG_LATIN1); ei_x_encode_atom(&eix, charp); + ei_x_encode_atom_as(&eix, charp, ERLANG_LATIN1, ERLANG_UTF8); ei_x_encode_atom_len(&eix, charp, intx); + ei_x_encode_atom_len_as(&eix, charp, intx, ERLANG_LATIN1, ERLANG_UTF8); ei_encode_binary(charp, intp, (void *)0, longx); ei_x_encode_binary(&eix, (void*)0, intx); ei_encode_pid(charp, intp, &epid); @@ -181,6 +186,7 @@ int main(void) ei_decode_char(charp, intp, charp); ei_decode_string(charp, intp, charp); ei_decode_atom(charp, intp, charp); + ei_decode_atom_as(charp, intp, charp, MAXATOMLEN_UTF8, ERLANG_WHATEVER, &enc, &enc); ei_decode_binary(charp, intp, (void *)0, longp); ei_decode_fun(charp, intp, &efun); free_fun(&efun); diff --git a/lib/erl_interface/test/ei_decode_encode_SUITE.erl b/lib/erl_interface/test/ei_decode_encode_SUITE.erl index e8ae7a6f81..0c98b494ec 100644 --- a/lib/erl_interface/test/ei_decode_encode_SUITE.erl +++ b/lib/erl_interface/test/ei_decode_encode_SUITE.erl @@ -119,8 +119,12 @@ test_ei_decode_encode(Config) when is_list(Config) -> ?line send_rec(P, OXRef), %% Unicode atoms - [send_rec(P, Atom) || Atom <- unicode_atom_data()], - + [begin send_rec(P, Atom), + send_rec(P, mk_pid({Atom,1}, 23434, 3434)), + send_rec(P, mk_port({Atom,1}, 2343434)), + send_rec(P, mk_ref({Atom,1}, [262143, 8723648, 24097245])), + void + end || Atom <- unicode_atom_data()], ?line runner:recv_eot(P), ok. @@ -229,38 +233,36 @@ uint8(Uint) -> mk_pid({NodeName, Creation}, Number, Serial) when is_atom(NodeName) -> - mk_pid({atom_to_list(NodeName), Creation}, Number, Serial); -mk_pid({NodeName, Creation}, Number, Serial) -> + <> = term_to_binary(NodeName), + mk_pid({NodeNameExt, Creation}, Number, Serial); +mk_pid({NodeNameExt, Creation}, Number, Serial) -> case catch binary_to_term(list_to_binary([?VERSION_MAGIC, ?PID_EXT, - ?ATOM_EXT, - uint16_be(length(NodeName)), - NodeName, + NodeNameExt, uint32_be(Number), uint32_be(Serial), uint8(Creation)])) of Pid when is_pid(Pid) -> Pid; {'EXIT', {badarg, _}} -> - exit({badarg, mk_pid, [{NodeName, Creation}, Number, Serial]}); + exit({badarg, mk_pid, [{NodeNameExt, Creation}, Number, Serial]}); Other -> exit({unexpected_binary_to_term_result, Other}) end. mk_port({NodeName, Creation}, Number) when is_atom(NodeName) -> - mk_port({atom_to_list(NodeName), Creation}, Number); -mk_port({NodeName, Creation}, Number) -> + <> = term_to_binary(NodeName), + mk_port({NodeNameExt, Creation}, Number); +mk_port({NodeNameExt, Creation}, Number) -> case catch binary_to_term(list_to_binary([?VERSION_MAGIC, ?PORT_EXT, - ?ATOM_EXT, - uint16_be(length(NodeName)), - NodeName, + NodeNameExt, uint32_be(Number), uint8(Creation)])) of Port when is_port(Port) -> Port; {'EXIT', {badarg, _}} -> - exit({badarg, mk_port, [{NodeName, Creation}, Number]}); + exit({badarg, mk_port, [{NodeNameExt, Creation}, Number]}); Other -> exit({unexpected_binary_to_term_result, Other}) end. @@ -268,33 +270,30 @@ mk_port({NodeName, Creation}, Number) -> mk_ref({NodeName, Creation}, Numbers) when is_atom(NodeName), is_integer(Creation), is_list(Numbers) -> - mk_ref({atom_to_list(NodeName), Creation}, Numbers); -mk_ref({NodeName, Creation}, [Number]) when is_list(NodeName), - is_integer(Creation), - is_integer(Number) -> + <> = term_to_binary(NodeName), + mk_ref({NodeNameExt, Creation}, Numbers); +mk_ref({NodeNameExt, Creation}, [Number]) when is_binary(NodeNameExt), + is_integer(Creation), + is_integer(Number) -> case catch binary_to_term(list_to_binary([?VERSION_MAGIC, ?REFERENCE_EXT, - ?ATOM_EXT, - uint16_be(length(NodeName)), - NodeName, + NodeNameExt, uint32_be(Number), uint8(Creation)])) of Ref when is_reference(Ref) -> Ref; {'EXIT', {badarg, _}} -> - exit({badarg, mk_ref, [{NodeName, Creation}, [Number]]}); + exit({badarg, mk_ref, [{NodeNameExt, Creation}, [Number]]}); Other -> exit({unexpected_binary_to_term_result, Other}) end; -mk_ref({NodeName, Creation}, Numbers) when is_list(NodeName), - is_integer(Creation), - is_list(Numbers) -> +mk_ref({NodeNameExt, Creation}, Numbers) when is_binary(NodeNameExt), + is_integer(Creation), + is_list(Numbers) -> case catch binary_to_term(list_to_binary([?VERSION_MAGIC, ?NEW_REFERENCE_EXT, uint16_be(length(Numbers)), - ?ATOM_EXT, - uint16_be(length(NodeName)), - NodeName, + NodeNameExt, uint8(Creation), lists:map(fun (N) -> uint32_be(N) @@ -303,7 +302,7 @@ mk_ref({NodeName, Creation}, Numbers) when is_list(NodeName), Ref when is_reference(Ref) -> Ref; {'EXIT', {badarg, _}} -> - exit({badarg, mk_ref, [{NodeName, Creation}, Numbers]}); + exit({badarg, mk_ref, [{NodeNameExt, Creation}, Numbers]}); Other -> exit({unexpected_binary_to_term_result, Other}) end. @@ -322,9 +321,11 @@ unicode_atom_data() -> uc_atup(lists:seq(65500, 65754)), uc_atup(lists:seq(65500, 65563)) | lists:map(fun (N) -> - uc_atup(lists:seq(64000+N, 64254+N)) + Pow2 = (1 bsl N), + uc_atup(lists:seq(Pow2 - 127, Pow2 + 127)) end, - lists:seq(1, 2000))]. + lists:seq(7, 20)) + ]. uc_atup(ATxt) -> string_to_atom(ATxt). diff --git a/lib/erl_interface/test/ei_decode_encode_SUITE_data/ei_decode_encode_test.c b/lib/erl_interface/test/ei_decode_encode_SUITE_data/ei_decode_encode_test.c index 194ce9057b..e57663f984 100644 --- a/lib/erl_interface/test/ei_decode_encode_SUITE_data/ei_decode_encode_test.c +++ b/lib/erl_interface/test/ei_decode_encode_SUITE_data/ei_decode_encode_test.c @@ -63,10 +63,12 @@ int ei_x_encode_my_atom(ei_x_buff* x, my_atom* a) return ei_x_encode_atom_as(x, a->name, ERLANG_UTF8, a->enc); } +#define BUFSZ 2000 + void decode_encode(struct Type* t, void* obj) { char *buf; - char buf2[1024]; + char buf2[BUFSZ]; int size1 = 0; int size2 = 0; int size3 = 0; @@ -89,6 +91,11 @@ void decode_encode(struct Type* t, void* obj) return; } + if (size1 > BUFSZ) { + fail("size is > BUFSZ"); + return; + } + MESSAGE("ei_encode_%s buf is NULL, arg is type %s", t->name, t->type); err = t->ei_encode_fp(NULL, &size2, obj); if (err != 0) { @@ -277,8 +284,11 @@ TESTCASE(test_ei_decode_encode) EI_DECODE_ENCODE(ref , erlang_ref); /* Unicode atoms */ - for (i=0; i<2010; i++) { - EI_DECODE_ENCODE(my_atom , my_atom); + for (i=0; i<24; i++) { + EI_DECODE_ENCODE(my_atom, my_atom); + EI_DECODE_ENCODE(pid, erlang_pid); + EI_DECODE_ENCODE(port, erlang_port); + EI_DECODE_ENCODE(ref, erlang_ref); } report(1); -- cgit v1.2.3 From 5d2aaef89f2e1dce2147ac56a105f129390f31a3 Mon Sep 17 00:00:00 2001 From: Sverker Eriksson Date: Wed, 23 Jan 2013 12:06:15 +0100 Subject: erts: Testcase doing unicode atom printout with ~w --- erts/emulator/test/distribution_SUITE.erl | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/erts/emulator/test/distribution_SUITE.erl b/erts/emulator/test/distribution_SUITE.erl index 7edd92d56f..101007c288 100644 --- a/erts/emulator/test/distribution_SUITE.erl +++ b/erts/emulator/test/distribution_SUITE.erl @@ -1240,9 +1240,9 @@ contended_atom_cache_entry_test(Config, Type) -> get_conflicting_unicode_atoms(CIX, ProcessPairs) end, -% io:format("Testing with the following atoms all using " -% "cache index ~p:~n ~p~n", -% [CIX, TestAtoms]), + io:format("Testing with the following atoms all using " + "cache index ~p:~n ~w~n", + [CIX, TestAtoms]), Ps = lists:map( fun (A) -> Ref = make_ref(), -- cgit v1.2.3 From c596e17cf3d69cf5e10d28ee2a8ee35162786da1 Mon Sep 17 00:00:00 2001 From: Sverker Eriksson Date: Wed, 23 Jan 2013 16:04:38 +0100 Subject: erl_interface: Changed erlang_char_encoding interface to allow bitwise-or'd combinations. --- lib/erl_interface/doc/src/ei.xml | 13 +++++++------ lib/erl_interface/include/ei.h | 7 ++++++- lib/erl_interface/src/decode/decode_atom.c | 2 +- lib/erl_interface/src/encode/encode_atom.c | 3 +-- lib/erl_interface/src/legacy/erl_marshal.c | 6 +++--- 5 files changed, 18 insertions(+), 13 deletions(-) diff --git a/lib/erl_interface/doc/src/ei.xml b/lib/erl_interface/doc/src/ei.xml index e9c7c644b5..117c787da6 100644 --- a/lib/erl_interface/doc/src/ei.xml +++ b/lib/erl_interface/doc/src/ei.xml @@ -91,14 +91,13 @@

enum erlang_char_encoding { - ERLANG_ASCII, ERLANG_LATIN1, ERLANG_UTF8, ERLANG_WHATEVER + ERLANG_ASCII, ERLANG_LATIN1, ERLANG_UTF8 };

The character encoding used for atoms. ERLANG_ASCII represents 7-bit ASCII. Latin1 and UTF8 are different extensions of 7-bit ASCII. All 7-bit ASCII characters are valid Latin1 and UTF8 characters. ASCII and Latin1 both represent each character - by one byte. A UTF8 character can consist of one to four bytes. ERLANG_WHATEVER - is not an encoding but rather used as a wildcard.

+ by one byte. A UTF8 character can consist of one to four bytes.

@@ -545,11 +544,13 @@ ei_x_encode_empty_list(&x); want
. The original encoding used in the binary format (latin1 or utf8) can be obtained from *was. The actual encoding of the resulting string (7-bit ascii, latin1 or utf8) can be obtained from *result. Both was and result can be NULL. - *result may differ from want if want is ERLANG_WHATEVER or if - *result turn out to be pure 7-bit ascii (compatible with both latin1 and utf8).

+ + *result may differ from want if want is a bitwise-or'd combination like + ERLANG_LATIN1|ERLANG_UTF8 or if *result turn out to be pure 7-bit ascii + (compatible with both latin1 and utf8).

This function fails if the atom is too long for the buffer or if it can not be represented with encoding want.

-

This functions was introduced in R16 release of Erlang/OTP as part of a first step +

This function was introduced in R16 release of Erlang/OTP as part of a first step to support UTF8 atoms.

diff --git a/lib/erl_interface/include/ei.h b/lib/erl_interface/include/ei.h index 20e575f64d..2278a28adb 100644 --- a/lib/erl_interface/include/ei.h +++ b/lib/erl_interface/include/ei.h @@ -190,7 +190,12 @@ extern volatile int __erl_errno; #define MAXATOMLEN_UTF8 (255*4 + 1) #define MAXNODELEN EI_MAXALIVELEN+1+EI_MAXHOSTNAMELEN -enum erlang_char_encoding { ERLANG_ASCII, ERLANG_LATIN1, ERLANG_UTF8, ERLANG_WHATEVER }; +enum erlang_char_encoding { + ERLANG_ASCII = 1, + ERLANG_LATIN1 = 2, + ERLANG_UTF8 = 4, + ERLANG_ANY = ERLANG_ASCII|ERLANG_LATIN1|ERLANG_UTF8 +}; /* a pid */ typedef struct { diff --git a/lib/erl_interface/src/decode/decode_atom.c b/lib/erl_interface/src/decode/decode_atom.c index 2ada418243..556c400cb3 100644 --- a/lib/erl_interface/src/decode/decode_atom.c +++ b/lib/erl_interface/src/decode/decode_atom.c @@ -58,7 +58,7 @@ int ei_decode_atom_as(const char *buf, int *index, char* p, int destlen, return -1; } - if (want_enc == got_enc || want_enc == ERLANG_WHATEVER || want_enc == ERLANG_ASCII) { + if ((want_enc & got_enc) || want_enc == ERLANG_ASCII) { int i, found_non_ascii = 0; if (len >= destlen) return -1; diff --git a/lib/erl_interface/src/encode/encode_atom.c b/lib/erl_interface/src/encode/encode_atom.c index 8bbe962396..044f17cb60 100644 --- a/lib/erl_interface/src/encode/encode_atom.c +++ b/lib/erl_interface/src/encode/encode_atom.c @@ -59,8 +59,7 @@ int ei_encode_atom_len_as(char *buf, int *index, const char *p, int len, char *s0 = s; int offs; - if (len >= MAXATOMLEN && (from_enc == ERLANG_LATIN1 || - from_enc == ERLANG_ASCII)) { + if (len >= MAXATOMLEN && (from_enc & (ERLANG_LATIN1|ERLANG_ASCII))) { return -1; } diff --git a/lib/erl_interface/src/legacy/erl_marshal.c b/lib/erl_interface/src/legacy/erl_marshal.c index 884e9d421b..4c45cebb02 100644 --- a/lib/erl_interface/src/legacy/erl_marshal.c +++ b/lib/erl_interface/src/legacy/erl_marshal.c @@ -662,7 +662,7 @@ static int read_atom(unsigned char** ext, Erl_Atom_data* a) int offs = 0; enum erlang_char_encoding enc; int ret = ei_decode_atom_as((char*)*ext, &offs, buf, MAXATOMLEN_UTF8, - ERLANG_WHATEVER, NULL, &enc); + ERLANG_LATIN1|ERLANG_UTF8, NULL, &enc); *ext += offs; if (ret == 0) { @@ -674,11 +674,11 @@ static int read_atom(unsigned char** ext, Erl_Atom_data* a) a->lenL = 0; a->utf8 = NULL; a->lenU = 0; - if (enc == ERLANG_LATIN1 || enc == ERLANG_ASCII) { + if (enc & (ERLANG_LATIN1 | ERLANG_ASCII)) { a->latin1 = clone; a->lenL = i; } - if (enc == ERLANG_UTF8 || enc == ERLANG_ASCII) { + if (enc & (ERLANG_UTF8 | ERLANG_ASCII)) { a->utf8 = clone; a->lenU = i; } -- cgit v1.2.3 From d6e3e256b850050b7a86323b2948009d5fcc30a9 Mon Sep 17 00:00:00 2001 From: Lukas Larsson Date: Wed, 23 Jan 2013 10:17:29 +0100 Subject: erl_interface: Fix bug when transcoding atoms from and to UTF8 --- lib/erl_interface/src/decode/decode_atom.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lib/erl_interface/src/decode/decode_atom.c b/lib/erl_interface/src/decode/decode_atom.c index 556c400cb3..9779ad3f35 100644 --- a/lib/erl_interface/src/decode/decode_atom.c +++ b/lib/erl_interface/src/decode/decode_atom.c @@ -76,8 +76,8 @@ int ei_decode_atom_as(const char *buf, int *index, char* p, int destlen, } else { int plen = (got_enc == ERLANG_LATIN1) ? - utf8_to_latin1(p, s, len, destlen-1, res_encp) : - latin1_to_utf8(p, s, len, destlen-1, res_encp); + latin1_to_utf8(p, s, len, destlen-1, res_encp) : + utf8_to_latin1(p, s, len, destlen-1, res_encp); if (plen < 0) return -1; if (p) p[plen] = 0; } -- cgit v1.2.3