From 0dd3b88cdf90283d9c276ee415f985cb764e522f Mon Sep 17 00:00:00 2001 From: Rickard Green Date: Thu, 10 Jan 2013 12:47:46 +0100 Subject: UTF-8 support for distribution --- erts/emulator/beam/atom.c | 164 +++++++++++++++++++++---------- erts/emulator/beam/atom.h | 13 ++- erts/emulator/beam/atom.names | 2 + erts/emulator/beam/beam_load.c | 4 +- erts/emulator/beam/bif.c | 26 ++--- erts/emulator/beam/dist.c | 8 +- erts/emulator/beam/erl_alloc.types | 2 + erts/emulator/beam/erl_bif_ddll.c | 5 +- erts/emulator/beam/erl_bif_info.c | 22 +++-- erts/emulator/beam/erl_bif_port.c | 2 +- erts/emulator/beam/erl_db.c | 2 +- erts/emulator/beam/erl_db_util.c | 3 +- erts/emulator/beam/erl_init.c | 4 +- erts/emulator/beam/erl_unicode.c | 139 +++++++++++++++++++++----- erts/emulator/beam/external.c | 193 +++++++++++++++++++++++-------------- erts/emulator/beam/external.h | 5 +- erts/emulator/beam/global.h | 10 +- erts/emulator/beam/io.c | 28 ++++-- erts/emulator/beam/utils.c | 2 +- 19 files changed, 440 insertions(+), 194 deletions(-) (limited to 'erts/emulator/beam') diff --git a/erts/emulator/beam/atom.c b/erts/emulator/beam/atom.c index b41a98f2a2..82dd320ea9 100644 --- a/erts/emulator/beam/atom.c +++ b/erts/emulator/beam/atom.c @@ -162,6 +162,7 @@ atom_alloc(Atom* tmpl) obj->name = atom_text_alloc(tmpl->len); sys_memcpy(obj->name, tmpl->name, tmpl->len); obj->len = tmpl->len; + obj->latin1_chars = tmpl->latin1_chars; obj->slot.index = -1; /* @@ -192,48 +193,6 @@ atom_free(Atom* obj) erts_free(ERTS_ALC_T_ATOM, (void*) obj); } -Eterm -am_atom_put(const char* name, int len) -{ - Atom a; - Eterm ret; - int aix; -#ifdef DEBUG - byte* err_pos; - Uint num_chars; - ASSERT(erts_analyze_utf8(name, len, &err_pos, &num_chars, NULL) == ERTS_UTF8_OK); -#endif - /* - * Silently truncate the atom if it is too long. Overlong atoms - * could occur in situations where we have no good way to return - * an error, such as in the I/O system. (Unfortunately, many - * drivers don't check for errors.) - * - * If an error should be produced for overlong atoms (such in - * list_to_atom/1), the caller should check the length before - * calling this function. - */ - if (len > MAX_ATOM_SZ_LIMIT) { - len = MAX_ATOM_SZ_LIMIT; /*SVERK Urk... */ - } -#ifdef ERTS_ATOM_PUT_OPS_STAT - erts_smp_atomic_inc_nob(&atom_put_ops); -#endif - a.len = len; - a.name = (byte*)name; - atom_read_lock(); - aix = index_get(&erts_atom_table, (void*) &a); - atom_read_unlock(); - if (aix >= 0) - ret = make_atom(aix); - else { - atom_write_lock(); - ret = make_atom(index_put(&erts_atom_table, (void*) &a)); - atom_write_unlock(); - } - return ret; -} - static void latin1_to_utf8(byte* conv_buf, const byte** srcp, int* lenp) { byte* dst; @@ -264,19 +223,116 @@ need_convertion: *lenp = dst - conv_buf; } - +/* + * erts_atom_put() may fail. If it fails THE_NON_VALUE is returned! + */ Eterm -am_atom_put2(const byte* name, int len, int is_latin1) +erts_atom_put(const byte *name, int len, ErtsAtomEncoding enc, int trunc) { byte utf8_copy[MAX_ATOM_SZ_FROM_LATIN1]; + const byte *text = name; + int tlen = len; + Sint no_latin1_chars; + Atom a; + int aix; - if (is_latin1) { - latin1_to_utf8(utf8_copy, &name, &len); +#ifdef ERTS_ATOM_PUT_OPS_STAT + erts_smp_atomic_inc_nob(&atom_put_ops); +#endif + + if (tlen < 0) { + if (trunc) + tlen = 0; + else + return THE_NON_VALUE; } - return am_atom_put((const char*)name, len); -} + switch (enc) { + case ERTS_ATOM_ENC_7BIT_ASCII: + if (tlen > MAX_ATOM_CHARACTERS) { + if (trunc) + tlen = MAX_ATOM_CHARACTERS; + else + return THE_NON_VALUE; + } +#ifdef DEBUG + for (aix = 0; aix < len; aix++) { + ASSERT((name[aix] & 0x80) == 0); + } +#endif + no_latin1_chars = tlen; + break; + case ERTS_ATOM_ENC_LATIN1: + if (tlen > MAX_ATOM_CHARACTERS) { + if (trunc) + tlen = MAX_ATOM_CHARACTERS; + else + return THE_NON_VALUE; + } + no_latin1_chars = tlen; + latin1_to_utf8(utf8_copy, &text, &tlen); + break; + case ERTS_ATOM_ENC_UTF8: + /* First sanity check; need to verify later */ + if (tlen > MAX_ATOM_SZ_LIMIT && !trunc) + return THE_NON_VALUE; + break; + } + a.len = tlen; + a.name = (byte *) text; + atom_read_lock(); + aix = index_get(&erts_atom_table, (void*) &a); + atom_read_unlock(); + if (aix >= 0) { + /* Already in table no need to verify it */ + return make_atom(aix); + } + + if (enc == ERTS_ATOM_ENC_UTF8) { + /* Need to verify encoding and length */ + byte *err_pos; + Uint no_chars; + switch (erts_analyze_utf8_x((byte *) text, + (Uint) tlen, + &err_pos, + &no_chars, NULL, + &no_latin1_chars, + MAX_ATOM_CHARACTERS)) { + case ERTS_UTF8_OK: + ASSERT(no_chars <= MAX_ATOM_CHARACTERS); + break; + case ERTS_UTF8_OK_MAX_CHARS: + /* Truncated... */ + if (!trunc) + return THE_NON_VALUE; + ASSERT(no_chars == MAX_ATOM_CHARACTERS); + tlen = err_pos - text; + break; + default: + /* Bad utf8... */ + return THE_NON_VALUE; + } + } + + ASSERT(tlen <= MAX_ATOM_SZ_LIMIT); + ASSERT(-1 <= no_latin1_chars && no_latin1_chars <= MAX_ATOM_CHARACTERS); + + a.len = tlen; + a.latin1_chars = (Sint16) no_latin1_chars; + a.name = (byte *) text; + atom_write_lock(); + aix = index_put(&erts_atom_table, (void*) &a); + atom_write_unlock(); + return make_atom(aix); +} + +Eterm +am_atom_put(const char* name, int len) +{ + /* Assumes 7-bit ascii; use erts_atom_put() for other encodings... */ + return erts_atom_put((byte *) name, len, ERTS_ATOM_ENC_7BIT_ASCII, 1); +} int atom_table_size(void) { @@ -318,10 +374,11 @@ erts_atom_get(const char *name, int len, Eterm* ap, int is_latin1) int i; int res; - a.len = len; + a.len = (Sint16) len; a.name = (byte *)name; if (is_latin1) { - latin1_to_utf8(utf8_copy, (const byte**)&a.name, &a.len); + latin1_to_utf8(utf8_copy, (const byte**)&a.name, &len); + a.len = (Sint16) len; } atom_read_lock(); i = index_get(&erts_atom_table, (void*) &a); @@ -384,8 +441,15 @@ init_atom_table(void) for (i = 0; erl_atom_names[i] != 0; i++) { int ix; a.len = strlen(erl_atom_names[i]); + a.latin1_chars = a.len; a.name = (byte*)erl_atom_names[i]; a.slot.index = i; +#ifdef DEBUG + /* Verify 7-bit ascii */ + for (ix = 0; ix < a.len; ix++) { + ASSERT((a.name[ix] & 0x80) == 0); + } +#endif ix = index_put(&erts_atom_table, (void*) &a); atom_text_pos -= a.len; atom_space -= a.len; diff --git a/erts/emulator/beam/atom.h b/erts/emulator/beam/atom.h index 84dd6d8901..f721999a4c 100644 --- a/erts/emulator/beam/atom.h +++ b/erts/emulator/beam/atom.h @@ -47,7 +47,8 @@ */ typedef struct atom { IndexSlot slot; /* MUST BE LOCATED AT TOP OF STRUCT!!! */ - int len; /* length of atom name */ + Sint16 len; /* length of atom name (UTF-8 encoded) */ + Sint16 latin1_chars; /* 0-255 if atom can be encoded in latin1; otherwise, -1 */ int ord0; /* ordinal value of first 3 bytes + 7 bits */ byte* name; /* name of atom */ } Atom; @@ -113,6 +114,12 @@ ERTS_GLB_INLINE int erts_is_atom_str(const char *str, Eterm term, int is_latin1) #endif +typedef enum { + ERTS_ATOM_ENC_7BIT_ASCII, + ERTS_ATOM_ENC_LATIN1, + ERTS_ATOM_ENC_UTF8 +} ErtsAtomEncoding; + /* * Note, ERTS_IS_ATOM_STR() expects the first argument to be a * 7-bit ASCII string literal. @@ -125,8 +132,8 @@ ERTS_GLB_INLINE int erts_is_atom_str(const char *str, Eterm term, int is_latin1) int atom_table_size(void); /* number of elements */ int atom_table_sz(void); /* table size in bytes, excluding stored objects */ -Eterm am_atom_put(const char*, int); /* most callers pass plain char*'s */ -Eterm am_atom_put2(const byte*, int, int is_latin1); +Eterm am_atom_put(const char*, int); /* ONLY 7-bit ascii! */ +Eterm erts_atom_put(const byte *name, int len, ErtsAtomEncoding enc, int trunc); int atom_erase(byte*, int); int atom_static_put(byte*, int); void init_atom_table(void); diff --git a/erts/emulator/beam/atom.names b/erts/emulator/beam/atom.names index afcbd732df..59c9f39e7b 100644 --- a/erts/emulator/beam/atom.names +++ b/erts/emulator/beam/atom.names @@ -17,6 +17,8 @@ # %CopyrightEnd% # +# +# IMPORTANT! All atoms defined here *need* to be in 7-bit ascii! # # File format: # diff --git a/erts/emulator/beam/beam_load.c b/erts/emulator/beam/beam_load.c index 7bb964d5aa..8b4135e21d 100644 --- a/erts/emulator/beam/beam_load.c +++ b/erts/emulator/beam/beam_load.c @@ -1230,7 +1230,7 @@ load_atom_table(LoaderState* stp) GetByte(stp, n); GetString(stp, atom, n); - stp->atom[i] = am_atom_put2(atom, n, 1); + stp->atom[i] = erts_atom_put(atom, n, ERTS_ATOM_ENC_LATIN1, 1); } /* @@ -1620,7 +1620,7 @@ read_line_table(LoaderState* stp) GetInt(stp, 2, n); GetString(stp, fname, n); - stp->fname[i] = am_atom_put((char*)fname, n); /*SVERK ? */ + stp->fname[i] = erts_atom_put(fname, n, ERTS_ATOM_ENC_LATIN1, 1); } } diff --git a/erts/emulator/beam/bif.c b/erts/emulator/beam/bif.c index 89157068c0..a0b4a8c049 100644 --- a/erts/emulator/beam/bif.c +++ b/erts/emulator/beam/bif.c @@ -2536,11 +2536,13 @@ BIF_RETTYPE append_element_2(BIF_ALIST_2) BIF_RETTYPE atom_to_list_1(BIF_ALIST_1) { - Eterm do_utf8_to_list(Process*, Uint num, byte *bytes, Uint sz, Uint left, - Uint *num_built, Uint *num_eaten, Eterm tail); /*SVERK */ Atom* ap; Uint num_chars, num_built, num_eaten; + byte* err_pos; Eterm res; +#ifdef DEBUG + int ares; +#endif if (is_not_atom(BIF_ARG_1)) BIF_ERROR(BIF_P, BADARG); @@ -2549,16 +2551,15 @@ BIF_RETTYPE atom_to_list_1(BIF_ALIST_1) ap = atom_tab(atom_val(BIF_ARG_1)); if (ap->len == 0) BIF_RET(NIL); /* the empty atom */ - { - byte* err_pos; - if (erts_analyze_utf8(ap->name, ap->len, &err_pos, &num_chars, NULL) - != ERTS_UTF8_OK) { - BIF_ERROR(BIF_P, BADARG); - } - } + +#ifdef DEBUG + ares = +#endif + erts_analyze_utf8(ap->name, ap->len, &err_pos, &num_chars, NULL); + ASSERT(ares == ERTS_UTF8_OK); - res = do_utf8_to_list(BIF_P, num_chars, ap->name, ap->len, ap->len, - &num_built, &num_eaten, NIL); + res = erts_utf8_to_list(BIF_P, num_chars, ap->name, ap->len, ap->len, + &num_built, &num_eaten, NIL); ASSERT(num_built == num_chars); ASSERT(num_eaten == ap->len); BIF_RET(res); @@ -2582,7 +2583,8 @@ BIF_RETTYPE list_to_atom_1(BIF_ALIST_1) } BIF_ERROR(BIF_P, BADARG); } - res = am_atom_put2((byte*)buf, i, 1); + res = erts_atom_put((byte *) buf, i, ERTS_ATOM_ENC_LATIN1, 1); + ASSERT(is_atom(res)); erts_free(ERTS_ALC_T_TMP, (void *) buf); BIF_RET(res); } diff --git a/erts/emulator/beam/dist.c b/erts/emulator/beam/dist.c index 28c4621ff2..8c3bcd1de4 100644 --- a/erts/emulator/beam/dist.c +++ b/erts/emulator/beam/dist.c @@ -1646,7 +1646,7 @@ dsig_send(ErtsDSigData *dsdp, Eterm ctl, Eterm msg, int force_busy) data_size += erts_encode_dist_ext_size(ctl, flags, acmp); if (is_value(msg)) data_size += erts_encode_dist_ext_size(msg, flags, acmp); - erts_finalize_atom_cache_map(acmp); + erts_finalize_atom_cache_map(acmp, flags); dhdr_ext_size = erts_encode_ext_dist_header_size(acmp); data_size += dhdr_ext_size; @@ -1996,7 +1996,8 @@ erts_dist_command(Port *prt, int reds_limit) ASSERT(ob); do { ob->extp = erts_encode_ext_dist_header_finalize(ob->extp, - dep->cache); + dep->cache, + flags); if (!(flags & DFLAG_DIST_HDR_ATOM_CACHE)) *--ob->extp = PASS_THROUGH; /* Old node; 'pass through' needed */ @@ -2040,7 +2041,8 @@ erts_dist_command(Port *prt, int reds_limit) Uint size; oq.first->extp = erts_encode_ext_dist_header_finalize(oq.first->extp, - dep->cache); + dep->cache, + flags); reds += ERTS_PORT_REDS_DIST_CMD_FINALIZE; if (!(flags & DFLAG_DIST_HDR_ATOM_CACHE)) *--oq.first->extp = PASS_THROUGH; /* Old node; 'pass through' diff --git a/erts/emulator/beam/erl_alloc.types b/erts/emulator/beam/erl_alloc.types index 0a4407f009..2b649b589b 100644 --- a/erts/emulator/beam/erl_alloc.types +++ b/erts/emulator/beam/erl_alloc.types @@ -49,6 +49,8 @@ # true after a "+enable X" statement or if it has been passed as a # command line argument to make_alloc_types. The variable X is false # after a "+disable X" statement or if it has never been mentioned. +# +# IMPORTANT! Only use 7-bit ascii text in this file! +if smp +disable threads_no_smp diff --git a/erts/emulator/beam/erl_bif_ddll.c b/erts/emulator/beam/erl_bif_ddll.c index 7f7c975e78..59a53870f3 100644 --- a/erts/emulator/beam/erl_bif_ddll.c +++ b/erts/emulator/beam/erl_bif_ddll.c @@ -1869,7 +1869,10 @@ static Eterm build_load_error_hp(Eterm *hp, int code) static Eterm mkatom(char *str) { - return am_atom_put(str, sys_strlen(str)); + return erts_atom_put((byte *) str, + sys_strlen(str), + ERTS_ATOM_ENC_LATIN1, + 1); } static char *pick_list_or_atom(Eterm name_term) diff --git a/erts/emulator/beam/erl_bif_info.c b/erts/emulator/beam/erl_bif_info.c index a3811ccdb0..c910bd0cb6 100755 --- a/erts/emulator/beam/erl_bif_info.c +++ b/erts/emulator/beam/erl_bif_info.c @@ -2296,8 +2296,10 @@ BIF_RETTYPE system_info_1(BIF_ALIST_1) for (i = num_instructions-1; i >= 0; i--) { res = erts_bld_cons(hpp, hszp, erts_bld_tuple(hpp, hszp, 2, - am_atom_put(opc[i].name, - strlen(opc[i].name)), + erts_atom_put(opc[i].name, + strlen(opc[i].name), + ERTS_ATOM_ENC_LATIN1, + 1), erts_bld_uint(hpp, hszp, opc[i].count)), res); @@ -3901,7 +3903,7 @@ static Eterm lcnt_build_lock_stats_term(Eterm **hpp, Uint *szp, erts_lcnt_lock_s timer_ns = stats->timer.ns; timer_n = stats->timer_n; - af = am_atom_put(stats->file, strlen(stats->file)); + af = erts_atom_put(stats->file, strlen(stats->file), ERTS_ATOM_ENC_LATIN1, 1); uil = erts_bld_uint( hpp, szp, line); tloc = erts_bld_tuple(hpp, szp, 2, af, uil); @@ -3938,13 +3940,13 @@ static Eterm lcnt_build_lock_term(Eterm **hpp, Uint *szp, erts_lcnt_lock_t *lock ASSERT(ltype); - type = am_atom_put(ltype, strlen(ltype)); - name = am_atom_put(lock->name, strlen(lock->name)); + type = erts_atom_put(ltype, strlen(ltype), ERTS_ATOM_ENC_LATIN1, 1); + name = erts_atom_put(lock->name, strlen(lock->name), ERTS_ATOM_ENC_LATIN1, 1); if (lock->flag & ERTS_LCNT_LT_ALLOC) { /* use allocator types names as id's for allocator locks */ ltype = (char *) ERTS_ALC_A2AD(signed_val(lock->id)); - id = am_atom_put(ltype, strlen(ltype)); + id = erts_atom_put(ltype, strlen(ltype), ERTS_ATOM_ENC_LATIN1, 1); } else if (lock->flag & ERTS_LCNT_LT_PROCLOCK) { /* use registered names as id's for process locks if available */ proc = erts_proc_lookup(lock->id); @@ -3984,12 +3986,12 @@ static Eterm lcnt_build_result_term(Eterm **hpp, Uint *szp, erts_lcnt_data_t *da dtns = erts_bld_uint( hpp, szp, data->duration.ns); tdt = erts_bld_tuple(hpp, szp, 2, dts, dtns); - adur = am_atom_put(str_duration, strlen(str_duration)); + adur = erts_atom_put(str_duration, strlen(str_duration), ERTS_ATOM_ENC_LATIN1, 1); tdur = erts_bld_tuple(hpp, szp, 2, adur, tdt); /* lock tuple */ - aloc = am_atom_put(str_locks, strlen(str_locks)); + aloc = erts_atom_put(str_locks, strlen(str_locks), ERTS_ATOM_ENC_LATIN1, 1); for (lock = data->current_locks->head; lock != NULL ; lock = lock->next ) { lloc = lcnt_build_lock_term(hpp, szp, lock, lloc); @@ -4125,14 +4127,14 @@ BIF_RETTYPE erts_debug_lock_counters_1(BIF_ALIST_1) static void os_info_init(void) { - Eterm type = am_atom_put(os_type, strlen(os_type)); + Eterm type = erts_atom_put((byte *) os_type, strlen(os_type), ERTS_ATOM_ENC_LATIN1, 1); Eterm flav; int major, minor, build; char* buf = erts_alloc(ERTS_ALC_T_TMP, 1024); /* More than enough */ Eterm* hp; os_flavor(buf, 1024); - flav = am_atom_put(buf, strlen(buf)); + flav = erts_atom_put((byte *) buf, strlen(buf), ERTS_ATOM_ENC_LATIN1, 1); erts_free(ERTS_ALC_T_TMP, (void *) buf); hp = erts_alloc(ERTS_ALC_T_LL_TEMP_TERM, (3+4)*sizeof(Eterm)); os_type_tuple = TUPLE2(hp, type, flav); diff --git a/erts/emulator/beam/erl_bif_port.c b/erts/emulator/beam/erl_bif_port.c index f9009166c0..4b270414cb 100644 --- a/erts/emulator/beam/erl_bif_port.c +++ b/erts/emulator/beam/erl_bif_port.c @@ -68,7 +68,7 @@ BIF_RETTYPE open_port_2(BIF_ALIST_2) } else { str = "einval"; } - BIF_P->fvalue = am_atom_put(str, strlen(str)); + BIF_P->fvalue = erts_atom_put((byte *) str, strlen(str), ERTS_ATOM_ENC_LATIN1, 1); BIF_ERROR(BIF_P, EXC_ERROR); } diff --git a/erts/emulator/beam/erl_db.c b/erts/emulator/beam/erl_db.c index 1ba1048afa..f8a4882ec0 100644 --- a/erts/emulator/beam/erl_db.c +++ b/erts/emulator/beam/erl_db.c @@ -3815,7 +3815,7 @@ erts_ets_colliding_names(Process* p, Eterm name, Uint cnt) while (index >= atom_table_size()) { char tmp[20]; erts_snprintf(tmp, sizeof(tmp), "am%x", atom_table_size()); - am_atom_put(tmp,strlen(tmp)); + erts_atom_put((byte *) tmp, strlen(tmp), ERTS_ATOM_ENC_LATIN1, 1); } list = CONS(hp, make_atom(index), list); hp += 2; diff --git a/erts/emulator/beam/erl_db_util.c b/erts/emulator/beam/erl_db_util.c index 0c9ca83ce4..407a55a3d7 100644 --- a/erts/emulator/beam/erl_db_util.c +++ b/erts/emulator/beam/erl_db_util.c @@ -4768,7 +4768,8 @@ static int match_compact(ErlHeapFragment *expr, DMCErrInfo *err_info) ASSERT(j < x); erts_snprintf(buff+1, sizeof(buff) - 1, "%u", (unsigned) j); /* Yes, writing directly into terms, they ARE off heap */ - *p = am_atom_put(buff, strlen(buff)); + *p = erts_atom_put((byte *) buff, strlen(buff), + ERTS_ATOM_ENC_LATIN1, 1); } ++p; } diff --git a/erts/emulator/beam/erl_init.c b/erts/emulator/beam/erl_init.c index 4b90e5394a..369eab5980 100644 --- a/erts/emulator/beam/erl_init.c +++ b/erts/emulator/beam/erl_init.c @@ -344,7 +344,7 @@ erl_first_process_otp(char* modname, void* code, unsigned size, int argc, char** ErlSpawnOpts so; Eterm env; - start_mod = am_atom_put(modname, sys_strlen(modname)); + start_mod = erts_atom_put((byte *) modname, sys_strlen(modname), ERTS_ATOM_ENC_LATIN1, 1); if (erts_find_function(start_mod, am_start, 2, erts_active_code_ix()) == NULL) { erl_exit(5, "No function %s:start/2\n", modname); @@ -441,7 +441,7 @@ load_preloaded(void) i = 0; while ((name = preload_p[i].name) != NULL) { length = preload_p[i].size; - module_name = am_atom_put(name, sys_strlen(name)); + module_name = erts_atom_put((byte *) name, sys_strlen(name), ERTS_ATOM_ENC_LATIN1, 1); if ((code = sys_preload_begin(&preload_p[i])) == 0) erl_exit(1, "Failed to find preloaded code for module %s\n", name); diff --git a/erts/emulator/beam/erl_unicode.c b/erts/emulator/beam/erl_unicode.c index e24b6f1458..6600ce4a4a 100644 --- a/erts/emulator/beam/erl_unicode.c +++ b/erts/emulator/beam/erl_unicode.c @@ -1154,15 +1154,24 @@ BIF_RETTYPE unicode_characters_to_list_2(BIF_ALIST_2) * When input to characters_to_list is a plain binary and the format is 'unicode', we do * a faster analyze and size count with this function. */ -int erts_analyze_utf8(byte *source, Uint size, - byte **err_pos, Uint *num_chars, int *left) +static ERTS_INLINE int +analyze_utf8(byte *source, Uint size, byte **err_pos, Uint *num_chars, int *left, + Sint *num_latin1_chars, Uint max_chars) { + Uint latin1_count; + int is_latin1; *err_pos = source; + if (num_latin1_chars) { + is_latin1 = 1; + latin1_count = 0; + } *num_chars = 0; while (size) { if (((*source) & ((byte) 0x80)) == 0) { source++; - --size; + --size; + if (num_latin1_chars) + latin1_count++; } else if (((*source) & ((byte) 0xE0)) == 0xC0) { if (size < 2) { return ERTS_UTF8_INCOMPLETE; @@ -1173,6 +1182,11 @@ int erts_analyze_utf8(byte *source, Uint size, } source += 2; size -= 2; + if (num_latin1_chars) { + latin1_count++; + if ((source[0] & ((byte) 0xFC)) != ((byte) 0xC0)) + is_latin1 = 0; + } } else if (((*source) & ((byte) 0xF0)) == 0xE0) { if (size < 3) { return ERTS_UTF8_INCOMPLETE; @@ -1188,6 +1202,8 @@ int erts_analyze_utf8(byte *source, Uint size, } source += 3; size -= 3; + if (num_latin1_chars) + is_latin1 = 0; } else if (((*source) & ((byte) 0xF8)) == 0xF0) { if (size < 4) { return ERTS_UTF8_INCOMPLETE; @@ -1205,22 +1221,41 @@ int erts_analyze_utf8(byte *source, Uint size, } source += 4; size -= 4; + if (num_latin1_chars) + is_latin1 = 0; } else { return ERTS_UTF8_ERROR; } ++(*num_chars); *err_pos = source; + if (max_chars && size > 0 && *num_chars == max_chars) + return ERTS_UTF8_OK_MAX_CHARS; if (left && --(*left) <= 0 && size) { return ERTS_UTF8_ANALYZE_MORE; } } + if (num_latin1_chars) + *num_latin1_chars = is_latin1 ? latin1_count : -1; return ERTS_UTF8_OK; } +int erts_analyze_utf8(byte *source, Uint size, + byte **err_pos, Uint *num_chars, int *left) +{ + return analyze_utf8(source, size, err_pos, num_chars, left, NULL, 0); +} + +int erts_analyze_utf8_x(byte *source, Uint size, + byte **err_pos, Uint *num_chars, int *left, + Sint *num_latin1_chars, Uint max_chars) +{ + return analyze_utf8(source, size, err_pos, num_chars, left, num_latin1_chars, max_chars); +} + /* * No errors should be able to occur - no overlongs, no malformed, no nothing - */ -Eterm do_utf8_to_list(Process *p, Uint num, byte *bytes, Uint sz, + */ +static Eterm do_utf8_to_list(Process *p, Uint num, byte *bytes, Uint sz, Uint left, Uint *num_built, Uint *num_eaten, Eterm tail) { @@ -1275,6 +1310,12 @@ Eterm do_utf8_to_list(Process *p, Uint num, byte *bytes, Uint sz, return ret; } +Eterm erts_utf8_to_list(Process *p, Uint num, byte *bytes, Uint sz, Uint left, + Uint *num_built, Uint *num_eaten, Eterm tail) +{ + return do_utf8_to_list(p, num, bytes, sz, left, num_built, num_eaten, tail); +} + static int is_candidate(Uint cp) { int index,pos; @@ -1849,14 +1890,14 @@ BIF_RETTYPE atom_to_binary_2(BIF_ALIST_2) } static BIF_RETTYPE -binary_to_atom(Process* p, Eterm bin, Eterm enc, int must_exist) +binary_to_atom(Process* proc, Eterm bin, Eterm enc, int must_exist) { byte* bytes; byte *temp_alloc = NULL; Uint bin_size; if ((bytes = erts_get_aligned_binary_bytes(bin, &temp_alloc)) == 0) { - BIF_ERROR(p, BADARG); + BIF_ERROR(proc, BADARG); } bin_size = binary_size(bin); if (enc == am_latin1) { @@ -1864,11 +1905,16 @@ binary_to_atom(Process* p, Eterm bin, Eterm enc, int must_exist) if (bin_size > MAX_ATOM_CHARACTERS) { system_limit: erts_free_aligned_binary_bytes(temp_alloc); - BIF_ERROR(p, SYSTEM_LIMIT); + BIF_ERROR(proc, SYSTEM_LIMIT); } if (!must_exist) { - a = am_atom_put2(bytes, bin_size, 1); - erts_free_aligned_binary_bytes(temp_alloc); + a = erts_atom_put((byte *) bytes, + bin_size, + ERTS_ATOM_ENC_LATIN1, + 0); + erts_free_aligned_binary_bytes(temp_alloc); + if (is_non_value(a)) + goto badarg; BIF_RET(a); } else if (erts_atom_get((char *)bytes, bin_size, &a, 1)) { erts_free_aligned_binary_bytes(temp_alloc); @@ -1900,17 +1946,22 @@ binary_to_atom(Process* p, Eterm bin, Eterm enc, int must_exist) } if (!must_exist) { - res = am_atom_put((char*)bytes, bin_size); + res = erts_atom_put((byte *) bytes, + bin_size, + ERTS_ATOM_ENC_UTF8, + 0); } else if (!erts_atom_get((char*)bytes, bin_size, &res, 0)) { goto badarg; } erts_free_aligned_binary_bytes(temp_alloc); + if (is_non_value(res)) + goto badarg; BIF_RET(res); } else { badarg: erts_free_aligned_binary_bytes(temp_alloc); - BIF_ERROR(p, BADARG); + BIF_ERROR(proc, BADARG); } } @@ -2625,30 +2676,70 @@ BIF_RETTYPE file_native_name_encoding_0(BIF_ALIST_0) } } -/* Assumes 'dest' has enough room. - */ -int erts_utf8_to_latin1(byte* dest, const byte* source, unsigned slen) +int erts_utf8_is_latin1_string(const byte *string, int len) +{ + /* Assumes string is encoded in valid UTF-8 */ + int i; + while (i < len) { + if ((string[i] & 0x80) == 0) + i++; + else if (i+1 < len + && (string[i] & 0xFE) == 0xC2 + && (string[i+1] & 0xC0) == 0x80) + i +=2; + else + return 0; + } + return 1; +} + +int erts_utf8_to_latin1(byte* dest, const byte* source, int slen) { + /* + * Assumes source contains valid utf8 that can be encoded as latin1, + * and that dest has enough room. + */ byte* dp = dest; while (slen > 0) { if ((source[0] & 0x80) == 0) { *dp++ = *source++; --slen; } - else if (slen > 1 && - (source[0] & 0xFE) == 0xC2 && - (source[1] & 0xC0) == 0x80) { + else { + ASSERT(slen > 1); + ASSERT((source[0] & 0xFE) == 0xC2); + ASSERT((source[1] & 0xC0) == 0x80); *dp++ = (char) ((source[0] << 6) | (source[1] & 0x3F)); source += 2; slen -= 2; } - else { - /* Just let unconvertable octets through. This should not happen - in a correctly upgraded system */ - *dp++ = *source++; - --slen; - } } return dp - dest; } +int erts_utf8_to_latin1_backwards(byte *dest, const byte *source, int slen) +{ + /* + * Assumes source contains valid utf8 that can be encoded as latin1, + * and that dest has enough room. + */ + int dix = 0; + int six = slen; + while (six > 0) { + six--; + dix--; + if ((source[six] & 0x80) == 0) + dest[dix] = source[six]; + else { + byte c; + ASSERT(six > 0); + ASSERT((source[six] & 0xC0) == 0x80); + ASSERT((source[six-1] & 0xFE) == 0xC2); + c = source[six] & 0x3F; + six--; + c |= source[six] << 6; + dest[dix] = c; + } + } + return -dix; +} diff --git a/erts/emulator/beam/external.c b/erts/emulator/beam/external.c index 68edcd0fa6..8c4d9108d4 100644 --- a/erts/emulator/beam/external.c +++ b/erts/emulator/beam/external.c @@ -142,6 +142,7 @@ erts_init_atom_cache_map(ErtsAtomCacheMap *acmp) { if (acmp) { int ix; + acmp->long_atoms = 0; for (ix = 0; ix < ERTS_ATOM_CACHE_SIZE; ix++) acmp->cache[ix].iix = -1; acmp->sz = 0; @@ -154,6 +155,7 @@ erts_reset_atom_cache_map(ErtsAtomCacheMap *acmp) { if (acmp) { int i; + acmp->long_atoms = 0; for (i = 0; i < acmp->sz; i++) { ASSERT(0 <= acmp->cix[i] && acmp->cix[i] < ERTS_ATOM_CACHE_SIZE); acmp->cache[acmp->cix[i]].iix = -1; @@ -175,9 +177,23 @@ erts_destroy_atom_cache_map(ErtsAtomCacheMap *acmp) } static ERTS_INLINE void -insert_acache_map(ErtsAtomCacheMap *acmp, Eterm atom) +insert_acache_map(ErtsAtomCacheMap *acmp, Eterm atom, Uint32 dflags) { - if (acmp && acmp->sz < ERTS_MAX_INTERNAL_ATOM_CACHE_ENTRIES) { + /* + * If the receiver do not understand utf8 atoms + * and this atom cannot be represented in latin1, + * we are not allowed to cache it. + * + * In this case all atoms are assumed to have + * latin1 encoding in the cache. By refusing it + * in the cache we will instead encode it using + * ATOM_UTF8_EXT/SMALL_ATOM_UTF8_EXT which the + * receiver do not recognize and tear down the + * connection. + */ + if (acmp && acmp->sz < ERTS_MAX_INTERNAL_ATOM_CACHE_ENTRIES + && ((dflags & DFLAG_UTF8_ATOMS) + || atom_tab(atom_val(atom))->latin1_chars >= 0)) { int ix; ASSERT(acmp->hdr_sz < 0); ix = atom2cix(atom); @@ -190,7 +206,7 @@ insert_acache_map(ErtsAtomCacheMap *acmp, Eterm atom) } static ERTS_INLINE int -get_iix_acache_map(ErtsAtomCacheMap *acmp, Eterm atom) +get_iix_acache_map(ErtsAtomCacheMap *acmp, Eterm atom, Uint32 dflags) { if (!acmp) return -1; @@ -199,7 +215,9 @@ get_iix_acache_map(ErtsAtomCacheMap *acmp, Eterm atom) ASSERT(is_atom(atom)); ix = atom2cix(atom); if (acmp->cache[ix].iix < 0) { - ASSERT(acmp->sz == ERTS_MAX_INTERNAL_ATOM_CACHE_ENTRIES); + ASSERT(acmp->sz == ERTS_MAX_INTERNAL_ATOM_CACHE_ENTRIES + || (!(dflags & DFLAG_UTF8_ATOMS) + && atom_tab(atom_val(atom))->latin1_chars < 0)); return -1; } else { @@ -210,18 +228,17 @@ get_iix_acache_map(ErtsAtomCacheMap *acmp, Eterm atom) } void -erts_finalize_atom_cache_map(ErtsAtomCacheMap *acmp) +erts_finalize_atom_cache_map(ErtsAtomCacheMap *acmp, Uint32 dflags) { if (acmp) { -#if MAX_ATOM_LENGTH > 255 -#error "This code is not complete; long_atoms info need to be passed to the following stages." - int long_atoms = 0; /* !0 if one or more atoms are long than 255. */ -#endif + int utf8_atoms = (int) (dflags & DFLAG_UTF8_ATOMS); + int long_atoms = 0; /* !0 if one or more atoms are longer than 255. */ int i; int sz; int fix_sz = 1 /* VERSION_MAGIC */ + 1 /* DIST_HEADER */ + + 1 /* dist header flags */ + 1 /* number of internal cache entries */ ; int min_sz; @@ -230,22 +247,23 @@ erts_finalize_atom_cache_map(ErtsAtomCacheMap *acmp) min_sz = fix_sz+(2+4)*acmp->sz; sz = fix_sz; for (i = 0; i < acmp->sz; i++) { + Atom *a; Eterm atom; int len; atom = acmp->cache[acmp->cix[i]].atom; ASSERT(is_atom(atom)); - len = atom_tab(atom_val(atom))->len; -#if MAX_ATOM_LENGTH > 255 + a = atom_tab(atom_val(atom)); + len = (int) (utf8_atoms ? a->len : a->latin1_chars); + ASSERT(len >= 0); if (!long_atoms && len > 255) long_atoms = 1; -#endif /* Enough for a new atom cache value */ sz += 1 /* cix */ + 1 /* length */ + len /* text */; } -#if MAX_ATOM_LENGTH > 255 - if (long_atoms) + if (long_atoms) { + acmp->long_atoms = 1; sz += acmp->sz; /* we need 2 bytes per atom for length */ -#endif + } /* Dynamically sized flag field */ sz += ERTS_DIST_HDR_ATOM_CACHE_FLAG_BYTES(acmp->sz); if (sz < min_sz) @@ -274,6 +292,7 @@ byte *erts_encode_ext_dist_header_setup(byte *ctl_ext, ErtsAtomCacheMap *acmp) else { int i; byte *ep = ctl_ext; + byte dist_hdr_flags = acmp->long_atoms ? ERTS_DIST_HDR_LONG_ATOMS_FLG : 0; ASSERT(acmp->hdr_sz >= 0); /* * Write cache update instructions. Note that this is a purely @@ -296,28 +315,36 @@ byte *erts_encode_ext_dist_header_setup(byte *ctl_ext, ErtsAtomCacheMap *acmp) } --ep; put_int8(acmp->sz, ep); + --ep; + put_int8(dist_hdr_flags, ep); *--ep = DIST_HEADER; *--ep = VERSION_MAGIC; return ep; } } -byte *erts_encode_ext_dist_header_finalize(byte *ext, ErtsAtomCache *cache) +byte *erts_encode_ext_dist_header_finalize(byte *ext, ErtsAtomCache *cache, Uint32 dflags) { byte *ip; byte instr_buf[(2+4)*ERTS_ATOM_CACHE_SIZE]; int ci, sz; + byte dist_hdr_flags; + int long_atoms; + int utf8_atoms = (int) (dflags & DFLAG_UTF8_ATOMS); register byte *ep = ext; ASSERT(ep[0] == VERSION_MAGIC); if (ep[1] != DIST_HEADER) return ext; + dist_hdr_flags = ep[2]; + long_atoms = ERTS_DIST_HDR_LONG_ATOMS_FLG & ((int) dist_hdr_flags); + /* * Update output atom cache and write the external version of * the dist header. We write the header backwards just * before the actual term(s). */ - ep += 2; + ep += 3; ci = (int) get_int8(ep); ASSERT(0 <= ci && ci < ERTS_ATOM_CACHE_SIZE); ep += 1; @@ -342,12 +369,7 @@ byte *erts_encode_ext_dist_header_finalize(byte *ext, ErtsAtomCache *cache) flgs_bytes = ERTS_DIST_HDR_ATOM_CACHE_FLAG_BYTES(ci); ASSERT(flgs_bytes <= sizeof(flgs_buf)); -#if MAX_ATOM_LENGTH > 255 - /* long_atoms info needs to be passed from previous stages */ - if (long_atoms) - flgs |= ERTS_DIST_HDR_LONG_ATOMS_FLG; -#endif - flgs = 0; + flgs = (Uint32) dist_hdr_flags; flgs_buf_ix = 0; if ((ci & 1) == 0) used_half_bytes = 2; @@ -382,17 +404,22 @@ byte *erts_encode_ext_dist_header_finalize(byte *ext, ErtsAtomCache *cache) Atom *a; cache->out_arr[cix] = atom; a = atom_tab(atom_val(atom)); - sz = a->len; - ep -= sz; - sys_memcpy((void *) ep, (void *) a->name, sz); -#if MAX_ATOM_LENGTH > 255 + if (utf8_atoms) { + sz = a->len; + ep -= sz; + sys_memcpy((void *) ep, (void *) a->name, sz); + } + else { + ASSERT(0 <= a->latin1_chars && a->latin1_chars <= MAX_ATOM_CHARACTERS); + ep -= a->latin1_chars; + sz = erts_utf8_to_latin1(ep, a->name, a->len); + ASSERT(a->latin1_chars == sz); + } if (long_atoms) { ep -= 2; put_int16(sz, ep); } - else -#endif - { + else { ASSERT(0 <= sz && sz <= 255); --ep; put_int8(sz, ep); @@ -553,6 +580,7 @@ erts_prepare_dist_ext(ErtsDistExternal *edep, #endif register byte *ep = ext; + int utf8_atoms = (int) (dep->flags & DFLAG_UTF8_ATOMS); edep->heap_size = -1; edep->ext_endp = ext+size; @@ -611,9 +639,7 @@ erts_prepare_dist_ext(ErtsDistExternal *edep, ERTS_EXT_HDR_FAIL; ep++; if (no_atoms) { -#if MAX_ATOM_LENGTH > 255 int long_atoms = 0; -#endif #ifdef DEBUG byte *flgs_buf = ep; #endif @@ -632,14 +658,8 @@ erts_prepare_dist_ext(ErtsDistExternal *edep, */ byte_ix = ERTS_DIST_HDR_ATOM_CACHE_FLAG_BYTE_IX(no_atoms); bit_ix = ERTS_DIST_HDR_ATOM_CACHE_FLAG_BIT_IX(no_atoms); - if (flgsp[byte_ix] & (((byte) ERTS_DIST_HDR_LONG_ATOMS_FLG) - << bit_ix)) { -#if MAX_ATOM_LENGTH > 255 + if (flgsp[byte_ix] & (((byte) ERTS_DIST_HDR_LONG_ATOMS_FLG) << bit_ix)) long_atoms = 1; -#else - ERTS_EXT_HDR_FAIL; /* Long atoms not supported yet */ -#endif - } #ifdef DEBUG byte_ix = 0; @@ -707,23 +727,25 @@ erts_prepare_dist_ext(ErtsDistExternal *edep, if (cix >= ERTS_ATOM_CACHE_SIZE) ERTS_EXT_HDR_FAIL; ep++; -#if MAX_ATOM_CHARACTERS > 255 if (long_atoms) { CHKSIZE(2); len = get_int16(ep); ep += 2; } - else -#endif - { + else { CHKSIZE(1); len = get_int8(ep); ep++; } - if (len > MAX_ATOM_CHARACTERS) - ERTS_EXT_HDR_FAIL; /* Too long atom */ CHKSIZE(len); - atom = am_atom_put((char *) ep, len); + atom = erts_atom_put((byte *) ep, + len, + (utf8_atoms + ? ERTS_ATOM_ENC_UTF8 + : ERTS_ATOM_ENC_LATIN1), + 0); + if (is_non_value(atom)) + ERTS_EXT_HDR_FAIL; ep += len; cache->in_arr[cix] = atom; edep->attab.atom[tix] = atom; @@ -1404,7 +1426,8 @@ static byte* enc_atom(ErtsAtomCacheMap *acmp, Eterm atom, byte *ep, Uint32 dflags) { int iix; - int i, j; + int len; + int utf8_atoms = (int) (dflags & DFLAG_UTF8_ATOMS); ASSERT(is_atom(atom)); @@ -1423,42 +1446,46 @@ enc_atom(ErtsAtomCacheMap *acmp, Eterm atom, byte *ep, Uint32 dflags) } return ep; } + /* * term_to_binary/1,2 and the initial distribution message * don't use the cache. */ - iix = get_iix_acache_map(acmp, atom); - if (iix < 0) { - i = atom_val(atom); - j = atom_tab(i)->len; - if (dflags & DFLAG_UTF8_ATOMS) { - if (j <= 255) { + + iix = get_iix_acache_map(acmp, atom, dflags); + if (iix < 0) { + Atom *a = atom_tab(atom_val(atom)); + if (utf8_atoms || a->latin1_chars < 0) { + len = a->len; + if (len > 255) { *ep++ = ATOM_UTF8_EXT; - put_int16(j, ep); + put_int16(len, ep); ep += 2; } else { *ep++ = SMALL_ATOM_UTF8_EXT; - put_int8(j, ep); - ep += 2; + put_int8(len, ep); + ep += 1; } - sys_memcpy((char *) ep, (char*)atom_tab(i)->name, (int) j); + sys_memcpy((char *) ep, (char *) a->name, len); } else { - if (j <= 255 && (dflags & DFLAG_SMALL_ATOM_TAGS)) { + if (a->latin1_chars <= 255 && (dflags & DFLAG_SMALL_ATOM_TAGS)) { *ep++ = SMALL_ATOM_EXT; - j = erts_utf8_to_latin1(ep+1, atom_tab(i)->name, j); - put_int8(j, ep); + len = erts_utf8_to_latin1(ep+1, a->name, a->len); + ASSERT(len == a->latin1_chars); + put_int8(len, ep); ep++; } else { *ep++ = ATOM_EXT; - j = erts_utf8_to_latin1(ep+2, atom_tab(i)->name, j); - put_int16(j, ep); + len = erts_utf8_to_latin1(ep+2, a->name, a->len); + ASSERT(len == a->latin1_chars); + put_int16(len, ep); ep += 2; } } - ep += j; + ep += len; return ep; } @@ -1535,7 +1562,15 @@ dec_atom(ErtsDistExternal *edep, byte* ep, Eterm* objp) goto error; } } else { - *objp = am_atom_put2(ep, len, is_latin1); + Eterm atom = erts_atom_put(ep, + len, + (is_latin1 + ? ERTS_ATOM_ENC_LATIN1 + : ERTS_ATOM_ENC_UTF8), + 0); + if (is_non_value(atom)) + goto error; + *objp = atom; } ep += len; break; @@ -2248,7 +2283,15 @@ dec_term_atom_common: goto error; } } else { - *objp = am_atom_put2(ep, n, is_latin1); + Eterm atom = erts_atom_put(ep, + n, + (is_latin1 + ? ERTS_ATOM_ENC_LATIN1 + : ERTS_ATOM_ENC_UTF8), + 0); + if (is_non_value(atom)) + goto error; + *objp = atom; } ep += n; break; @@ -2917,18 +2960,22 @@ encode_size_struct2(ErtsAtomCacheMap *acmp, Eterm obj, unsigned dflags) } } else { - int alen = atom_tab(atom_val(obj))->len; - result += 1 + 1 + alen; - if (dflags & DFLAG_UTF8_ATOMS) { + Atom *a = atom_tab(atom_val(obj)); + int alen; + if ((dflags & DFLAG_UTF8_ATOMS) || a->latin1_chars < 0) { + alen = a->len; + result += 1 + 1 + alen; if (alen > 255) { result++; /* ATOM_UTF8_EXT (not small) */ } - /*SVERK we use utf8 length which is an over estimation */ - } - else if (alen > 255 || !(dflags & DFLAG_SMALL_ATOM_TAGS)) { - result++; /* ATOM_EXT (not small) */ } - insert_acache_map(acmp, obj); + else { + alen = a->latin1_chars; + result += 1 + 1 + alen; + if (alen > 255 || !(dflags & DFLAG_SMALL_ATOM_TAGS)) + result++; /* ATOM_EXT (not small) */ + } + insert_acache_map(acmp, obj, dflags); } break; case SMALL_DEF: diff --git a/erts/emulator/beam/external.h b/erts/emulator/beam/external.h index 50eea62225..ad430117c8 100644 --- a/erts/emulator/beam/external.h +++ b/erts/emulator/beam/external.h @@ -92,6 +92,7 @@ typedef struct cache { typedef struct { int hdr_sz; int sz; + int long_atoms; int cix[ERTS_ATOM_CACHE_SIZE]; struct { Eterm atom; @@ -152,12 +153,12 @@ typedef struct { void erts_init_atom_cache_map(ErtsAtomCacheMap *); void erts_reset_atom_cache_map(ErtsAtomCacheMap *); void erts_destroy_atom_cache_map(ErtsAtomCacheMap *); -void erts_finalize_atom_cache_map(ErtsAtomCacheMap *); +void erts_finalize_atom_cache_map(ErtsAtomCacheMap *, Uint32); Uint erts_encode_ext_dist_header_size(ErtsAtomCacheMap *); Uint erts_encode_ext_dist_header_size(ErtsAtomCacheMap *); byte *erts_encode_ext_dist_header_setup(byte *, ErtsAtomCacheMap *); -byte *erts_encode_ext_dist_header_finalize(byte *, ErtsAtomCache *); +byte *erts_encode_ext_dist_header_finalize(byte *, ErtsAtomCache *, Uint32); Uint erts_encode_dist_ext_size(Eterm, Uint32, ErtsAtomCacheMap *); void erts_encode_dist_ext(Eterm, byte **, Uint32, ErtsAtomCacheMap *); diff --git a/erts/emulator/beam/global.h b/erts/emulator/beam/global.h index 1500424d3e..eccdf10c75 100755 --- a/erts/emulator/beam/global.h +++ b/erts/emulator/beam/global.h @@ -1519,17 +1519,25 @@ Sint erts_native_filename_need(Eterm ioterm, int encoding); void erts_copy_utf8_to_utf16_little(byte *target, byte *bytes, int num_chars); int erts_analyze_utf8(byte *source, Uint size, byte **err_pos, Uint *num_chars, int *left); +int erts_analyze_utf8_x(byte *source, Uint size, + byte **err_pos, Uint *num_chars, int *left, + Sint *num_latin1_chars, Uint max_chars); char *erts_convert_filename_to_native(Eterm name, char *statbuf, size_t statbuf_size, ErtsAlcType_t alloc_type, int allow_empty, int allow_atom, Sint *used /* out */); Eterm erts_convert_native_to_filename(Process *p, byte *bytes); -int erts_utf8_to_latin1(byte* dest, const byte* source, unsigned slen); +Eterm erts_utf8_to_list(Process *p, Uint num, byte *bytes, Uint sz, Uint left, + Uint *num_built, Uint *num_eaten, Eterm tail); +int erts_utf8_is_latin1_string(const byte *string, int len); +int erts_utf8_to_latin1(byte* dest, const byte* source, int slen); +int erts_utf8_to_latin1_backwards(byte* dest, const byte* source, int slen); #define ERTS_UTF8_OK 0 #define ERTS_UTF8_INCOMPLETE 1 #define ERTS_UTF8_ERROR 2 #define ERTS_UTF8_ANALYZE_MORE 3 +#define ERTS_UTF8_OK_MAX_CHARS 4 /* erl_trace.c */ void erts_init_trace(void); diff --git a/erts/emulator/beam/io.c b/erts/emulator/beam/io.c index 60b9238d38..b1eb75bede 100644 --- a/erts/emulator/beam/io.c +++ b/erts/emulator/beam/io.c @@ -646,8 +646,11 @@ erts_open_driver(erts_driver_t* driver, /* Pointer to driver. */ if (IS_TRACED_FL(port, F_TRACE_PORTS)) { trace_port_open(port, - pid, - am_atom_put(port->name, strlen(port->name))); + pid, + erts_atom_put((byte *) port->name, + strlen(port->name), + ERTS_ATOM_ENC_LATIN1, + 1)); } if (driver->start) { @@ -4765,7 +4768,8 @@ int driver_exit(ErlDrvPort ix, int err) return driver_failure_term(ix, am_normal, 0); else { char* err_str = erl_errno_id(err); - Eterm am_err = am_atom_put(err_str, sys_strlen(err_str)); + Eterm am_err = erts_atom_put((byte *) err_str, sys_strlen(err_str), + ERTS_ATOM_ENC_LATIN1, 1); return driver_failure_term(ix, am_err, 0); } } @@ -4778,8 +4782,12 @@ int driver_failure(ErlDrvPort ix, int code) int driver_failure_atom(ErlDrvPort ix, char* string) { - Eterm am = am_atom_put(string, strlen(string)); - return driver_failure_term(ix, am, 0); + return driver_failure_term(ix, + erts_atom_put((byte *) string, + strlen(string), + ERTS_ATOM_ENC_LATIN1, + 1), + 0); } int driver_failure_posix(ErlDrvPort ix, int err) @@ -4796,7 +4804,10 @@ int driver_failure_eof(ErlDrvPort ix) ErlDrvTermData driver_mk_atom(char* string) { - Eterm am = am_atom_put(string, sys_strlen(string)); + Eterm am = erts_atom_put((byte *) string, + sys_strlen(string), + ERTS_ATOM_ENC_LATIN1, + 1); ERTS_SMP_CHK_NO_PROC_LOCKS; return (ErlDrvTermData) am; } @@ -5091,7 +5102,10 @@ init_driver(erts_driver_t *drv, ErlDrvEntry *de, DE_Handle *handle) erts_smp_mtx_init_x(drv->lock, "driver_lock", #if defined(ERTS_ENABLE_LOCK_CHECK) || defined(ERTS_ENABLE_LOCK_COUNT) - am_atom_put(drv->name, sys_strlen(drv->name)) + erts_atom_put((byte *) drv->name, + sys_strlen(drv->name), + ERTS_ATOM_ENC_LATIN1, + 1) #else NIL #endif diff --git a/erts/emulator/beam/utils.c b/erts/emulator/beam/utils.c index 1969fc762c..97b6d01207 100644 --- a/erts/emulator/beam/utils.c +++ b/erts/emulator/beam/utils.c @@ -370,7 +370,7 @@ Eterm erts_bld_atom(Uint **hpp, Uint *szp, char *str) { if (hpp) - return am_atom_put(str, sys_strlen(str)); + return erts_atom_put((byte *) str, sys_strlen(str), ERTS_ATOM_ENC_LATIN1, 1); else return THE_NON_VALUE; } -- cgit v1.2.3