From 0dd3b88cdf90283d9c276ee415f985cb764e522f Mon Sep 17 00:00:00 2001 From: Rickard Green Date: Thu, 10 Jan 2013 12:47:46 +0100 Subject: UTF-8 support for distribution --- erts/emulator/beam/atom.c | 164 ++++++++++++++++++++++++++++++++-------------- 1 file changed, 114 insertions(+), 50 deletions(-) (limited to 'erts/emulator/beam/atom.c') diff --git a/erts/emulator/beam/atom.c b/erts/emulator/beam/atom.c index b41a98f2a2..82dd320ea9 100644 --- a/erts/emulator/beam/atom.c +++ b/erts/emulator/beam/atom.c @@ -162,6 +162,7 @@ atom_alloc(Atom* tmpl) obj->name = atom_text_alloc(tmpl->len); sys_memcpy(obj->name, tmpl->name, tmpl->len); obj->len = tmpl->len; + obj->latin1_chars = tmpl->latin1_chars; obj->slot.index = -1; /* @@ -192,48 +193,6 @@ atom_free(Atom* obj) erts_free(ERTS_ALC_T_ATOM, (void*) obj); } -Eterm -am_atom_put(const char* name, int len) -{ - Atom a; - Eterm ret; - int aix; -#ifdef DEBUG - byte* err_pos; - Uint num_chars; - ASSERT(erts_analyze_utf8(name, len, &err_pos, &num_chars, NULL) == ERTS_UTF8_OK); -#endif - /* - * Silently truncate the atom if it is too long. Overlong atoms - * could occur in situations where we have no good way to return - * an error, such as in the I/O system. (Unfortunately, many - * drivers don't check for errors.) - * - * If an error should be produced for overlong atoms (such in - * list_to_atom/1), the caller should check the length before - * calling this function. - */ - if (len > MAX_ATOM_SZ_LIMIT) { - len = MAX_ATOM_SZ_LIMIT; /*SVERK Urk... */ - } -#ifdef ERTS_ATOM_PUT_OPS_STAT - erts_smp_atomic_inc_nob(&atom_put_ops); -#endif - a.len = len; - a.name = (byte*)name; - atom_read_lock(); - aix = index_get(&erts_atom_table, (void*) &a); - atom_read_unlock(); - if (aix >= 0) - ret = make_atom(aix); - else { - atom_write_lock(); - ret = make_atom(index_put(&erts_atom_table, (void*) &a)); - atom_write_unlock(); - } - return ret; -} - static void latin1_to_utf8(byte* conv_buf, const byte** srcp, int* lenp) { byte* dst; @@ -264,19 +223,116 @@ need_convertion: *lenp = dst - conv_buf; } - +/* + * erts_atom_put() may fail. If it fails THE_NON_VALUE is returned! + */ Eterm -am_atom_put2(const byte* name, int len, int is_latin1) +erts_atom_put(const byte *name, int len, ErtsAtomEncoding enc, int trunc) { byte utf8_copy[MAX_ATOM_SZ_FROM_LATIN1]; + const byte *text = name; + int tlen = len; + Sint no_latin1_chars; + Atom a; + int aix; - if (is_latin1) { - latin1_to_utf8(utf8_copy, &name, &len); +#ifdef ERTS_ATOM_PUT_OPS_STAT + erts_smp_atomic_inc_nob(&atom_put_ops); +#endif + + if (tlen < 0) { + if (trunc) + tlen = 0; + else + return THE_NON_VALUE; } - return am_atom_put((const char*)name, len); -} + switch (enc) { + case ERTS_ATOM_ENC_7BIT_ASCII: + if (tlen > MAX_ATOM_CHARACTERS) { + if (trunc) + tlen = MAX_ATOM_CHARACTERS; + else + return THE_NON_VALUE; + } +#ifdef DEBUG + for (aix = 0; aix < len; aix++) { + ASSERT((name[aix] & 0x80) == 0); + } +#endif + no_latin1_chars = tlen; + break; + case ERTS_ATOM_ENC_LATIN1: + if (tlen > MAX_ATOM_CHARACTERS) { + if (trunc) + tlen = MAX_ATOM_CHARACTERS; + else + return THE_NON_VALUE; + } + no_latin1_chars = tlen; + latin1_to_utf8(utf8_copy, &text, &tlen); + break; + case ERTS_ATOM_ENC_UTF8: + /* First sanity check; need to verify later */ + if (tlen > MAX_ATOM_SZ_LIMIT && !trunc) + return THE_NON_VALUE; + break; + } + a.len = tlen; + a.name = (byte *) text; + atom_read_lock(); + aix = index_get(&erts_atom_table, (void*) &a); + atom_read_unlock(); + if (aix >= 0) { + /* Already in table no need to verify it */ + return make_atom(aix); + } + + if (enc == ERTS_ATOM_ENC_UTF8) { + /* Need to verify encoding and length */ + byte *err_pos; + Uint no_chars; + switch (erts_analyze_utf8_x((byte *) text, + (Uint) tlen, + &err_pos, + &no_chars, NULL, + &no_latin1_chars, + MAX_ATOM_CHARACTERS)) { + case ERTS_UTF8_OK: + ASSERT(no_chars <= MAX_ATOM_CHARACTERS); + break; + case ERTS_UTF8_OK_MAX_CHARS: + /* Truncated... */ + if (!trunc) + return THE_NON_VALUE; + ASSERT(no_chars == MAX_ATOM_CHARACTERS); + tlen = err_pos - text; + break; + default: + /* Bad utf8... */ + return THE_NON_VALUE; + } + } + + ASSERT(tlen <= MAX_ATOM_SZ_LIMIT); + ASSERT(-1 <= no_latin1_chars && no_latin1_chars <= MAX_ATOM_CHARACTERS); + + a.len = tlen; + a.latin1_chars = (Sint16) no_latin1_chars; + a.name = (byte *) text; + atom_write_lock(); + aix = index_put(&erts_atom_table, (void*) &a); + atom_write_unlock(); + return make_atom(aix); +} + +Eterm +am_atom_put(const char* name, int len) +{ + /* Assumes 7-bit ascii; use erts_atom_put() for other encodings... */ + return erts_atom_put((byte *) name, len, ERTS_ATOM_ENC_7BIT_ASCII, 1); +} int atom_table_size(void) { @@ -318,10 +374,11 @@ erts_atom_get(const char *name, int len, Eterm* ap, int is_latin1) int i; int res; - a.len = len; + a.len = (Sint16) len; a.name = (byte *)name; if (is_latin1) { - latin1_to_utf8(utf8_copy, (const byte**)&a.name, &a.len); + latin1_to_utf8(utf8_copy, (const byte**)&a.name, &len); + a.len = (Sint16) len; } atom_read_lock(); i = index_get(&erts_atom_table, (void*) &a); @@ -384,8 +441,15 @@ init_atom_table(void) for (i = 0; erl_atom_names[i] != 0; i++) { int ix; a.len = strlen(erl_atom_names[i]); + a.latin1_chars = a.len; a.name = (byte*)erl_atom_names[i]; a.slot.index = i; +#ifdef DEBUG + /* Verify 7-bit ascii */ + for (ix = 0; ix < a.len; ix++) { + ASSERT((a.name[ix] & 0x80) == 0); + } +#endif ix = index_put(&erts_atom_table, (void*) &a); atom_text_pos -= a.len; atom_space -= a.len; -- cgit v1.2.3