diff options
Diffstat (limited to 'erts/emulator/beam/atom.c')
-rw-r--r-- | erts/emulator/beam/atom.c | 181 |
1 files changed, 152 insertions, 29 deletions
diff --git a/erts/emulator/beam/atom.c b/erts/emulator/beam/atom.c index d7c7f117cf..84d2d5e3ed 100644 --- a/erts/emulator/beam/atom.c +++ b/erts/emulator/beam/atom.c @@ -1,7 +1,7 @@ /* * %CopyrightBegin% * - * Copyright Ericsson AB 1996-2011. All Rights Reserved. + * Copyright Ericsson AB 1996-2013. All Rights Reserved. * * The contents of this file are subject to the Erlang Public License, * Version 1.1, (the "License"); you may not use this file except in @@ -111,7 +111,7 @@ atom_text_alloc(int bytes) { byte *res; - ASSERT(bytes <= MAX_ATOM_LENGTH); + ASSERT(bytes <= MAX_ATOM_SZ_LIMIT); if (atom_text_pos + bytes >= atom_text_end) { more_atom_space(); } @@ -132,9 +132,17 @@ atom_hash(Atom* obj) byte* p = obj->name; int len = obj->len; HashValue h = 0, g; + byte v; while(len--) { - h = (h << 4) + *p++; + v = *p++; + /* latin1 clutch for r16 */ + if ((v & 0xFE) == 0xC2 && (*p & 0xC0) == 0x80) { + v = (v << 6) | (*p & 0x3F); + p++; len--; + } + /* normal hashpjw follows for v */ + h = (h << 4) + v; if ((g = h & 0xf0000000)) { h ^= (g >> 24); h ^= g; @@ -162,6 +170,7 @@ atom_alloc(Atom* tmpl) obj->name = atom_text_alloc(tmpl->len); sys_memcpy(obj->name, tmpl->name, tmpl->len); obj->len = tmpl->len; + obj->latin1_chars = tmpl->latin1_chars; obj->slot.index = -1; /* @@ -192,44 +201,146 @@ atom_free(Atom* obj) erts_free(ERTS_ALC_T_ATOM, (void*) obj); } +static void latin1_to_utf8(byte* conv_buf, const byte** srcp, int* lenp) +{ + byte* dst; + const byte* src = *srcp; + int i, len = *lenp; + + for (i=0 ; i < len; ++i) { + if (src[i] & 0x80) { + goto need_convertion; + } + } + return; + +need_convertion: + sys_memcpy(conv_buf, src, i); + dst = conv_buf + i; + for ( ; i < len; ++i) { + unsigned char chr = src[i]; + if (!(chr & 0x80)) { + *dst++ = chr; + } + else { + *dst++ = 0xC0 | (chr >> 6); + *dst++ = 0x80 | (chr & 0x3F); + } + } + *srcp = conv_buf; + *lenp = dst - conv_buf; +} + +/* + * erts_atom_put() may fail. If it fails THE_NON_VALUE is returned! + */ Eterm -am_atom_put(const char* name, int len) +erts_atom_put(const byte *name, int len, ErtsAtomEncoding enc, int trunc) { + byte utf8_copy[MAX_ATOM_SZ_FROM_LATIN1]; + const byte *text = name; + int tlen = len; + Sint no_latin1_chars; Atom a; - Eterm ret; int aix; - /* - * Silently truncate the atom if it is too long. Overlong atoms - * could occur in situations where we have no good way to return - * an error, such as in the I/O system. (Unfortunately, many - * drivers don't check for errors.) - * - * If an error should be produced for overlong atoms (such in - * list_to_atom/1), the caller should check the length before - * calling this function. - */ - if (len > MAX_ATOM_LENGTH) { - len = MAX_ATOM_LENGTH; - } #ifdef ERTS_ATOM_PUT_OPS_STAT erts_smp_atomic_inc_nob(&atom_put_ops); #endif - a.len = len; - a.name = (byte*)name; + + if (tlen < 0) { + if (trunc) + tlen = 0; + else + return THE_NON_VALUE; + } + + switch (enc) { + case ERTS_ATOM_ENC_7BIT_ASCII: + if (tlen > MAX_ATOM_CHARACTERS) { + if (trunc) + tlen = MAX_ATOM_CHARACTERS; + else + return THE_NON_VALUE; + } +#ifdef DEBUG + for (aix = 0; aix < len; aix++) { + ASSERT((name[aix] & 0x80) == 0); + } +#endif + no_latin1_chars = tlen; + break; + case ERTS_ATOM_ENC_LATIN1: + if (tlen > MAX_ATOM_CHARACTERS) { + if (trunc) + tlen = MAX_ATOM_CHARACTERS; + else + return THE_NON_VALUE; + } + no_latin1_chars = tlen; + latin1_to_utf8(utf8_copy, &text, &tlen); + break; + case ERTS_ATOM_ENC_UTF8: + /* First sanity check; need to verify later */ + if (tlen > MAX_ATOM_SZ_LIMIT && !trunc) + return THE_NON_VALUE; + break; + } + + a.len = tlen; + a.name = (byte *) text; atom_read_lock(); aix = index_get(&erts_atom_table, (void*) &a); atom_read_unlock(); - if (aix >= 0) - ret = make_atom(aix); - else { - atom_write_lock(); - ret = make_atom(index_put(&erts_atom_table, (void*) &a)); - atom_write_unlock(); + if (aix >= 0) { + /* Already in table no need to verify it */ + return make_atom(aix); } - return ret; + + if (enc == ERTS_ATOM_ENC_UTF8) { + /* Need to verify encoding and length */ + byte *err_pos; + Uint no_chars; + switch (erts_analyze_utf8_x((byte *) text, + (Uint) tlen, + &err_pos, + &no_chars, NULL, + &no_latin1_chars, + MAX_ATOM_CHARACTERS)) { + case ERTS_UTF8_OK: + ASSERT(no_chars <= MAX_ATOM_CHARACTERS); + break; + case ERTS_UTF8_OK_MAX_CHARS: + /* Truncated... */ + if (!trunc) + return THE_NON_VALUE; + ASSERT(no_chars == MAX_ATOM_CHARACTERS); + tlen = err_pos - text; + break; + default: + /* Bad utf8... */ + return THE_NON_VALUE; + } + } + + ASSERT(tlen <= MAX_ATOM_SZ_LIMIT); + ASSERT(-1 <= no_latin1_chars && no_latin1_chars <= MAX_ATOM_CHARACTERS); + + a.len = tlen; + a.latin1_chars = (Sint16) no_latin1_chars; + a.name = (byte *) text; + atom_write_lock(); + aix = index_put(&erts_atom_table, (void*) &a); + atom_write_unlock(); + return make_atom(aix); } +Eterm +am_atom_put(const char* name, int len) +{ + /* Assumes 7-bit ascii; use erts_atom_put() for other encodings... */ + return erts_atom_put((byte *) name, len, ERTS_ATOM_ENC_7BIT_ASCII, 1); +} int atom_table_size(void) { @@ -264,14 +375,19 @@ int atom_table_sz(void) } int -erts_atom_get(const char *name, int len, Eterm* ap) +erts_atom_get(const char *name, int len, Eterm* ap, ErtsAtomEncoding enc) { + byte utf8_copy[MAX_ATOM_SZ_FROM_LATIN1]; Atom a; int i; int res; - a.len = len; + a.len = (Sint16) len; a.name = (byte *)name; + if (enc == ERTS_ATOM_ENC_LATIN1) { + latin1_to_utf8(utf8_copy, (const byte**)&a.name, &len); + a.len = (Sint16) len; + } atom_read_lock(); i = index_get(&erts_atom_table, (void*) &a); res = i < 0 ? 0 : (*ap = make_atom(i), 1); @@ -333,8 +449,15 @@ init_atom_table(void) for (i = 0; erl_atom_names[i] != 0; i++) { int ix; a.len = strlen(erl_atom_names[i]); + a.latin1_chars = a.len; a.name = (byte*)erl_atom_names[i]; a.slot.index = i; +#ifdef DEBUG + /* Verify 7-bit ascii */ + for (ix = 0; ix < a.len; ix++) { + ASSERT((a.name[ix] & 0x80) == 0); + } +#endif ix = index_put(&erts_atom_table, (void*) &a); atom_text_pos -= a.len; atom_space -= a.len; |