aboutsummaryrefslogtreecommitdiffstats
path: root/erts/emulator/beam/atom.c
diff options
context:
space:
mode:
authorRickard Green <[email protected]>2013-01-10 12:47:46 +0100
committerRickard Green <[email protected]>2013-01-16 17:16:52 +0100
commit0dd3b88cdf90283d9c276ee415f985cb764e522f (patch)
tree1584d76d9960339a03c04412ef7919473e7b2efc /erts/emulator/beam/atom.c
parent5d79f55ca441727578d34b78ee0d6d8aa80976ee (diff)
downloadotp-0dd3b88cdf90283d9c276ee415f985cb764e522f.tar.gz
otp-0dd3b88cdf90283d9c276ee415f985cb764e522f.tar.bz2
otp-0dd3b88cdf90283d9c276ee415f985cb764e522f.zip
UTF-8 support for distribution
Diffstat (limited to 'erts/emulator/beam/atom.c')
-rw-r--r--erts/emulator/beam/atom.c164
1 files changed, 114 insertions, 50 deletions
diff --git a/erts/emulator/beam/atom.c b/erts/emulator/beam/atom.c
index b41a98f2a2..82dd320ea9 100644
--- a/erts/emulator/beam/atom.c
+++ b/erts/emulator/beam/atom.c
@@ -162,6 +162,7 @@ atom_alloc(Atom* tmpl)
obj->name = atom_text_alloc(tmpl->len);
sys_memcpy(obj->name, tmpl->name, tmpl->len);
obj->len = tmpl->len;
+ obj->latin1_chars = tmpl->latin1_chars;
obj->slot.index = -1;
/*
@@ -192,48 +193,6 @@ atom_free(Atom* obj)
erts_free(ERTS_ALC_T_ATOM, (void*) obj);
}
-Eterm
-am_atom_put(const char* name, int len)
-{
- Atom a;
- Eterm ret;
- int aix;
-#ifdef DEBUG
- byte* err_pos;
- Uint num_chars;
- ASSERT(erts_analyze_utf8(name, len, &err_pos, &num_chars, NULL) == ERTS_UTF8_OK);
-#endif
- /*
- * Silently truncate the atom if it is too long. Overlong atoms
- * could occur in situations where we have no good way to return
- * an error, such as in the I/O system. (Unfortunately, many
- * drivers don't check for errors.)
- *
- * If an error should be produced for overlong atoms (such in
- * list_to_atom/1), the caller should check the length before
- * calling this function.
- */
- if (len > MAX_ATOM_SZ_LIMIT) {
- len = MAX_ATOM_SZ_LIMIT; /*SVERK Urk... */
- }
-#ifdef ERTS_ATOM_PUT_OPS_STAT
- erts_smp_atomic_inc_nob(&atom_put_ops);
-#endif
- a.len = len;
- a.name = (byte*)name;
- atom_read_lock();
- aix = index_get(&erts_atom_table, (void*) &a);
- atom_read_unlock();
- if (aix >= 0)
- ret = make_atom(aix);
- else {
- atom_write_lock();
- ret = make_atom(index_put(&erts_atom_table, (void*) &a));
- atom_write_unlock();
- }
- return ret;
-}
-
static void latin1_to_utf8(byte* conv_buf, const byte** srcp, int* lenp)
{
byte* dst;
@@ -264,19 +223,116 @@ need_convertion:
*lenp = dst - conv_buf;
}
-
+/*
+ * erts_atom_put() may fail. If it fails THE_NON_VALUE is returned!
+ */
Eterm
-am_atom_put2(const byte* name, int len, int is_latin1)
+erts_atom_put(const byte *name, int len, ErtsAtomEncoding enc, int trunc)
{
byte utf8_copy[MAX_ATOM_SZ_FROM_LATIN1];
+ const byte *text = name;
+ int tlen = len;
+ Sint no_latin1_chars;
+ Atom a;
+ int aix;
- if (is_latin1) {
- latin1_to_utf8(utf8_copy, &name, &len);
+#ifdef ERTS_ATOM_PUT_OPS_STAT
+ erts_smp_atomic_inc_nob(&atom_put_ops);
+#endif
+
+ if (tlen < 0) {
+ if (trunc)
+ tlen = 0;
+ else
+ return THE_NON_VALUE;
}
- return am_atom_put((const char*)name, len);
-}
+ switch (enc) {
+ case ERTS_ATOM_ENC_7BIT_ASCII:
+ if (tlen > MAX_ATOM_CHARACTERS) {
+ if (trunc)
+ tlen = MAX_ATOM_CHARACTERS;
+ else
+ return THE_NON_VALUE;
+ }
+#ifdef DEBUG
+ for (aix = 0; aix < len; aix++) {
+ ASSERT((name[aix] & 0x80) == 0);
+ }
+#endif
+ no_latin1_chars = tlen;
+ break;
+ case ERTS_ATOM_ENC_LATIN1:
+ if (tlen > MAX_ATOM_CHARACTERS) {
+ if (trunc)
+ tlen = MAX_ATOM_CHARACTERS;
+ else
+ return THE_NON_VALUE;
+ }
+ no_latin1_chars = tlen;
+ latin1_to_utf8(utf8_copy, &text, &tlen);
+ break;
+ case ERTS_ATOM_ENC_UTF8:
+ /* First sanity check; need to verify later */
+ if (tlen > MAX_ATOM_SZ_LIMIT && !trunc)
+ return THE_NON_VALUE;
+ break;
+ }
+ a.len = tlen;
+ a.name = (byte *) text;
+ atom_read_lock();
+ aix = index_get(&erts_atom_table, (void*) &a);
+ atom_read_unlock();
+ if (aix >= 0) {
+ /* Already in table no need to verify it */
+ return make_atom(aix);
+ }
+
+ if (enc == ERTS_ATOM_ENC_UTF8) {
+ /* Need to verify encoding and length */
+ byte *err_pos;
+ Uint no_chars;
+ switch (erts_analyze_utf8_x((byte *) text,
+ (Uint) tlen,
+ &err_pos,
+ &no_chars, NULL,
+ &no_latin1_chars,
+ MAX_ATOM_CHARACTERS)) {
+ case ERTS_UTF8_OK:
+ ASSERT(no_chars <= MAX_ATOM_CHARACTERS);
+ break;
+ case ERTS_UTF8_OK_MAX_CHARS:
+ /* Truncated... */
+ if (!trunc)
+ return THE_NON_VALUE;
+ ASSERT(no_chars == MAX_ATOM_CHARACTERS);
+ tlen = err_pos - text;
+ break;
+ default:
+ /* Bad utf8... */
+ return THE_NON_VALUE;
+ }
+ }
+
+ ASSERT(tlen <= MAX_ATOM_SZ_LIMIT);
+ ASSERT(-1 <= no_latin1_chars && no_latin1_chars <= MAX_ATOM_CHARACTERS);
+
+ a.len = tlen;
+ a.latin1_chars = (Sint16) no_latin1_chars;
+ a.name = (byte *) text;
+ atom_write_lock();
+ aix = index_put(&erts_atom_table, (void*) &a);
+ atom_write_unlock();
+ return make_atom(aix);
+}
+
+Eterm
+am_atom_put(const char* name, int len)
+{
+ /* Assumes 7-bit ascii; use erts_atom_put() for other encodings... */
+ return erts_atom_put((byte *) name, len, ERTS_ATOM_ENC_7BIT_ASCII, 1);
+}
int atom_table_size(void)
{
@@ -318,10 +374,11 @@ erts_atom_get(const char *name, int len, Eterm* ap, int is_latin1)
int i;
int res;
- a.len = len;
+ a.len = (Sint16) len;
a.name = (byte *)name;
if (is_latin1) {
- latin1_to_utf8(utf8_copy, (const byte**)&a.name, &a.len);
+ latin1_to_utf8(utf8_copy, (const byte**)&a.name, &len);
+ a.len = (Sint16) len;
}
atom_read_lock();
i = index_get(&erts_atom_table, (void*) &a);
@@ -384,8 +441,15 @@ init_atom_table(void)
for (i = 0; erl_atom_names[i] != 0; i++) {
int ix;
a.len = strlen(erl_atom_names[i]);
+ a.latin1_chars = a.len;
a.name = (byte*)erl_atom_names[i];
a.slot.index = i;
+#ifdef DEBUG
+ /* Verify 7-bit ascii */
+ for (ix = 0; ix < a.len; ix++) {
+ ASSERT((a.name[ix] & 0x80) == 0);
+ }
+#endif
ix = index_put(&erts_atom_table, (void*) &a);
atom_text_pos -= a.len;
atom_space -= a.len;