aboutsummaryrefslogtreecommitdiffstats
path: root/erts/emulator/beam/atom.c
diff options
context:
space:
mode:
authorSverker Eriksson <[email protected]>2013-01-23 18:09:35 +0100
committerSverker Eriksson <[email protected]>2013-01-23 18:09:35 +0100
commitb8e623410d1c22fe6d5fdeb8ccb0b2305533f033 (patch)
tree708d64e36e18b61ae1801c02ec3aeef42a697be3 /erts/emulator/beam/atom.c
parente99df74bee7c245ec76678e336fcd09d4b51a089 (diff)
parentd6e3e256b850050b7a86323b2948009d5fcc30a9 (diff)
downloadotp-b8e623410d1c22fe6d5fdeb8ccb0b2305533f033.tar.gz
otp-b8e623410d1c22fe6d5fdeb8ccb0b2305533f033.tar.bz2
otp-b8e623410d1c22fe6d5fdeb8ccb0b2305533f033.zip
Merge branch 'sverk/r16/utf8-atoms'
* sverk/r16/utf8-atoms: erl_interface: Fix bug when transcoding atoms from and to UTF8 erl_interface: Changed erlang_char_encoding interface erts: Testcase doing unicode atom printout with ~w erl_interface: even more utf8 atom stuff erts: Fix bug in analyze_utf8 causing faulty latin1 detection Add UTF-8 node name support for epmd workaround... Fix merge conflict with hasse UTF-8 atom documentation test case erl_interface: utf8 atoms continued Add utf8 atom distribution test cases atom fixes for NIFs and atom_to_binary UTF-8 support for distribution Implement UTF-8 atom support for jinterface erl_interface: Enable decode of unicode atoms stdlib: Fix printing of unicode atoms erts: Change internal representation of atoms to utf8 erts: Refactor rename DFLAG(S)_INTERNAL_TAGS for conformity Conflicts: erts/emulator/beam/io.c OTP-10753
Diffstat (limited to 'erts/emulator/beam/atom.c')
-rw-r--r--erts/emulator/beam/atom.c169
1 files changed, 142 insertions, 27 deletions
diff --git a/erts/emulator/beam/atom.c b/erts/emulator/beam/atom.c
index d7c7f117cf..82dd320ea9 100644
--- a/erts/emulator/beam/atom.c
+++ b/erts/emulator/beam/atom.c
@@ -111,7 +111,7 @@ atom_text_alloc(int bytes)
{
byte *res;
- ASSERT(bytes <= MAX_ATOM_LENGTH);
+ ASSERT(bytes <= MAX_ATOM_SZ_LIMIT);
if (atom_text_pos + bytes >= atom_text_end) {
more_atom_space();
}
@@ -162,6 +162,7 @@ atom_alloc(Atom* tmpl)
obj->name = atom_text_alloc(tmpl->len);
sys_memcpy(obj->name, tmpl->name, tmpl->len);
obj->len = tmpl->len;
+ obj->latin1_chars = tmpl->latin1_chars;
obj->slot.index = -1;
/*
@@ -192,44 +193,146 @@ atom_free(Atom* obj)
erts_free(ERTS_ALC_T_ATOM, (void*) obj);
}
+static void latin1_to_utf8(byte* conv_buf, const byte** srcp, int* lenp)
+{
+ byte* dst;
+ const byte* src = *srcp;
+ int i, len = *lenp;
+
+ for (i=0 ; i < len; ++i) {
+ if (src[i] & 0x80) {
+ goto need_convertion;
+ }
+ }
+ return;
+
+need_convertion:
+ sys_memcpy(conv_buf, src, i);
+ dst = conv_buf + i;
+ for ( ; i < len; ++i) {
+ unsigned char chr = src[i];
+ if (!(chr & 0x80)) {
+ *dst++ = chr;
+ }
+ else {
+ *dst++ = 0xC0 | (chr >> 6);
+ *dst++ = 0x80 | (chr & 0x3F);
+ }
+ }
+ *srcp = conv_buf;
+ *lenp = dst - conv_buf;
+}
+
+/*
+ * erts_atom_put() may fail. If it fails THE_NON_VALUE is returned!
+ */
Eterm
-am_atom_put(const char* name, int len)
+erts_atom_put(const byte *name, int len, ErtsAtomEncoding enc, int trunc)
{
+ byte utf8_copy[MAX_ATOM_SZ_FROM_LATIN1];
+ const byte *text = name;
+ int tlen = len;
+ Sint no_latin1_chars;
Atom a;
- Eterm ret;
int aix;
- /*
- * Silently truncate the atom if it is too long. Overlong atoms
- * could occur in situations where we have no good way to return
- * an error, such as in the I/O system. (Unfortunately, many
- * drivers don't check for errors.)
- *
- * If an error should be produced for overlong atoms (such in
- * list_to_atom/1), the caller should check the length before
- * calling this function.
- */
- if (len > MAX_ATOM_LENGTH) {
- len = MAX_ATOM_LENGTH;
- }
#ifdef ERTS_ATOM_PUT_OPS_STAT
erts_smp_atomic_inc_nob(&atom_put_ops);
#endif
- a.len = len;
- a.name = (byte*)name;
+
+ if (tlen < 0) {
+ if (trunc)
+ tlen = 0;
+ else
+ return THE_NON_VALUE;
+ }
+
+ switch (enc) {
+ case ERTS_ATOM_ENC_7BIT_ASCII:
+ if (tlen > MAX_ATOM_CHARACTERS) {
+ if (trunc)
+ tlen = MAX_ATOM_CHARACTERS;
+ else
+ return THE_NON_VALUE;
+ }
+#ifdef DEBUG
+ for (aix = 0; aix < len; aix++) {
+ ASSERT((name[aix] & 0x80) == 0);
+ }
+#endif
+ no_latin1_chars = tlen;
+ break;
+ case ERTS_ATOM_ENC_LATIN1:
+ if (tlen > MAX_ATOM_CHARACTERS) {
+ if (trunc)
+ tlen = MAX_ATOM_CHARACTERS;
+ else
+ return THE_NON_VALUE;
+ }
+ no_latin1_chars = tlen;
+ latin1_to_utf8(utf8_copy, &text, &tlen);
+ break;
+ case ERTS_ATOM_ENC_UTF8:
+ /* First sanity check; need to verify later */
+ if (tlen > MAX_ATOM_SZ_LIMIT && !trunc)
+ return THE_NON_VALUE;
+ break;
+ }
+
+ a.len = tlen;
+ a.name = (byte *) text;
atom_read_lock();
aix = index_get(&erts_atom_table, (void*) &a);
atom_read_unlock();
- if (aix >= 0)
- ret = make_atom(aix);
- else {
- atom_write_lock();
- ret = make_atom(index_put(&erts_atom_table, (void*) &a));
- atom_write_unlock();
+ if (aix >= 0) {
+ /* Already in table no need to verify it */
+ return make_atom(aix);
}
- return ret;
+
+ if (enc == ERTS_ATOM_ENC_UTF8) {
+ /* Need to verify encoding and length */
+ byte *err_pos;
+ Uint no_chars;
+ switch (erts_analyze_utf8_x((byte *) text,
+ (Uint) tlen,
+ &err_pos,
+ &no_chars, NULL,
+ &no_latin1_chars,
+ MAX_ATOM_CHARACTERS)) {
+ case ERTS_UTF8_OK:
+ ASSERT(no_chars <= MAX_ATOM_CHARACTERS);
+ break;
+ case ERTS_UTF8_OK_MAX_CHARS:
+ /* Truncated... */
+ if (!trunc)
+ return THE_NON_VALUE;
+ ASSERT(no_chars == MAX_ATOM_CHARACTERS);
+ tlen = err_pos - text;
+ break;
+ default:
+ /* Bad utf8... */
+ return THE_NON_VALUE;
+ }
+ }
+
+ ASSERT(tlen <= MAX_ATOM_SZ_LIMIT);
+ ASSERT(-1 <= no_latin1_chars && no_latin1_chars <= MAX_ATOM_CHARACTERS);
+
+ a.len = tlen;
+ a.latin1_chars = (Sint16) no_latin1_chars;
+ a.name = (byte *) text;
+ atom_write_lock();
+ aix = index_put(&erts_atom_table, (void*) &a);
+ atom_write_unlock();
+ return make_atom(aix);
}
+Eterm
+am_atom_put(const char* name, int len)
+{
+ /* Assumes 7-bit ascii; use erts_atom_put() for other encodings... */
+ return erts_atom_put((byte *) name, len, ERTS_ATOM_ENC_7BIT_ASCII, 1);
+}
int atom_table_size(void)
{
@@ -264,14 +367,19 @@ int atom_table_sz(void)
}
int
-erts_atom_get(const char *name, int len, Eterm* ap)
+erts_atom_get(const char *name, int len, Eterm* ap, int is_latin1)
{
+ byte utf8_copy[MAX_ATOM_SZ_FROM_LATIN1];
Atom a;
int i;
int res;
- a.len = len;
+ a.len = (Sint16) len;
a.name = (byte *)name;
+ if (is_latin1) {
+ latin1_to_utf8(utf8_copy, (const byte**)&a.name, &len);
+ a.len = (Sint16) len;
+ }
atom_read_lock();
i = index_get(&erts_atom_table, (void*) &a);
res = i < 0 ? 0 : (*ap = make_atom(i), 1);
@@ -333,8 +441,15 @@ init_atom_table(void)
for (i = 0; erl_atom_names[i] != 0; i++) {
int ix;
a.len = strlen(erl_atom_names[i]);
+ a.latin1_chars = a.len;
a.name = (byte*)erl_atom_names[i];
a.slot.index = i;
+#ifdef DEBUG
+ /* Verify 7-bit ascii */
+ for (ix = 0; ix < a.len; ix++) {
+ ASSERT((a.name[ix] & 0x80) == 0);
+ }
+#endif
ix = index_put(&erts_atom_table, (void*) &a);
atom_text_pos -= a.len;
atom_space -= a.len;