aboutsummaryrefslogtreecommitdiffstats
path: root/lib/erl_interface/src/encode
diff options
context:
space:
mode:
authorSverker Eriksson <[email protected]>2013-01-23 18:09:35 +0100
committerSverker Eriksson <[email protected]>2013-01-23 18:09:35 +0100
commitb8e623410d1c22fe6d5fdeb8ccb0b2305533f033 (patch)
tree708d64e36e18b61ae1801c02ec3aeef42a697be3 /lib/erl_interface/src/encode
parente99df74bee7c245ec76678e336fcd09d4b51a089 (diff)
parentd6e3e256b850050b7a86323b2948009d5fcc30a9 (diff)
downloadotp-b8e623410d1c22fe6d5fdeb8ccb0b2305533f033.tar.gz
otp-b8e623410d1c22fe6d5fdeb8ccb0b2305533f033.tar.bz2
otp-b8e623410d1c22fe6d5fdeb8ccb0b2305533f033.zip
Merge branch 'sverk/r16/utf8-atoms'
* sverk/r16/utf8-atoms: erl_interface: Fix bug when transcoding atoms from and to UTF8 erl_interface: Changed erlang_char_encoding interface erts: Testcase doing unicode atom printout with ~w erl_interface: even more utf8 atom stuff erts: Fix bug in analyze_utf8 causing faulty latin1 detection Add UTF-8 node name support for epmd workaround... Fix merge conflict with hasse UTF-8 atom documentation test case erl_interface: utf8 atoms continued Add utf8 atom distribution test cases atom fixes for NIFs and atom_to_binary UTF-8 support for distribution Implement UTF-8 atom support for jinterface erl_interface: Enable decode of unicode atoms stdlib: Fix printing of unicode atoms erts: Change internal representation of atoms to utf8 erts: Refactor rename DFLAG(S)_INTERNAL_TAGS for conformity Conflicts: erts/emulator/beam/io.c OTP-10753
Diffstat (limited to 'lib/erl_interface/src/encode')
-rw-r--r--lib/erl_interface/src/encode/encode_atom.c154
-rw-r--r--lib/erl_interface/src/encode/encode_fun.c4
-rw-r--r--lib/erl_interface/src/encode/encode_pid.c24
-rw-r--r--lib/erl_interface/src/encode/encode_port.c23
-rw-r--r--lib/erl_interface/src/encode/encode_ref.c24
5 files changed, 174 insertions, 55 deletions
diff --git a/lib/erl_interface/src/encode/encode_atom.c b/lib/erl_interface/src/encode/encode_atom.c
index 6f41f045e0..044f17cb60 100644
--- a/lib/erl_interface/src/encode/encode_atom.c
+++ b/lib/erl_interface/src/encode/encode_atom.c
@@ -22,29 +22,108 @@
#include "eiext.h"
#include "putget.h"
+
+static int copy_ascii_atom(char* dst, const char* src, int slen);
+static int copy_utf8_atom(char* dst, const char* src, int slen);
+
+
int ei_encode_atom(char *buf, int *index, const char *p)
{
size_t len = strlen(p);
- if (len >= INT_MAX) return -1;
- return ei_encode_atom_len(buf, index, p, len);
+ if (len >= MAXATOMLEN)
+ len = MAXATOMLEN - 1;
+ return ei_encode_atom_len_as(buf, index, p, len, ERLANG_LATIN1, ERLANG_LATIN1);
}
int ei_encode_atom_len(char *buf, int *index, const char *p, int len)
{
+ /* This function is documented to truncate at MAXATOMLEN (256) */
+ if (len >= MAXATOMLEN)
+ len = MAXATOMLEN - 1;
+ return ei_encode_atom_len_as(buf, index, p, len, ERLANG_LATIN1, ERLANG_LATIN1);
+}
+
+int ei_encode_atom_as(char *buf, int *index, const char *p,
+ enum erlang_char_encoding from_enc,
+ enum erlang_char_encoding to_enc)
+{
+ return ei_encode_atom_len_as(buf, index, p, strlen(p), from_enc, to_enc);
+}
+
+int ei_encode_atom_len_as(char *buf, int *index, const char *p, int len,
+ enum erlang_char_encoding from_enc,
+ enum erlang_char_encoding to_enc)
+{
char *s = buf + *index;
char *s0 = s;
+ int offs;
- /* This function is documented to truncate at MAXATOMLEN (256) */
- if (len > MAXATOMLEN)
- len = MAXATOMLEN;
+ if (len >= MAXATOMLEN && (from_enc & (ERLANG_LATIN1|ERLANG_ASCII))) {
+ return -1;
+ }
- if (!buf) s += 3;
- else {
- put8(s,ERL_ATOM_EXT);
- put16be(s,len);
+ switch(to_enc) {
+ case ERLANG_LATIN1:
+ if (buf) {
+ put8(s,ERL_ATOM_EXT);
+ switch (from_enc) {
+ case ERLANG_UTF8:
+ len = utf8_to_latin1(s+2, p, len, MAXATOMLEN-1, NULL);
+ if (len < 0) return -1;
+ break;
+ case ERLANG_ASCII:
+ if (copy_ascii_atom(s+2, p, len) < 0) return -1;
+ break;
+ case ERLANG_LATIN1:
+ memcpy(s+2, p, len);
+ break;
+ default:
+ return -1;
+ }
+ put16be(s,len);
+ }
+ else {
+ s += 3;
+ if (from_enc == ERLANG_UTF8) {
+ len = utf8_to_latin1(NULL, p, len, MAXATOMLEN-1, NULL);
+ if (len < 0) return -1;
+ }
+ }
+ break;
+
+ case ERLANG_UTF8:
+ offs = 1 + 1;
+ switch (from_enc) {
+ case ERLANG_LATIN1:
+ if (len >= 256/2) offs++;
+ len = latin1_to_utf8((buf ? s+offs : NULL), p, len, MAXATOMLEN_UTF8-1, NULL);
+ break;
+ case ERLANG_ASCII:
+ if (buf && copy_ascii_atom(s+offs, p, len) < 0) return -1;
+ break;
+ case ERLANG_UTF8:
+ if (len >= 256) offs++;
+ if (buf && copy_utf8_atom(s+offs, p, len) < 0) return -1;
+ break;
+ default:
+ return -1;
+ }
+ if (buf) {
+ if (offs == 2) {
+ put8(s, ERL_SMALL_ATOM_UTF8_EXT);
+ put8(s, len);
+ }
+ else {
+ put8(s, ERL_ATOM_UTF8_EXT);
+ put16be(s, len);
+ }
+ }
+ else s+= offs;
+ break;
- memmove(s,p,len); /* unterminated string */
+ default:
+ return -1;
}
s += len;
@@ -53,3 +132,58 @@ int ei_encode_atom_len(char *buf, int *index, const char *p, int len)
return 0;
}
+int
+ei_internal_put_atom(char** bufp, const char* p, int slen,
+ enum erlang_char_encoding to_enc)
+{
+ int ix = 0;
+ if (ei_encode_atom_len_as(*bufp, &ix, p, slen, ERLANG_UTF8, to_enc) < 0)
+ return -1;
+ *bufp += ix;
+ return 0;
+}
+
+
+int copy_ascii_atom(char* dst, const char* src, int slen)
+{
+ while (slen > 0) {
+ if ((src[0] & 0x80) != 0) return -1;
+ *dst++ = *src++;
+ slen--;
+ }
+ return 0;
+}
+
+int copy_utf8_atom(char* dst, const char* src, int slen)
+{
+ int num_chars = 0;
+
+ while (slen > 0) {
+ if (++num_chars >= MAXATOMLEN) return -1;
+ if ((src[0] & 0x80) != 0) {
+ if ((src[0] & 0xE0) == 0xC0) {
+ if (slen < 2 || (src[1] & 0xC0) != 0x80) return -1;
+ *dst++ = *src++;
+ slen--;
+ }
+ else if ((src[0] & 0xF0) == 0xE0) {
+ if (slen < 3 || (src[1] & 0xC0) != 0x80 || (src[2] & 0xC0) != 0x80) return -1;
+ *dst++ = *src++;
+ *dst++ = *src++;
+ slen -= 2;
+ }
+ else if ((src[0] & 0xF8) == 0xF0) {
+ if (slen < 4 || (src[1] & 0xC0) != 0x80 || (src[2] & 0xC0) != 0x80 || (src[3] & 0xC0) != 0x80) return -1;
+ *dst++ = *src++;
+ *dst++ = *src++;
+ *dst++ = *src++;
+ slen -= 3;
+ }
+ else return -1;
+ }
+ *dst++ = *src++;
+ slen--;
+ }
+ return 0;
+}
+
diff --git a/lib/erl_interface/src/encode/encode_fun.c b/lib/erl_interface/src/encode/encode_fun.c
index 54ee2083d6..4daee32648 100644
--- a/lib/erl_interface/src/encode/encode_fun.c
+++ b/lib/erl_interface/src/encode/encode_fun.c
@@ -35,7 +35,7 @@ int ei_encode_fun(char *buf, int *index, const erlang_fun *p)
ix += sizeof(char) + 4;
if (ei_encode_pid(buf, &ix, &p->pid) < 0)
return -1;
- if (ei_encode_atom(buf, &ix, p->module) < 0)
+ if (ei_encode_atom_as(buf, &ix, p->module, ERLANG_UTF8, p->module_org_enc) < 0)
return -1;
if (ei_encode_long(buf, &ix, p->index) < 0)
return -1;
@@ -60,7 +60,7 @@ int ei_encode_fun(char *buf, int *index, const erlang_fun *p)
} else
size_p = NULL;
ix += 1 + 4 + 1 + sizeof(p->md5) + 4 + 4;
- if (ei_encode_atom(buf, &ix, p->module) < 0)
+ if (ei_encode_atom_as(buf, &ix, p->module, ERLANG_UTF8, p->module_org_enc) < 0)
return -1;
if (ei_encode_long(buf, &ix, p->old_index) < 0)
return -1;
diff --git a/lib/erl_interface/src/encode/encode_pid.c b/lib/erl_interface/src/encode/encode_pid.c
index ee7f235c17..0cf3ef4efb 100644
--- a/lib/erl_interface/src/encode/encode_pid.c
+++ b/lib/erl_interface/src/encode/encode_pid.c
@@ -24,29 +24,23 @@
int ei_encode_pid(char *buf, int *index, const erlang_pid *p)
{
char *s = buf + *index;
- char *s0 = s;
- int len = strlen(p->node);
-
- if (!buf) s += 13 + len;
- else {
- put8(s,ERL_PID_EXT);
- /* first the nodename */
- put8(s,ERL_ATOM_EXT);
+ ++(*index); /* skip ERL_PID_EXT */
+ if (ei_encode_atom_len_as(buf, index, p->node, strlen(p->node), ERLANG_UTF8, p->node_org_enc) < 0)
+ return -1;
+
+ if (buf) {
+ put8(s,ERL_PID_EXT);
- put16be(s,len);
-
- memmove(s, p->node, len);
- s += len;
+ s = buf + *index;
/* now the integers */
put32be(s,p->num & 0x7fff); /* 15 bits */
put32be(s,p->serial & 0x1fff); /* 13 bits */
put8(s,(p->creation & 0x03)); /* 2 bits */
}
-
- *index += s-s0;
-
+
+ *index += 4 + 4 + 1;
return 0;
}
diff --git a/lib/erl_interface/src/encode/encode_port.c b/lib/erl_interface/src/encode/encode_port.c
index fbbb33182e..2bf9e26d78 100644
--- a/lib/erl_interface/src/encode/encode_port.c
+++ b/lib/erl_interface/src/encode/encode_port.c
@@ -24,28 +24,23 @@
int ei_encode_port(char *buf, int *index, const erlang_port *p)
{
char *s = buf + *index;
- char *s0 = s;
- int len = strlen(p->node);
-
- if (!buf) s += 9 + len;
- else {
- put8(s,ERL_PORT_EXT);
- /* first the nodename */
- put8(s,ERL_ATOM_EXT);
+ ++(*index); /* skip ERL_PORT_EXT */
+ if (ei_encode_atom_len_as(buf, index, p->node, strlen(p->node), ERLANG_UTF8,
+ p->node_org_enc) < 0) {
+ return -1;
+ }
+ if (buf) {
+ put8(s,ERL_PORT_EXT);
- put16be(s,len);
-
- memmove(s, p->node, len);
- s += len;
+ s = buf + *index;
/* now the integers */
put32be(s,p->id & 0x0fffffff /* 28 bits */);
put8(s,(p->creation & 0x03));
}
- *index += s-s0;
-
+ *index += 4 + 1;
return 0;
}
diff --git a/lib/erl_interface/src/encode/encode_ref.c b/lib/erl_interface/src/encode/encode_ref.c
index 292b452864..e8b3173315 100644
--- a/lib/erl_interface/src/encode/encode_ref.c
+++ b/lib/erl_interface/src/encode/encode_ref.c
@@ -24,36 +24,32 @@
int ei_encode_ref(char *buf, int *index, const erlang_ref *p)
{
char *s = buf + *index;
- char *s0 = s;
- int len = strlen(p->node);
int i;
+ (*index) += 1 + 2; /* skip to node atom */
+ if (ei_encode_atom_len_as(buf, index, p->node, strlen(p->node), ERLANG_UTF8,
+ p->node_org_enc) < 0) {
+ return -1;
+ }
+
/* Always encode as an extended reference; all participating parties
are now expected to be able to decode extended references. */
- if (!buf) s += 1 + 2 + (3+len) + p->len*4 + 1;
- else {
+ if (buf) {
put8(s,ERL_NEW_REFERENCE_EXT);
/* first, number of integers */
put16be(s, p->len);
/* then the nodename */
- put8(s,ERL_ATOM_EXT);
-
- put16be(s,len);
-
- memmove(s, p->node, len);
- s += len;
+ s = buf + *index;
/* now the integers */
put8(s,(p->creation & 0x03));
for (i = 0; i < p->len; i++)
put32be(s,p->n[i]);
-
- }
-
- *index += s-s0;
+ }
+ *index += p->len*4 + 1;
return 0;
}