aboutsummaryrefslogtreecommitdiffstats
path: root/lib/erl_interface/src/legacy/erl_marshal.c
diff options
context:
space:
mode:
authorSverker Eriksson <[email protected]>2013-01-11 17:27:29 +0100
committerSverker Eriksson <[email protected]>2013-01-18 15:04:03 +0100
commitb553664f54034e8c04ae6f9cc44f16b7f516518b (patch)
tree1522c655fea9aa52476e997aa26f1512d3ecada4 /lib/erl_interface/src/legacy/erl_marshal.c
parent97abb095cd2182d5c3fafd525da4943ef74dc8e5 (diff)
downloadotp-b553664f54034e8c04ae6f9cc44f16b7f516518b.tar.gz
otp-b553664f54034e8c04ae6f9cc44f16b7f516518b.tar.bz2
otp-b553664f54034e8c04ae6f9cc44f16b7f516518b.zip
erl_interface: utf8 atoms continued
Diffstat (limited to 'lib/erl_interface/src/legacy/erl_marshal.c')
-rw-r--r--lib/erl_interface/src/legacy/erl_marshal.c259
1 files changed, 153 insertions, 106 deletions
diff --git a/lib/erl_interface/src/legacy/erl_marshal.c b/lib/erl_interface/src/legacy/erl_marshal.c
index 775d7e82ca..884e9d421b 100644
--- a/lib/erl_interface/src/legacy/erl_marshal.c
+++ b/lib/erl_interface/src/legacy/erl_marshal.c
@@ -44,6 +44,9 @@ int erl_fp_compare(unsigned *a, unsigned *b);
static void erl_long_to_fp(long l, unsigned *d);
#endif
+static int cmpbytes(unsigned char* s1,int l1,unsigned char* s2,int l2);
+static int cmpatoms(unsigned char* s1, int l1, unsigned char tag1, unsigned char* s2, int l2, unsigned char tag2);
+
/* Used when comparing two encoded byte arrays */
/* this global data is ok (from threading point of view) since it is
* initialized once and never changed
@@ -111,8 +114,9 @@ void erl_init_marshal(void)
cmp_array[ERL_SMALL_BIG_EXT] = ERL_NUM_CMP;
cmp_array[ERL_LARGE_BIG_EXT] = ERL_NUM_CMP;
cmp_array[ERL_ATOM_EXT] = ERL_ATOM_CMP;
+ cmp_array[ERL_ATOM_UTF8_EXT] = ERL_ATOM_CMP;
cmp_array[ERL_SMALL_ATOM_EXT] = ERL_ATOM_CMP;
- cmp_array[ERL_UNICODE_ATOM_EXT] = ERL_ATOM_CMP;
+ cmp_array[ERL_SMALL_ATOM_UTF8_EXT] = ERL_ATOM_CMP;
cmp_array[ERL_REFERENCE_EXT] = ERL_REF_CMP;
cmp_array[ERL_NEW_REFERENCE_EXT] = ERL_REF_CMP;
cmp_array[ERL_FUN_EXT] = ERL_FUN_CMP;
@@ -162,6 +166,21 @@ static int erl_length_x(const ETERM *ep) {
*==============================================================
*/
+static void encode_atom(Erl_Atom_data* a, unsigned char **ext)
+{
+ int ix = 0;
+ if (a->latin1) {
+ ei_encode_atom_len_as((char*)*ext, &ix, a->latin1, a->lenL,
+ ERLANG_LATIN1, ERLANG_LATIN1);
+ }
+ else if (ei_encode_atom_len_as((char*)*ext, &ix, a->utf8, a->lenU,
+ ERLANG_UTF8, ERLANG_LATIN1) < 0) {
+ ei_encode_atom_len_as((char*)*ext, &ix, a->utf8, a->lenU,
+ ERLANG_UTF8, ERLANG_UTF8);
+ }
+ *ext += ix;
+}
+
/*
* The actual ENCODE engine.
* Returns 0 on success, otherwise 1.
@@ -176,12 +195,7 @@ int erl_encode_it(ETERM *ep, unsigned char **ext, int dist)
switch(ERL_TYPE(ep))
{
case ERL_ATOM:
- i = ep->uval.aval.len;
- *(*ext)++ = ERL_ATOM_EXT;
- *(*ext)++ = (i >>8) &0xff;
- *(*ext)++ = i &0xff;
- memcpy((void *) *ext, (const void *) ep->uval.aval.a, i);
- *ext += i;
+ encode_atom(&ep->uval.aval.d, ext);
return 0;
case ERL_INTEGER:
@@ -292,12 +306,7 @@ int erl_encode_it(ETERM *ep, unsigned char **ext, int dist)
case ERL_PID:
*(*ext)++ = ERL_PID_EXT;
/* First poke in node as an atom */
- i = strlen((char *)ERL_PID_NODE(ep));
- *(*ext)++ = ERL_ATOM_EXT;
- *(*ext)++ = (i >>8) &0xff;
- *(*ext)++ = i &0xff;
- memcpy(*ext, ERL_PID_NODE(ep), i);
- *ext += i;
+ encode_atom(&ep->uval.pidval.node, ext);
/* And then fill in the integer fields */
i = ERL_PID_NUMBER(ep);
*(*ext)++ = (i >> 24) &0xff;
@@ -325,11 +334,8 @@ int erl_encode_it(ETERM *ep, unsigned char **ext, int dist)
*(*ext)++ = (len >> 8) &0xff;
*(*ext)++ = len &0xff;
- *(*ext)++ = ERL_ATOM_EXT;
- *(*ext)++ = (i >> 8) &0xff;
- *(*ext)++ = i &0xff;
- memcpy(*ext, ERL_REF_NODE(ep), i);
- *ext += i;
+ encode_atom(&ep->uval.refval.node, ext);
+
*(*ext)++ = ERL_REF_CREATION(ep);
/* Then the integer fields */
for (j = 0; j < ERL_REF_LEN(ep); j++) {
@@ -344,12 +350,7 @@ int erl_encode_it(ETERM *ep, unsigned char **ext, int dist)
case ERL_PORT:
*(*ext)++ = ERL_PORT_EXT;
/* First poke in node as an atom */
- i = strlen((char *)ERL_PORT_NODE(ep));
- *(*ext)++ = ERL_ATOM_EXT;
- *(*ext)++ = (i >>8) &0xff;
- *(*ext)++ = i &0xff;
- memcpy(*ext, ERL_PORT_NODE(ep), i);
- *ext += i;
+ encode_atom(&ep->uval.portval.node, ext);
/* Then the integer fields */
i = ERL_PORT_NUMBER(ep);
*(*ext)++ = (i >> 24) &0xff;
@@ -500,6 +501,16 @@ int erl_term_len(ETERM *ep)
return 1+erl_term_len_helper(ep, 4);
}
+static int atom_len_helper(Erl_Atom_data* a)
+{
+ if (erl_atom_ptr_latin1(a)) {
+ return 1 + 2 + a->lenL; /* ERL_ATOM_EXT */
+ }
+ else {
+ return 1 + 1 + (a->lenU > 255) + a->lenU;
+ }
+}
+
static int erl_term_len_helper(ETERM *ep, int dist)
{
int len = 0;
@@ -511,8 +522,7 @@ static int erl_term_len_helper(ETERM *ep, int dist)
if (ep) {
switch (ERL_TYPE(ep)) {
case ERL_ATOM:
- i = ep->uval.aval.len;
- len = i + 3;
+ len = atom_len_helper(&ep->uval.aval.d);
break;
case ERL_INTEGER:
@@ -544,20 +554,15 @@ static int erl_term_len_helper(ETERM *ep, int dist)
break;
case ERL_PID:
- /* 1 + N + 4 + 4 + 1 where N = 3 + strlen */
- i = strlen((char *)ERL_PID_NODE(ep));
- len = 13 + i;
+ len = 1 + atom_len_helper(&ep->uval.pidval.node) + 4 + 4 + 1;
break;
case ERL_REF:
- i = strlen((char *)ERL_REF_NODE(ep));
- len = 1 + 2 + (i+3) + 1 + ERL_REF_LEN(ep) * 4;
+ len = 1 + 2 + atom_len_helper(&ep->uval.refval.node) + 1 + ERL_REF_LEN(ep) * 4;
break;
case ERL_PORT:
- /* 1 + N + 4 + 1 where N = 3 + strlen */
- i = strlen((char *)ERL_PORT_NODE(ep));
- len = 9 + i;
+ len = 1 + atom_len_helper(&ep->uval.portval.node) + 4 + 1;
break;
case ERL_EMPTY_LIST:
@@ -651,11 +656,33 @@ int erl_encode_buf(ETERM *ep, unsigned char **ext)
} /* erl_encode_buf */
-static int read_atom(unsigned char** ext, char* dst)
+static int read_atom(unsigned char** ext, Erl_Atom_data* a)
{
+ char buf[MAXATOMLEN_UTF8];
int offs = 0;
- int ret = ei_decode_atom((char*)*ext, &offs, dst);
+ enum erlang_char_encoding enc;
+ int ret = ei_decode_atom_as((char*)*ext, &offs, buf, MAXATOMLEN_UTF8,
+ ERLANG_WHATEVER, NULL, &enc);
*ext += offs;
+
+ if (ret == 0) {
+ int i = strlen(buf);
+ char* clone = erl_malloc(i+1);
+ memcpy(clone, buf, i+1);
+
+ a->latin1 = NULL;
+ a->lenL = 0;
+ a->utf8 = NULL;
+ a->lenU = 0;
+ if (enc == ERLANG_LATIN1 || enc == ERLANG_ASCII) {
+ a->latin1 = clone;
+ a->lenL = i;
+ }
+ if (enc == ERLANG_UTF8 || enc == ERLANG_ASCII) {
+ a->utf8 = clone;
+ a->lenU = i;
+ }
+ }
return ret;
}
@@ -665,7 +692,6 @@ static int read_atom(unsigned char** ext, char* dst)
*/
static ETERM *erl_decode_it(unsigned char **ext)
{
- char atom_buf[MAXATOMLEN+1];
char *cp;
ETERM *ep,*tp,*np;
unsigned int u,sign;
@@ -765,127 +791,89 @@ static ETERM *erl_decode_it(unsigned char **ext)
case ERL_ATOM_EXT:
case ERL_SMALL_ATOM_EXT:
- case ERL_UNICODE_ATOM_EXT:
+ case ERL_ATOM_UTF8_EXT:
+ case ERL_SMALL_ATOM_UTF8_EXT:
+
ERL_TYPE(ep) = ERL_ATOM;
--(*ext);
- if (read_atom(ext, atom_buf) < 0) return NULL;
-
- i = strlen(atom_buf);
- ep->uval.aval.len = i;
- ep->uval.aval.a = (char *) erl_malloc(i+1);
- memcpy(ep->uval.aval.a, atom_buf, i+1);
+ if (read_atom(ext, &ep->uval.aval.d) < 0) return NULL;
return ep;
case ERL_PID_EXT:
- erl_free_term(ep);
- { /* Why not use the constructors? */
- char* node = atom_buf;
+ {
unsigned int number, serial;
unsigned char creation;
- ETERM *eterm_p;
- if (read_atom(ext, node) < 0) return NULL;
+ ERL_TYPE(ep) = ERL_PID;
+ if (read_atom(ext, &ep->uval.pidval.node) < 0) return NULL;
/* get the integers */
-#if 0
- /* FIXME: Remove code or whatever....
- Ints on the wire are big-endian (== network byte order)
- so use ntoh[sl]. (But some are little-endian! Arrrgh!)
- Also, the libc authors can be expected to optimize them
- heavily. However, the marshalling makes no guarantees
- about alignments -- so it won't work at all. */
- number = ntohl(*((unsigned int *)*ext)++);
- serial = ntohl(*((unsigned int *)*ext)++);
-#else
number = ((*ext)[0] << 24) | ((*ext)[1]) << 16 |
((*ext)[2]) << 8 | ((*ext)[3]);
*ext += 4;
serial = ((*ext)[0] << 24) | ((*ext)[1]) << 16 |
((*ext)[2]) << 8 | ((*ext)[3]);
*ext += 4;
-#endif
creation = *(*ext)++;
- eterm_p = erl_mk_pid(node, number, serial, creation);
- return eterm_p;
+ erl_mk_pid_helper(ep, number, serial, creation);
+ return ep;
}
case ERL_REFERENCE_EXT:
- erl_free_term(ep);
{
- char* node = atom_buf;
- unsigned int number;
+ unsigned int n[3] = {0, 0, 0};
unsigned char creation;
- ETERM *eterm_p;
- if (read_atom(ext, node) < 0) return NULL;
+ ERL_TYPE(ep) = ERL_REF;
+ if (read_atom(ext, &ep->uval.refval.node) < 0) return NULL;
/* get the integers */
-#if 0
- number = ntohl(*((unsigned int *)*ext)++);
-#else
- number = ((*ext)[0] << 24) | ((*ext)[1]) << 16 |
+ n[0] = ((*ext)[0] << 24) | ((*ext)[1]) << 16 |
((*ext)[2]) << 8 | ((*ext)[3]);
*ext += 4;
-#endif
creation = *(*ext)++;
- eterm_p = erl_mk_ref(node, number, creation);
- return eterm_p;
+ __erl_mk_reference(ep, NULL, 1, n, creation);
+ return ep;
}
case ERL_NEW_REFERENCE_EXT:
- erl_free_term(ep);
{
- char* node = atom_buf;
size_t cnt, i;
unsigned int n[3];
unsigned char creation;
- ETERM *eterm_p;
-#if 0
- cnt = ntohs(*((unsigned short *)*ext)++);
-#else
+ ERL_TYPE(ep) = ERL_REF;
cnt = ((*ext)[0] << 8) | (*ext)[1];
*ext += 2;
-#endif
- if (read_atom(ext, node) < 0) return NULL;
+ if (read_atom(ext, &ep->uval.refval.node) < 0) return NULL;
/* get the integers */
creation = *(*ext)++;
for(i = 0; i < cnt; i++)
{
-#if 0
- n[i] = ntohl(*((unsigned int *)*ext)++);
-#else
n[i] = ((*ext)[0] << 24) | ((*ext)[1]) << 16 |
((*ext)[2]) << 8 | ((*ext)[3]);
*ext += 4;
-#endif
}
- eterm_p = __erl_mk_reference(node, cnt, n, creation);
- return eterm_p;
+ __erl_mk_reference(ep, NULL, cnt, n, creation);
+ return ep;
}
case ERL_PORT_EXT:
- erl_free_term(ep);
{
- char* node = atom_buf;
unsigned int number;
unsigned char creation;
- ETERM *eterm_p;
- if (read_atom(ext, node) < 0) return NULL;
+ ERL_TYPE(ep) = ERL_PORT;
+ if (read_atom(ext, &ep->uval.portval.node) < 0) return NULL;
/* get the integers */
-#if 0
- number = ntohl(*((unsigned int *)*ext)++);
-#else
number = ((*ext)[0] << 24) | ((*ext)[1]) << 16 |
((*ext)[2]) << 8 | ((*ext)[3]);
*ext += 4;
-#endif
creation = *(*ext)++;
- eterm_p = erl_mk_port(node, number, creation);
- return eterm_p;
+ erl_mk_port_helper(ep, number, creation);
+ return ep;
}
case ERL_NIL_EXT:
@@ -1120,8 +1108,9 @@ unsigned char erl_ext_type(unsigned char *ext)
case ERL_INTEGER_EXT:
return ERL_INTEGER;
case ERL_ATOM_EXT:
+ case ERL_ATOM_UTF8_EXT:
case ERL_SMALL_ATOM_EXT:
- case ERL_UNICODE_ATOM_EXT:
+ case ERL_SMALL_ATOM_UTF8_EXT:
return ERL_ATOM;
case ERL_PID_EXT:
return ERL_PID;
@@ -1173,8 +1162,9 @@ int erl_ext_size(unsigned char *t)
case ERL_SMALL_INTEGER_EXT:
case ERL_INTEGER_EXT:
case ERL_ATOM_EXT:
+ case ERL_ATOM_UTF8_EXT:
case ERL_SMALL_ATOM_EXT:
- case ERL_UNICODE_ATOM_EXT:
+ case ERL_SMALL_ATOM_UTF8_EXT:
case ERL_PID_EXT:
case ERL_PORT_EXT:
case ERL_REFERENCE_EXT:
@@ -1221,12 +1211,13 @@ static int jump_atom(unsigned char** ext)
switch (*e++) {
case ERL_ATOM_EXT:
- case ERL_UNICODE_ATOM_EXT:
+ case ERL_ATOM_UTF8_EXT:
len = (e[0] << 8) | e[1];
e += (len + 2);
break;
case ERL_SMALL_ATOM_EXT:
+ case ERL_SMALL_ATOM_UTF8_EXT:
len = e[0];
e += (len + 1);
break;
@@ -1259,8 +1250,9 @@ static int jump(unsigned char **ext)
*ext += 1;
break;
case ERL_ATOM_EXT:
+ case ERL_ATOM_UTF8_EXT:
case ERL_SMALL_ATOM_EXT:
- case ERL_UNICODE_ATOM_EXT:
+ case ERL_SMALL_ATOM_UTF8_EXT:
jump_atom(ext);
break;
case ERL_PID_EXT:
@@ -1426,6 +1418,58 @@ static int cmpbytes(unsigned char* s1,int l1,unsigned char* s2,int l2)
} /* cmpbytes */
+#define tag2enc(T) ((T)==ERL_ATOM_EXT || (T)==ERL_SMALL_ATOM_EXT ? ERLANG_LATIN1 : ERLANG_UTF8)
+
+static int cmpatoms(unsigned char* s1, int l1, unsigned char tag1,
+ unsigned char* s2, int l2, unsigned char tag2)
+{
+ enum erlang_char_encoding enc1 = tag2enc(tag1);
+ enum erlang_char_encoding enc2 = tag2enc(tag2);
+
+ if (enc1 == enc2) {
+ return cmpbytes(s1, l1,s2,l2);
+ }
+
+ if (enc1 == ERLANG_LATIN1) {
+ return cmp_latin1_vs_utf8((char*)s1, l1, (char*)s2, l2);
+ }
+ else {
+ return -cmp_latin1_vs_utf8((char*)s2, l2, (char*)s1, l1);
+ }
+}
+
+int cmp_latin1_vs_utf8(const char* strL, int lenL, const char* strU, int lenU)
+{
+ unsigned char* sL = (unsigned char*)strL;
+ unsigned char* sU = (unsigned char*)strU;
+ unsigned char* sL_end = sL + lenL;
+ unsigned char* sU_end = sU + lenU;
+
+ while(sL < sL_end && sU < sU_end) {
+ unsigned char UasL;
+ if (*sL >= 0x80) {
+ if (*sU < 0xC4 && (sU+1) < sU_end) {
+ UasL = ((sU[0] & 0x3) << 6) | (sU[1] & 0x3F);
+ }
+ else return -1;
+ }
+ else {
+ UasL = *sU;
+ }
+ if (*sL < UasL) return -1;
+ if (*sL > UasL) return 1;
+
+ sL++;
+ if (*sU < 0x80) sU++;
+ else if (*sU < 0xE0) sU += 2;
+ else if (*sU < 0xF0) sU += 3;
+ else /*if (*sU < 0xF8)*/ sU += 4;
+ }
+
+ return (sU >= sU_end) - (sL >= sL_end); /* -1, 0 or 1 */
+}
+
+
#define CMP_EXT_ERROR_CODE 4711
#define CMP_EXT_INT32_BE(AP, BP) \
@@ -1561,6 +1605,7 @@ static int cmp_exe2(unsigned char **e1, unsigned char **e2)
int min, ret,i,j,k;
double ff1, ff2;
unsigned char *tmp1, *tmp2;
+ unsigned char tag1, tag2;
if ( ((*e1)[0] == ERL_STRING_EXT) && ((*e2)[0] == ERL_LIST_EXT) ) {
return cmp_string_list(e1, e2);
@@ -1568,9 +1613,10 @@ static int cmp_exe2(unsigned char **e1, unsigned char **e2)
return -cmp_string_list(e2, e1);
}
- *e2 += 1;
+ tag1 = *(*e1)++;
+ tag2 = *(*e2)++;
i = j = 0;
- switch (*(*e1)++)
+ switch (tag1)
{
case ERL_SMALL_INTEGER_EXT:
if (**e1 < **e2) ret = -1;
@@ -1590,14 +1636,15 @@ static int cmp_exe2(unsigned char **e1, unsigned char **e2)
*e1 += 4; *e2 += 4;
return ret;
case ERL_ATOM_EXT:
- case ERL_UNICODE_ATOM_EXT:
+ case ERL_ATOM_UTF8_EXT:
i = (**e1) << 8; (*e1)++;
j = (**e2) << 8; (*e2)++;
/*fall through*/
case ERL_SMALL_ATOM_EXT:
+ case ERL_SMALL_ATOM_UTF8_EXT:
i |= (**e1); (*e1)++;
j |= (**e2); (*e2)++;
- ret = cmpbytes(*e1, i, *e2, j);
+ ret = cmpatoms(*e1, i, tag1, *e2, j, tag2);
*e1 += i;
*e2 += j;
return ret;