From b553664f54034e8c04ae6f9cc44f16b7f516518b Mon Sep 17 00:00:00 2001 From: Sverker Eriksson Date: Fri, 11 Jan 2013 17:27:29 +0100 Subject: erl_interface: utf8 atoms continued --- lib/erl_interface/src/legacy/erl_marshal.c | 259 +++++++++++++++++------------ 1 file changed, 153 insertions(+), 106 deletions(-) (limited to 'lib/erl_interface/src/legacy/erl_marshal.c') diff --git a/lib/erl_interface/src/legacy/erl_marshal.c b/lib/erl_interface/src/legacy/erl_marshal.c index 775d7e82ca..884e9d421b 100644 --- a/lib/erl_interface/src/legacy/erl_marshal.c +++ b/lib/erl_interface/src/legacy/erl_marshal.c @@ -44,6 +44,9 @@ int erl_fp_compare(unsigned *a, unsigned *b); static void erl_long_to_fp(long l, unsigned *d); #endif +static int cmpbytes(unsigned char* s1,int l1,unsigned char* s2,int l2); +static int cmpatoms(unsigned char* s1, int l1, unsigned char tag1, unsigned char* s2, int l2, unsigned char tag2); + /* Used when comparing two encoded byte arrays */ /* this global data is ok (from threading point of view) since it is * initialized once and never changed @@ -111,8 +114,9 @@ void erl_init_marshal(void) cmp_array[ERL_SMALL_BIG_EXT] = ERL_NUM_CMP; cmp_array[ERL_LARGE_BIG_EXT] = ERL_NUM_CMP; cmp_array[ERL_ATOM_EXT] = ERL_ATOM_CMP; + cmp_array[ERL_ATOM_UTF8_EXT] = ERL_ATOM_CMP; cmp_array[ERL_SMALL_ATOM_EXT] = ERL_ATOM_CMP; - cmp_array[ERL_UNICODE_ATOM_EXT] = ERL_ATOM_CMP; + cmp_array[ERL_SMALL_ATOM_UTF8_EXT] = ERL_ATOM_CMP; cmp_array[ERL_REFERENCE_EXT] = ERL_REF_CMP; cmp_array[ERL_NEW_REFERENCE_EXT] = ERL_REF_CMP; cmp_array[ERL_FUN_EXT] = ERL_FUN_CMP; @@ -162,6 +166,21 @@ static int erl_length_x(const ETERM *ep) { *============================================================== */ +static void encode_atom(Erl_Atom_data* a, unsigned char **ext) +{ + int ix = 0; + if (a->latin1) { + ei_encode_atom_len_as((char*)*ext, &ix, a->latin1, a->lenL, + ERLANG_LATIN1, ERLANG_LATIN1); + } + else if (ei_encode_atom_len_as((char*)*ext, &ix, a->utf8, a->lenU, + ERLANG_UTF8, ERLANG_LATIN1) < 0) { + ei_encode_atom_len_as((char*)*ext, &ix, a->utf8, a->lenU, + ERLANG_UTF8, ERLANG_UTF8); + } + *ext += ix; +} + /* * The actual ENCODE engine. * Returns 0 on success, otherwise 1. @@ -176,12 +195,7 @@ int erl_encode_it(ETERM *ep, unsigned char **ext, int dist) switch(ERL_TYPE(ep)) { case ERL_ATOM: - i = ep->uval.aval.len; - *(*ext)++ = ERL_ATOM_EXT; - *(*ext)++ = (i >>8) &0xff; - *(*ext)++ = i &0xff; - memcpy((void *) *ext, (const void *) ep->uval.aval.a, i); - *ext += i; + encode_atom(&ep->uval.aval.d, ext); return 0; case ERL_INTEGER: @@ -292,12 +306,7 @@ int erl_encode_it(ETERM *ep, unsigned char **ext, int dist) case ERL_PID: *(*ext)++ = ERL_PID_EXT; /* First poke in node as an atom */ - i = strlen((char *)ERL_PID_NODE(ep)); - *(*ext)++ = ERL_ATOM_EXT; - *(*ext)++ = (i >>8) &0xff; - *(*ext)++ = i &0xff; - memcpy(*ext, ERL_PID_NODE(ep), i); - *ext += i; + encode_atom(&ep->uval.pidval.node, ext); /* And then fill in the integer fields */ i = ERL_PID_NUMBER(ep); *(*ext)++ = (i >> 24) &0xff; @@ -325,11 +334,8 @@ int erl_encode_it(ETERM *ep, unsigned char **ext, int dist) *(*ext)++ = (len >> 8) &0xff; *(*ext)++ = len &0xff; - *(*ext)++ = ERL_ATOM_EXT; - *(*ext)++ = (i >> 8) &0xff; - *(*ext)++ = i &0xff; - memcpy(*ext, ERL_REF_NODE(ep), i); - *ext += i; + encode_atom(&ep->uval.refval.node, ext); + *(*ext)++ = ERL_REF_CREATION(ep); /* Then the integer fields */ for (j = 0; j < ERL_REF_LEN(ep); j++) { @@ -344,12 +350,7 @@ int erl_encode_it(ETERM *ep, unsigned char **ext, int dist) case ERL_PORT: *(*ext)++ = ERL_PORT_EXT; /* First poke in node as an atom */ - i = strlen((char *)ERL_PORT_NODE(ep)); - *(*ext)++ = ERL_ATOM_EXT; - *(*ext)++ = (i >>8) &0xff; - *(*ext)++ = i &0xff; - memcpy(*ext, ERL_PORT_NODE(ep), i); - *ext += i; + encode_atom(&ep->uval.portval.node, ext); /* Then the integer fields */ i = ERL_PORT_NUMBER(ep); *(*ext)++ = (i >> 24) &0xff; @@ -500,6 +501,16 @@ int erl_term_len(ETERM *ep) return 1+erl_term_len_helper(ep, 4); } +static int atom_len_helper(Erl_Atom_data* a) +{ + if (erl_atom_ptr_latin1(a)) { + return 1 + 2 + a->lenL; /* ERL_ATOM_EXT */ + } + else { + return 1 + 1 + (a->lenU > 255) + a->lenU; + } +} + static int erl_term_len_helper(ETERM *ep, int dist) { int len = 0; @@ -511,8 +522,7 @@ static int erl_term_len_helper(ETERM *ep, int dist) if (ep) { switch (ERL_TYPE(ep)) { case ERL_ATOM: - i = ep->uval.aval.len; - len = i + 3; + len = atom_len_helper(&ep->uval.aval.d); break; case ERL_INTEGER: @@ -544,20 +554,15 @@ static int erl_term_len_helper(ETERM *ep, int dist) break; case ERL_PID: - /* 1 + N + 4 + 4 + 1 where N = 3 + strlen */ - i = strlen((char *)ERL_PID_NODE(ep)); - len = 13 + i; + len = 1 + atom_len_helper(&ep->uval.pidval.node) + 4 + 4 + 1; break; case ERL_REF: - i = strlen((char *)ERL_REF_NODE(ep)); - len = 1 + 2 + (i+3) + 1 + ERL_REF_LEN(ep) * 4; + len = 1 + 2 + atom_len_helper(&ep->uval.refval.node) + 1 + ERL_REF_LEN(ep) * 4; break; case ERL_PORT: - /* 1 + N + 4 + 1 where N = 3 + strlen */ - i = strlen((char *)ERL_PORT_NODE(ep)); - len = 9 + i; + len = 1 + atom_len_helper(&ep->uval.portval.node) + 4 + 1; break; case ERL_EMPTY_LIST: @@ -651,11 +656,33 @@ int erl_encode_buf(ETERM *ep, unsigned char **ext) } /* erl_encode_buf */ -static int read_atom(unsigned char** ext, char* dst) +static int read_atom(unsigned char** ext, Erl_Atom_data* a) { + char buf[MAXATOMLEN_UTF8]; int offs = 0; - int ret = ei_decode_atom((char*)*ext, &offs, dst); + enum erlang_char_encoding enc; + int ret = ei_decode_atom_as((char*)*ext, &offs, buf, MAXATOMLEN_UTF8, + ERLANG_WHATEVER, NULL, &enc); *ext += offs; + + if (ret == 0) { + int i = strlen(buf); + char* clone = erl_malloc(i+1); + memcpy(clone, buf, i+1); + + a->latin1 = NULL; + a->lenL = 0; + a->utf8 = NULL; + a->lenU = 0; + if (enc == ERLANG_LATIN1 || enc == ERLANG_ASCII) { + a->latin1 = clone; + a->lenL = i; + } + if (enc == ERLANG_UTF8 || enc == ERLANG_ASCII) { + a->utf8 = clone; + a->lenU = i; + } + } return ret; } @@ -665,7 +692,6 @@ static int read_atom(unsigned char** ext, char* dst) */ static ETERM *erl_decode_it(unsigned char **ext) { - char atom_buf[MAXATOMLEN+1]; char *cp; ETERM *ep,*tp,*np; unsigned int u,sign; @@ -765,127 +791,89 @@ static ETERM *erl_decode_it(unsigned char **ext) case ERL_ATOM_EXT: case ERL_SMALL_ATOM_EXT: - case ERL_UNICODE_ATOM_EXT: + case ERL_ATOM_UTF8_EXT: + case ERL_SMALL_ATOM_UTF8_EXT: + ERL_TYPE(ep) = ERL_ATOM; --(*ext); - if (read_atom(ext, atom_buf) < 0) return NULL; - - i = strlen(atom_buf); - ep->uval.aval.len = i; - ep->uval.aval.a = (char *) erl_malloc(i+1); - memcpy(ep->uval.aval.a, atom_buf, i+1); + if (read_atom(ext, &ep->uval.aval.d) < 0) return NULL; return ep; case ERL_PID_EXT: - erl_free_term(ep); - { /* Why not use the constructors? */ - char* node = atom_buf; + { unsigned int number, serial; unsigned char creation; - ETERM *eterm_p; - if (read_atom(ext, node) < 0) return NULL; + ERL_TYPE(ep) = ERL_PID; + if (read_atom(ext, &ep->uval.pidval.node) < 0) return NULL; /* get the integers */ -#if 0 - /* FIXME: Remove code or whatever.... - Ints on the wire are big-endian (== network byte order) - so use ntoh[sl]. (But some are little-endian! Arrrgh!) - Also, the libc authors can be expected to optimize them - heavily. However, the marshalling makes no guarantees - about alignments -- so it won't work at all. */ - number = ntohl(*((unsigned int *)*ext)++); - serial = ntohl(*((unsigned int *)*ext)++); -#else number = ((*ext)[0] << 24) | ((*ext)[1]) << 16 | ((*ext)[2]) << 8 | ((*ext)[3]); *ext += 4; serial = ((*ext)[0] << 24) | ((*ext)[1]) << 16 | ((*ext)[2]) << 8 | ((*ext)[3]); *ext += 4; -#endif creation = *(*ext)++; - eterm_p = erl_mk_pid(node, number, serial, creation); - return eterm_p; + erl_mk_pid_helper(ep, number, serial, creation); + return ep; } case ERL_REFERENCE_EXT: - erl_free_term(ep); { - char* node = atom_buf; - unsigned int number; + unsigned int n[3] = {0, 0, 0}; unsigned char creation; - ETERM *eterm_p; - if (read_atom(ext, node) < 0) return NULL; + ERL_TYPE(ep) = ERL_REF; + if (read_atom(ext, &ep->uval.refval.node) < 0) return NULL; /* get the integers */ -#if 0 - number = ntohl(*((unsigned int *)*ext)++); -#else - number = ((*ext)[0] << 24) | ((*ext)[1]) << 16 | + n[0] = ((*ext)[0] << 24) | ((*ext)[1]) << 16 | ((*ext)[2]) << 8 | ((*ext)[3]); *ext += 4; -#endif creation = *(*ext)++; - eterm_p = erl_mk_ref(node, number, creation); - return eterm_p; + __erl_mk_reference(ep, NULL, 1, n, creation); + return ep; } case ERL_NEW_REFERENCE_EXT: - erl_free_term(ep); { - char* node = atom_buf; size_t cnt, i; unsigned int n[3]; unsigned char creation; - ETERM *eterm_p; -#if 0 - cnt = ntohs(*((unsigned short *)*ext)++); -#else + ERL_TYPE(ep) = ERL_REF; cnt = ((*ext)[0] << 8) | (*ext)[1]; *ext += 2; -#endif - if (read_atom(ext, node) < 0) return NULL; + if (read_atom(ext, &ep->uval.refval.node) < 0) return NULL; /* get the integers */ creation = *(*ext)++; for(i = 0; i < cnt; i++) { -#if 0 - n[i] = ntohl(*((unsigned int *)*ext)++); -#else n[i] = ((*ext)[0] << 24) | ((*ext)[1]) << 16 | ((*ext)[2]) << 8 | ((*ext)[3]); *ext += 4; -#endif } - eterm_p = __erl_mk_reference(node, cnt, n, creation); - return eterm_p; + __erl_mk_reference(ep, NULL, cnt, n, creation); + return ep; } case ERL_PORT_EXT: - erl_free_term(ep); { - char* node = atom_buf; unsigned int number; unsigned char creation; - ETERM *eterm_p; - if (read_atom(ext, node) < 0) return NULL; + ERL_TYPE(ep) = ERL_PORT; + if (read_atom(ext, &ep->uval.portval.node) < 0) return NULL; /* get the integers */ -#if 0 - number = ntohl(*((unsigned int *)*ext)++); -#else number = ((*ext)[0] << 24) | ((*ext)[1]) << 16 | ((*ext)[2]) << 8 | ((*ext)[3]); *ext += 4; -#endif creation = *(*ext)++; - eterm_p = erl_mk_port(node, number, creation); - return eterm_p; + erl_mk_port_helper(ep, number, creation); + return ep; } case ERL_NIL_EXT: @@ -1120,8 +1108,9 @@ unsigned char erl_ext_type(unsigned char *ext) case ERL_INTEGER_EXT: return ERL_INTEGER; case ERL_ATOM_EXT: + case ERL_ATOM_UTF8_EXT: case ERL_SMALL_ATOM_EXT: - case ERL_UNICODE_ATOM_EXT: + case ERL_SMALL_ATOM_UTF8_EXT: return ERL_ATOM; case ERL_PID_EXT: return ERL_PID; @@ -1173,8 +1162,9 @@ int erl_ext_size(unsigned char *t) case ERL_SMALL_INTEGER_EXT: case ERL_INTEGER_EXT: case ERL_ATOM_EXT: + case ERL_ATOM_UTF8_EXT: case ERL_SMALL_ATOM_EXT: - case ERL_UNICODE_ATOM_EXT: + case ERL_SMALL_ATOM_UTF8_EXT: case ERL_PID_EXT: case ERL_PORT_EXT: case ERL_REFERENCE_EXT: @@ -1221,12 +1211,13 @@ static int jump_atom(unsigned char** ext) switch (*e++) { case ERL_ATOM_EXT: - case ERL_UNICODE_ATOM_EXT: + case ERL_ATOM_UTF8_EXT: len = (e[0] << 8) | e[1]; e += (len + 2); break; case ERL_SMALL_ATOM_EXT: + case ERL_SMALL_ATOM_UTF8_EXT: len = e[0]; e += (len + 1); break; @@ -1259,8 +1250,9 @@ static int jump(unsigned char **ext) *ext += 1; break; case ERL_ATOM_EXT: + case ERL_ATOM_UTF8_EXT: case ERL_SMALL_ATOM_EXT: - case ERL_UNICODE_ATOM_EXT: + case ERL_SMALL_ATOM_UTF8_EXT: jump_atom(ext); break; case ERL_PID_EXT: @@ -1426,6 +1418,58 @@ static int cmpbytes(unsigned char* s1,int l1,unsigned char* s2,int l2) } /* cmpbytes */ +#define tag2enc(T) ((T)==ERL_ATOM_EXT || (T)==ERL_SMALL_ATOM_EXT ? ERLANG_LATIN1 : ERLANG_UTF8) + +static int cmpatoms(unsigned char* s1, int l1, unsigned char tag1, + unsigned char* s2, int l2, unsigned char tag2) +{ + enum erlang_char_encoding enc1 = tag2enc(tag1); + enum erlang_char_encoding enc2 = tag2enc(tag2); + + if (enc1 == enc2) { + return cmpbytes(s1, l1,s2,l2); + } + + if (enc1 == ERLANG_LATIN1) { + return cmp_latin1_vs_utf8((char*)s1, l1, (char*)s2, l2); + } + else { + return -cmp_latin1_vs_utf8((char*)s2, l2, (char*)s1, l1); + } +} + +int cmp_latin1_vs_utf8(const char* strL, int lenL, const char* strU, int lenU) +{ + unsigned char* sL = (unsigned char*)strL; + unsigned char* sU = (unsigned char*)strU; + unsigned char* sL_end = sL + lenL; + unsigned char* sU_end = sU + lenU; + + while(sL < sL_end && sU < sU_end) { + unsigned char UasL; + if (*sL >= 0x80) { + if (*sU < 0xC4 && (sU+1) < sU_end) { + UasL = ((sU[0] & 0x3) << 6) | (sU[1] & 0x3F); + } + else return -1; + } + else { + UasL = *sU; + } + if (*sL < UasL) return -1; + if (*sL > UasL) return 1; + + sL++; + if (*sU < 0x80) sU++; + else if (*sU < 0xE0) sU += 2; + else if (*sU < 0xF0) sU += 3; + else /*if (*sU < 0xF8)*/ sU += 4; + } + + return (sU >= sU_end) - (sL >= sL_end); /* -1, 0 or 1 */ +} + + #define CMP_EXT_ERROR_CODE 4711 #define CMP_EXT_INT32_BE(AP, BP) \ @@ -1561,6 +1605,7 @@ static int cmp_exe2(unsigned char **e1, unsigned char **e2) int min, ret,i,j,k; double ff1, ff2; unsigned char *tmp1, *tmp2; + unsigned char tag1, tag2; if ( ((*e1)[0] == ERL_STRING_EXT) && ((*e2)[0] == ERL_LIST_EXT) ) { return cmp_string_list(e1, e2); @@ -1568,9 +1613,10 @@ static int cmp_exe2(unsigned char **e1, unsigned char **e2) return -cmp_string_list(e2, e1); } - *e2 += 1; + tag1 = *(*e1)++; + tag2 = *(*e2)++; i = j = 0; - switch (*(*e1)++) + switch (tag1) { case ERL_SMALL_INTEGER_EXT: if (**e1 < **e2) ret = -1; @@ -1590,14 +1636,15 @@ static int cmp_exe2(unsigned char **e1, unsigned char **e2) *e1 += 4; *e2 += 4; return ret; case ERL_ATOM_EXT: - case ERL_UNICODE_ATOM_EXT: + case ERL_ATOM_UTF8_EXT: i = (**e1) << 8; (*e1)++; j = (**e2) << 8; (*e2)++; /*fall through*/ case ERL_SMALL_ATOM_EXT: + case ERL_SMALL_ATOM_UTF8_EXT: i |= (**e1); (*e1)++; j |= (**e2); (*e2)++; - ret = cmpbytes(*e1, i, *e2, j); + ret = cmpatoms(*e1, i, tag1, *e2, j, tag2); *e1 += i; *e2 += j; return ret; -- cgit v1.2.3