From b553664f54034e8c04ae6f9cc44f16b7f516518b Mon Sep 17 00:00:00 2001 From: Sverker Eriksson Date: Fri, 11 Jan 2013 17:27:29 +0100 Subject: erl_interface: utf8 atoms continued --- lib/erl_interface/src/connect/ei_connect.c | 5 +- lib/erl_interface/src/connect/ei_connect_int.h | 2 + lib/erl_interface/src/connect/eirecv.c | 12 +- lib/erl_interface/src/decode/decode_atom.c | 185 +++++++++++++----- lib/erl_interface/src/decode/decode_boolean.c | 8 +- lib/erl_interface/src/decode/decode_fun.c | 6 +- lib/erl_interface/src/decode/decode_pid.c | 3 +- lib/erl_interface/src/decode/decode_port.c | 3 +- lib/erl_interface/src/decode/decode_ref.c | 6 +- lib/erl_interface/src/encode/encode_atom.c | 100 +++++++++- lib/erl_interface/src/encode/encode_fun.c | 4 +- lib/erl_interface/src/encode/encode_pid.c | 24 +-- lib/erl_interface/src/encode/encode_port.c | 23 +-- lib/erl_interface/src/encode/encode_ref.c | 24 +-- lib/erl_interface/src/legacy/erl_connect.c | 16 +- lib/erl_interface/src/legacy/erl_eterm.c | 158 +++++++++++---- lib/erl_interface/src/legacy/erl_eterm.h | 4 +- lib/erl_interface/src/legacy/erl_format.c | 20 +- lib/erl_interface/src/legacy/erl_malloc.c | 20 +- lib/erl_interface/src/legacy/erl_marshal.c | 259 +++++++++++++++---------- lib/erl_interface/src/legacy/global_whereis.c | 11 +- lib/erl_interface/src/misc/ei_decode_term.c | 13 +- lib/erl_interface/src/misc/ei_printterm.c | 5 +- lib/erl_interface/src/misc/ei_x_encode.c | 21 +- lib/erl_interface/src/misc/get_type.c | 77 +------- lib/erl_interface/src/misc/putget.h | 6 +- lib/erl_interface/src/misc/show_msg.c | 12 +- 27 files changed, 650 insertions(+), 377 deletions(-) (limited to 'lib/erl_interface/src') diff --git a/lib/erl_interface/src/connect/ei_connect.c b/lib/erl_interface/src/connect/ei_connect.c index 34362b4b9f..a17257795e 100644 --- a/lib/erl_interface/src/connect/ei_connect.c +++ b/lib/erl_interface/src/connect/ei_connect.c @@ -459,6 +459,7 @@ int ei_connect_xinit(ei_cnode* ec, const char *thishostname, /* memmove(&ec->this_ipaddr, thisipaddr, sizeof(ec->this_ipaddr)); */ strcpy(ec->self.node,thisnodename); + ec->self.node_org_enc = ERLANG_LATIN1; ec->self.num = 0; ec->self.serial = 0; ec->self.creation = creation; @@ -1332,7 +1333,9 @@ static int send_name_or_challenge(int fd, char *nodename, | DFLAG_EXTENDED_PIDS_PORTS | DFLAG_FUN_TAGS | DFLAG_NEW_FUN_TAGS - | DFLAG_NEW_FLOATS)); + | DFLAG_NEW_FLOATS + | DFLAG_SMALL_ATOM_TAGS + | DFLAG_UTF8_ATOMS)); if (f_chall) put32be(s, challenge); memcpy(s, nodename, strlen(nodename)); diff --git a/lib/erl_interface/src/connect/ei_connect_int.h b/lib/erl_interface/src/connect/ei_connect_int.h index 3c42b49b82..81c384e38d 100644 --- a/lib/erl_interface/src/connect/ei_connect_int.h +++ b/lib/erl_interface/src/connect/ei_connect_int.h @@ -102,6 +102,8 @@ extern int h_errno; #define DFLAG_NEW_FUN_TAGS 0x80 #define DFLAG_EXTENDED_PIDS_PORTS 0x100 #define DFLAG_NEW_FLOATS 0x800 +#define DFLAG_SMALL_ATOM_TAGS 0x4000 +#define DFLAG_UTF8_ATOMS 0x10000 ei_cnode *ei_fd_to_cnode(int fd); int ei_distversion(int fd); diff --git a/lib/erl_interface/src/connect/eirecv.c b/lib/erl_interface/src/connect/eirecv.c index 86852f947d..075f78e3d2 100644 --- a/lib/erl_interface/src/connect/eirecv.c +++ b/lib/erl_interface/src/connect/eirecv.c @@ -108,7 +108,7 @@ ei_recv_internal (int fd, switch (msg->msgtype) { case ERL_SEND: /* { SEND, Cookie, ToPid } */ if (ei_tracelevel >= 4) show_this_msg = 1; - if (ei_decode_atom(header,&index,msg->cookie) + if (ei_decode_atom_as(header,&index,msg->cookie,sizeof(msg->cookie),ERLANG_UTF8,NULL,NULL) || ei_decode_pid(header,&index,&msg->to)) { erl_errno = EIO; @@ -120,8 +120,8 @@ ei_recv_internal (int fd, case ERL_REG_SEND: /* { REG_SEND, From, Cookie, ToName } */ if (ei_tracelevel >= 4) show_this_msg = 1; if (ei_decode_pid(header,&index,&msg->from) - || ei_decode_atom(header,&index,msg->cookie) - || ei_decode_atom(header,&index,msg->toname)) + || ei_decode_atom_as(header,&index,msg->cookie,sizeof(msg->cookie),ERLANG_UTF8,NULL,NULL) + || ei_decode_atom_as(header,&index,msg->toname,sizeof(msg->toname),ERLANG_UTF8,NULL,NULL)) { erl_errno = EIO; return -1; @@ -157,7 +157,7 @@ ei_recv_internal (int fd, case ERL_SEND_TT: /* { SEND_TT, Cookie, ToPid, TraceToken } */ if (ei_tracelevel >= 4) show_this_msg = 1; - if (ei_decode_atom(header,&index,msg->cookie) + if (ei_decode_atom_as(header,&index,msg->cookie,sizeof(msg->cookie),ERLANG_UTF8,NULL,NULL) || ei_decode_pid(header,&index,&msg->to) || ei_decode_trace(header,&index,&msg->token)) { @@ -171,8 +171,8 @@ ei_recv_internal (int fd, case ERL_REG_SEND_TT: /* { REG_SEND_TT, From, Cookie, ToName, TraceToken } */ if (ei_tracelevel >= 4) show_this_msg = 1; if (ei_decode_pid(header,&index,&msg->from) - || ei_decode_atom(header,&index,msg->cookie) - || ei_decode_atom(header,&index,msg->toname) + || ei_decode_atom_as(header,&index,msg->cookie,sizeof(msg->cookie),ERLANG_UTF8,NULL,NULL) + || ei_decode_atom_as(header,&index,msg->toname,sizeof(msg->toname),ERLANG_UTF8,NULL,NULL) || ei_decode_trace(header,&index,&msg->token)) { erl_errno = EIO; diff --git a/lib/erl_interface/src/decode/decode_atom.c b/lib/erl_interface/src/decode/decode_atom.c index 84edf1766a..2ada418243 100644 --- a/lib/erl_interface/src/decode/decode_atom.c +++ b/lib/erl_interface/src/decode/decode_atom.c @@ -21,76 +21,155 @@ #include "eiext.h" #include "putget.h" -static int utf8_to_latin1(char* dest, const char* source, unsigned len); int ei_decode_atom(const char *buf, int *index, char *p) { - const char *s = buf + *index; - const char *s0 = s; - int len; - - switch (get8(s)) { - case ERL_ATOM_EXT: - len = get16be(s); - if (len > MAXATOMLEN) return -1; - if (p) { - memmove(p,s,len); - p[len] = (char)0; - } - break; - - case ERL_SMALL_ATOM_EXT: - len = get8(s); - if (p) { - memmove(p,s,len); - p[len] = (char)0; - } - break; - - case ERL_UNICODE_ATOM_EXT: - len = get16be(s); - - if (len > 2*MAXATOMLEN) return -1; - - if (p && utf8_to_latin1(p, s, len) < 0) return -1; - break; - - default: - return -1; - } - - s += len; - *index += s-s0; - return 0; + return ei_decode_atom_as(buf, index, p, MAXATOMLEN, ERLANG_LATIN1, NULL, NULL); } -int ei_internal_get_atom(const char** bufp, char* p) +int ei_decode_atom_as(const char *buf, int *index, char* p, int destlen, + enum erlang_char_encoding want_enc, + enum erlang_char_encoding* was_encp, + enum erlang_char_encoding* res_encp) { - int ix = 0; - if (ei_decode_atom(*bufp, &ix, p) < 0) return -1; - *bufp += ix; + const char *s = buf + *index; + const char *s0 = s; + int len; + enum erlang_char_encoding got_enc; + + switch (get8(s)) { + case ERL_ATOM_EXT: + len = get16be(s); + got_enc = ERLANG_LATIN1; + break; + case ERL_SMALL_ATOM_EXT: + len = get8(s); + got_enc = ERLANG_LATIN1; + break; + case ERL_ATOM_UTF8_EXT: + len = get16be(s); + got_enc = ERLANG_UTF8; + break; + case ERL_SMALL_ATOM_UTF8_EXT: + len = get8(s); + got_enc = ERLANG_UTF8; + break; + default: + return -1; + } + + if (want_enc == got_enc || want_enc == ERLANG_WHATEVER || want_enc == ERLANG_ASCII) { + int i, found_non_ascii = 0; + if (len >= destlen) + return -1; + for (i=0; i 0 && dest < dest_end) { - if ((source[0] & 0x80) == 0) { - *dest++ = *source++; + while (slen > 0) { + if (dst >= dst_end) return -1; + if ((src[0] & 0x80) == 0) { + if (dst_start) { + *dst = *src; + } + ++dst; + ++src; --slen; } else if (slen > 1 && - (source[0] & 0xFE) == 0xC2 && - (source[1] & 0xC0) == 0x80) { - *dest++ = (char) ((source[0] << 6) | (source[1] & 0x3F)); - source += 2; + (src[0] & 0xFE) == 0xC2 && + (src[1] & 0xC0) == 0x80) { + if (dst_start) { + *dst = (char) ((src[0] << 6) | (src[1] & 0x3F)); + } + ++dst; + src += 2; slen -= 2; + found_non_ascii = 1; } else return -1; } - *dest = 0; + if (res_encp) { + *res_encp = found_non_ascii ? ERLANG_LATIN1 : ERLANG_ASCII; + } + return dst - dst_start; +} + +int latin1_to_utf8(char* dst, const char* src, int slen, int destlen, + enum erlang_char_encoding* res_encp) +{ + const char* const src_end = src + slen; + const char* const dst_start = dst; + const char* const dst_end = dst + destlen; + int found_non_ascii = 0; + + while (src < src_end) { + if (dst >= dst_end) return -1; + if ((src[0] & 0x80) == 0) { + if (dst_start) { + *dst = *src; + } + ++dst; + } + else { + if (dst_start) { + unsigned char ch = *src; + dst[0] = 0xC0 | (ch >> 6); + dst[1] = 0x80 | (ch & 0x3F); + } + dst += 2; + found_non_ascii = 1; + } + ++src; + } + if (res_encp) { + *res_encp = found_non_ascii ? ERLANG_UTF8 : ERLANG_ASCII; + } + return dst - dst_start; +} + + + +int ei_internal_get_atom(const char** bufp, char* p, + enum erlang_char_encoding* was_encp) +{ + int ix = 0; + if (ei_decode_atom_as(*bufp, &ix, p, MAXATOMLEN_UTF8, ERLANG_UTF8, was_encp, NULL) < 0) + return -1; + *bufp += ix; return 0; } + diff --git a/lib/erl_interface/src/decode/decode_boolean.c b/lib/erl_interface/src/decode/decode_boolean.c index 0a7a06f1d4..f20690249b 100644 --- a/lib/erl_interface/src/decode/decode_boolean.c +++ b/lib/erl_interface/src/decode/decode_boolean.c @@ -24,12 +24,11 @@ /* c non-zero -> erlang "true" atom, otherwise "false" */ int ei_decode_boolean(const char *buf, int *index, int *p) { - const char *s = buf + *index; - const char *s0 = s; - char tbuf[MAXATOMLEN+1]; + char tbuf[6]; int t; - if (get_atom(&s, tbuf) < 0) return -1; + if (ei_decode_atom_as(buf, index, tbuf, sizeof(tbuf), ERLANG_ASCII, NULL, NULL) < 0) + return -1; if (memcmp(tbuf, "true", 5) == 0) t = 1; @@ -39,7 +38,6 @@ int ei_decode_boolean(const char *buf, int *index, int *p) return -1; if (p) *p = t; - *index += s-s0; return 0; } diff --git a/lib/erl_interface/src/decode/decode_fun.c b/lib/erl_interface/src/decode/decode_fun.c index 64fb9e86d8..7bbef5db44 100644 --- a/lib/erl_interface/src/decode/decode_fun.c +++ b/lib/erl_interface/src/decode/decode_fun.c @@ -42,7 +42,8 @@ int ei_decode_fun(const char *buf, int *index, erlang_fun *p) if (ei_decode_pid(s, &ix, (p == NULL ? (erlang_pid*)NULL : &p->pid)) < 0) return -1; /* then the module (atom) */ - if (ei_decode_atom(s, &ix, (p == NULL ? (char*)NULL : p->module)) < 0) + if (ei_decode_atom_as(s, &ix, (p == NULL ? (char*)NULL : p->module), + MAXATOMLEN_UTF8, ERLANG_UTF8, &p->module_org_enc, NULL) < 0) return -1; /* then the index */ if (ei_decode_long(s, &ix, (p == NULL ? (long*)NULL : &p->index)) < 0) @@ -84,7 +85,8 @@ int ei_decode_fun(const char *buf, int *index, erlang_fun *p) if (p != NULL) p->n_free_vars = i; /* then the module (atom) */ ix = 0; - if (ei_decode_atom(s, &ix, (p == NULL ? (char*)NULL : p->module)) < 0) + if (ei_decode_atom_as(s, &ix, (p == NULL ? (char*)NULL : p->module), + MAXATOMLEN_UTF8, ERLANG_UTF8, &p->module_org_enc, NULL) < 0) return -1; /* then the old_index */ if (ei_decode_long(s, &ix, (p == NULL ? (long*)NULL : &p->old_index)) < 0) diff --git a/lib/erl_interface/src/decode/decode_pid.c b/lib/erl_interface/src/decode/decode_pid.c index a762ae499e..e79952195d 100644 --- a/lib/erl_interface/src/decode/decode_pid.c +++ b/lib/erl_interface/src/decode/decode_pid.c @@ -26,12 +26,11 @@ int ei_decode_pid(const char *buf, int *index, erlang_pid *p) { const char *s = buf + *index; const char *s0 = s; - int len; if (get8(s) != ERL_PID_EXT) return -1; /* first the nodename */ - if (get_atom(&s, p->node) < 0) return -1; + if (get_atom(&s, p->node, &p->node_org_enc) < 0) return -1; /* now the numbers: num (4), serial (4), creation (1) */ if (p) { diff --git a/lib/erl_interface/src/decode/decode_port.c b/lib/erl_interface/src/decode/decode_port.c index 6eb2bc9197..5fd96b51a4 100644 --- a/lib/erl_interface/src/decode/decode_port.c +++ b/lib/erl_interface/src/decode/decode_port.c @@ -25,12 +25,11 @@ int ei_decode_port(const char *buf, int *index, erlang_port *p) { const char *s = buf + *index; const char *s0 = s; - int len; if (get8(s) != ERL_PORT_EXT) return -1; /* first the nodename */ - if (get_atom(&s, p->node) < 0) return -1; + if (get_atom(&s, p->node, &p->node_org_enc) < 0) return -1; /* now the numbers: num (4), creation (1) */ if (p) { diff --git a/lib/erl_interface/src/decode/decode_ref.c b/lib/erl_interface/src/decode/decode_ref.c index df3c30777b..7294e5d239 100644 --- a/lib/erl_interface/src/decode/decode_ref.c +++ b/lib/erl_interface/src/decode/decode_ref.c @@ -26,13 +26,13 @@ int ei_decode_ref(const char *buf, int *index, erlang_ref *p) { const char *s = buf + *index; const char *s0 = s; - int count, len, i; + int count, i; switch (get8(s)) { case ERL_REFERENCE_EXT: /* nodename */ - if (get_atom(&s, p->node) < 0) return -1; + if (get_atom(&s, p->node, &p->node_org_enc) < 0) return -1; /* now the numbers: num (4), creation (1) */ if (p) { @@ -53,7 +53,7 @@ int ei_decode_ref(const char *buf, int *index, erlang_ref *p) if (p) p->len = count; /* then the nodename */ - if (get_atom(&s, p->node) < 0) return -1; + if (get_atom(&s, p->node, &p->node_org_enc) < 0) return -1; /* creation */ if (p) { diff --git a/lib/erl_interface/src/encode/encode_atom.c b/lib/erl_interface/src/encode/encode_atom.c index 6f41f045e0..a3d7c4c759 100644 --- a/lib/erl_interface/src/encode/encode_atom.c +++ b/lib/erl_interface/src/encode/encode_atom.c @@ -26,25 +26,95 @@ int ei_encode_atom(char *buf, int *index, const char *p) { size_t len = strlen(p); - if (len >= INT_MAX) return -1; - return ei_encode_atom_len(buf, index, p, len); + if (len >= MAXATOMLEN) + len = MAXATOMLEN - 1; + return ei_encode_atom_len_as(buf, index, p, len, ERLANG_LATIN1, ERLANG_LATIN1); } int ei_encode_atom_len(char *buf, int *index, const char *p, int len) +{ + /* This function is documented to truncate at MAXATOMLEN (256) */ + if (len >= MAXATOMLEN) + len = MAXATOMLEN - 1; + return ei_encode_atom_len_as(buf, index, p, len, ERLANG_LATIN1, ERLANG_LATIN1); +} + +int ei_encode_atom_as(char *buf, int *index, const char *p, + enum erlang_char_encoding from_enc, + enum erlang_char_encoding to_enc) +{ + return ei_encode_atom_len_as(buf, index, p, strlen(p), from_enc, to_enc); +} + +int ei_encode_atom_len_as(char *buf, int *index, const char *p, int len, + enum erlang_char_encoding from_enc, + enum erlang_char_encoding to_enc) { char *s = buf + *index; char *s0 = s; + int offs; - /* This function is documented to truncate at MAXATOMLEN (256) */ - if (len > MAXATOMLEN) - len = MAXATOMLEN; + if (from_enc == ERLANG_LATIN1 && len >= MAXATOMLEN) { + return -1; + } - if (!buf) s += 3; - else { - put8(s,ERL_ATOM_EXT); - put16be(s,len); + switch(to_enc) { + case ERLANG_LATIN1: + if (buf) { + put8(s,ERL_ATOM_EXT); + switch (from_enc) { + case ERLANG_UTF8: + len = utf8_to_latin1(s+2, p, len, MAXATOMLEN-1, NULL); + if (len < 0) return -1; + break; + case ERLANG_ASCII: + case ERLANG_LATIN1: + memcpy(s+2, p, len); + break; + default: + return -1; + } + put16be(s,len); + } + else { + s += 3; + if (from_enc == ERLANG_UTF8) { + len = utf8_to_latin1(NULL, p, len, MAXATOMLEN-1, NULL); + if (len < 0) return -1; + } + } + break; + + case ERLANG_UTF8: + offs = 1 + 1; + switch (from_enc) { + case ERLANG_LATIN1: + if (len >= 256/2) offs++; + len = latin1_to_utf8((buf ? s+offs : NULL), p, len, MAXATOMLEN_UTF8-1, NULL); + break; + case ERLANG_ASCII: + case ERLANG_UTF8: + if (len >= 256) offs++; + if (buf) memcpy(s+offs, p, len); + break; + default: + return -1; + } + if (buf) { + if (offs == 2) { + put8(s, ERL_SMALL_ATOM_UTF8_EXT); + put8(s, len); + } + else { + put8(s, ERL_ATOM_UTF8_EXT); + put16be(s, len); + } + } + else s+= offs; + break; - memmove(s,p,len); /* unterminated string */ + default: + return -1; } s += len; @@ -53,3 +123,13 @@ int ei_encode_atom_len(char *buf, int *index, const char *p, int len) return 0; } +int +ei_internal_put_atom(char** bufp, const char* p, int slen, + enum erlang_char_encoding to_enc) +{ + int ix = 0; + if (ei_encode_atom_len_as(*bufp, &ix, p, slen, ERLANG_UTF8, to_enc) < 0) + return -1; + *bufp += ix; + return 0; +} diff --git a/lib/erl_interface/src/encode/encode_fun.c b/lib/erl_interface/src/encode/encode_fun.c index 54ee2083d6..4daee32648 100644 --- a/lib/erl_interface/src/encode/encode_fun.c +++ b/lib/erl_interface/src/encode/encode_fun.c @@ -35,7 +35,7 @@ int ei_encode_fun(char *buf, int *index, const erlang_fun *p) ix += sizeof(char) + 4; if (ei_encode_pid(buf, &ix, &p->pid) < 0) return -1; - if (ei_encode_atom(buf, &ix, p->module) < 0) + if (ei_encode_atom_as(buf, &ix, p->module, ERLANG_UTF8, p->module_org_enc) < 0) return -1; if (ei_encode_long(buf, &ix, p->index) < 0) return -1; @@ -60,7 +60,7 @@ int ei_encode_fun(char *buf, int *index, const erlang_fun *p) } else size_p = NULL; ix += 1 + 4 + 1 + sizeof(p->md5) + 4 + 4; - if (ei_encode_atom(buf, &ix, p->module) < 0) + if (ei_encode_atom_as(buf, &ix, p->module, ERLANG_UTF8, p->module_org_enc) < 0) return -1; if (ei_encode_long(buf, &ix, p->old_index) < 0) return -1; diff --git a/lib/erl_interface/src/encode/encode_pid.c b/lib/erl_interface/src/encode/encode_pid.c index ee7f235c17..0cf3ef4efb 100644 --- a/lib/erl_interface/src/encode/encode_pid.c +++ b/lib/erl_interface/src/encode/encode_pid.c @@ -24,29 +24,23 @@ int ei_encode_pid(char *buf, int *index, const erlang_pid *p) { char *s = buf + *index; - char *s0 = s; - int len = strlen(p->node); - - if (!buf) s += 13 + len; - else { - put8(s,ERL_PID_EXT); - /* first the nodename */ - put8(s,ERL_ATOM_EXT); + ++(*index); /* skip ERL_PID_EXT */ + if (ei_encode_atom_len_as(buf, index, p->node, strlen(p->node), ERLANG_UTF8, p->node_org_enc) < 0) + return -1; + + if (buf) { + put8(s,ERL_PID_EXT); - put16be(s,len); - - memmove(s, p->node, len); - s += len; + s = buf + *index; /* now the integers */ put32be(s,p->num & 0x7fff); /* 15 bits */ put32be(s,p->serial & 0x1fff); /* 13 bits */ put8(s,(p->creation & 0x03)); /* 2 bits */ } - - *index += s-s0; - + + *index += 4 + 4 + 1; return 0; } diff --git a/lib/erl_interface/src/encode/encode_port.c b/lib/erl_interface/src/encode/encode_port.c index fbbb33182e..2bf9e26d78 100644 --- a/lib/erl_interface/src/encode/encode_port.c +++ b/lib/erl_interface/src/encode/encode_port.c @@ -24,28 +24,23 @@ int ei_encode_port(char *buf, int *index, const erlang_port *p) { char *s = buf + *index; - char *s0 = s; - int len = strlen(p->node); - - if (!buf) s += 9 + len; - else { - put8(s,ERL_PORT_EXT); - /* first the nodename */ - put8(s,ERL_ATOM_EXT); + ++(*index); /* skip ERL_PORT_EXT */ + if (ei_encode_atom_len_as(buf, index, p->node, strlen(p->node), ERLANG_UTF8, + p->node_org_enc) < 0) { + return -1; + } + if (buf) { + put8(s,ERL_PORT_EXT); - put16be(s,len); - - memmove(s, p->node, len); - s += len; + s = buf + *index; /* now the integers */ put32be(s,p->id & 0x0fffffff /* 28 bits */); put8(s,(p->creation & 0x03)); } - *index += s-s0; - + *index += 4 + 1; return 0; } diff --git a/lib/erl_interface/src/encode/encode_ref.c b/lib/erl_interface/src/encode/encode_ref.c index 292b452864..e8b3173315 100644 --- a/lib/erl_interface/src/encode/encode_ref.c +++ b/lib/erl_interface/src/encode/encode_ref.c @@ -24,36 +24,32 @@ int ei_encode_ref(char *buf, int *index, const erlang_ref *p) { char *s = buf + *index; - char *s0 = s; - int len = strlen(p->node); int i; + (*index) += 1 + 2; /* skip to node atom */ + if (ei_encode_atom_len_as(buf, index, p->node, strlen(p->node), ERLANG_UTF8, + p->node_org_enc) < 0) { + return -1; + } + /* Always encode as an extended reference; all participating parties are now expected to be able to decode extended references. */ - if (!buf) s += 1 + 2 + (3+len) + p->len*4 + 1; - else { + if (buf) { put8(s,ERL_NEW_REFERENCE_EXT); /* first, number of integers */ put16be(s, p->len); /* then the nodename */ - put8(s,ERL_ATOM_EXT); - - put16be(s,len); - - memmove(s, p->node, len); - s += len; + s = buf + *index; /* now the integers */ put8(s,(p->creation & 0x03)); for (i = 0; i < p->len; i++) put32be(s,p->n[i]); - - } - - *index += s-s0; + } + *index += p->len*4 + 1; return 0; } diff --git a/lib/erl_interface/src/legacy/erl_connect.c b/lib/erl_interface/src/legacy/erl_connect.c index 41d4fa3138..be83fa8469 100644 --- a/lib/erl_interface/src/legacy/erl_connect.c +++ b/lib/erl_interface/src/legacy/erl_connect.c @@ -247,9 +247,15 @@ int erl_send(int fd, ETERM *to ,ETERM *msg) erl_errno = EINVAL; return -1; } - - strncpy(topid.node, (char *)ERL_PID_NODE(to), sizeof(topid.node)); - topid.node[sizeof(topid.node)-1] = '\0'; + + if (to->uval.pidval.node.latin1) { + strcpy(topid.node, to->uval.pidval.node.latin1); + topid.node_org_enc = ERLANG_LATIN1; + } + else { + strcpy(topid.node, to->uval.pidval.node.utf8); + topid.node_org_enc = ERLANG_UTF8; + } topid.num = ERL_PID_NUMBER(to); topid.serial = ERL_PID_SERIAL(to); topid.creation = ERL_PID_CREATION(to); @@ -263,7 +269,7 @@ static int erl_do_receive_msg(int fd, ei_x_buff* x, ErlMessage* emsg) erlang_msg msg; int r; - msg.from.node[0] = msg.to.node[0] = '\0'; + msg.from.node[0] = msg.to.node[0] = msg.toname[0] = '\0'; r = ei_do_receive_msg(fd, 0, &msg, x, 0); if (r == ERL_MSG) { @@ -299,7 +305,7 @@ static int erl_do_receive_msg(int fd, ei_x_buff* x, ErlMessage* emsg) emsg->to = erl_mk_pid(msg.to.node, msg.to.num, msg.to.serial, msg.to.creation); else emsg->to = NULL; - memcpy(emsg->to_name, msg.toname, MAXATOMLEN+1); + strcpy(emsg->to_name, msg.toname); return r; } diff --git a/lib/erl_interface/src/legacy/erl_eterm.c b/lib/erl_interface/src/legacy/erl_eterm.c index 8d559f0f55..aa0fd5ddcf 100644 --- a/lib/erl_interface/src/legacy/erl_eterm.c +++ b/lib/erl_interface/src/legacy/erl_eterm.c @@ -36,6 +36,7 @@ #include "erl_error.h" #include "erl_internal.h" #include "ei_internal.h" +#include "putget.h" #define ERL_IS_BYTE(x) (ERL_IS_INTEGER(x) && (ERL_INT_VALUE(x) & ~0xFF) == 0) @@ -142,9 +143,7 @@ ETERM *erl_mk_atom (const char *s) ep = erl_alloc_eterm(ERL_ATOM); ERL_COUNT(ep) = 1; - ERL_ATOM_SIZE(ep) = strlen(s); - if ((ERL_ATOM_PTR(ep) = strsave(s)) == NULL) - { + if (erl_atom_init_latin1(&ep->uval.aval.d, s) == NULL) { erl_free_term(ep); erl_errno = ENOMEM; return NULL; @@ -152,6 +151,65 @@ ETERM *erl_mk_atom (const char *s) return ep; } +char* erl_atom_ptr_latin1(Erl_Atom_data* a) +{ + if (a->latin1 == NULL) { + enum erlang_char_encoding enc; + a->lenL = utf8_to_latin1(NULL, a->utf8, a->lenU, a->lenU, &enc); + if (a->lenL < 0) { + a->lenL = 0; + return NULL; + } + if (enc == ERLANG_ASCII) { + a->latin1 = a->utf8; + } + else { + a->latin1 = malloc(a->lenL+1); + utf8_to_latin1(a->latin1, a->utf8, a->lenU, a->lenL, NULL); + a->latin1[a->lenL] = '\0'; + } + } + return a->latin1; +} + +char* erl_atom_ptr_utf8(Erl_Atom_data* a) +{ + if (a->utf8 == NULL) { + int dlen = a->lenL * 2; /* over estimation */ + a->utf8 = malloc(dlen + 1); + a->lenU = latin1_to_utf8(a->utf8, a->latin1, a->lenL, dlen, NULL); + a->utf8[a->lenU] = '\0'; + } + return a->utf8; + +} +int erl_atom_size_latin1(Erl_Atom_data* a) +{ + if (a->latin1 == NULL) { + erl_atom_ptr_latin1(a); + } + return a->lenL; +} +int erl_atom_size_utf8(Erl_Atom_data* a) +{ + if (a->utf8 == NULL) { + erl_atom_ptr_utf8(a); + } + return a->lenU; +} +char* erl_atom_init_latin1(Erl_Atom_data* a, const char* s) +{ + a->lenL = strlen(s); + if ((a->latin1 = strsave(s)) == NULL) + { + return NULL; + } + a->utf8 = NULL; + a->lenU = 0; + return a->latin1; +} + + /* * Given a string as input, creates a list. */ @@ -208,12 +266,19 @@ ETERM *erl_mk_pid(const char *node, ep = erl_alloc_eterm(ERL_PID); ERL_COUNT(ep) = 1; - if ((ERL_PID_NODE(ep) = strsave(node)) == NULL) + if (erl_atom_init_latin1(&ep->uval.pidval.node, node) == NULL) { erl_free_term(ep); erl_errno = ENOMEM; return NULL; } + erl_mk_pid_helper(ep, number, serial, creation); + return ep; +} + +void erl_mk_pid_helper(ETERM *ep, unsigned int number, + unsigned int serial, unsigned char creation) +{ ERL_PID_NUMBER(ep) = number & 0x7fff; /* 15 bits */ if (ei_internal_use_r9_pids_ports()) { ERL_PID_SERIAL(ep) = serial & 0x07; /* 3 bits */ @@ -222,7 +287,6 @@ ETERM *erl_mk_pid(const char *node, ERL_PID_SERIAL(ep) = serial & 0x1fff; /* 13 bits */ } ERL_PID_CREATION(ep) = creation & 0x03; /* 2 bits */ - return ep; } /* @@ -239,12 +303,18 @@ ETERM *erl_mk_port(const char *node, ep = erl_alloc_eterm(ERL_PORT); ERL_COUNT(ep) = 1; - if ((ERL_PORT_NODE(ep) = strsave(node)) == NULL) + if (erl_atom_init_latin1(&ep->uval.portval.node, node) == NULL) { erl_free_term(ep); erl_errno = ENOMEM; return NULL; } + erl_mk_port_helper(ep, number, creation); + return ep; +} + +void erl_mk_port_helper(ETERM* ep, unsigned number, unsigned char creation) +{ if (ei_internal_use_r9_pids_ports()) { ERL_PORT_NUMBER(ep) = number & 0x3ffff; /* 18 bits */ } @@ -252,29 +322,29 @@ ETERM *erl_mk_port(const char *node, ERL_PORT_NUMBER(ep) = number & 0x0fffffff; /* 18 bits */ } ERL_PORT_CREATION(ep) = creation & 0x03; /* 2 bits */ - return ep; } /* * Create any kind of reference. */ -ETERM *__erl_mk_reference (const char *node, +ETERM *__erl_mk_reference (ETERM* t, + const char *node, size_t len, unsigned int n[], unsigned char creation) { - ETERM * t; - - if (node == NULL) return NULL; - - t = erl_alloc_eterm(ERL_REF); - ERL_COUNT(t) = 1; - - if ((ERL_REF_NODE(t) = strsave(node)) == NULL) - { - erl_free_term(t); - erl_errno = ENOMEM; - return NULL; + if (t == NULL) { + if (node == NULL) return NULL; + + t = erl_alloc_eterm(ERL_REF); + ERL_COUNT(t) = 1; + + if (erl_atom_init_latin1(&t->uval.refval.node, node) == NULL) + { + erl_free_term(t); + erl_errno = ENOMEM; + return NULL; + } } ERL_REF_LEN(t) = len; ERL_REF_NUMBERS(t)[0] = n[0] & 0x3ffff; /* 18 bits */ @@ -294,7 +364,7 @@ ETERM *erl_mk_ref (const char *node, { unsigned int n[3] = {0, 0, 0}; n[0] = number; - return __erl_mk_reference(node, 1, n, creation); + return __erl_mk_reference(NULL, node, 1, n, creation); } /* @@ -307,7 +377,7 @@ erl_mk_long_ref (const char *node, { unsigned int n[3] = {0, 0, 0}; n[0] = n3; n[1] = n2; n[2] = n1; - return __erl_mk_reference(node, 3, n, creation); + return __erl_mk_reference(NULL, node, 3, n, creation); } /* @@ -758,6 +828,28 @@ int erl_iolist_length (const ETERM* term) return -1; } +static int erl_atom_copy(Erl_Atom_data* dst, const Erl_Atom_data* src) +{ + if (src->latin1 == src->utf8) { + dst->latin1 = dst->utf8 = strsave(src->latin1); + dst->lenL = dst->lenU = strlen(src->latin1); + } + else if (src->latin1) { + dst->latin1 = strsave(src->latin1); + dst->lenL = strlen(src->latin1); + dst->utf8 = NULL; + dst->lenU = 0; + } + else { + dst->utf8 = strsave(src->utf8); + dst->lenU = strlen(src->utf8); + dst->latin1 = NULL; + dst->lenL = 0; + } + return (dst->latin1 != NULL || dst->utf8 == NULL); +} + + /* * Return a brand NEW COPY of an ETERM. */ @@ -796,9 +888,7 @@ ETERM *erl_copy_term(const ETERM *ep) ERL_FLOAT_VALUE(cp) = ERL_FLOAT_VALUE(ep); break; case ERL_ATOM: - ERL_ATOM_SIZE(cp) = ERL_ATOM_SIZE(ep); - ERL_ATOM_PTR(cp) = strsave(ERL_ATOM_PTR(ep)); - if (ERL_ATOM_PTR(cp) == NULL) + if (!erl_atom_copy(&cp->uval.aval.d, &ep->uval.aval.d)) { erl_free_term(cp); erl_errno = ENOMEM; @@ -810,17 +900,17 @@ ETERM *erl_copy_term(const ETERM *ep) name and plug in. Somewhat ugly (also done with port and ref below). */ memcpy(&cp->uval.pidval, &ep->uval.pidval, sizeof(Erl_Pid)); - ERL_PID_NODE(cp) = strsave(ERL_PID_NODE(ep)); + erl_atom_copy(&cp->uval.pidval.node, &ep->uval.pidval.node); ERL_COUNT(cp) = 1; break; case ERL_PORT: memcpy(&cp->uval.portval, &ep->uval.portval, sizeof(Erl_Port)); - ERL_PORT_NODE(cp) = strsave(ERL_PORT_NODE(ep)); + erl_atom_copy(&cp->uval.portval.node, &ep->uval.portval.node); ERL_COUNT(cp) = 1; break; case ERL_REF: memcpy(&cp->uval.refval, &ep->uval.refval, sizeof(Erl_Ref)); - ERL_REF_NODE(cp) = strsave(ERL_REF_NODE(ep)); + erl_atom_copy(&cp->uval.refval.node, &ep->uval.refval.node); ERL_COUNT(cp) = 1; break; case ERL_LIST: @@ -883,29 +973,29 @@ int erl_print_term(FILE *fp, const ETERM *ep) j = i = doquote = 0; switch(ERL_TYPE(ep)) { - case ERL_ATOM: + case ERL_ATOM: { + char* adata = ERL_ATOM_PTR(ep); /* FIXME: what if some weird locale is in use? */ - if (!islower((int)ERL_ATOM_PTR(ep)[0])) + if (!islower(adata[0])) doquote = 1; for (i = 0; !doquote && i < ERL_ATOM_SIZE(ep); i++) { - doquote = !(isalnum((int)ERL_ATOM_PTR(ep)[i]) - || (ERL_ATOM_PTR(ep)[i] == '_')); + doquote = !(isalnum(adata[i]) || (adata[i] == '_')); } if (doquote) { putc('\'', fp); ch_written++; } - fputs(ERL_ATOM_PTR(ep), fp); + fputs(adata, fp); ch_written += ERL_ATOM_SIZE(ep); if (doquote) { putc('\'', fp); ch_written++; } break; - + } case ERL_VARIABLE: if (!isupper((int)ERL_VAR_NAME(ep)[0])) { doquote = 1; diff --git a/lib/erl_interface/src/legacy/erl_eterm.h b/lib/erl_interface/src/legacy/erl_eterm.h index 41b008f04f..2e8129d9cd 100644 --- a/lib/erl_interface/src/legacy/erl_eterm.h +++ b/lib/erl_interface/src/legacy/erl_eterm.h @@ -55,7 +55,9 @@ typedef struct _heapmark { } Erl_HeapMark; -ETERM * __erl_mk_reference(const char *, size_t, unsigned int n[], unsigned char); +void erl_mk_port_helper(ETERM* ep, unsigned number, unsigned char creation); +void erl_mk_pid_helper(ETERM*, unsigned,unsigned, unsigned char); +ETERM * __erl_mk_reference(ETERM*, const char *, size_t, unsigned int n[], unsigned char); int erl_current_fix_desc(void); #endif /* _ERL_ETERM_H */ diff --git a/lib/erl_interface/src/legacy/erl_format.c b/lib/erl_interface/src/legacy/erl_format.c index dc85806c36..533241e396 100644 --- a/lib/erl_interface/src/legacy/erl_format.c +++ b/lib/erl_interface/src/legacy/erl_format.c @@ -574,10 +574,22 @@ static int ematch(ETERM *p, ETERM *t) switch (type_p) { - case ERL_ATOM: - return p->uval.aval.len == t->uval.aval.len && - memcmp(p->uval.aval.a, t->uval.aval.a, p->uval.aval.len) == 0; - + case ERL_ATOM: { + Erl_Atom_data* pa = &p->uval.aval.d; + Erl_Atom_data* ta = &t->uval.aval.d; + if (pa->utf8 && ta->utf8) { + return pa->lenU == ta->lenU && memcmp(pa->utf8, ta->utf8, pa->lenU)==0; + } + else if (pa->latin1 && ta->latin1) { + return pa->lenL == ta->lenL && memcmp(pa->latin1, ta->latin1, pa->lenL)==0; + } + else if (pa->latin1) { + return cmp_latin1_vs_utf8(pa->latin1, pa->lenL, ta->utf8, ta->lenU)==0; + } + else { + return cmp_latin1_vs_utf8(ta->latin1, ta->lenL, pa->utf8, pa->lenU)==0; + } + } case ERL_VARIABLE: if (strcmp(p->uval.vval.name, "_") == 0) /* anon. variable */ return ERL_TRUE; diff --git a/lib/erl_interface/src/legacy/erl_malloc.c b/lib/erl_interface/src/legacy/erl_malloc.c index f51a6c69b3..d09239e02d 100644 --- a/lib/erl_interface/src/legacy/erl_malloc.c +++ b/lib/erl_interface/src/legacy/erl_malloc.c @@ -112,6 +112,18 @@ do { \ (ptr) = NULL; \ } while (0) +static void erl_atom_free(Erl_Atom_data* p) +{ + erl_free(p->latin1); + if (p->utf8 != p->latin1) { + erl_free(p->utf8); + } + p->latin1 = NULL; + p->utf8 = NULL; + p->lenL = 0; + p->lenU = 0; +} + static void _erl_free_term (ETERM *ep, int external, int compound) { restart: @@ -122,7 +134,7 @@ restart: switch(ERL_TYPE(ep)) { case ERL_ATOM: - FREE_AND_CLEAR(ERL_ATOM_PTR(ep)); + erl_atom_free(&ep->uval.aval.d); break; case ERL_VARIABLE: FREE_AND_CLEAR(ERL_VAR_NAME(ep)); @@ -161,13 +173,13 @@ restart: FREE_AND_CLEAR(ERL_BIN_PTR(ep)); break; case ERL_PID: - FREE_AND_CLEAR(ERL_PID_NODE(ep)); + erl_atom_free(&ep->uval.pidval.node); break; case ERL_PORT: - FREE_AND_CLEAR(ERL_PORT_NODE(ep)); + erl_atom_free(&ep->uval.portval.node); break; case ERL_REF: - FREE_AND_CLEAR(ERL_REF_NODE(ep)); + erl_atom_free(&ep->uval.refval.node); break; case ERL_EMPTY_LIST: case ERL_INTEGER: diff --git a/lib/erl_interface/src/legacy/erl_marshal.c b/lib/erl_interface/src/legacy/erl_marshal.c index 775d7e82ca..884e9d421b 100644 --- a/lib/erl_interface/src/legacy/erl_marshal.c +++ b/lib/erl_interface/src/legacy/erl_marshal.c @@ -44,6 +44,9 @@ int erl_fp_compare(unsigned *a, unsigned *b); static void erl_long_to_fp(long l, unsigned *d); #endif +static int cmpbytes(unsigned char* s1,int l1,unsigned char* s2,int l2); +static int cmpatoms(unsigned char* s1, int l1, unsigned char tag1, unsigned char* s2, int l2, unsigned char tag2); + /* Used when comparing two encoded byte arrays */ /* this global data is ok (from threading point of view) since it is * initialized once and never changed @@ -111,8 +114,9 @@ void erl_init_marshal(void) cmp_array[ERL_SMALL_BIG_EXT] = ERL_NUM_CMP; cmp_array[ERL_LARGE_BIG_EXT] = ERL_NUM_CMP; cmp_array[ERL_ATOM_EXT] = ERL_ATOM_CMP; + cmp_array[ERL_ATOM_UTF8_EXT] = ERL_ATOM_CMP; cmp_array[ERL_SMALL_ATOM_EXT] = ERL_ATOM_CMP; - cmp_array[ERL_UNICODE_ATOM_EXT] = ERL_ATOM_CMP; + cmp_array[ERL_SMALL_ATOM_UTF8_EXT] = ERL_ATOM_CMP; cmp_array[ERL_REFERENCE_EXT] = ERL_REF_CMP; cmp_array[ERL_NEW_REFERENCE_EXT] = ERL_REF_CMP; cmp_array[ERL_FUN_EXT] = ERL_FUN_CMP; @@ -162,6 +166,21 @@ static int erl_length_x(const ETERM *ep) { *============================================================== */ +static void encode_atom(Erl_Atom_data* a, unsigned char **ext) +{ + int ix = 0; + if (a->latin1) { + ei_encode_atom_len_as((char*)*ext, &ix, a->latin1, a->lenL, + ERLANG_LATIN1, ERLANG_LATIN1); + } + else if (ei_encode_atom_len_as((char*)*ext, &ix, a->utf8, a->lenU, + ERLANG_UTF8, ERLANG_LATIN1) < 0) { + ei_encode_atom_len_as((char*)*ext, &ix, a->utf8, a->lenU, + ERLANG_UTF8, ERLANG_UTF8); + } + *ext += ix; +} + /* * The actual ENCODE engine. * Returns 0 on success, otherwise 1. @@ -176,12 +195,7 @@ int erl_encode_it(ETERM *ep, unsigned char **ext, int dist) switch(ERL_TYPE(ep)) { case ERL_ATOM: - i = ep->uval.aval.len; - *(*ext)++ = ERL_ATOM_EXT; - *(*ext)++ = (i >>8) &0xff; - *(*ext)++ = i &0xff; - memcpy((void *) *ext, (const void *) ep->uval.aval.a, i); - *ext += i; + encode_atom(&ep->uval.aval.d, ext); return 0; case ERL_INTEGER: @@ -292,12 +306,7 @@ int erl_encode_it(ETERM *ep, unsigned char **ext, int dist) case ERL_PID: *(*ext)++ = ERL_PID_EXT; /* First poke in node as an atom */ - i = strlen((char *)ERL_PID_NODE(ep)); - *(*ext)++ = ERL_ATOM_EXT; - *(*ext)++ = (i >>8) &0xff; - *(*ext)++ = i &0xff; - memcpy(*ext, ERL_PID_NODE(ep), i); - *ext += i; + encode_atom(&ep->uval.pidval.node, ext); /* And then fill in the integer fields */ i = ERL_PID_NUMBER(ep); *(*ext)++ = (i >> 24) &0xff; @@ -325,11 +334,8 @@ int erl_encode_it(ETERM *ep, unsigned char **ext, int dist) *(*ext)++ = (len >> 8) &0xff; *(*ext)++ = len &0xff; - *(*ext)++ = ERL_ATOM_EXT; - *(*ext)++ = (i >> 8) &0xff; - *(*ext)++ = i &0xff; - memcpy(*ext, ERL_REF_NODE(ep), i); - *ext += i; + encode_atom(&ep->uval.refval.node, ext); + *(*ext)++ = ERL_REF_CREATION(ep); /* Then the integer fields */ for (j = 0; j < ERL_REF_LEN(ep); j++) { @@ -344,12 +350,7 @@ int erl_encode_it(ETERM *ep, unsigned char **ext, int dist) case ERL_PORT: *(*ext)++ = ERL_PORT_EXT; /* First poke in node as an atom */ - i = strlen((char *)ERL_PORT_NODE(ep)); - *(*ext)++ = ERL_ATOM_EXT; - *(*ext)++ = (i >>8) &0xff; - *(*ext)++ = i &0xff; - memcpy(*ext, ERL_PORT_NODE(ep), i); - *ext += i; + encode_atom(&ep->uval.portval.node, ext); /* Then the integer fields */ i = ERL_PORT_NUMBER(ep); *(*ext)++ = (i >> 24) &0xff; @@ -500,6 +501,16 @@ int erl_term_len(ETERM *ep) return 1+erl_term_len_helper(ep, 4); } +static int atom_len_helper(Erl_Atom_data* a) +{ + if (erl_atom_ptr_latin1(a)) { + return 1 + 2 + a->lenL; /* ERL_ATOM_EXT */ + } + else { + return 1 + 1 + (a->lenU > 255) + a->lenU; + } +} + static int erl_term_len_helper(ETERM *ep, int dist) { int len = 0; @@ -511,8 +522,7 @@ static int erl_term_len_helper(ETERM *ep, int dist) if (ep) { switch (ERL_TYPE(ep)) { case ERL_ATOM: - i = ep->uval.aval.len; - len = i + 3; + len = atom_len_helper(&ep->uval.aval.d); break; case ERL_INTEGER: @@ -544,20 +554,15 @@ static int erl_term_len_helper(ETERM *ep, int dist) break; case ERL_PID: - /* 1 + N + 4 + 4 + 1 where N = 3 + strlen */ - i = strlen((char *)ERL_PID_NODE(ep)); - len = 13 + i; + len = 1 + atom_len_helper(&ep->uval.pidval.node) + 4 + 4 + 1; break; case ERL_REF: - i = strlen((char *)ERL_REF_NODE(ep)); - len = 1 + 2 + (i+3) + 1 + ERL_REF_LEN(ep) * 4; + len = 1 + 2 + atom_len_helper(&ep->uval.refval.node) + 1 + ERL_REF_LEN(ep) * 4; break; case ERL_PORT: - /* 1 + N + 4 + 1 where N = 3 + strlen */ - i = strlen((char *)ERL_PORT_NODE(ep)); - len = 9 + i; + len = 1 + atom_len_helper(&ep->uval.portval.node) + 4 + 1; break; case ERL_EMPTY_LIST: @@ -651,11 +656,33 @@ int erl_encode_buf(ETERM *ep, unsigned char **ext) } /* erl_encode_buf */ -static int read_atom(unsigned char** ext, char* dst) +static int read_atom(unsigned char** ext, Erl_Atom_data* a) { + char buf[MAXATOMLEN_UTF8]; int offs = 0; - int ret = ei_decode_atom((char*)*ext, &offs, dst); + enum erlang_char_encoding enc; + int ret = ei_decode_atom_as((char*)*ext, &offs, buf, MAXATOMLEN_UTF8, + ERLANG_WHATEVER, NULL, &enc); *ext += offs; + + if (ret == 0) { + int i = strlen(buf); + char* clone = erl_malloc(i+1); + memcpy(clone, buf, i+1); + + a->latin1 = NULL; + a->lenL = 0; + a->utf8 = NULL; + a->lenU = 0; + if (enc == ERLANG_LATIN1 || enc == ERLANG_ASCII) { + a->latin1 = clone; + a->lenL = i; + } + if (enc == ERLANG_UTF8 || enc == ERLANG_ASCII) { + a->utf8 = clone; + a->lenU = i; + } + } return ret; } @@ -665,7 +692,6 @@ static int read_atom(unsigned char** ext, char* dst) */ static ETERM *erl_decode_it(unsigned char **ext) { - char atom_buf[MAXATOMLEN+1]; char *cp; ETERM *ep,*tp,*np; unsigned int u,sign; @@ -765,127 +791,89 @@ static ETERM *erl_decode_it(unsigned char **ext) case ERL_ATOM_EXT: case ERL_SMALL_ATOM_EXT: - case ERL_UNICODE_ATOM_EXT: + case ERL_ATOM_UTF8_EXT: + case ERL_SMALL_ATOM_UTF8_EXT: + ERL_TYPE(ep) = ERL_ATOM; --(*ext); - if (read_atom(ext, atom_buf) < 0) return NULL; - - i = strlen(atom_buf); - ep->uval.aval.len = i; - ep->uval.aval.a = (char *) erl_malloc(i+1); - memcpy(ep->uval.aval.a, atom_buf, i+1); + if (read_atom(ext, &ep->uval.aval.d) < 0) return NULL; return ep; case ERL_PID_EXT: - erl_free_term(ep); - { /* Why not use the constructors? */ - char* node = atom_buf; + { unsigned int number, serial; unsigned char creation; - ETERM *eterm_p; - if (read_atom(ext, node) < 0) return NULL; + ERL_TYPE(ep) = ERL_PID; + if (read_atom(ext, &ep->uval.pidval.node) < 0) return NULL; /* get the integers */ -#if 0 - /* FIXME: Remove code or whatever.... - Ints on the wire are big-endian (== network byte order) - so use ntoh[sl]. (But some are little-endian! Arrrgh!) - Also, the libc authors can be expected to optimize them - heavily. However, the marshalling makes no guarantees - about alignments -- so it won't work at all. */ - number = ntohl(*((unsigned int *)*ext)++); - serial = ntohl(*((unsigned int *)*ext)++); -#else number = ((*ext)[0] << 24) | ((*ext)[1]) << 16 | ((*ext)[2]) << 8 | ((*ext)[3]); *ext += 4; serial = ((*ext)[0] << 24) | ((*ext)[1]) << 16 | ((*ext)[2]) << 8 | ((*ext)[3]); *ext += 4; -#endif creation = *(*ext)++; - eterm_p = erl_mk_pid(node, number, serial, creation); - return eterm_p; + erl_mk_pid_helper(ep, number, serial, creation); + return ep; } case ERL_REFERENCE_EXT: - erl_free_term(ep); { - char* node = atom_buf; - unsigned int number; + unsigned int n[3] = {0, 0, 0}; unsigned char creation; - ETERM *eterm_p; - if (read_atom(ext, node) < 0) return NULL; + ERL_TYPE(ep) = ERL_REF; + if (read_atom(ext, &ep->uval.refval.node) < 0) return NULL; /* get the integers */ -#if 0 - number = ntohl(*((unsigned int *)*ext)++); -#else - number = ((*ext)[0] << 24) | ((*ext)[1]) << 16 | + n[0] = ((*ext)[0] << 24) | ((*ext)[1]) << 16 | ((*ext)[2]) << 8 | ((*ext)[3]); *ext += 4; -#endif creation = *(*ext)++; - eterm_p = erl_mk_ref(node, number, creation); - return eterm_p; + __erl_mk_reference(ep, NULL, 1, n, creation); + return ep; } case ERL_NEW_REFERENCE_EXT: - erl_free_term(ep); { - char* node = atom_buf; size_t cnt, i; unsigned int n[3]; unsigned char creation; - ETERM *eterm_p; -#if 0 - cnt = ntohs(*((unsigned short *)*ext)++); -#else + ERL_TYPE(ep) = ERL_REF; cnt = ((*ext)[0] << 8) | (*ext)[1]; *ext += 2; -#endif - if (read_atom(ext, node) < 0) return NULL; + if (read_atom(ext, &ep->uval.refval.node) < 0) return NULL; /* get the integers */ creation = *(*ext)++; for(i = 0; i < cnt; i++) { -#if 0 - n[i] = ntohl(*((unsigned int *)*ext)++); -#else n[i] = ((*ext)[0] << 24) | ((*ext)[1]) << 16 | ((*ext)[2]) << 8 | ((*ext)[3]); *ext += 4; -#endif } - eterm_p = __erl_mk_reference(node, cnt, n, creation); - return eterm_p; + __erl_mk_reference(ep, NULL, cnt, n, creation); + return ep; } case ERL_PORT_EXT: - erl_free_term(ep); { - char* node = atom_buf; unsigned int number; unsigned char creation; - ETERM *eterm_p; - if (read_atom(ext, node) < 0) return NULL; + ERL_TYPE(ep) = ERL_PORT; + if (read_atom(ext, &ep->uval.portval.node) < 0) return NULL; /* get the integers */ -#if 0 - number = ntohl(*((unsigned int *)*ext)++); -#else number = ((*ext)[0] << 24) | ((*ext)[1]) << 16 | ((*ext)[2]) << 8 | ((*ext)[3]); *ext += 4; -#endif creation = *(*ext)++; - eterm_p = erl_mk_port(node, number, creation); - return eterm_p; + erl_mk_port_helper(ep, number, creation); + return ep; } case ERL_NIL_EXT: @@ -1120,8 +1108,9 @@ unsigned char erl_ext_type(unsigned char *ext) case ERL_INTEGER_EXT: return ERL_INTEGER; case ERL_ATOM_EXT: + case ERL_ATOM_UTF8_EXT: case ERL_SMALL_ATOM_EXT: - case ERL_UNICODE_ATOM_EXT: + case ERL_SMALL_ATOM_UTF8_EXT: return ERL_ATOM; case ERL_PID_EXT: return ERL_PID; @@ -1173,8 +1162,9 @@ int erl_ext_size(unsigned char *t) case ERL_SMALL_INTEGER_EXT: case ERL_INTEGER_EXT: case ERL_ATOM_EXT: + case ERL_ATOM_UTF8_EXT: case ERL_SMALL_ATOM_EXT: - case ERL_UNICODE_ATOM_EXT: + case ERL_SMALL_ATOM_UTF8_EXT: case ERL_PID_EXT: case ERL_PORT_EXT: case ERL_REFERENCE_EXT: @@ -1221,12 +1211,13 @@ static int jump_atom(unsigned char** ext) switch (*e++) { case ERL_ATOM_EXT: - case ERL_UNICODE_ATOM_EXT: + case ERL_ATOM_UTF8_EXT: len = (e[0] << 8) | e[1]; e += (len + 2); break; case ERL_SMALL_ATOM_EXT: + case ERL_SMALL_ATOM_UTF8_EXT: len = e[0]; e += (len + 1); break; @@ -1259,8 +1250,9 @@ static int jump(unsigned char **ext) *ext += 1; break; case ERL_ATOM_EXT: + case ERL_ATOM_UTF8_EXT: case ERL_SMALL_ATOM_EXT: - case ERL_UNICODE_ATOM_EXT: + case ERL_SMALL_ATOM_UTF8_EXT: jump_atom(ext); break; case ERL_PID_EXT: @@ -1426,6 +1418,58 @@ static int cmpbytes(unsigned char* s1,int l1,unsigned char* s2,int l2) } /* cmpbytes */ +#define tag2enc(T) ((T)==ERL_ATOM_EXT || (T)==ERL_SMALL_ATOM_EXT ? ERLANG_LATIN1 : ERLANG_UTF8) + +static int cmpatoms(unsigned char* s1, int l1, unsigned char tag1, + unsigned char* s2, int l2, unsigned char tag2) +{ + enum erlang_char_encoding enc1 = tag2enc(tag1); + enum erlang_char_encoding enc2 = tag2enc(tag2); + + if (enc1 == enc2) { + return cmpbytes(s1, l1,s2,l2); + } + + if (enc1 == ERLANG_LATIN1) { + return cmp_latin1_vs_utf8((char*)s1, l1, (char*)s2, l2); + } + else { + return -cmp_latin1_vs_utf8((char*)s2, l2, (char*)s1, l1); + } +} + +int cmp_latin1_vs_utf8(const char* strL, int lenL, const char* strU, int lenU) +{ + unsigned char* sL = (unsigned char*)strL; + unsigned char* sU = (unsigned char*)strU; + unsigned char* sL_end = sL + lenL; + unsigned char* sU_end = sU + lenU; + + while(sL < sL_end && sU < sU_end) { + unsigned char UasL; + if (*sL >= 0x80) { + if (*sU < 0xC4 && (sU+1) < sU_end) { + UasL = ((sU[0] & 0x3) << 6) | (sU[1] & 0x3F); + } + else return -1; + } + else { + UasL = *sU; + } + if (*sL < UasL) return -1; + if (*sL > UasL) return 1; + + sL++; + if (*sU < 0x80) sU++; + else if (*sU < 0xE0) sU += 2; + else if (*sU < 0xF0) sU += 3; + else /*if (*sU < 0xF8)*/ sU += 4; + } + + return (sU >= sU_end) - (sL >= sL_end); /* -1, 0 or 1 */ +} + + #define CMP_EXT_ERROR_CODE 4711 #define CMP_EXT_INT32_BE(AP, BP) \ @@ -1561,6 +1605,7 @@ static int cmp_exe2(unsigned char **e1, unsigned char **e2) int min, ret,i,j,k; double ff1, ff2; unsigned char *tmp1, *tmp2; + unsigned char tag1, tag2; if ( ((*e1)[0] == ERL_STRING_EXT) && ((*e2)[0] == ERL_LIST_EXT) ) { return cmp_string_list(e1, e2); @@ -1568,9 +1613,10 @@ static int cmp_exe2(unsigned char **e1, unsigned char **e2) return -cmp_string_list(e2, e1); } - *e2 += 1; + tag1 = *(*e1)++; + tag2 = *(*e2)++; i = j = 0; - switch (*(*e1)++) + switch (tag1) { case ERL_SMALL_INTEGER_EXT: if (**e1 < **e2) ret = -1; @@ -1590,14 +1636,15 @@ static int cmp_exe2(unsigned char **e1, unsigned char **e2) *e1 += 4; *e2 += 4; return ret; case ERL_ATOM_EXT: - case ERL_UNICODE_ATOM_EXT: + case ERL_ATOM_UTF8_EXT: i = (**e1) << 8; (*e1)++; j = (**e2) << 8; (*e2)++; /*fall through*/ case ERL_SMALL_ATOM_EXT: + case ERL_SMALL_ATOM_UTF8_EXT: i |= (**e1); (*e1)++; j |= (**e2); (*e2)++; - ret = cmpbytes(*e1, i, *e2, j); + ret = cmpatoms(*e1, i, tag1, *e2, j, tag2); *e1 += i; *e2 += j; return ret; diff --git a/lib/erl_interface/src/legacy/global_whereis.c b/lib/erl_interface/src/legacy/global_whereis.c index 2afb193504..e6c556d907 100644 --- a/lib/erl_interface/src/legacy/global_whereis.c +++ b/lib/erl_interface/src/legacy/global_whereis.c @@ -85,7 +85,16 @@ ETERM *erl_global_whereis(int fd, const char *name, char *node) opid = erl_decode((unsigned char*)buf); /* extract the nodename for the caller */ - if (node) strcpy(node,epid.node); + if (node) { + char* node_str = ERL_PID_NODE(opid); + if (node_str) { + strcpy(node, node_str); + } + else { + erl_free_term(opid); + return NULL; + } + } return opid; } diff --git a/lib/erl_interface/src/misc/ei_decode_term.c b/lib/erl_interface/src/misc/ei_decode_term.c index 6773f90bfc..65afee89cc 100644 --- a/lib/erl_interface/src/misc/ei_decode_term.c +++ b/lib/erl_interface/src/misc/ei_decode_term.c @@ -32,7 +32,7 @@ int ei_decode_ei_term(const char* buf, int* index, ei_term* term) { const char* s = buf + *index, * s0 = s; - int len, i, n, sign; + int i, n, sign; char c; if (term == NULL) return -1; @@ -48,12 +48,13 @@ int ei_decode_ei_term(const char* buf, int* index, ei_term* term) case NEW_FLOAT_EXT: return ei_decode_double(buf, index, &term->value.d_val); case ERL_ATOM_EXT: + case ERL_ATOM_UTF8_EXT: case ERL_SMALL_ATOM_EXT: - case ERL_UNICODE_ATOM_EXT: + case ERL_SMALL_ATOM_UTF8_EXT: return ei_decode_atom(buf, index, term->value.atom_name); case ERL_REFERENCE_EXT: /* first the nodename */ - if (get_atom(&s, term->value.ref.node) < 0) return -1; + if (get_atom(&s, term->value.ref.node, &term->value.ref.node_org_enc) < 0) return -1; /* now the numbers: num (4), creation (1) */ term->value.ref.n[0] = get32be(s); term->value.ref.len = 1; @@ -63,7 +64,7 @@ int ei_decode_ei_term(const char* buf, int* index, ei_term* term) /* first the integer count */ term->value.ref.len = get16be(s); /* then the nodename */ - if (get_atom(&s, term->value.ref.node) < 0) return -1; + if (get_atom(&s, term->value.ref.node, &term->value.ref.node_org_enc) < 0) return -1; /* creation */ term->value.ref.creation = get8(s) & 0x03; /* finally the id integers */ @@ -75,12 +76,12 @@ int ei_decode_ei_term(const char* buf, int* index, ei_term* term) } break; case ERL_PORT_EXT: - if (get_atom(&s, term->value.port.node) < 0) return -1; + if (get_atom(&s, term->value.port.node, &term->value.port.node_org_enc) < 0) return -1; term->value.port.id = get32be(s) & 0x0fffffff; /* 28 bits */; term->value.port.creation = get8(s) & 0x03; break; case ERL_PID_EXT: - if (get_atom(&s, term->value.pid.node) < 0) return -1; + if (get_atom(&s, term->value.pid.node, &term->value.port.node_org_enc) < 0) return -1; /* now the numbers: num (4), serial (4), creation (1) */ term->value.pid.num = get32be(s) & 0x7fff; /* 15 bits */ term->value.pid.serial = get32be(s) & 0x1fff; /* 13 bits */ diff --git a/lib/erl_interface/src/misc/ei_printterm.c b/lib/erl_interface/src/misc/ei_printterm.c index 620c6e72e2..91fe73e68c 100644 --- a/lib/erl_interface/src/misc/ei_printterm.c +++ b/lib/erl_interface/src/misc/ei_printterm.c @@ -132,9 +132,10 @@ static int print_term(FILE* fp, ei_x_buff* x, doquote = 0; ei_get_type_internal(buf, index, &ty, &n); switch (ty) { - case ERL_ATOM_EXT: + case ERL_ATOM_EXT: + case ERL_ATOM_UTF8_EXT: case ERL_SMALL_ATOM_EXT: - case ERL_UNICODE_ATOM_EXT: + case ERL_SMALL_ATOM_UTF8_EXT: if (ei_decode_atom(buf, index, a) < 0) goto err; doquote = !islower((int)a[0]); diff --git a/lib/erl_interface/src/misc/ei_x_encode.c b/lib/erl_interface/src/misc/ei_x_encode.c index fa1e26ccbb..44dcff7664 100644 --- a/lib/erl_interface/src/misc/ei_x_encode.c +++ b/lib/erl_interface/src/misc/ei_x_encode.c @@ -197,18 +197,33 @@ int ei_x_encode_tuple_header(ei_x_buff* x, long n) int ei_x_encode_atom(ei_x_buff* x, const char* s) { - return ei_x_encode_atom_len(x, s, strlen(s)); + return ei_x_encode_atom_len_as(x, s, strlen(s), ERLANG_LATIN1, ERLANG_LATIN1); } int ei_x_encode_atom_len(ei_x_buff* x, const char* s, int len) +{ + return ei_x_encode_atom_len_as(x, s, len, ERLANG_LATIN1, ERLANG_LATIN1); +} + +int ei_x_encode_atom_as(ei_x_buff* x, const char* s, + enum erlang_char_encoding from_enc, + enum erlang_char_encoding to_enc) +{ + return ei_x_encode_atom_len_as(x, s, strlen(s), from_enc, to_enc); +} + +int ei_x_encode_atom_len_as(ei_x_buff* x, const char* s, int len, + enum erlang_char_encoding from_enc, + enum erlang_char_encoding to_enc) { int i = x->index; - ei_encode_atom_len(NULL, &i, s, len); + ei_encode_atom_len_as(NULL, &i, s, len, from_enc, to_enc); if (!x_fix_buff(x, i)) return -1; - return ei_encode_atom_len(x->buff, &x->index, s, len); + return ei_encode_atom_len_as(x->buff, &x->index, s, len, from_enc, to_enc); } + int ei_x_encode_pid(ei_x_buff* x, const erlang_pid* pid) { int i = x->index; diff --git a/lib/erl_interface/src/misc/get_type.c b/lib/erl_interface/src/misc/get_type.c index c9a040fbbd..54465196b0 100644 --- a/lib/erl_interface/src/misc/get_type.c +++ b/lib/erl_interface/src/misc/get_type.c @@ -33,80 +33,6 @@ int ei_get_type(const char *buf, const int *index, int *type, int *len) return ei_get_type_internal(buf, index, type, len); } -#if 0 -int ei_get_type(const char *buf, const int *index, int *type, int *len) -{ - const char *s = buf + *index; - int itype = get8(s); /* Internal type */ - - *len = 0; - - switch (*type) { - - case ERL_SMALL_INTEGER_EXT: - case ERL_INTEGER_EXT: - case ERL_SMALL_BIG_EXT: - case ERL_LARGE_BIG_EXT: - *type = EI_TYPE_INTEGER; - break; - - case ERL_FLOAT_EXT: - *type = EI_TYPE_FLOAT; - break; - - case ERL_SMALL_TUPLE_EXT: - case ERL_SMALL_ATOM_EXT: - *len = get8(s); - break; - - case ERL_ATOM_EXT: - case ERL_UNICODE_ATOM_EXT: - case ERL_STRING_EXT: - *len = get16be(s); - break; - - case ERL_LARGE_TUPLE_EXT: - case ERL_LIST_EXT: - case ERL_BINARY_EXT: - *len = get32be(s); - break; - - case ERL_SMALL_BIG_EXT: - *len = (get8(s)+1)/2; /* big arity */ - break; - - case ERL_LARGE_BIG_EXT: - *len = (get32be(s)+1)/2; /* big arity */ - break; - - case ERL_BINARY_EXT: - *type = EI_TYPE_BINARY; - break; - - case ERL_PID_EXT: - *type = EI_TYPE_PID; - break; - - case ERL_PORT_EXT: - *type = EI_TYPE_PORT; - break; - - case ERL_REFERENCE_EXT: - case ERL_NEW_REFERENCE_EXT: - *type = EI_TYPE_REF; - break; - - default: - break; - } - - /* leave index unchanged */ - return 0; -} -#endif - - -/* Old definition of function above */ int ei_get_type_internal(const char *buf, const int *index, int *type, int *len) @@ -117,12 +43,13 @@ int ei_get_type_internal(const char *buf, const int *index, switch (*type) { case ERL_SMALL_ATOM_EXT: + case ERL_SMALL_ATOM_UTF8_EXT: *type = ERL_ATOM_EXT; case ERL_SMALL_TUPLE_EXT: *len = get8(s); break; - case ERL_UNICODE_ATOM_EXT: + case ERL_ATOM_UTF8_EXT: *type = ERL_ATOM_EXT; case ERL_ATOM_EXT: case ERL_STRING_EXT: diff --git a/lib/erl_interface/src/misc/putget.h b/lib/erl_interface/src/misc/putget.h index 8b0d4d3404..77ae168f8c 100644 --- a/lib/erl_interface/src/misc/putget.h +++ b/lib/erl_interface/src/misc/putget.h @@ -105,8 +105,12 @@ ((EI_ULONGLONG)((unsigned char *)(s))[-2] << 8) | \ (EI_ULONGLONG)((unsigned char *)(s))[-1])) -int ei_internal_get_atom(const char** bufp, char* p); +int utf8_to_latin1(char* dst, const char* src, int slen, int destlen, enum erlang_char_encoding* res_encp); +int latin1_to_utf8(char* dst, const char* src, int slen, int destlen, enum erlang_char_encoding* res_encp); +int ei_internal_get_atom(const char** bufp, char* p, enum erlang_char_encoding*); +int ei_internal_put_atom(char** bufp, const char* p, int slen, enum erlang_char_encoding); #define get_atom ei_internal_get_atom +#define put_atom ei_internal_put_atom typedef union float_ext { double d; diff --git a/lib/erl_interface/src/misc/show_msg.c b/lib/erl_interface/src/misc/show_msg.c index 194296798b..ca46b15aff 100644 --- a/lib/erl_interface/src/misc/show_msg.c +++ b/lib/erl_interface/src/misc/show_msg.c @@ -132,13 +132,13 @@ int ei_show_sendmsg(FILE *stream, const char *header, const char *msgbuf) switch (msg.msgtype) { case ERL_SEND: - if (ei_decode_atom(header,&index,msg.cookie) + if (ei_decode_atom_as(header,&index,msg.cookie,sizeof(msg.cookie),ERLANG_UTF8,NULL,NULL) || ei_decode_pid(header,&index,&msg.to)) return -1; mbuf = msgbuf; break; case ERL_SEND_TT: - if (ei_decode_atom(header,&index,msg.cookie) + if (ei_decode_atom_as(header,&index,msg.cookie,sizeof(msg.cookie),ERLANG_UTF8,NULL,NULL) || ei_decode_pid(header,&index,&msg.to) || ei_decode_trace(header,&index,&msg.token)) return -1; mbuf = msgbuf; @@ -146,15 +146,15 @@ int ei_show_sendmsg(FILE *stream, const char *header, const char *msgbuf) case ERL_REG_SEND: if (ei_decode_pid(header,&index,&msg.from) - || ei_decode_atom(header,&index,msg.cookie) - || ei_decode_atom(header,&index,msg.toname)) return -1; + || ei_decode_atom_as(header,&index,msg.cookie,sizeof(msg.cookie),ERLANG_UTF8,NULL,NULL) + || ei_decode_atom_as(header,&index,msg.toname,sizeof(msg.toname),ERLANG_UTF8,NULL,NULL)) return -1; mbuf = msgbuf; break; case ERL_REG_SEND_TT: if (ei_decode_pid(header,&index,&msg.from) - || ei_decode_atom(header,&index,msg.cookie) - || ei_decode_atom(header,&index,msg.toname) + || ei_decode_atom_as(header,&index,msg.cookie,sizeof(msg.cookie),ERLANG_UTF8,NULL,NULL) + || ei_decode_atom_as(header,&index,msg.toname,sizeof(msg.toname),ERLANG_UTF8,NULL,NULL) || ei_decode_trace(header,&index,&msg.token)) return -1; mbuf = msgbuf; break; -- cgit v1.2.3