diff options
author | Sverker Eriksson <[email protected]> | 2013-01-11 17:27:29 +0100 |
---|---|---|
committer | Sverker Eriksson <[email protected]> | 2013-01-18 15:04:03 +0100 |
commit | b553664f54034e8c04ae6f9cc44f16b7f516518b (patch) | |
tree | 1522c655fea9aa52476e997aa26f1512d3ecada4 /lib/erl_interface/src/decode | |
parent | 97abb095cd2182d5c3fafd525da4943ef74dc8e5 (diff) | |
download | otp-b553664f54034e8c04ae6f9cc44f16b7f516518b.tar.gz otp-b553664f54034e8c04ae6f9cc44f16b7f516518b.tar.bz2 otp-b553664f54034e8c04ae6f9cc44f16b7f516518b.zip |
erl_interface: utf8 atoms continued
Diffstat (limited to 'lib/erl_interface/src/decode')
-rw-r--r-- | lib/erl_interface/src/decode/decode_atom.c | 185 | ||||
-rw-r--r-- | lib/erl_interface/src/decode/decode_boolean.c | 8 | ||||
-rw-r--r-- | lib/erl_interface/src/decode/decode_fun.c | 6 | ||||
-rw-r--r-- | lib/erl_interface/src/decode/decode_pid.c | 3 | ||||
-rw-r--r-- | lib/erl_interface/src/decode/decode_port.c | 3 | ||||
-rw-r--r-- | lib/erl_interface/src/decode/decode_ref.c | 6 |
6 files changed, 144 insertions, 67 deletions
diff --git a/lib/erl_interface/src/decode/decode_atom.c b/lib/erl_interface/src/decode/decode_atom.c index 84edf1766a..2ada418243 100644 --- a/lib/erl_interface/src/decode/decode_atom.c +++ b/lib/erl_interface/src/decode/decode_atom.c @@ -21,76 +21,155 @@ #include "eiext.h" #include "putget.h" -static int utf8_to_latin1(char* dest, const char* source, unsigned len); int ei_decode_atom(const char *buf, int *index, char *p) { - const char *s = buf + *index; - const char *s0 = s; - int len; - - switch (get8(s)) { - case ERL_ATOM_EXT: - len = get16be(s); - if (len > MAXATOMLEN) return -1; - if (p) { - memmove(p,s,len); - p[len] = (char)0; - } - break; - - case ERL_SMALL_ATOM_EXT: - len = get8(s); - if (p) { - memmove(p,s,len); - p[len] = (char)0; - } - break; - - case ERL_UNICODE_ATOM_EXT: - len = get16be(s); - - if (len > 2*MAXATOMLEN) return -1; - - if (p && utf8_to_latin1(p, s, len) < 0) return -1; - break; - - default: - return -1; - } - - s += len; - *index += s-s0; - return 0; + return ei_decode_atom_as(buf, index, p, MAXATOMLEN, ERLANG_LATIN1, NULL, NULL); } -int ei_internal_get_atom(const char** bufp, char* p) +int ei_decode_atom_as(const char *buf, int *index, char* p, int destlen, + enum erlang_char_encoding want_enc, + enum erlang_char_encoding* was_encp, + enum erlang_char_encoding* res_encp) { - int ix = 0; - if (ei_decode_atom(*bufp, &ix, p) < 0) return -1; - *bufp += ix; + const char *s = buf + *index; + const char *s0 = s; + int len; + enum erlang_char_encoding got_enc; + + switch (get8(s)) { + case ERL_ATOM_EXT: + len = get16be(s); + got_enc = ERLANG_LATIN1; + break; + case ERL_SMALL_ATOM_EXT: + len = get8(s); + got_enc = ERLANG_LATIN1; + break; + case ERL_ATOM_UTF8_EXT: + len = get16be(s); + got_enc = ERLANG_UTF8; + break; + case ERL_SMALL_ATOM_UTF8_EXT: + len = get8(s); + got_enc = ERLANG_UTF8; + break; + default: + return -1; + } + + if (want_enc == got_enc || want_enc == ERLANG_WHATEVER || want_enc == ERLANG_ASCII) { + int i, found_non_ascii = 0; + if (len >= destlen) + return -1; + for (i=0; i<len; i++) { + if (s[i] & 0x80) found_non_ascii = 1; + if (p) p[i] = s[i]; + } + if (p) p[len] = 0; + if (want_enc == ERLANG_ASCII && found_non_ascii) { + return -1; + } + if (res_encp) { + *res_encp = found_non_ascii ? got_enc : ERLANG_ASCII; + } + } + else { + int plen = (got_enc == ERLANG_LATIN1) ? + utf8_to_latin1(p, s, len, destlen-1, res_encp) : + latin1_to_utf8(p, s, len, destlen-1, res_encp); + if (plen < 0) return -1; + if (p) p[plen] = 0; + } + if (was_encp) { + *was_encp = got_enc; + } + + s += len; + *index += s-s0; return 0; -} +} + -static int utf8_to_latin1(char* dest, const char* source, unsigned slen) +int utf8_to_latin1(char* dst, const char* src, int slen, int destlen, + enum erlang_char_encoding* res_encp) { - const char* dest_end = dest + MAXATOMLEN - 1; + const char* const dst_start = dst; + const char* const dst_end = dst + destlen; + int found_non_ascii = 0; - while (slen > 0 && dest < dest_end) { - if ((source[0] & 0x80) == 0) { - *dest++ = *source++; + while (slen > 0) { + if (dst >= dst_end) return -1; + if ((src[0] & 0x80) == 0) { + if (dst_start) { + *dst = *src; + } + ++dst; + ++src; --slen; } else if (slen > 1 && - (source[0] & 0xFE) == 0xC2 && - (source[1] & 0xC0) == 0x80) { - *dest++ = (char) ((source[0] << 6) | (source[1] & 0x3F)); - source += 2; + (src[0] & 0xFE) == 0xC2 && + (src[1] & 0xC0) == 0x80) { + if (dst_start) { + *dst = (char) ((src[0] << 6) | (src[1] & 0x3F)); + } + ++dst; + src += 2; slen -= 2; + found_non_ascii = 1; } else return -1; } - *dest = 0; + if (res_encp) { + *res_encp = found_non_ascii ? ERLANG_LATIN1 : ERLANG_ASCII; + } + return dst - dst_start; +} + +int latin1_to_utf8(char* dst, const char* src, int slen, int destlen, + enum erlang_char_encoding* res_encp) +{ + const char* const src_end = src + slen; + const char* const dst_start = dst; + const char* const dst_end = dst + destlen; + int found_non_ascii = 0; + + while (src < src_end) { + if (dst >= dst_end) return -1; + if ((src[0] & 0x80) == 0) { + if (dst_start) { + *dst = *src; + } + ++dst; + } + else { + if (dst_start) { + unsigned char ch = *src; + dst[0] = 0xC0 | (ch >> 6); + dst[1] = 0x80 | (ch & 0x3F); + } + dst += 2; + found_non_ascii = 1; + } + ++src; + } + if (res_encp) { + *res_encp = found_non_ascii ? ERLANG_UTF8 : ERLANG_ASCII; + } + return dst - dst_start; +} + + + +int ei_internal_get_atom(const char** bufp, char* p, + enum erlang_char_encoding* was_encp) +{ + int ix = 0; + if (ei_decode_atom_as(*bufp, &ix, p, MAXATOMLEN_UTF8, ERLANG_UTF8, was_encp, NULL) < 0) + return -1; + *bufp += ix; return 0; } + diff --git a/lib/erl_interface/src/decode/decode_boolean.c b/lib/erl_interface/src/decode/decode_boolean.c index 0a7a06f1d4..f20690249b 100644 --- a/lib/erl_interface/src/decode/decode_boolean.c +++ b/lib/erl_interface/src/decode/decode_boolean.c @@ -24,12 +24,11 @@ /* c non-zero -> erlang "true" atom, otherwise "false" */ int ei_decode_boolean(const char *buf, int *index, int *p) { - const char *s = buf + *index; - const char *s0 = s; - char tbuf[MAXATOMLEN+1]; + char tbuf[6]; int t; - if (get_atom(&s, tbuf) < 0) return -1; + if (ei_decode_atom_as(buf, index, tbuf, sizeof(tbuf), ERLANG_ASCII, NULL, NULL) < 0) + return -1; if (memcmp(tbuf, "true", 5) == 0) t = 1; @@ -39,7 +38,6 @@ int ei_decode_boolean(const char *buf, int *index, int *p) return -1; if (p) *p = t; - *index += s-s0; return 0; } diff --git a/lib/erl_interface/src/decode/decode_fun.c b/lib/erl_interface/src/decode/decode_fun.c index 64fb9e86d8..7bbef5db44 100644 --- a/lib/erl_interface/src/decode/decode_fun.c +++ b/lib/erl_interface/src/decode/decode_fun.c @@ -42,7 +42,8 @@ int ei_decode_fun(const char *buf, int *index, erlang_fun *p) if (ei_decode_pid(s, &ix, (p == NULL ? (erlang_pid*)NULL : &p->pid)) < 0) return -1; /* then the module (atom) */ - if (ei_decode_atom(s, &ix, (p == NULL ? (char*)NULL : p->module)) < 0) + if (ei_decode_atom_as(s, &ix, (p == NULL ? (char*)NULL : p->module), + MAXATOMLEN_UTF8, ERLANG_UTF8, &p->module_org_enc, NULL) < 0) return -1; /* then the index */ if (ei_decode_long(s, &ix, (p == NULL ? (long*)NULL : &p->index)) < 0) @@ -84,7 +85,8 @@ int ei_decode_fun(const char *buf, int *index, erlang_fun *p) if (p != NULL) p->n_free_vars = i; /* then the module (atom) */ ix = 0; - if (ei_decode_atom(s, &ix, (p == NULL ? (char*)NULL : p->module)) < 0) + if (ei_decode_atom_as(s, &ix, (p == NULL ? (char*)NULL : p->module), + MAXATOMLEN_UTF8, ERLANG_UTF8, &p->module_org_enc, NULL) < 0) return -1; /* then the old_index */ if (ei_decode_long(s, &ix, (p == NULL ? (long*)NULL : &p->old_index)) < 0) diff --git a/lib/erl_interface/src/decode/decode_pid.c b/lib/erl_interface/src/decode/decode_pid.c index a762ae499e..e79952195d 100644 --- a/lib/erl_interface/src/decode/decode_pid.c +++ b/lib/erl_interface/src/decode/decode_pid.c @@ -26,12 +26,11 @@ int ei_decode_pid(const char *buf, int *index, erlang_pid *p) { const char *s = buf + *index; const char *s0 = s; - int len; if (get8(s) != ERL_PID_EXT) return -1; /* first the nodename */ - if (get_atom(&s, p->node) < 0) return -1; + if (get_atom(&s, p->node, &p->node_org_enc) < 0) return -1; /* now the numbers: num (4), serial (4), creation (1) */ if (p) { diff --git a/lib/erl_interface/src/decode/decode_port.c b/lib/erl_interface/src/decode/decode_port.c index 6eb2bc9197..5fd96b51a4 100644 --- a/lib/erl_interface/src/decode/decode_port.c +++ b/lib/erl_interface/src/decode/decode_port.c @@ -25,12 +25,11 @@ int ei_decode_port(const char *buf, int *index, erlang_port *p) { const char *s = buf + *index; const char *s0 = s; - int len; if (get8(s) != ERL_PORT_EXT) return -1; /* first the nodename */ - if (get_atom(&s, p->node) < 0) return -1; + if (get_atom(&s, p->node, &p->node_org_enc) < 0) return -1; /* now the numbers: num (4), creation (1) */ if (p) { diff --git a/lib/erl_interface/src/decode/decode_ref.c b/lib/erl_interface/src/decode/decode_ref.c index df3c30777b..7294e5d239 100644 --- a/lib/erl_interface/src/decode/decode_ref.c +++ b/lib/erl_interface/src/decode/decode_ref.c @@ -26,13 +26,13 @@ int ei_decode_ref(const char *buf, int *index, erlang_ref *p) { const char *s = buf + *index; const char *s0 = s; - int count, len, i; + int count, i; switch (get8(s)) { case ERL_REFERENCE_EXT: /* nodename */ - if (get_atom(&s, p->node) < 0) return -1; + if (get_atom(&s, p->node, &p->node_org_enc) < 0) return -1; /* now the numbers: num (4), creation (1) */ if (p) { @@ -53,7 +53,7 @@ int ei_decode_ref(const char *buf, int *index, erlang_ref *p) if (p) p->len = count; /* then the nodename */ - if (get_atom(&s, p->node) < 0) return -1; + if (get_atom(&s, p->node, &p->node_org_enc) < 0) return -1; /* creation */ if (p) { |