From b553664f54034e8c04ae6f9cc44f16b7f516518b Mon Sep 17 00:00:00 2001 From: Sverker Eriksson Date: Fri, 11 Jan 2013 17:27:29 +0100 Subject: erl_interface: utf8 atoms continued --- lib/erl_interface/src/decode/decode_atom.c | 185 ++++++++++++++++++++--------- 1 file changed, 132 insertions(+), 53 deletions(-) (limited to 'lib/erl_interface/src/decode/decode_atom.c') diff --git a/lib/erl_interface/src/decode/decode_atom.c b/lib/erl_interface/src/decode/decode_atom.c index 84edf1766a..2ada418243 100644 --- a/lib/erl_interface/src/decode/decode_atom.c +++ b/lib/erl_interface/src/decode/decode_atom.c @@ -21,76 +21,155 @@ #include "eiext.h" #include "putget.h" -static int utf8_to_latin1(char* dest, const char* source, unsigned len); int ei_decode_atom(const char *buf, int *index, char *p) { - const char *s = buf + *index; - const char *s0 = s; - int len; - - switch (get8(s)) { - case ERL_ATOM_EXT: - len = get16be(s); - if (len > MAXATOMLEN) return -1; - if (p) { - memmove(p,s,len); - p[len] = (char)0; - } - break; - - case ERL_SMALL_ATOM_EXT: - len = get8(s); - if (p) { - memmove(p,s,len); - p[len] = (char)0; - } - break; - - case ERL_UNICODE_ATOM_EXT: - len = get16be(s); - - if (len > 2*MAXATOMLEN) return -1; - - if (p && utf8_to_latin1(p, s, len) < 0) return -1; - break; - - default: - return -1; - } - - s += len; - *index += s-s0; - return 0; + return ei_decode_atom_as(buf, index, p, MAXATOMLEN, ERLANG_LATIN1, NULL, NULL); } -int ei_internal_get_atom(const char** bufp, char* p) +int ei_decode_atom_as(const char *buf, int *index, char* p, int destlen, + enum erlang_char_encoding want_enc, + enum erlang_char_encoding* was_encp, + enum erlang_char_encoding* res_encp) { - int ix = 0; - if (ei_decode_atom(*bufp, &ix, p) < 0) return -1; - *bufp += ix; + const char *s = buf + *index; + const char *s0 = s; + int len; + enum erlang_char_encoding got_enc; + + switch (get8(s)) { + case ERL_ATOM_EXT: + len = get16be(s); + got_enc = ERLANG_LATIN1; + break; + case ERL_SMALL_ATOM_EXT: + len = get8(s); + got_enc = ERLANG_LATIN1; + break; + case ERL_ATOM_UTF8_EXT: + len = get16be(s); + got_enc = ERLANG_UTF8; + break; + case ERL_SMALL_ATOM_UTF8_EXT: + len = get8(s); + got_enc = ERLANG_UTF8; + break; + default: + return -1; + } + + if (want_enc == got_enc || want_enc == ERLANG_WHATEVER || want_enc == ERLANG_ASCII) { + int i, found_non_ascii = 0; + if (len >= destlen) + return -1; + for (i=0; i 0 && dest < dest_end) { - if ((source[0] & 0x80) == 0) { - *dest++ = *source++; + while (slen > 0) { + if (dst >= dst_end) return -1; + if ((src[0] & 0x80) == 0) { + if (dst_start) { + *dst = *src; + } + ++dst; + ++src; --slen; } else if (slen > 1 && - (source[0] & 0xFE) == 0xC2 && - (source[1] & 0xC0) == 0x80) { - *dest++ = (char) ((source[0] << 6) | (source[1] & 0x3F)); - source += 2; + (src[0] & 0xFE) == 0xC2 && + (src[1] & 0xC0) == 0x80) { + if (dst_start) { + *dst = (char) ((src[0] << 6) | (src[1] & 0x3F)); + } + ++dst; + src += 2; slen -= 2; + found_non_ascii = 1; } else return -1; } - *dest = 0; + if (res_encp) { + *res_encp = found_non_ascii ? ERLANG_LATIN1 : ERLANG_ASCII; + } + return dst - dst_start; +} + +int latin1_to_utf8(char* dst, const char* src, int slen, int destlen, + enum erlang_char_encoding* res_encp) +{ + const char* const src_end = src + slen; + const char* const dst_start = dst; + const char* const dst_end = dst + destlen; + int found_non_ascii = 0; + + while (src < src_end) { + if (dst >= dst_end) return -1; + if ((src[0] & 0x80) == 0) { + if (dst_start) { + *dst = *src; + } + ++dst; + } + else { + if (dst_start) { + unsigned char ch = *src; + dst[0] = 0xC0 | (ch >> 6); + dst[1] = 0x80 | (ch & 0x3F); + } + dst += 2; + found_non_ascii = 1; + } + ++src; + } + if (res_encp) { + *res_encp = found_non_ascii ? ERLANG_UTF8 : ERLANG_ASCII; + } + return dst - dst_start; +} + + + +int ei_internal_get_atom(const char** bufp, char* p, + enum erlang_char_encoding* was_encp) +{ + int ix = 0; + if (ei_decode_atom_as(*bufp, &ix, p, MAXATOMLEN_UTF8, ERLANG_UTF8, was_encp, NULL) < 0) + return -1; + *bufp += ix; return 0; } + -- cgit v1.2.3