diff options
Diffstat (limited to 'lib/erl_interface/src/decode/decode_atom.c')
-rw-r--r-- | lib/erl_interface/src/decode/decode_atom.c | 182 |
1 files changed, 157 insertions, 25 deletions
diff --git a/lib/erl_interface/src/decode/decode_atom.c b/lib/erl_interface/src/decode/decode_atom.c index c2e6a0426e..b3bba82434 100644 --- a/lib/erl_interface/src/decode/decode_atom.c +++ b/lib/erl_interface/src/decode/decode_atom.c @@ -1,18 +1,19 @@ /* * %CopyrightBegin% * - * Copyright Ericsson AB 1998-2011. All Rights Reserved. + * Copyright Ericsson AB 1998-2016. All Rights Reserved. * - * The contents of this file are subject to the Erlang Public License, - * Version 1.1, (the "License"); you may not use this file except in - * compliance with the License. You should have received a copy of the - * Erlang Public License along with this software. If not, it can be - * retrieved online at http://www.erlang.org/. - * - * Software distributed under the License is distributed on an "AS IS" - * basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See - * the License for the specific language governing rights and limitations - * under the License. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. * * %CopyrightEnd% */ @@ -21,24 +22,155 @@ #include "eiext.h" #include "putget.h" + int ei_decode_atom(const char *buf, int *index, char *p) { - const char *s = buf + *index; - const char *s0 = s; - int len; + return ei_decode_atom_as(buf, index, p, MAXATOMLEN, ERLANG_LATIN1, NULL, NULL); +} + +int ei_decode_atom_as(const char *buf, int *index, char* p, int destlen, + erlang_char_encoding want_enc, + erlang_char_encoding* was_encp, + erlang_char_encoding* res_encp) +{ + const char *s = buf + *index; + const char *s0 = s; + int len; + erlang_char_encoding got_enc; + + switch (get8(s)) { + case ERL_ATOM_EXT: + len = get16be(s); + got_enc = ERLANG_LATIN1; + break; + case ERL_SMALL_ATOM_EXT: + len = get8(s); + got_enc = ERLANG_LATIN1; + break; + case ERL_ATOM_UTF8_EXT: + len = get16be(s); + got_enc = ERLANG_UTF8; + break; + case ERL_SMALL_ATOM_UTF8_EXT: + len = get8(s); + got_enc = ERLANG_UTF8; + break; + default: + return -1; + } + + if ((want_enc & got_enc) || want_enc == ERLANG_ASCII) { + int i, found_non_ascii = 0; + if (len >= destlen) + return -1; + for (i=0; i<len; i++) { + if (s[i] & 0x80) found_non_ascii = 1; + if (p) p[i] = s[i]; + } + if (p) p[len] = 0; + if (want_enc == ERLANG_ASCII && found_non_ascii) { + return -1; + } + if (res_encp) { + *res_encp = found_non_ascii ? got_enc : ERLANG_ASCII; + } + } + else { + int plen = (got_enc == ERLANG_LATIN1) ? + latin1_to_utf8(p, s, len, destlen-1, res_encp) : + utf8_to_latin1(p, s, len, destlen-1, res_encp); + if (plen < 0) return -1; + if (p) p[plen] = 0; + } + if (was_encp) { + *was_encp = got_enc; + } + + s += len; + *index += s-s0; + return 0; +} + + +int utf8_to_latin1(char* dst, const char* src, int slen, int destlen, + erlang_char_encoding* res_encp) +{ + const char* const dst_start = dst; + const char* const dst_end = dst + destlen; + int found_non_ascii = 0; - if (get8(s) != ERL_ATOM_EXT) return -1; + while (slen > 0) { + if (dst >= dst_end) return -1; + if ((src[0] & 0x80) == 0) { + if (dst_start) { + *dst = *src; + } + ++dst; + ++src; + --slen; + } + else if (slen > 1 && + (src[0] & 0xFE) == 0xC2 && + (src[1] & 0xC0) == 0x80) { + if (dst_start) { + *dst = (char) ((src[0] << 6) | (src[1] & 0x3F)); + } + ++dst; + src += 2; + slen -= 2; + found_non_ascii = 1; + } + else return -1; + } + if (res_encp) { + *res_encp = found_non_ascii ? ERLANG_LATIN1 : ERLANG_ASCII; + } + return dst - dst_start; +} + +int latin1_to_utf8(char* dst, const char* src, int slen, int destlen, + erlang_char_encoding* res_encp) +{ + const char* const src_end = src + slen; + const char* const dst_start = dst; + const char* const dst_end = dst + destlen; + int found_non_ascii = 0; + + while (src < src_end) { + if (dst >= dst_end) return -1; + if ((src[0] & 0x80) == 0) { + if (dst_start) { + *dst = *src; + } + ++dst; + } + else { + if (dst_start) { + unsigned char ch = *src; + dst[0] = 0xC0 | (ch >> 6); + dst[1] = 0x80 | (ch & 0x3F); + } + dst += 2; + found_non_ascii = 1; + } + ++src; + } + if (res_encp) { + *res_encp = found_non_ascii ? ERLANG_UTF8 : ERLANG_ASCII; + } + return dst - dst_start; +} - len = get16be(s); - if (len > MAXATOMLEN) return -1; - if (p) { - memmove(p,s,len); - p[len] = (char)0; - } - s += len; - *index += s-s0; - - return 0; +int ei_internal_get_atom(const char** bufp, char* p, + erlang_char_encoding* was_encp) +{ + int ix = 0; + if (ei_decode_atom_as(*bufp, &ix, p, MAXATOMLEN_UTF8, ERLANG_UTF8, was_encp, NULL) < 0) + return -1; + *bufp += ix; + return 0; } + + |