/* * %CopyrightBegin% * * Copyright Ericsson AB 1998-2016. All Rights Reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * * %CopyrightEnd% */ #include #include #include "eidef.h" #include "eiext.h" #include "putget.h" static int verify_ascii_atom(const char* src, int slen); static int verify_utf8_atom(const char* src, int slen); static int is_latin1_as_utf8(const char *p, int len); int ei_encode_atom(char *buf, int *index, const char *p) { size_t len = strlen(p); if (len >= MAXATOMLEN) len = MAXATOMLEN - 1; return ei_encode_atom_len_as(buf, index, p, len, ERLANG_LATIN1, ERLANG_LATIN1); } int ei_encode_atom_len(char *buf, int *index, const char *p, int len) { /* This function is documented to truncate at MAXATOMLEN (256) */ if (len >= MAXATOMLEN) len = MAXATOMLEN - 1; return ei_encode_atom_len_as(buf, index, p, len, ERLANG_LATIN1, ERLANG_LATIN1); } int ei_encode_atom_as(char *buf, int *index, const char *p, erlang_char_encoding from_enc, erlang_char_encoding to_enc) { return ei_encode_atom_len_as(buf, index, p, strlen(p), from_enc, to_enc); } int ei_encode_atom_len_as(char *buf, int *index, const char *p, int len, erlang_char_encoding from_enc, erlang_char_encoding to_enc) { char *s = buf + *index; char *s0 = s; int offs; if (len >= MAXATOMLEN && (from_enc & (ERLANG_LATIN1|ERLANG_ASCII))) { return -1; } if (to_enc == (ERLANG_LATIN1 | ERLANG_UTF8)) { if (from_enc == ERLANG_UTF8) { to_enc = is_latin1_as_utf8(p, len) ? ERLANG_LATIN1 : ERLANG_UTF8; } else { to_enc = from_enc; } } switch(to_enc) { case ERLANG_LATIN1: if (buf) { put8(s,ERL_ATOM_EXT); switch (from_enc) { case ERLANG_UTF8: len = utf8_to_latin1(s+2, p, len, MAXATOMLEN-1, NULL); if (len < 0) return -1; break; case ERLANG_ASCII: if (verify_ascii_atom(p, len) < 0) return -1; memcpy(s+2, p, len); break; case ERLANG_LATIN1: memcpy(s+2, p, len); break; default: return -1; } put16be(s,len); } else { s += 3; if (from_enc == ERLANG_UTF8) { len = utf8_to_latin1(NULL, p, len, MAXATOMLEN-1, NULL); if (len < 0) return -1; } else if (from_enc == ERLANG_ASCII) if (verify_ascii_atom(p, len) < 0) return -1; } break; case ERLANG_UTF8: offs = 1 + 1; switch (from_enc) { case ERLANG_LATIN1: if (len >= 256/2) offs++; len = latin1_to_utf8((buf ? s+offs : NULL), p, len, MAXATOMLEN_UTF8-1, NULL); break; case ERLANG_ASCII: if (verify_ascii_atom(p, len) < 0) return -1; if (buf) memcpy(s+offs,p,len); break; case ERLANG_UTF8: if (len >= 256) offs++; if (verify_utf8_atom(p, len) < 0) return -1; if (buf) memcpy(s+offs,p,len); break; default: return -1; } if (buf) { if (offs == 2) { put8(s, ERL_SMALL_ATOM_UTF8_EXT); put8(s, len); } else { put8(s, ERL_ATOM_UTF8_EXT); put16be(s, len); } } else s+= offs; break; default: return -1; } s += len; *index += s-s0; return 0; } int ei_internal_put_atom(char** bufp, const char* p, int slen, erlang_char_encoding to_enc) { int ix = 0; if (ei_encode_atom_len_as(*bufp, &ix, p, slen, ERLANG_UTF8, to_enc) < 0) return -1; *bufp += ix; return 0; } static int verify_ascii_atom(const char* src, int slen) { while (slen > 0) { if ((src[0] & 0x80) != 0) return -1; src++; slen--; } return 0; } static int verify_utf8_atom(const char* src, int slen) { int num_chars = 0; while (slen > 0) { if (++num_chars >= MAXATOMLEN) return -1; if ((src[0] & 0x80) != 0) { if ((src[0] & 0xE0) == 0xC0) { if (slen < 2 || (src[1] & 0xC0) != 0x80) return -1; src++; slen--; } else if ((src[0] & 0xF0) == 0xE0) { if (slen < 3 || (src[1] & 0xC0) != 0x80 || (src[2] & 0xC0) != 0x80) return -1; src += 2; slen -= 2; } else if ((src[0] & 0xF8) == 0xF0) { if (slen < 4 || (src[1] & 0xC0) != 0x80 || (src[2] & 0xC0) != 0x80 || (src[3] & 0xC0) != 0x80) return -1; src += 3; slen -= 3; } else return -1; } src++; slen--; } return 0; } /* Only latin1 code points in utf8 string? */ static int is_latin1_as_utf8(const char *p, int len) { int i; for (i=0; i 0xC3) return 0; } return 1; }