diff options
author | Sverker Eriksson <[email protected]> | 2013-01-23 18:09:35 +0100 |
---|---|---|
committer | Sverker Eriksson <[email protected]> | 2013-01-23 18:09:35 +0100 |
commit | b8e623410d1c22fe6d5fdeb8ccb0b2305533f033 (patch) | |
tree | 708d64e36e18b61ae1801c02ec3aeef42a697be3 /lib/erl_interface/src/encode/encode_atom.c | |
parent | e99df74bee7c245ec76678e336fcd09d4b51a089 (diff) | |
parent | d6e3e256b850050b7a86323b2948009d5fcc30a9 (diff) | |
download | otp-b8e623410d1c22fe6d5fdeb8ccb0b2305533f033.tar.gz otp-b8e623410d1c22fe6d5fdeb8ccb0b2305533f033.tar.bz2 otp-b8e623410d1c22fe6d5fdeb8ccb0b2305533f033.zip |
Merge branch 'sverk/r16/utf8-atoms'
* sverk/r16/utf8-atoms:
erl_interface: Fix bug when transcoding atoms from and to UTF8
erl_interface: Changed erlang_char_encoding interface
erts: Testcase doing unicode atom printout with ~w
erl_interface: even more utf8 atom stuff
erts: Fix bug in analyze_utf8 causing faulty latin1 detection
Add UTF-8 node name support for epmd
workaround...
Fix merge conflict with hasse
UTF-8 atom documentation
test case
erl_interface: utf8 atoms continued
Add utf8 atom distribution test cases
atom fixes for NIFs and atom_to_binary
UTF-8 support for distribution
Implement UTF-8 atom support for jinterface
erl_interface: Enable decode of unicode atoms
stdlib: Fix printing of unicode atoms
erts: Change internal representation of atoms to utf8
erts: Refactor rename DFLAG(S)_INTERNAL_TAGS for conformity
Conflicts:
erts/emulator/beam/io.c
OTP-10753
Diffstat (limited to 'lib/erl_interface/src/encode/encode_atom.c')
-rw-r--r-- | lib/erl_interface/src/encode/encode_atom.c | 154 |
1 files changed, 144 insertions, 10 deletions
diff --git a/lib/erl_interface/src/encode/encode_atom.c b/lib/erl_interface/src/encode/encode_atom.c index 6f41f045e0..044f17cb60 100644 --- a/lib/erl_interface/src/encode/encode_atom.c +++ b/lib/erl_interface/src/encode/encode_atom.c @@ -22,29 +22,108 @@ #include "eiext.h" #include "putget.h" + +static int copy_ascii_atom(char* dst, const char* src, int slen); +static int copy_utf8_atom(char* dst, const char* src, int slen); + + int ei_encode_atom(char *buf, int *index, const char *p) { size_t len = strlen(p); - if (len >= INT_MAX) return -1; - return ei_encode_atom_len(buf, index, p, len); + if (len >= MAXATOMLEN) + len = MAXATOMLEN - 1; + return ei_encode_atom_len_as(buf, index, p, len, ERLANG_LATIN1, ERLANG_LATIN1); } int ei_encode_atom_len(char *buf, int *index, const char *p, int len) { + /* This function is documented to truncate at MAXATOMLEN (256) */ + if (len >= MAXATOMLEN) + len = MAXATOMLEN - 1; + return ei_encode_atom_len_as(buf, index, p, len, ERLANG_LATIN1, ERLANG_LATIN1); +} + +int ei_encode_atom_as(char *buf, int *index, const char *p, + enum erlang_char_encoding from_enc, + enum erlang_char_encoding to_enc) +{ + return ei_encode_atom_len_as(buf, index, p, strlen(p), from_enc, to_enc); +} + +int ei_encode_atom_len_as(char *buf, int *index, const char *p, int len, + enum erlang_char_encoding from_enc, + enum erlang_char_encoding to_enc) +{ char *s = buf + *index; char *s0 = s; + int offs; - /* This function is documented to truncate at MAXATOMLEN (256) */ - if (len > MAXATOMLEN) - len = MAXATOMLEN; + if (len >= MAXATOMLEN && (from_enc & (ERLANG_LATIN1|ERLANG_ASCII))) { + return -1; + } - if (!buf) s += 3; - else { - put8(s,ERL_ATOM_EXT); - put16be(s,len); + switch(to_enc) { + case ERLANG_LATIN1: + if (buf) { + put8(s,ERL_ATOM_EXT); + switch (from_enc) { + case ERLANG_UTF8: + len = utf8_to_latin1(s+2, p, len, MAXATOMLEN-1, NULL); + if (len < 0) return -1; + break; + case ERLANG_ASCII: + if (copy_ascii_atom(s+2, p, len) < 0) return -1; + break; + case ERLANG_LATIN1: + memcpy(s+2, p, len); + break; + default: + return -1; + } + put16be(s,len); + } + else { + s += 3; + if (from_enc == ERLANG_UTF8) { + len = utf8_to_latin1(NULL, p, len, MAXATOMLEN-1, NULL); + if (len < 0) return -1; + } + } + break; + + case ERLANG_UTF8: + offs = 1 + 1; + switch (from_enc) { + case ERLANG_LATIN1: + if (len >= 256/2) offs++; + len = latin1_to_utf8((buf ? s+offs : NULL), p, len, MAXATOMLEN_UTF8-1, NULL); + break; + case ERLANG_ASCII: + if (buf && copy_ascii_atom(s+offs, p, len) < 0) return -1; + break; + case ERLANG_UTF8: + if (len >= 256) offs++; + if (buf && copy_utf8_atom(s+offs, p, len) < 0) return -1; + break; + default: + return -1; + } + if (buf) { + if (offs == 2) { + put8(s, ERL_SMALL_ATOM_UTF8_EXT); + put8(s, len); + } + else { + put8(s, ERL_ATOM_UTF8_EXT); + put16be(s, len); + } + } + else s+= offs; + break; - memmove(s,p,len); /* unterminated string */ + default: + return -1; } s += len; @@ -53,3 +132,58 @@ int ei_encode_atom_len(char *buf, int *index, const char *p, int len) return 0; } +int +ei_internal_put_atom(char** bufp, const char* p, int slen, + enum erlang_char_encoding to_enc) +{ + int ix = 0; + if (ei_encode_atom_len_as(*bufp, &ix, p, slen, ERLANG_UTF8, to_enc) < 0) + return -1; + *bufp += ix; + return 0; +} + + +int copy_ascii_atom(char* dst, const char* src, int slen) +{ + while (slen > 0) { + if ((src[0] & 0x80) != 0) return -1; + *dst++ = *src++; + slen--; + } + return 0; +} + +int copy_utf8_atom(char* dst, const char* src, int slen) +{ + int num_chars = 0; + + while (slen > 0) { + if (++num_chars >= MAXATOMLEN) return -1; + if ((src[0] & 0x80) != 0) { + if ((src[0] & 0xE0) == 0xC0) { + if (slen < 2 || (src[1] & 0xC0) != 0x80) return -1; + *dst++ = *src++; + slen--; + } + else if ((src[0] & 0xF0) == 0xE0) { + if (slen < 3 || (src[1] & 0xC0) != 0x80 || (src[2] & 0xC0) != 0x80) return -1; + *dst++ = *src++; + *dst++ = *src++; + slen -= 2; + } + else if ((src[0] & 0xF8) == 0xF0) { + if (slen < 4 || (src[1] & 0xC0) != 0x80 || (src[2] & 0xC0) != 0x80 || (src[3] & 0xC0) != 0x80) return -1; + *dst++ = *src++; + *dst++ = *src++; + *dst++ = *src++; + slen -= 3; + } + else return -1; + } + *dst++ = *src++; + slen--; + } + return 0; +} + |