aboutsummaryrefslogtreecommitdiffstats
path: root/lib
diff options
context:
space:
mode:
authorSverker Eriksson <[email protected]>2013-01-22 19:25:36 +0100
committerSverker Eriksson <[email protected]>2013-01-22 19:34:03 +0100
commit1f4765cca4874fa92fcfad888fbe6d5f2fbf74d1 (patch)
treebf152c1b3dbf855dfc5a8724c3e043e161a971b6 /lib
parent8eb544073fe243a8935a54f83f9c9f1f7478e3c5 (diff)
downloadotp-1f4765cca4874fa92fcfad888fbe6d5f2fbf74d1.tar.gz
otp-1f4765cca4874fa92fcfad888fbe6d5f2fbf74d1.tar.bz2
otp-1f4765cca4874fa92fcfad888fbe6d5f2fbf74d1.zip
erl_interface: even more utf8 atom stuff
Diffstat (limited to 'lib')
-rw-r--r--lib/erl_interface/doc/src/ei.xml12
-rw-r--r--lib/erl_interface/src/connect/ei_connect.c2
-rw-r--r--lib/erl_interface/src/encode/encode_atom.c59
-rw-r--r--lib/erl_interface/src/legacy/erl_connect.c2
-rw-r--r--lib/erl_interface/src/misc/ei_format.c4
-rw-r--r--lib/erl_interface/src/misc/ei_printterm.c2
-rw-r--r--lib/erl_interface/src/misc/show_msg.c2
-rw-r--r--lib/erl_interface/src/prog/ei_fake_prog.c6
-rw-r--r--lib/erl_interface/test/ei_decode_encode_SUITE.erl63
-rw-r--r--lib/erl_interface/test/ei_decode_encode_SUITE_data/ei_decode_encode_test.c16
10 files changed, 122 insertions, 46 deletions
diff --git a/lib/erl_interface/doc/src/ei.xml b/lib/erl_interface/doc/src/ei.xml
index 0b0b1eeb79..e9c7c644b5 100644
--- a/lib/erl_interface/doc/src/ei.xml
+++ b/lib/erl_interface/doc/src/ei.xml
@@ -94,7 +94,11 @@ enum erlang_char_encoding {
ERLANG_ASCII, ERLANG_LATIN1, ERLANG_UTF8, ERLANG_WHATEVER
};
</code>
- <p>The character encoding used for atoms.</p>
+ <p>The character encoding used for atoms. <c>ERLANG_ASCII</c> represents 7-bit ASCII.
+ Latin1 and UTF8 are different extensions of 7-bit ASCII. All 7-bit ASCII characters
+ are valid Latin1 and UTF8 characters. ASCII and Latin1 both represent each character
+ by one byte. A UTF8 character can consist of one to four bytes. <c>ERLANG_WHATEVER</c>
+ is not an encoding but rather used as a wildcard.</p>
</item>
</taglist>
</section>
@@ -256,11 +260,11 @@ enum erlang_char_encoding {
<p>Encodes an atom in the binary format with character encoding
<c><seealso marker="#erlang_char_encoding">to_enc</seealso></c> (latin1 or utf8).
The <c>p</c> parameter is the name of the atom with character encoding
- <c><seealso marker="#erlang_char_encoding">from_enc</seealso></c>.
+ <c><seealso marker="#erlang_char_encoding">from_enc</seealso></c> (ascii, latin1 or utf8).
The name must either be zero-terminated or a function variant with a <c>len</c>
parameter must be used.</p>
- <p>The encoding will fail if the atom is too long or if it can not be represented
- with character encoding <c>to_enc</c>.</p>
+ <p>The encoding will fail if <c>p</c> is not a valid string in encoding <c>from_enc</c>,
+ if the string is too long or if it can not be represented with character encoding <c>to_enc</c>.</p>
<p>These functions were introduced in R16 release of Erlang/OTP as part of a first step
to support UTF8 atoms. Atoms encoded with <c>ERLANG_UTF8</c>
can not be decoded by earlier releases than R16.</p>
diff --git a/lib/erl_interface/src/connect/ei_connect.c b/lib/erl_interface/src/connect/ei_connect.c
index a17257795e..4421bbb7fe 100644
--- a/lib/erl_interface/src/connect/ei_connect.c
+++ b/lib/erl_interface/src/connect/ei_connect.c
@@ -1071,7 +1071,7 @@ int ei_rpc(ei_cnode* ec, int fd, char *mod, char *fun,
int i, index;
ei_term t;
erlang_msg msg;
- char rex[MAXATOMLEN+1];
+ char rex[MAXATOMLEN];
if (ei_rpc_to(ec, fd, mod, fun, inbuf, inbuflen) < 0) {
return -1;
diff --git a/lib/erl_interface/src/encode/encode_atom.c b/lib/erl_interface/src/encode/encode_atom.c
index a3d7c4c759..8bbe962396 100644
--- a/lib/erl_interface/src/encode/encode_atom.c
+++ b/lib/erl_interface/src/encode/encode_atom.c
@@ -22,6 +22,11 @@
#include "eiext.h"
#include "putget.h"
+
+static int copy_ascii_atom(char* dst, const char* src, int slen);
+static int copy_utf8_atom(char* dst, const char* src, int slen);
+
+
int ei_encode_atom(char *buf, int *index, const char *p)
{
size_t len = strlen(p);
@@ -54,7 +59,8 @@ int ei_encode_atom_len_as(char *buf, int *index, const char *p, int len,
char *s0 = s;
int offs;
- if (from_enc == ERLANG_LATIN1 && len >= MAXATOMLEN) {
+ if (len >= MAXATOMLEN && (from_enc == ERLANG_LATIN1 ||
+ from_enc == ERLANG_ASCII)) {
return -1;
}
@@ -68,6 +74,8 @@ int ei_encode_atom_len_as(char *buf, int *index, const char *p, int len,
if (len < 0) return -1;
break;
case ERLANG_ASCII:
+ if (copy_ascii_atom(s+2, p, len) < 0) return -1;
+ break;
case ERLANG_LATIN1:
memcpy(s+2, p, len);
break;
@@ -93,9 +101,11 @@ int ei_encode_atom_len_as(char *buf, int *index, const char *p, int len,
len = latin1_to_utf8((buf ? s+offs : NULL), p, len, MAXATOMLEN_UTF8-1, NULL);
break;
case ERLANG_ASCII:
+ if (buf && copy_ascii_atom(s+offs, p, len) < 0) return -1;
+ break;
case ERLANG_UTF8:
if (len >= 256) offs++;
- if (buf) memcpy(s+offs, p, len);
+ if (buf && copy_utf8_atom(s+offs, p, len) < 0) return -1;
break;
default:
return -1;
@@ -133,3 +143,48 @@ ei_internal_put_atom(char** bufp, const char* p, int slen,
*bufp += ix;
return 0;
}
+
+
+int copy_ascii_atom(char* dst, const char* src, int slen)
+{
+ while (slen > 0) {
+ if ((src[0] & 0x80) != 0) return -1;
+ *dst++ = *src++;
+ slen--;
+ }
+ return 0;
+}
+
+int copy_utf8_atom(char* dst, const char* src, int slen)
+{
+ int num_chars = 0;
+
+ while (slen > 0) {
+ if (++num_chars >= MAXATOMLEN) return -1;
+ if ((src[0] & 0x80) != 0) {
+ if ((src[0] & 0xE0) == 0xC0) {
+ if (slen < 2 || (src[1] & 0xC0) != 0x80) return -1;
+ *dst++ = *src++;
+ slen--;
+ }
+ else if ((src[0] & 0xF0) == 0xE0) {
+ if (slen < 3 || (src[1] & 0xC0) != 0x80 || (src[2] & 0xC0) != 0x80) return -1;
+ *dst++ = *src++;
+ *dst++ = *src++;
+ slen -= 2;
+ }
+ else if ((src[0] & 0xF8) == 0xF0) {
+ if (slen < 4 || (src[1] & 0xC0) != 0x80 || (src[2] & 0xC0) != 0x80 || (src[3] & 0xC0) != 0x80) return -1;
+ *dst++ = *src++;
+ *dst++ = *src++;
+ *dst++ = *src++;
+ slen -= 3;
+ }
+ else return -1;
+ }
+ *dst++ = *src++;
+ slen--;
+ }
+ return 0;
+}
+
diff --git a/lib/erl_interface/src/legacy/erl_connect.c b/lib/erl_interface/src/legacy/erl_connect.c
index be83fa8469..f82704ea8b 100644
--- a/lib/erl_interface/src/legacy/erl_connect.c
+++ b/lib/erl_interface/src/legacy/erl_connect.c
@@ -125,7 +125,7 @@ static ei_cnode erl_if_ec;
int erl_connect_init(int this_node_number, char *cookie, short creation)
{
- char nn[MAXATOMLEN+1];
+ char nn[MAXATOMLEN];
sprintf(nn, "c%d", this_node_number);
diff --git a/lib/erl_interface/src/misc/ei_format.c b/lib/erl_interface/src/misc/ei_format.c
index 281a192535..b5f11e618e 100644
--- a/lib/erl_interface/src/misc/ei_format.c
+++ b/lib/erl_interface/src/misc/ei_format.c
@@ -139,8 +139,8 @@ static int patom(const char** fmt, ei_x_buff* x)
--(*fmt);
len = *fmt - start;
/* FIXME why truncate atom name and not fail?! */
- if (len > MAXATOMLEN)
- len = MAXATOMLEN;
+ if (len >= MAXATOMLEN)
+ len = MAXATOMLEN-1;
return ei_x_encode_atom_len(x, start, len);
}
diff --git a/lib/erl_interface/src/misc/ei_printterm.c b/lib/erl_interface/src/misc/ei_printterm.c
index 91fe73e68c..f3003a6172 100644
--- a/lib/erl_interface/src/misc/ei_printterm.c
+++ b/lib/erl_interface/src/misc/ei_printterm.c
@@ -115,7 +115,7 @@ static int print_term(FILE* fp, ei_x_buff* x,
const char* buf, int* index)
{
int i, doquote, n, m, ty, r;
- char a[MAXATOMLEN+1], *p;
+ char a[MAXATOMLEN], *p;
int ch_written = 0; /* counter of written chars */
erlang_pid pid;
erlang_port port;
diff --git a/lib/erl_interface/src/misc/show_msg.c b/lib/erl_interface/src/misc/show_msg.c
index ca46b15aff..33b09643ca 100644
--- a/lib/erl_interface/src/misc/show_msg.c
+++ b/lib/erl_interface/src/misc/show_msg.c
@@ -457,7 +457,7 @@ static void show_term(const char *termbuf, int *index, FILE *stream)
break;
case ERL_FUN_EXT: {
- char atom[MAXATOMLEN+1];
+ char atom[MAXATOMLEN];
long idx;
long uniq;
const char* s = termbuf + *index, * s0 = s;
diff --git a/lib/erl_interface/src/prog/ei_fake_prog.c b/lib/erl_interface/src/prog/ei_fake_prog.c
index 68eb537211..34101a2851 100644
--- a/lib/erl_interface/src/prog/ei_fake_prog.c
+++ b/lib/erl_interface/src/prog/ei_fake_prog.c
@@ -96,6 +96,7 @@ int main(void)
EI_ULONGLONG *ulonglongp = (EI_ULONGLONG*)NULL;
EI_ULONGLONG ulonglongx = 0;
#endif
+ enum erlang_char_encoding enc;
intx = erl_errno;
@@ -148,9 +149,13 @@ int main(void)
ei_x_encode_string(&eix, charp);
ei_x_encode_string_len(&eix, charp, intx);
ei_encode_atom(charp, intp, charp);
+ ei_encode_atom_as(charp, intp, charp, ERLANG_LATIN1, ERLANG_UTF8);
ei_encode_atom_len(charp, intp, charp, intx);
+ ei_encode_atom_len_as(charp, intp, charp, intx, ERLANG_ASCII, ERLANG_LATIN1);
ei_x_encode_atom(&eix, charp);
+ ei_x_encode_atom_as(&eix, charp, ERLANG_LATIN1, ERLANG_UTF8);
ei_x_encode_atom_len(&eix, charp, intx);
+ ei_x_encode_atom_len_as(&eix, charp, intx, ERLANG_LATIN1, ERLANG_UTF8);
ei_encode_binary(charp, intp, (void *)0, longx);
ei_x_encode_binary(&eix, (void*)0, intx);
ei_encode_pid(charp, intp, &epid);
@@ -181,6 +186,7 @@ int main(void)
ei_decode_char(charp, intp, charp);
ei_decode_string(charp, intp, charp);
ei_decode_atom(charp, intp, charp);
+ ei_decode_atom_as(charp, intp, charp, MAXATOMLEN_UTF8, ERLANG_WHATEVER, &enc, &enc);
ei_decode_binary(charp, intp, (void *)0, longp);
ei_decode_fun(charp, intp, &efun);
free_fun(&efun);
diff --git a/lib/erl_interface/test/ei_decode_encode_SUITE.erl b/lib/erl_interface/test/ei_decode_encode_SUITE.erl
index e8ae7a6f81..0c98b494ec 100644
--- a/lib/erl_interface/test/ei_decode_encode_SUITE.erl
+++ b/lib/erl_interface/test/ei_decode_encode_SUITE.erl
@@ -119,8 +119,12 @@ test_ei_decode_encode(Config) when is_list(Config) ->
?line send_rec(P, OXRef),
%% Unicode atoms
- [send_rec(P, Atom) || Atom <- unicode_atom_data()],
-
+ [begin send_rec(P, Atom),
+ send_rec(P, mk_pid({Atom,1}, 23434, 3434)),
+ send_rec(P, mk_port({Atom,1}, 2343434)),
+ send_rec(P, mk_ref({Atom,1}, [262143, 8723648, 24097245])),
+ void
+ end || Atom <- unicode_atom_data()],
?line runner:recv_eot(P),
ok.
@@ -229,38 +233,36 @@ uint8(Uint) ->
mk_pid({NodeName, Creation}, Number, Serial) when is_atom(NodeName) ->
- mk_pid({atom_to_list(NodeName), Creation}, Number, Serial);
-mk_pid({NodeName, Creation}, Number, Serial) ->
+ <<?VERSION_MAGIC, NodeNameExt/binary>> = term_to_binary(NodeName),
+ mk_pid({NodeNameExt, Creation}, Number, Serial);
+mk_pid({NodeNameExt, Creation}, Number, Serial) ->
case catch binary_to_term(list_to_binary([?VERSION_MAGIC,
?PID_EXT,
- ?ATOM_EXT,
- uint16_be(length(NodeName)),
- NodeName,
+ NodeNameExt,
uint32_be(Number),
uint32_be(Serial),
uint8(Creation)])) of
Pid when is_pid(Pid) ->
Pid;
{'EXIT', {badarg, _}} ->
- exit({badarg, mk_pid, [{NodeName, Creation}, Number, Serial]});
+ exit({badarg, mk_pid, [{NodeNameExt, Creation}, Number, Serial]});
Other ->
exit({unexpected_binary_to_term_result, Other})
end.
mk_port({NodeName, Creation}, Number) when is_atom(NodeName) ->
- mk_port({atom_to_list(NodeName), Creation}, Number);
-mk_port({NodeName, Creation}, Number) ->
+ <<?VERSION_MAGIC, NodeNameExt/binary>> = term_to_binary(NodeName),
+ mk_port({NodeNameExt, Creation}, Number);
+mk_port({NodeNameExt, Creation}, Number) ->
case catch binary_to_term(list_to_binary([?VERSION_MAGIC,
?PORT_EXT,
- ?ATOM_EXT,
- uint16_be(length(NodeName)),
- NodeName,
+ NodeNameExt,
uint32_be(Number),
uint8(Creation)])) of
Port when is_port(Port) ->
Port;
{'EXIT', {badarg, _}} ->
- exit({badarg, mk_port, [{NodeName, Creation}, Number]});
+ exit({badarg, mk_port, [{NodeNameExt, Creation}, Number]});
Other ->
exit({unexpected_binary_to_term_result, Other})
end.
@@ -268,33 +270,30 @@ mk_port({NodeName, Creation}, Number) ->
mk_ref({NodeName, Creation}, Numbers) when is_atom(NodeName),
is_integer(Creation),
is_list(Numbers) ->
- mk_ref({atom_to_list(NodeName), Creation}, Numbers);
-mk_ref({NodeName, Creation}, [Number]) when is_list(NodeName),
- is_integer(Creation),
- is_integer(Number) ->
+ <<?VERSION_MAGIC, NodeNameExt/binary>> = term_to_binary(NodeName),
+ mk_ref({NodeNameExt, Creation}, Numbers);
+mk_ref({NodeNameExt, Creation}, [Number]) when is_binary(NodeNameExt),
+ is_integer(Creation),
+ is_integer(Number) ->
case catch binary_to_term(list_to_binary([?VERSION_MAGIC,
?REFERENCE_EXT,
- ?ATOM_EXT,
- uint16_be(length(NodeName)),
- NodeName,
+ NodeNameExt,
uint32_be(Number),
uint8(Creation)])) of
Ref when is_reference(Ref) ->
Ref;
{'EXIT', {badarg, _}} ->
- exit({badarg, mk_ref, [{NodeName, Creation}, [Number]]});
+ exit({badarg, mk_ref, [{NodeNameExt, Creation}, [Number]]});
Other ->
exit({unexpected_binary_to_term_result, Other})
end;
-mk_ref({NodeName, Creation}, Numbers) when is_list(NodeName),
- is_integer(Creation),
- is_list(Numbers) ->
+mk_ref({NodeNameExt, Creation}, Numbers) when is_binary(NodeNameExt),
+ is_integer(Creation),
+ is_list(Numbers) ->
case catch binary_to_term(list_to_binary([?VERSION_MAGIC,
?NEW_REFERENCE_EXT,
uint16_be(length(Numbers)),
- ?ATOM_EXT,
- uint16_be(length(NodeName)),
- NodeName,
+ NodeNameExt,
uint8(Creation),
lists:map(fun (N) ->
uint32_be(N)
@@ -303,7 +302,7 @@ mk_ref({NodeName, Creation}, Numbers) when is_list(NodeName),
Ref when is_reference(Ref) ->
Ref;
{'EXIT', {badarg, _}} ->
- exit({badarg, mk_ref, [{NodeName, Creation}, Numbers]});
+ exit({badarg, mk_ref, [{NodeNameExt, Creation}, Numbers]});
Other ->
exit({unexpected_binary_to_term_result, Other})
end.
@@ -322,9 +321,11 @@ unicode_atom_data() ->
uc_atup(lists:seq(65500, 65754)),
uc_atup(lists:seq(65500, 65563))
| lists:map(fun (N) ->
- uc_atup(lists:seq(64000+N, 64254+N))
+ Pow2 = (1 bsl N),
+ uc_atup(lists:seq(Pow2 - 127, Pow2 + 127))
end,
- lists:seq(1, 2000))].
+ lists:seq(7, 20))
+ ].
uc_atup(ATxt) ->
string_to_atom(ATxt).
diff --git a/lib/erl_interface/test/ei_decode_encode_SUITE_data/ei_decode_encode_test.c b/lib/erl_interface/test/ei_decode_encode_SUITE_data/ei_decode_encode_test.c
index 194ce9057b..e57663f984 100644
--- a/lib/erl_interface/test/ei_decode_encode_SUITE_data/ei_decode_encode_test.c
+++ b/lib/erl_interface/test/ei_decode_encode_SUITE_data/ei_decode_encode_test.c
@@ -63,10 +63,12 @@ int ei_x_encode_my_atom(ei_x_buff* x, my_atom* a)
return ei_x_encode_atom_as(x, a->name, ERLANG_UTF8, a->enc);
}
+#define BUFSZ 2000
+
void decode_encode(struct Type* t, void* obj)
{
char *buf;
- char buf2[1024];
+ char buf2[BUFSZ];
int size1 = 0;
int size2 = 0;
int size3 = 0;
@@ -89,6 +91,11 @@ void decode_encode(struct Type* t, void* obj)
return;
}
+ if (size1 > BUFSZ) {
+ fail("size is > BUFSZ");
+ return;
+ }
+
MESSAGE("ei_encode_%s buf is NULL, arg is type %s", t->name, t->type);
err = t->ei_encode_fp(NULL, &size2, obj);
if (err != 0) {
@@ -277,8 +284,11 @@ TESTCASE(test_ei_decode_encode)
EI_DECODE_ENCODE(ref , erlang_ref);
/* Unicode atoms */
- for (i=0; i<2010; i++) {
- EI_DECODE_ENCODE(my_atom , my_atom);
+ for (i=0; i<24; i++) {
+ EI_DECODE_ENCODE(my_atom, my_atom);
+ EI_DECODE_ENCODE(pid, erlang_pid);
+ EI_DECODE_ENCODE(port, erlang_port);
+ EI_DECODE_ENCODE(ref, erlang_ref);
}
report(1);