From 32d13d59910a384ad09682cafc83c7300c96c694 Mon Sep 17 00:00:00 2001 From: Sverker Eriksson Date: Wed, 8 Mar 2017 14:46:24 +0100 Subject: erl_interface: Do not generate atoms on old latin1 ext format Solved by letting ei_encode_atom_as ignore 'to_enc' argument and always encode in UTF8 format. --- lib/erl_interface/doc/src/ei.xml | 20 +++---- lib/erl_interface/src/encode/encode_atom.c | 64 +++------------------- lib/erl_interface/src/encode/encode_boolean.c | 8 +-- lib/erl_interface/src/legacy/erl_eterm.c | 16 ++++-- lib/erl_interface/src/legacy/erl_marshal.c | 13 ++--- lib/erl_interface/test/ei_decode_SUITE.erl | 25 ++++++--- .../test/ei_decode_SUITE_data/ei_decode_test.c | 18 +++--- lib/erl_interface/test/ei_decode_encode_SUITE.erl | 1 - lib/erl_interface/test/ei_encode_SUITE.erl | 36 ++++++------ .../test/erl_ext_SUITE_data/ext_test.c | 10 ++-- 10 files changed, 83 insertions(+), 128 deletions(-) (limited to 'lib/erl_interface') diff --git a/lib/erl_interface/doc/src/ei.xml b/lib/erl_interface/doc/src/ei.xml index ddfb4d88a8..c3c776296c 100644 --- a/lib/erl_interface/doc/src/ei.xml +++ b/lib/erl_interface/doc/src/ei.xml @@ -421,22 +421,16 @@ typedef enum { intei_x_encode_atom_len_as(ei_x_buff* x, const char *p, int len, erlang_char_encoding from_enc, erlang_char_encoding to_enc) Encode an atom. -

Encodes an atom in the binary format with character encoding - to_enc - (Latin-1 or UTF-8). Parameter p is the name of the atom with +

Encodes an atom in the binary format. Parameter p is the name of the atom with character encoding from_enc (ASCII, Latin-1, or UTF-8). The name must either be NULL-terminated or - a function variant with a len parameter must be used. - If to_enc is set to the bitwise OR'd combination - (ERLANG_LATIN1|ERLANG_UTF8), UTF-8 encoding is only used if the - atom string cannot be represented in Latin-1 encoding.

-

The encoding fails if p is an invalid string in encoding - from_enc, if the string is too long, or if it cannot be - represented with character encoding to_enc.

-

These functions were introduced in Erlang/OTP R16 as part of a first - step to support UTF-8 atoms. Atoms encoded with ERLANG_UTF8 - cannot be decoded by earlier releases than R16.

+ a function variant with a len parameter must be used.

+

The encoding fails if p is not a valid string in encoding + from_enc.

+ +

Argument to_enc is ignored. As from Erlang/OTP 20 the encoding is always + done in UTF-8 which is readable by nodes as old as Erlang/OTP R16.

diff --git a/lib/erl_interface/src/encode/encode_atom.c b/lib/erl_interface/src/encode/encode_atom.c index c1817628e5..1fd7811a0e 100644 --- a/lib/erl_interface/src/encode/encode_atom.c +++ b/lib/erl_interface/src/encode/encode_atom.c @@ -26,7 +26,6 @@ static int verify_ascii_atom(const char* src, int slen); static int verify_utf8_atom(const char* src, int slen); -static int is_latin1_as_utf8(const char *p, int len); int ei_encode_atom(char *buf, int *index, const char *p) { @@ -34,7 +33,7 @@ int ei_encode_atom(char *buf, int *index, const char *p) if (len >= MAXATOMLEN) len = MAXATOMLEN - 1; - return ei_encode_atom_len_as(buf, index, p, len, ERLANG_LATIN1, ERLANG_LATIN1); + return ei_encode_atom_len_as(buf, index, p, len, ERLANG_LATIN1, 0); } int ei_encode_atom_len(char *buf, int *index, const char *p, int len) @@ -42,7 +41,7 @@ int ei_encode_atom_len(char *buf, int *index, const char *p, int len) /* This function is documented to truncate at MAXATOMLEN (256) */ if (len >= MAXATOMLEN) len = MAXATOMLEN - 1; - return ei_encode_atom_len_as(buf, index, p, len, ERLANG_LATIN1, ERLANG_LATIN1); + return ei_encode_atom_len_as(buf, index, p, len, ERLANG_LATIN1, 0); } int ei_encode_atom_as(char *buf, int *index, const char *p, @@ -64,46 +63,11 @@ int ei_encode_atom_len_as(char *buf, int *index, const char *p, int len, return -1; } - if (to_enc == (ERLANG_LATIN1 | ERLANG_UTF8)) { - if (from_enc == ERLANG_UTF8) { - to_enc = is_latin1_as_utf8(p, len) ? ERLANG_LATIN1 : ERLANG_UTF8; - } - else { - to_enc = from_enc; - } - } - switch(to_enc) { - case ERLANG_LATIN1: - if (buf) { - put8(s,ERL_ATOM_EXT); - switch (from_enc) { - case ERLANG_UTF8: - len = utf8_to_latin1(s+2, p, len, MAXATOMLEN-1, NULL); - if (len < 0) return -1; - break; - case ERLANG_ASCII: - if (verify_ascii_atom(p, len) < 0) return -1; - memcpy(s+2, p, len); - break; - case ERLANG_LATIN1: - memcpy(s+2, p, len); - break; - default: - return -1; - } - put16be(s,len); - } - else { - s += 3; - if (from_enc == ERLANG_UTF8) { - len = utf8_to_latin1(NULL, p, len, MAXATOMLEN-1, NULL); - if (len < 0) return -1; - } else if (from_enc == ERLANG_ASCII) - if (verify_ascii_atom(p, len) < 0) return -1; - } - break; - - case ERLANG_UTF8: + /* + * Since OTP 20 we totally ignore 'to_enc' + * and alway encode as UTF8. + */ + { offs = 1 + 1; switch (from_enc) { case ERLANG_LATIN1: @@ -133,10 +97,6 @@ int ei_encode_atom_len_as(char *buf, int *index, const char *p, int len, } } else s+= offs; - break; - - default: - return -1; } s += len; @@ -197,13 +157,3 @@ static int verify_utf8_atom(const char* src, int slen) return 0; } -/* Only latin1 code points in utf8 string? - */ -static int is_latin1_as_utf8(const char *p, int len) -{ - int i; - for (i=0; i 0xC3) return 0; - } - return 1; -} diff --git a/lib/erl_interface/src/encode/encode_boolean.c b/lib/erl_interface/src/encode/encode_boolean.c index 61e7e5e6e7..053029af05 100644 --- a/lib/erl_interface/src/encode/encode_boolean.c +++ b/lib/erl_interface/src/encode/encode_boolean.c @@ -32,12 +32,12 @@ int ei_encode_boolean(char *buf, int *index, int p) val = p ? "true" : "false"; len = strlen(val); - if (!buf) s += 3; + if (!buf) s += 2; else { - put8(s,ERL_ATOM_EXT); - put16be(s,len); + put8(s, ERL_SMALL_ATOM_UTF8_EXT); + put8(s, len); - memmove(s,val,len); /* unterminated string */ + memcpy(s,val,len); /* unterminated string */ } s += len; diff --git a/lib/erl_interface/src/legacy/erl_eterm.c b/lib/erl_interface/src/legacy/erl_eterm.c index e4b3b49c7d..5153d0f2e7 100644 --- a/lib/erl_interface/src/legacy/erl_eterm.c +++ b/lib/erl_interface/src/legacy/erl_eterm.c @@ -188,14 +188,20 @@ char* erl_atom_ptr_latin1(Erl_Atom_data* a) char* erl_atom_ptr_utf8(Erl_Atom_data* a) { if (a->utf8 == NULL) { - int dlen = a->lenL * 2; /* over estimation */ - a->utf8 = malloc(dlen + 1); - a->lenU = latin1_to_utf8(a->utf8, a->latin1, a->lenL, dlen, NULL); - a->utf8[a->lenU] = '\0'; + erlang_char_encoding enc; + a->lenU = latin1_to_utf8(NULL, a->latin1, a->lenL, a->lenL*2, &enc); + if (enc == ERLANG_ASCII) { + a->utf8 = a->latin1; + } + else { + a->utf8 = malloc(a->lenU + 1); + latin1_to_utf8(a->utf8, a->latin1, a->lenL, a->lenU, NULL); + a->utf8[a->lenU] = '\0'; + } } return a->utf8; - } + int erl_atom_size_latin1(Erl_Atom_data* a) { if (a->latin1 == NULL) { diff --git a/lib/erl_interface/src/legacy/erl_marshal.c b/lib/erl_interface/src/legacy/erl_marshal.c index 527ae0ef8f..b7a8455313 100644 --- a/lib/erl_interface/src/legacy/erl_marshal.c +++ b/lib/erl_interface/src/legacy/erl_marshal.c @@ -175,10 +175,9 @@ static void encode_atom(Erl_Atom_data* a, unsigned char **ext) int ix = 0; if (a->latin1) { ei_encode_atom_len_as((char*)*ext, &ix, a->latin1, a->lenL, - ERLANG_LATIN1, ERLANG_LATIN1); + ERLANG_LATIN1, ERLANG_UTF8); } - else if (ei_encode_atom_len_as((char*)*ext, &ix, a->utf8, a->lenU, - ERLANG_UTF8, ERLANG_LATIN1) < 0) { + else { ei_encode_atom_len_as((char*)*ext, &ix, a->utf8, a->lenU, ERLANG_UTF8, ERLANG_UTF8); } @@ -542,12 +541,8 @@ int erl_term_len(ETERM *ep) static int atom_len_helper(Erl_Atom_data* a) { - if (erl_atom_ptr_latin1(a)) { - return 1 + 2 + a->lenL; /* ERL_ATOM_EXT */ - } - else { - return 1 + 1 + (a->lenU > 255) + a->lenU; - } + (void) erl_atom_ptr_utf8(a); + return 1 + 1 + (a->lenU > 255) + a->lenU; } static int erl_term_len_helper(ETERM *ep, int dist) diff --git a/lib/erl_interface/test/ei_decode_SUITE.erl b/lib/erl_interface/test/ei_decode_SUITE.erl index 10e90685c8..8612450692 100644 --- a/lib/erl_interface/test/ei_decode_SUITE.erl +++ b/lib/erl_interface/test/ei_decode_SUITE.erl @@ -179,7 +179,8 @@ test_ei_decode_misc(Config) when is_list(Config) -> send_term_as_binary(P,foo), send_term_as_binary(P,''), - send_term_as_binary(P,'ÅÄÖåäö'), + %%send_term_as_binary(P,'ÅÄÖåäö'), + send_latin1_atom_as_binary(P, "ÅÄÖåäö"), send_term_as_binary(P,"foo"), send_term_as_binary(P,""), @@ -200,18 +201,19 @@ test_ei_decode_misc(Config) when is_list(Config) -> test_ei_decode_utf8_atom(Config) -> P = runner:start(?test_ei_decode_utf8_atom), - send_utf8_atom_as_binary(P,"Ã¥"), - send_utf8_atom_as_binary(P,"ä"), - send_term_as_binary(P,'ö'), - send_term_as_binary(P,'õ'), + send_latin1_atom_as_binary(P,"Ã¥"), + send_latin1_atom_as_binary(P,"ä"), + send_latin1_atom_as_binary(P,"ö"), + send_latin1_atom_as_binary(P,"õ"), send_utf8_atom_as_binary(P,[1758]), send_utf8_atom_as_binary(P,[1758,1758]), send_utf8_atom_as_binary(P,[1758,1758,1758]), send_utf8_atom_as_binary(P,[1758,1758,1758,1758]), - send_utf8_atom_as_binary(P,"a"), - send_utf8_atom_as_binary(P,"b"), + send_latin1_atom_as_binary(P,"a"), + send_latin1_atom_as_binary(P,"b"), + send_term_as_binary(P,'c'), send_term_as_binary(P,'d'), @@ -230,6 +232,9 @@ send_raw(Port, Bin) when is_port(Port) -> send_utf8_atom_as_binary(Port, String) -> Port ! {self(), {command, term_to_binary(uc_atup(String))}}. +send_latin1_atom_as_binary(Port, String) -> + Port ! {self(), {command, encode_latin1_atom(String)}}. + send_integers(P) -> send_term_as_binary(P,0), % SMALL_INTEGER_EXT smallest send_term_as_binary(P,255), % SMALL_INTEGER_EXT largest @@ -304,6 +309,12 @@ send_integers2(P) -> send_term_as_binary(P, []), % illegal type ok. +encode_latin1_atom(String) -> + Len = length(String), + %% Use ATOM_EXT (not SMALL_*) to simulate old term_to_binary + TagLen = [$d, Len bsr 8, Len band 16#ff], + list_to_binary([131, TagLen, String]). + uc_atup(ATxt) -> string_to_atom(ATxt). diff --git a/lib/erl_interface/test/ei_decode_SUITE_data/ei_decode_test.c b/lib/erl_interface/test/ei_decode_SUITE_data/ei_decode_test.c index cfe9083065..649dc9a677 100644 --- a/lib/erl_interface/test/ei_decode_SUITE_data/ei_decode_test.c +++ b/lib/erl_interface/test/ei_decode_SUITE_data/ei_decode_test.c @@ -102,7 +102,7 @@ int ei_decode_my_string(const char *buf, int *index, char *to, } \ \ if (size1 != SIZE) { \ - fail("size of encoded data is incorrect"); \ + fail1("size of encoded data (%d) is incorrect", size1); \ return; \ } \ } \ @@ -614,11 +614,11 @@ TESTCASE(test_ei_decode_misc) EI_DECODE_2(decode_double, 9, double, -1.0); EI_DECODE_2(decode_double, 9, double, 1.0); - EI_DECODE_2(decode_boolean, 8, int, 0); - EI_DECODE_2(decode_boolean, 7, int, 1); + EI_DECODE_2(decode_boolean, 7, int, 0); + EI_DECODE_2(decode_boolean, 6, int, 1); - EI_DECODE_STRING(decode_my_atom, 6, "foo"); - EI_DECODE_STRING(decode_my_atom, 3, ""); + EI_DECODE_STRING(decode_my_atom, 5, "foo"); + EI_DECODE_STRING(decode_my_atom, 2, ""); EI_DECODE_STRING(decode_my_atom, 9, "ÅÄÖåäö"); EI_DECODE_STRING(decode_my_string, 6, "foo"); @@ -665,10 +665,10 @@ TESTCASE(test_ei_decode_utf8_atom) P99({ERLANG_ANY,ERLANG_LATIN1,ERLANG_ASCII})); EI_DECODE_STRING_4(decode_my_atom_as, 4, "b", P99({ERLANG_UTF8,ERLANG_LATIN1,ERLANG_ASCII})); - EI_DECODE_STRING_4(decode_my_atom_as, 4, "c", - P99({ERLANG_LATIN1,ERLANG_LATIN1,ERLANG_ASCII})); - EI_DECODE_STRING_4(decode_my_atom_as, 4, "d", - P99({ERLANG_ASCII,ERLANG_LATIN1,ERLANG_ASCII})); + EI_DECODE_STRING_4(decode_my_atom_as, 3, "c", + P99({ERLANG_LATIN1,ERLANG_UTF8,ERLANG_ASCII})); + EI_DECODE_STRING_4(decode_my_atom_as, 3, "d", + P99({ERLANG_ASCII,ERLANG_UTF8,ERLANG_ASCII})); report(1); } diff --git a/lib/erl_interface/test/ei_decode_encode_SUITE.erl b/lib/erl_interface/test/ei_decode_encode_SUITE.erl index 570a91e2da..108a1f5142 100644 --- a/lib/erl_interface/test/ei_decode_encode_SUITE.erl +++ b/lib/erl_interface/test/ei_decode_encode_SUITE.erl @@ -170,7 +170,6 @@ get_binary(P) -> -define(VERSION_MAGIC, 131). --define(ATOM_EXT, 100). -define(REFERENCE_EXT, 101). -define(PORT_EXT, 102). -define(PID_EXT, 103). diff --git a/lib/erl_interface/test/ei_encode_SUITE.erl b/lib/erl_interface/test/ei_encode_SUITE.erl index ac6ec9cf4e..43484a1319 100644 --- a/lib/erl_interface/test/ei_encode_SUITE.erl +++ b/lib/erl_interface/test/ei_encode_SUITE.erl @@ -184,17 +184,17 @@ test_ei_encode_misc(Config) when is_list(Config) -> {<<70,_:8/binary>>,Fp1} = get_buf_and_term(P), true = match_float(Fp1, 1.0), - {<<100,0,5,"false">>,false} = get_buf_and_term(P), - {<<100,0,4,"true">> ,true} = get_buf_and_term(P), - {<<100,0,4,"true">> ,true} = get_buf_and_term(P), - {<<100,0,4,"true">> ,true} = get_buf_and_term(P), - - {<<100,0,3,"foo">>,foo} = get_buf_and_term(P), - {<<100,0,3,"foo">>,foo} = get_buf_and_term(P), - {<<100,0,0,"">>,''} = get_buf_and_term(P), - {<<100,0,0,"">>,''} = get_buf_and_term(P), - {<<100,0,6,"ÅÄÖåäö">>,'ÅÄÖåäö'} = get_buf_and_term(P), - {<<100,0,6,"ÅÄÖåäö">>,'ÅÄÖåäö'} = get_buf_and_term(P), + {<<$w,5,"false">>,false} = get_buf_and_term(P), + {<<$w,4,"true">> ,true} = get_buf_and_term(P), + {<<$w,4,"true">> ,true} = get_buf_and_term(P), + {<<$w,4,"true">> ,true} = get_buf_and_term(P), + + {<<$w,3,"foo">>,foo} = get_buf_and_term(P), + {<<$w,3,"foo">>,foo} = get_buf_and_term(P), + {<<$w,0,"">>,''} = get_buf_and_term(P), + {<<$w,0,"">>,''} = get_buf_and_term(P), + {<<$w,12,"ÅÄÖåäö"/utf8>>,'ÅÄÖåäö'} = get_buf_and_term(P), + {<<$w,12,"ÅÄÖåäö"/utf8>>,'ÅÄÖåäö'} = get_buf_and_term(P), {<<107,0,3,"foo">>,"foo"} = get_buf_and_term(P), {<<107,0,3,"foo">>,"foo"} = get_buf_and_term(P), @@ -239,12 +239,12 @@ test_ei_encode_utf8_atom(Config) -> P = runner:start(?test_ei_encode_utf8_atom), {<<119,2,195,133>>,'Ã…'} = get_buf_and_term(P), - {<<100,0,1,197>>,'Ã…'} = get_buf_and_term(P), - {<<100,0,1,197>>,'Ã…'} = get_buf_and_term(P), + {<<119,2,195,133>>,'Ã…'} = get_buf_and_term(P), + {<<119,2,195,133>>,'Ã…'} = get_buf_and_term(P), {<<119,2,195,133>>,'Ã…'} = get_buf_and_term(P), {<<119,1,$A>>,'A'} = get_buf_and_term(P), - {<<100,0,1,$A>>,'A'} = get_buf_and_term(P), + {<<119,1,$A>>,'A'} = get_buf_and_term(P), runner:recv_eot(P), ok. @@ -254,13 +254,13 @@ test_ei_encode_utf8_atom_len(Config) -> P = runner:start(?test_ei_encode_utf8_atom_len), {<<119,2,195,133>>,'Ã…'} = get_buf_and_term(P), - {<<100,0,2,197,196>>,'ÅÄ'} = get_buf_and_term(P), - {<<100,0,1,197>>,'Ã…'} = get_buf_and_term(P), + {<<119,4,195,133,195,132>>,'ÅÄ'} = get_buf_and_term(P), + {<<119,2,195,133>>,'Ã…'} = get_buf_and_term(P), {<<119,4,195,133,195,132>>,'ÅÄ'} = get_buf_and_term(P), {<<119,1,$A>>,'A'} = get_buf_and_term(P), - {<<100,0,2,$A,$B>>,'AB'} = get_buf_and_term(P), - {<<100,0,255,_:(255*8)>>,_} = get_buf_and_term(P), + {<<119,2,$A,$B>>,'AB'} = get_buf_and_term(P), + {<<119,255,_:(255*8)>>,_} = get_buf_and_term(P), runner:recv_eot(P), ok. diff --git a/lib/erl_interface/test/erl_ext_SUITE_data/ext_test.c b/lib/erl_interface/test/erl_ext_SUITE_data/ext_test.c index 36cf086ed2..afac5485e8 100644 --- a/lib/erl_interface/test/erl_ext_SUITE_data/ext_test.c +++ b/lib/erl_interface/test/erl_ext_SUITE_data/ext_test.c @@ -407,7 +407,7 @@ test_compare_ext(char *test_desc, } -#define ATOM_EXT (100) +#define SMALL_ATOM_UTF8_EXT (119) #define REFERENCE_EXT (101) #define PORT_EXT (102) #define PID_EXT (103) @@ -429,13 +429,13 @@ write_atom(unsigned char *buf, char *atom) len = 0; while(atom[len]) { - buf[len + 3] = atom[len]; + buf[len + 2] = atom[len]; len++; } - buf[0] = ATOM_EXT; - PUT_UINT16(&buf[1], len); + buf[0] = SMALL_ATOM_UTF8_EXT; + buf[1] = len; - return buf + 3 + len; + return buf + 2 + len; } static unsigned char * -- cgit v1.2.3