aboutsummaryrefslogtreecommitdiffstats
path: root/lib/erl_interface
diff options
context:
space:
mode:
authorSverker Eriksson <[email protected]>2017-03-08 14:46:24 +0100
committerSverker Eriksson <[email protected]>2017-03-08 18:07:09 +0100
commit32d13d59910a384ad09682cafc83c7300c96c694 (patch)
tree1f5bfca2565243210b16a6de2bdc3ae66cbff8bb /lib/erl_interface
parent65b04e233e09e3cc2e0fda3c28e155b95c5a4baf (diff)
downloadotp-32d13d59910a384ad09682cafc83c7300c96c694.tar.gz
otp-32d13d59910a384ad09682cafc83c7300c96c694.tar.bz2
otp-32d13d59910a384ad09682cafc83c7300c96c694.zip
erl_interface: Do not generate atoms on old latin1 ext format
Solved by letting ei_encode_atom_as ignore 'to_enc' argument and always encode in UTF8 format.
Diffstat (limited to 'lib/erl_interface')
-rw-r--r--lib/erl_interface/doc/src/ei.xml20
-rw-r--r--lib/erl_interface/src/encode/encode_atom.c64
-rw-r--r--lib/erl_interface/src/encode/encode_boolean.c8
-rw-r--r--lib/erl_interface/src/legacy/erl_eterm.c16
-rw-r--r--lib/erl_interface/src/legacy/erl_marshal.c13
-rw-r--r--lib/erl_interface/test/ei_decode_SUITE.erl25
-rw-r--r--lib/erl_interface/test/ei_decode_SUITE_data/ei_decode_test.c18
-rw-r--r--lib/erl_interface/test/ei_decode_encode_SUITE.erl1
-rw-r--r--lib/erl_interface/test/ei_encode_SUITE.erl36
-rw-r--r--lib/erl_interface/test/erl_ext_SUITE_data/ext_test.c10
10 files changed, 83 insertions, 128 deletions
diff --git a/lib/erl_interface/doc/src/ei.xml b/lib/erl_interface/doc/src/ei.xml
index ddfb4d88a8..c3c776296c 100644
--- a/lib/erl_interface/doc/src/ei.xml
+++ b/lib/erl_interface/doc/src/ei.xml
@@ -421,22 +421,16 @@ typedef enum {
<name><ret>int</ret><nametext>ei_x_encode_atom_len_as(ei_x_buff* x, const char *p, int len, erlang_char_encoding from_enc, erlang_char_encoding to_enc)</nametext></name>
<fsummary>Encode an atom.</fsummary>
<desc>
- <p>Encodes an atom in the binary format with character encoding
- <seealso marker="#erlang_char_encoding"><c>to_enc</c></seealso>
- (Latin-1 or UTF-8). Parameter <c>p</c> is the name of the atom with
+ <p>Encodes an atom in the binary format. Parameter <c>p</c> is the name of the atom with
character encoding
<seealso marker="#erlang_char_encoding"><c>from_enc</c></seealso>
(ASCII, Latin-1, or UTF-8). The name must either be <c>NULL</c>-terminated or
- a function variant with a <c>len</c> parameter must be used.
- If <c>to_enc</c> is set to the bitwise OR'd combination
- <c>(ERLANG_LATIN1|ERLANG_UTF8)</c>, UTF-8 encoding is only used if the
- atom string cannot be represented in Latin-1 encoding.</p>
- <p>The encoding fails if <c>p</c> is an invalid string in encoding
- <c>from_enc</c>, if the string is too long, or if it cannot be
- represented with character encoding <c>to_enc</c>.</p>
- <p>These functions were introduced in Erlang/OTP R16 as part of a first
- step to support UTF-8 atoms. Atoms encoded with <c>ERLANG_UTF8</c>
- cannot be decoded by earlier releases than R16.</p>
+ a function variant with a <c>len</c> parameter must be used.</p>
+ <p>The encoding fails if <c>p</c> is not a valid string in encoding
+ <c>from_enc</c>.</p>
+
+ <p>Argument <c>to_enc</c> is ignored. As from Erlang/OTP 20 the encoding is always
+ done in UTF-8 which is readable by nodes as old as Erlang/OTP R16.</p>
</desc>
</func>
diff --git a/lib/erl_interface/src/encode/encode_atom.c b/lib/erl_interface/src/encode/encode_atom.c
index c1817628e5..1fd7811a0e 100644
--- a/lib/erl_interface/src/encode/encode_atom.c
+++ b/lib/erl_interface/src/encode/encode_atom.c
@@ -26,7 +26,6 @@
static int verify_ascii_atom(const char* src, int slen);
static int verify_utf8_atom(const char* src, int slen);
-static int is_latin1_as_utf8(const char *p, int len);
int ei_encode_atom(char *buf, int *index, const char *p)
{
@@ -34,7 +33,7 @@ int ei_encode_atom(char *buf, int *index, const char *p)
if (len >= MAXATOMLEN)
len = MAXATOMLEN - 1;
- return ei_encode_atom_len_as(buf, index, p, len, ERLANG_LATIN1, ERLANG_LATIN1);
+ return ei_encode_atom_len_as(buf, index, p, len, ERLANG_LATIN1, 0);
}
int ei_encode_atom_len(char *buf, int *index, const char *p, int len)
@@ -42,7 +41,7 @@ int ei_encode_atom_len(char *buf, int *index, const char *p, int len)
/* This function is documented to truncate at MAXATOMLEN (256) */
if (len >= MAXATOMLEN)
len = MAXATOMLEN - 1;
- return ei_encode_atom_len_as(buf, index, p, len, ERLANG_LATIN1, ERLANG_LATIN1);
+ return ei_encode_atom_len_as(buf, index, p, len, ERLANG_LATIN1, 0);
}
int ei_encode_atom_as(char *buf, int *index, const char *p,
@@ -64,46 +63,11 @@ int ei_encode_atom_len_as(char *buf, int *index, const char *p, int len,
return -1;
}
- if (to_enc == (ERLANG_LATIN1 | ERLANG_UTF8)) {
- if (from_enc == ERLANG_UTF8) {
- to_enc = is_latin1_as_utf8(p, len) ? ERLANG_LATIN1 : ERLANG_UTF8;
- }
- else {
- to_enc = from_enc;
- }
- }
- switch(to_enc) {
- case ERLANG_LATIN1:
- if (buf) {
- put8(s,ERL_ATOM_EXT);
- switch (from_enc) {
- case ERLANG_UTF8:
- len = utf8_to_latin1(s+2, p, len, MAXATOMLEN-1, NULL);
- if (len < 0) return -1;
- break;
- case ERLANG_ASCII:
- if (verify_ascii_atom(p, len) < 0) return -1;
- memcpy(s+2, p, len);
- break;
- case ERLANG_LATIN1:
- memcpy(s+2, p, len);
- break;
- default:
- return -1;
- }
- put16be(s,len);
- }
- else {
- s += 3;
- if (from_enc == ERLANG_UTF8) {
- len = utf8_to_latin1(NULL, p, len, MAXATOMLEN-1, NULL);
- if (len < 0) return -1;
- } else if (from_enc == ERLANG_ASCII)
- if (verify_ascii_atom(p, len) < 0) return -1;
- }
- break;
-
- case ERLANG_UTF8:
+ /*
+ * Since OTP 20 we totally ignore 'to_enc'
+ * and alway encode as UTF8.
+ */
+ {
offs = 1 + 1;
switch (from_enc) {
case ERLANG_LATIN1:
@@ -133,10 +97,6 @@ int ei_encode_atom_len_as(char *buf, int *index, const char *p, int len,
}
}
else s+= offs;
- break;
-
- default:
- return -1;
}
s += len;
@@ -197,13 +157,3 @@ static int verify_utf8_atom(const char* src, int slen)
return 0;
}
-/* Only latin1 code points in utf8 string?
- */
-static int is_latin1_as_utf8(const char *p, int len)
-{
- int i;
- for (i=0; i<len; i++) {
- if ((unsigned char)p[i] > 0xC3) return 0;
- }
- return 1;
-}
diff --git a/lib/erl_interface/src/encode/encode_boolean.c b/lib/erl_interface/src/encode/encode_boolean.c
index 61e7e5e6e7..053029af05 100644
--- a/lib/erl_interface/src/encode/encode_boolean.c
+++ b/lib/erl_interface/src/encode/encode_boolean.c
@@ -32,12 +32,12 @@ int ei_encode_boolean(char *buf, int *index, int p)
val = p ? "true" : "false";
len = strlen(val);
- if (!buf) s += 3;
+ if (!buf) s += 2;
else {
- put8(s,ERL_ATOM_EXT);
- put16be(s,len);
+ put8(s, ERL_SMALL_ATOM_UTF8_EXT);
+ put8(s, len);
- memmove(s,val,len); /* unterminated string */
+ memcpy(s,val,len); /* unterminated string */
}
s += len;
diff --git a/lib/erl_interface/src/legacy/erl_eterm.c b/lib/erl_interface/src/legacy/erl_eterm.c
index e4b3b49c7d..5153d0f2e7 100644
--- a/lib/erl_interface/src/legacy/erl_eterm.c
+++ b/lib/erl_interface/src/legacy/erl_eterm.c
@@ -188,14 +188,20 @@ char* erl_atom_ptr_latin1(Erl_Atom_data* a)
char* erl_atom_ptr_utf8(Erl_Atom_data* a)
{
if (a->utf8 == NULL) {
- int dlen = a->lenL * 2; /* over estimation */
- a->utf8 = malloc(dlen + 1);
- a->lenU = latin1_to_utf8(a->utf8, a->latin1, a->lenL, dlen, NULL);
- a->utf8[a->lenU] = '\0';
+ erlang_char_encoding enc;
+ a->lenU = latin1_to_utf8(NULL, a->latin1, a->lenL, a->lenL*2, &enc);
+ if (enc == ERLANG_ASCII) {
+ a->utf8 = a->latin1;
+ }
+ else {
+ a->utf8 = malloc(a->lenU + 1);
+ latin1_to_utf8(a->utf8, a->latin1, a->lenL, a->lenU, NULL);
+ a->utf8[a->lenU] = '\0';
+ }
}
return a->utf8;
-
}
+
int erl_atom_size_latin1(Erl_Atom_data* a)
{
if (a->latin1 == NULL) {
diff --git a/lib/erl_interface/src/legacy/erl_marshal.c b/lib/erl_interface/src/legacy/erl_marshal.c
index 527ae0ef8f..b7a8455313 100644
--- a/lib/erl_interface/src/legacy/erl_marshal.c
+++ b/lib/erl_interface/src/legacy/erl_marshal.c
@@ -175,10 +175,9 @@ static void encode_atom(Erl_Atom_data* a, unsigned char **ext)
int ix = 0;
if (a->latin1) {
ei_encode_atom_len_as((char*)*ext, &ix, a->latin1, a->lenL,
- ERLANG_LATIN1, ERLANG_LATIN1);
+ ERLANG_LATIN1, ERLANG_UTF8);
}
- else if (ei_encode_atom_len_as((char*)*ext, &ix, a->utf8, a->lenU,
- ERLANG_UTF8, ERLANG_LATIN1) < 0) {
+ else {
ei_encode_atom_len_as((char*)*ext, &ix, a->utf8, a->lenU,
ERLANG_UTF8, ERLANG_UTF8);
}
@@ -542,12 +541,8 @@ int erl_term_len(ETERM *ep)
static int atom_len_helper(Erl_Atom_data* a)
{
- if (erl_atom_ptr_latin1(a)) {
- return 1 + 2 + a->lenL; /* ERL_ATOM_EXT */
- }
- else {
- return 1 + 1 + (a->lenU > 255) + a->lenU;
- }
+ (void) erl_atom_ptr_utf8(a);
+ return 1 + 1 + (a->lenU > 255) + a->lenU;
}
static int erl_term_len_helper(ETERM *ep, int dist)
diff --git a/lib/erl_interface/test/ei_decode_SUITE.erl b/lib/erl_interface/test/ei_decode_SUITE.erl
index 10e90685c8..8612450692 100644
--- a/lib/erl_interface/test/ei_decode_SUITE.erl
+++ b/lib/erl_interface/test/ei_decode_SUITE.erl
@@ -179,7 +179,8 @@ test_ei_decode_misc(Config) when is_list(Config) ->
send_term_as_binary(P,foo),
send_term_as_binary(P,''),
- send_term_as_binary(P,'ÅÄÖåäö'),
+ %%send_term_as_binary(P,'ÅÄÖåäö'),
+ send_latin1_atom_as_binary(P, "ÅÄÖåäö"),
send_term_as_binary(P,"foo"),
send_term_as_binary(P,""),
@@ -200,18 +201,19 @@ test_ei_decode_misc(Config) when is_list(Config) ->
test_ei_decode_utf8_atom(Config) ->
P = runner:start(?test_ei_decode_utf8_atom),
- send_utf8_atom_as_binary(P,"å"),
- send_utf8_atom_as_binary(P,"ä"),
- send_term_as_binary(P,'ö'),
- send_term_as_binary(P,'õ'),
+ send_latin1_atom_as_binary(P,"å"),
+ send_latin1_atom_as_binary(P,"ä"),
+ send_latin1_atom_as_binary(P,"ö"),
+ send_latin1_atom_as_binary(P,"õ"),
send_utf8_atom_as_binary(P,[1758]),
send_utf8_atom_as_binary(P,[1758,1758]),
send_utf8_atom_as_binary(P,[1758,1758,1758]),
send_utf8_atom_as_binary(P,[1758,1758,1758,1758]),
- send_utf8_atom_as_binary(P,"a"),
- send_utf8_atom_as_binary(P,"b"),
+ send_latin1_atom_as_binary(P,"a"),
+ send_latin1_atom_as_binary(P,"b"),
+
send_term_as_binary(P,'c'),
send_term_as_binary(P,'d'),
@@ -230,6 +232,9 @@ send_raw(Port, Bin) when is_port(Port) ->
send_utf8_atom_as_binary(Port, String) ->
Port ! {self(), {command, term_to_binary(uc_atup(String))}}.
+send_latin1_atom_as_binary(Port, String) ->
+ Port ! {self(), {command, encode_latin1_atom(String)}}.
+
send_integers(P) ->
send_term_as_binary(P,0), % SMALL_INTEGER_EXT smallest
send_term_as_binary(P,255), % SMALL_INTEGER_EXT largest
@@ -304,6 +309,12 @@ send_integers2(P) ->
send_term_as_binary(P, []), % illegal type
ok.
+encode_latin1_atom(String) ->
+ Len = length(String),
+ %% Use ATOM_EXT (not SMALL_*) to simulate old term_to_binary
+ TagLen = [$d, Len bsr 8, Len band 16#ff],
+ list_to_binary([131, TagLen, String]).
+
uc_atup(ATxt) ->
string_to_atom(ATxt).
diff --git a/lib/erl_interface/test/ei_decode_SUITE_data/ei_decode_test.c b/lib/erl_interface/test/ei_decode_SUITE_data/ei_decode_test.c
index cfe9083065..649dc9a677 100644
--- a/lib/erl_interface/test/ei_decode_SUITE_data/ei_decode_test.c
+++ b/lib/erl_interface/test/ei_decode_SUITE_data/ei_decode_test.c
@@ -102,7 +102,7 @@ int ei_decode_my_string(const char *buf, int *index, char *to,
} \
\
if (size1 != SIZE) { \
- fail("size of encoded data is incorrect"); \
+ fail1("size of encoded data (%d) is incorrect", size1); \
return; \
} \
} \
@@ -614,11 +614,11 @@ TESTCASE(test_ei_decode_misc)
EI_DECODE_2(decode_double, 9, double, -1.0);
EI_DECODE_2(decode_double, 9, double, 1.0);
- EI_DECODE_2(decode_boolean, 8, int, 0);
- EI_DECODE_2(decode_boolean, 7, int, 1);
+ EI_DECODE_2(decode_boolean, 7, int, 0);
+ EI_DECODE_2(decode_boolean, 6, int, 1);
- EI_DECODE_STRING(decode_my_atom, 6, "foo");
- EI_DECODE_STRING(decode_my_atom, 3, "");
+ EI_DECODE_STRING(decode_my_atom, 5, "foo");
+ EI_DECODE_STRING(decode_my_atom, 2, "");
EI_DECODE_STRING(decode_my_atom, 9, "������");
EI_DECODE_STRING(decode_my_string, 6, "foo");
@@ -665,10 +665,10 @@ TESTCASE(test_ei_decode_utf8_atom)
P99({ERLANG_ANY,ERLANG_LATIN1,ERLANG_ASCII}));
EI_DECODE_STRING_4(decode_my_atom_as, 4, "b",
P99({ERLANG_UTF8,ERLANG_LATIN1,ERLANG_ASCII}));
- EI_DECODE_STRING_4(decode_my_atom_as, 4, "c",
- P99({ERLANG_LATIN1,ERLANG_LATIN1,ERLANG_ASCII}));
- EI_DECODE_STRING_4(decode_my_atom_as, 4, "d",
- P99({ERLANG_ASCII,ERLANG_LATIN1,ERLANG_ASCII}));
+ EI_DECODE_STRING_4(decode_my_atom_as, 3, "c",
+ P99({ERLANG_LATIN1,ERLANG_UTF8,ERLANG_ASCII}));
+ EI_DECODE_STRING_4(decode_my_atom_as, 3, "d",
+ P99({ERLANG_ASCII,ERLANG_UTF8,ERLANG_ASCII}));
report(1);
}
diff --git a/lib/erl_interface/test/ei_decode_encode_SUITE.erl b/lib/erl_interface/test/ei_decode_encode_SUITE.erl
index 570a91e2da..108a1f5142 100644
--- a/lib/erl_interface/test/ei_decode_encode_SUITE.erl
+++ b/lib/erl_interface/test/ei_decode_encode_SUITE.erl
@@ -170,7 +170,6 @@ get_binary(P) ->
-define(VERSION_MAGIC, 131).
--define(ATOM_EXT, 100).
-define(REFERENCE_EXT, 101).
-define(PORT_EXT, 102).
-define(PID_EXT, 103).
diff --git a/lib/erl_interface/test/ei_encode_SUITE.erl b/lib/erl_interface/test/ei_encode_SUITE.erl
index ac6ec9cf4e..43484a1319 100644
--- a/lib/erl_interface/test/ei_encode_SUITE.erl
+++ b/lib/erl_interface/test/ei_encode_SUITE.erl
@@ -184,17 +184,17 @@ test_ei_encode_misc(Config) when is_list(Config) ->
{<<70,_:8/binary>>,Fp1} = get_buf_and_term(P),
true = match_float(Fp1, 1.0),
- {<<100,0,5,"false">>,false} = get_buf_and_term(P),
- {<<100,0,4,"true">> ,true} = get_buf_and_term(P),
- {<<100,0,4,"true">> ,true} = get_buf_and_term(P),
- {<<100,0,4,"true">> ,true} = get_buf_and_term(P),
-
- {<<100,0,3,"foo">>,foo} = get_buf_and_term(P),
- {<<100,0,3,"foo">>,foo} = get_buf_and_term(P),
- {<<100,0,0,"">>,''} = get_buf_and_term(P),
- {<<100,0,0,"">>,''} = get_buf_and_term(P),
- {<<100,0,6,"ÅÄÖåäö">>,'ÅÄÖåäö'} = get_buf_and_term(P),
- {<<100,0,6,"ÅÄÖåäö">>,'ÅÄÖåäö'} = get_buf_and_term(P),
+ {<<$w,5,"false">>,false} = get_buf_and_term(P),
+ {<<$w,4,"true">> ,true} = get_buf_and_term(P),
+ {<<$w,4,"true">> ,true} = get_buf_and_term(P),
+ {<<$w,4,"true">> ,true} = get_buf_and_term(P),
+
+ {<<$w,3,"foo">>,foo} = get_buf_and_term(P),
+ {<<$w,3,"foo">>,foo} = get_buf_and_term(P),
+ {<<$w,0,"">>,''} = get_buf_and_term(P),
+ {<<$w,0,"">>,''} = get_buf_and_term(P),
+ {<<$w,12,"ÅÄÖåäö"/utf8>>,'ÅÄÖåäö'} = get_buf_and_term(P),
+ {<<$w,12,"ÅÄÖåäö"/utf8>>,'ÅÄÖåäö'} = get_buf_and_term(P),
{<<107,0,3,"foo">>,"foo"} = get_buf_and_term(P),
{<<107,0,3,"foo">>,"foo"} = get_buf_and_term(P),
@@ -239,12 +239,12 @@ test_ei_encode_utf8_atom(Config) ->
P = runner:start(?test_ei_encode_utf8_atom),
{<<119,2,195,133>>,'Å'} = get_buf_and_term(P),
- {<<100,0,1,197>>,'Å'} = get_buf_and_term(P),
- {<<100,0,1,197>>,'Å'} = get_buf_and_term(P),
+ {<<119,2,195,133>>,'Å'} = get_buf_and_term(P),
+ {<<119,2,195,133>>,'Å'} = get_buf_and_term(P),
{<<119,2,195,133>>,'Å'} = get_buf_and_term(P),
{<<119,1,$A>>,'A'} = get_buf_and_term(P),
- {<<100,0,1,$A>>,'A'} = get_buf_and_term(P),
+ {<<119,1,$A>>,'A'} = get_buf_and_term(P),
runner:recv_eot(P),
ok.
@@ -254,13 +254,13 @@ test_ei_encode_utf8_atom_len(Config) ->
P = runner:start(?test_ei_encode_utf8_atom_len),
{<<119,2,195,133>>,'Å'} = get_buf_and_term(P),
- {<<100,0,2,197,196>>,'ÅÄ'} = get_buf_and_term(P),
- {<<100,0,1,197>>,'Å'} = get_buf_and_term(P),
+ {<<119,4,195,133,195,132>>,'ÅÄ'} = get_buf_and_term(P),
+ {<<119,2,195,133>>,'Å'} = get_buf_and_term(P),
{<<119,4,195,133,195,132>>,'ÅÄ'} = get_buf_and_term(P),
{<<119,1,$A>>,'A'} = get_buf_and_term(P),
- {<<100,0,2,$A,$B>>,'AB'} = get_buf_and_term(P),
- {<<100,0,255,_:(255*8)>>,_} = get_buf_and_term(P),
+ {<<119,2,$A,$B>>,'AB'} = get_buf_and_term(P),
+ {<<119,255,_:(255*8)>>,_} = get_buf_and_term(P),
runner:recv_eot(P),
ok.
diff --git a/lib/erl_interface/test/erl_ext_SUITE_data/ext_test.c b/lib/erl_interface/test/erl_ext_SUITE_data/ext_test.c
index 36cf086ed2..afac5485e8 100644
--- a/lib/erl_interface/test/erl_ext_SUITE_data/ext_test.c
+++ b/lib/erl_interface/test/erl_ext_SUITE_data/ext_test.c
@@ -407,7 +407,7 @@ test_compare_ext(char *test_desc,
}
-#define ATOM_EXT (100)
+#define SMALL_ATOM_UTF8_EXT (119)
#define REFERENCE_EXT (101)
#define PORT_EXT (102)
#define PID_EXT (103)
@@ -429,13 +429,13 @@ write_atom(unsigned char *buf, char *atom)
len = 0;
while(atom[len]) {
- buf[len + 3] = atom[len];
+ buf[len + 2] = atom[len];
len++;
}
- buf[0] = ATOM_EXT;
- PUT_UINT16(&buf[1], len);
+ buf[0] = SMALL_ATOM_UTF8_EXT;
+ buf[1] = len;
- return buf + 3 + len;
+ return buf + 2 + len;
}
static unsigned char *