aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBjörn Gustavsson <[email protected]>2017-02-01 16:25:47 +0100
committerBjörn Gustavsson <[email protected]>2017-02-02 12:18:54 +0100
commitdb442323e9e86528edeb7226d55404e290b088b3 (patch)
tree58248d652e7878947f2e75d1627a9fccecd35bb9
parentcce3120dd0021c5ab5bf8d5b4088e7364f678dda (diff)
downloadotp-db442323e9e86528edeb7226d55404e290b088b3.tar.gz
otp-db442323e9e86528edeb7226d55404e290b088b3.tar.bz2
otp-db442323e9e86528edeb7226d55404e290b088b3.zip
Make "~s" fail for Unicode atoms
26b59dfe67e introduced support for arbitrary Unicode characters in atoms. After that commit, it is possible to print any atom with a "~s" format string: 1> io:format("~s\n", ['спутник']). спутник Note that the same text as a string will fail: 2> io:format("~s\n", ["спутник"]). ** exception error: bad argument in function io:format/3 called as io:format(<0.53.0>,"~s\n", [[1089,1087,1091,1090,1085,1080,1082]]) Being more permissive for atoms is probably beneficial for io:format/2. However, for io_lib:format/2, the new behavior breaks this guarantee in the documentation for io_lib:format/2: If and only if the Unicode translation modifier is used in the format string (that is, ~ts or ~tc), the resulting list can contain characters beyond the ISO Latin-1 character range (that is, numbers > 255). The problem is that you can no longer be sure whether io_lib:format/2 will return an iolist that can be successfully passed to a port or iolist_to_binary/1. We see three solutions: 1. Keep the new behavior. That means that you can get non-iolist data when you use ~s for printing an atom, but a 'badarg' when printing Unicode strings. That is inconsistent, and it delays error detection if the result is passed to a port or iolist_to_binary/1. 2. Always allow Unicode characters for ~s. That would be incompatible, because ~s says that any binary is encoded in latin1, while ~ts says that any binary is encoded in UTF-8. To implement this solution, we could no longer support latin1 binaries; all binaries would have to be encoded in UTF-8. 3. Only allow ~s for atoms where all characters are less than 256. Require ~ts to print atoms such as 'спутник'. We reject solution 1 because it is slightly incompatible and is inconsistent. We reject solution 2 because it too incompatible. Therefore, this commit implements solution 3.
-rw-r--r--lib/stdlib/src/io_lib_format.erl5
-rw-r--r--lib/stdlib/test/io_SUITE.erl25
2 files changed, 27 insertions, 3 deletions
diff --git a/lib/stdlib/src/io_lib_format.erl b/lib/stdlib/src/io_lib_format.erl
index c7b75961cb..3113767614 100644
--- a/lib/stdlib/src/io_lib_format.erl
+++ b/lib/stdlib/src/io_lib_format.erl
@@ -265,7 +265,10 @@ control($W, [A,Depth], F, Adj, P, Pad, _Enc, _Str, _I) when is_integer(Depth) ->
term(io_lib:write(A, Depth), F, Adj, P, Pad);
control($P, [A,Depth], F, Adj, P, Pad, Enc, Str, I) when is_integer(Depth) ->
print(A, Depth, F, Adj, P, Pad, Enc, Str, I);
-control($s, [A], F, Adj, P, Pad, _Enc, _Str, _I) when is_atom(A) ->
+control($s, [A], F, Adj, P, Pad, latin1, _Str, _I) when is_atom(A) ->
+ L = iolist_to_chars(atom_to_list(A)),
+ string(L, F, Adj, P, Pad);
+control($s, [A], F, Adj, P, Pad, unicode, _Str, _I) when is_atom(A) ->
string(atom_to_list(A), F, Adj, P, Pad);
control($s, [L0], F, Adj, P, Pad, latin1, _Str, _I) ->
L = iolist_to_chars(L0),
diff --git a/lib/stdlib/test/io_SUITE.erl b/lib/stdlib/test/io_SUITE.erl
index 7d48cbc97c..b0a1e461e3 100644
--- a/lib/stdlib/test/io_SUITE.erl
+++ b/lib/stdlib/test/io_SUITE.erl
@@ -30,7 +30,7 @@
io_lib_print_binary_depth_one/1, otp_10302/1, otp_10755/1,
otp_10836/1, io_lib_width_too_small/1,
io_with_huge_message_queue/1, format_string/1,
- maps/1, coverage/1]).
+ maps/1, coverage/1, otp_14178_unicode_atoms/1]).
-export([pretty/2]).
@@ -61,7 +61,7 @@ all() ->
printable_range, bad_printable_range,
io_lib_print_binary_depth_one, otp_10302, otp_10755, otp_10836,
io_lib_width_too_small, io_with_huge_message_queue,
- format_string, maps, coverage].
+ format_string, maps, coverage, otp_14178_unicode_atoms].
%% Error cases for output.
error_1(Config) when is_list(Config) ->
@@ -2106,3 +2106,24 @@ coverage(_Config) ->
io:format("~s\n", [S2]),
ok.
+
+%% Test UTF-8 atoms.
+otp_14178_unicode_atoms(_Config) ->
+ "atom" = fmt("~ts", ['atom']),
+ "кирилли́ческий атом" = fmt("~ts", ['кирилли́ческий атом']),
+ [16#10FFFF] = fmt("~ts", ['\x{10FFFF}']),
+
+ %% ~s must not accept code points greater than 255.
+ bad_io_lib_format("~s", ['\x{100}']),
+ bad_io_lib_format("~s", ['кирилли́ческий атом']),
+
+ ok.
+
+bad_io_lib_format(F, S) ->
+ try io_lib:format(F, S) of
+ _ ->
+ ct:fail({should_fail,F,S})
+ catch
+ error:badarg ->
+ ok
+ end.