aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorDan Gudmundsson <[email protected]>2015-03-11 16:12:05 +0100
committerDan Gudmundsson <[email protected]>2015-03-11 16:12:05 +0100
commit7efa93c9dd4ce25c5754c1b72ec804e8172c0099 (patch)
treeea10d7114eb0cba466edc6917871655218c0569b
parent5d11a9f728de74b2df0c374658ce21ad318f4b93 (diff)
parentf896e24696f7d7ddb0c665751389fe2dc7ab8503 (diff)
downloadotp-7efa93c9dd4ce25c5754c1b72ec804e8172c0099.tar.gz
otp-7efa93c9dd4ce25c5754c1b72ec804e8172c0099.tar.bz2
otp-7efa93c9dd4ce25c5754c1b72ec804e8172c0099.zip
Merge branch 'dgud/kernel/unicode_file_read/OTP-12144'
* dgud/kernel/unicode_file_read/OTP-12144: kernel: Add test for unicode mode in file kernel: Fix file:read_line/1 unicode error handling
-rw-r--r--lib/kernel/src/file_io_server.erl114
-rw-r--r--lib/kernel/test/file_SUITE.erl152
2 files changed, 201 insertions, 65 deletions
diff --git a/lib/kernel/src/file_io_server.erl b/lib/kernel/src/file_io_server.erl
index 0e9ff5bc0f..7d30e7e1d8 100644
--- a/lib/kernel/src/file_io_server.erl
+++ b/lib/kernel/src/file_io_server.erl
@@ -307,18 +307,18 @@ io_request({get_chars,Enc,_Prompt,N},
#state{}=State) ->
get_chars(N, Enc, State);
-%%
-%% This optimization gives almost nothing - needs more working...
-%% Disabled for now. /PaN
-%%
-%% io_request({get_line,Enc,_Prompt},
-%% #state{unic=latin1}=State) ->
-%% get_line(Enc,State);
-
-io_request({get_line,Enc,_Prompt},
- #state{}=State) ->
- get_chars(io_lib, collect_line, [], Enc, State);
-
+io_request({get_line,OutEnc,_Prompt}, #state{buf=Buf, read_mode=Mode, unic=InEnc} = State0) ->
+ try
+ %% Minimize the encoding conversions
+ WorkEnc = case InEnc of
+ {_,_} -> OutEnc; %% utf16 or utf32
+ _ -> InEnc %% Byte oriented utf8 or latin1
+ end,
+ {Res, State} = get_line(start, convert_enc(Buf, InEnc, WorkEnc), WorkEnc, State0),
+ {reply, cast(Res, Mode, WorkEnc, OutEnc), State}
+ catch exit:ExError ->
+ {stop,ExError,{error,ExError},State0#state{buf= <<>>}}
+ end;
io_request({setopts, Opts},
#state{}=State) when is_list(Opts) ->
@@ -386,56 +386,40 @@ put_chars(Chars, InEncoding, #state{handle=Handle, unic=OutEncoding}=State) ->
{stop,normal,{error,{no_translation, InEncoding, OutEncoding}},State}
end.
-%%
-%% Process the I/O request get_line for latin1 encoding of file specially
-%% Unfortunately this function gives almost nothing, it needs more work
-%% I disable it for now /PaN
-%%
-%% srch(<<>>,_,_) ->
-%% nomatch;
-%% srch(<<X:8,_/binary>>,X,N) ->
-%% {match,N};
-%% srch(<<_:8,T/binary>>,X,N) ->
-%% srch(T,X,N+1).
-%% get_line(OutEnc, #state{handle=Handle,buf = <<>>,unic=latin1}=State) ->
-%% case ?PRIM_FILE:read(Handle,?READ_SIZE_BINARY) of
-%% {ok, B} ->
-%% get_line(OutEnc, State#state{buf = B});
-%% eof ->
-%% {reply,eof,State};
-%% {error,Reason}=Error ->
-%% {stop,Reason,Error,State}
-%% end;
-%% get_line(OutEnc, #state{handle=Handle,buf=Buf,read_mode=ReadMode,unic=latin1}=State) ->
-%% case srch(Buf,$\n,0) of
-%% nomatch ->
-%% case ?PRIM_FILE:read(Handle,?READ_SIZE_BINARY) of
-%% {ok, B} ->
-%% get_line(OutEnc,State#state{buf = <<Buf/binary,B/binary>>});
-%% eof ->
-%% std_reply(cast(Buf, ReadMode,latin1,OutEnc), State);
-%% {error,Reason}=Error ->
-%% {stop,Reason,Error,State#state{buf= <<>>}}
-%% end;
-%% {match,Pos} when Pos >= 1->
-%% PosP1 = Pos + 1,
-%% <<Res0:PosP1/binary,NewBuf/binary>> = Buf,
-%% PosM1 = Pos - 1,
-%% Res = case Res0 of
-%% <<Chomped:PosM1/binary,$\r:8,$\n:8>> ->
-%% cat(Chomped, <<"\n">>, ReadMode,latin1,OutEnc);
-%% _Other ->
-%% cast(Res0, ReadMode,latin1,OutEnc)
-%% end,
-%% {reply,Res,State#state{buf=NewBuf}};
-%% {match,Pos} ->
-%% PosP1 = Pos + 1,
-%% <<Res:PosP1/binary,NewBuf/binary>> = Buf,
-%% {reply,Res,State#state{buf=NewBuf}}
-%% end;
-%% get_line(_, #state{}=State) ->
-%% {error,{error,get_line},State}.
-
+get_line(S, {<<>>, Cont}, OutEnc,
+ #state{handle=Handle, read_mode=Mode, unic=InEnc}=State) ->
+ case ?PRIM_FILE:read(Handle, read_size(Mode)) of
+ {ok,Bin} ->
+ get_line(S, convert_enc([Cont, Bin], InEnc, OutEnc), OutEnc, State);
+ eof ->
+ get_line(S, {eof, Cont}, OutEnc, State);
+ {error,Reason}=Error ->
+ {stop,Reason,Error,State}
+ end;
+get_line(S0, {Buf, BCont}, OutEnc, #state{unic=InEnc}=State) ->
+ case io_lib:collect_line(S0, Buf, OutEnc, []) of
+ {stop, Result, Cont0} ->
+ %% Convert both buffers back to file InEnc encoding
+ {Cont, <<>>} = convert_enc(Cont0, OutEnc, InEnc),
+ {Result, State#state{buf=cast_binary([Cont, BCont])}};
+ S ->
+ get_line(S, {<<>>, BCont}, OutEnc, State)
+ end.
+
+convert_enc(Bins, Enc, Enc) ->
+ {cast_binary(Bins), <<>>};
+convert_enc(eof, _, _) ->
+ {<<>>, <<>>};
+convert_enc(Bin, InEnc, OutEnc) ->
+ case unicode:characters_to_binary(Bin, InEnc, OutEnc) of
+ Res when is_binary(Res) ->
+ {Res, <<>>};
+ {incomplete, Res, Cont} ->
+ {Res, Cont};
+ {error, _, _} ->
+ exit({no_translation, InEnc, OutEnc})
+ end.
+
%%
%% Process the I/O request get_chars
%%
@@ -640,8 +624,6 @@ invalid_unicode_error(Mod, Func, XtraArg, S) ->
%% Convert error code to make it look as before
err_func(io_lib, get_until, {_,F,_}) ->
- F;
-err_func(_, F, _) ->
F.
@@ -713,6 +695,8 @@ cat(B1, B2, list, latin1,_) ->
binary_to_list(B1)++binary_to_list(B2).
%% Cast binary to list or binary
+cast(eof, _, _, _) ->
+ eof;
cast(B, binary, latin1, latin1) ->
B;
cast(B, binary, InEncoding, OutEncoding) ->
@@ -736,6 +720,8 @@ cast(B, list, InEncoding, OutEncoding) ->
%% Convert buffer to binary
cast_binary(Binary) when is_binary(Binary) ->
Binary;
+cast_binary([<<>>|List]) ->
+ cast_binary(List);
cast_binary(List) when is_list(List) ->
list_to_binary(List);
cast_binary(_EOF) ->
diff --git a/lib/kernel/test/file_SUITE.erl b/lib/kernel/test/file_SUITE.erl
index 2ce2303ba3..1213d8e37e 100644
--- a/lib/kernel/test/file_SUITE.erl
+++ b/lib/kernel/test/file_SUITE.erl
@@ -93,6 +93,8 @@
-export([old_io_protocol/1]).
+-export([unicode_mode/1]).
+
%% Debug exports
-export([create_file_slow/2, create_file/2, create_bin/2]).
-export([verify_file/2, verify_bin/3]).
@@ -105,6 +107,7 @@
-include_lib("test_server/include/test_server.hrl").
-include_lib("kernel/include/file.hrl").
+-define(THROW_ERROR(RES), throw({fail, ?LINE, RES})).
suite() -> [{ct_hooks,[ts_install_cth]}].
@@ -116,7 +119,9 @@ all() ->
delayed_write, read_ahead, segment_read, segment_write,
ipread, pid2name, interleaved_read_write, otp_5814, otp_10852,
large_file, large_write, read_line_1, read_line_2, read_line_3,
- read_line_4, standard_io, old_io_protocol].
+ read_line_4, standard_io, old_io_protocol,
+ unicode_mode
+ ].
groups() ->
[{dirs, [], [make_del_dir, cur_dir_0, cur_dir_1,
@@ -347,8 +352,153 @@ old_io_protocol(Config) when is_list(Config) ->
[] = flush(),
ok.
+unicode_mode(suite) -> [];
+unicode_mode(doc) -> [""];
+unicode_mode(Config) ->
+ Dir = {dir, ?config(priv_dir,Config)},
+ OptVariants = [[Dir],
+ [Dir, {encoding, utf8}],
+ [Dir, binary],
+ [Dir, binary, {encoding, utf8}]
+ ],
+ ReadVariants = [{read, fun(Fd) -> um_read(Fd, fun(Fd1) -> file:read(Fd1, 1024) end) end},
+ {read_line, fun(Fd) -> um_read(Fd, fun(Fd1) -> file:read_line(Fd1) end) end}
+ %%{pread, fun(Fd) -> file:pread(Fd, 0, 1024) end},
+ %%{preadl, fun(Fd) -> file:pread(Fd, [{0, 1024}]) end},
+ ],
+
+ _ = [read_write_0("ASCII: list: Hello World", Read, Opt) ||
+ Opt <- OptVariants, Read <- ReadVariants],
+ _ = [read_write_0("LATIN1: list: åäöÅÄÖ", Read, Opt) ||
+ Opt <- OptVariants, Read <- ReadVariants],
+ _ = [read_write_0(<<"ASCII: bin: Hello World">>, Read, Opt) ||
+ Opt <- OptVariants, Read <- ReadVariants],
+ _ = [read_write_0(<<"LATIN1: bin: åäöÅÄÖ">>, Read, Opt) ||
+ Opt <- OptVariants, Read <- ReadVariants],
+ %% These will be double encoded if option is encoding utf-8
+ _ = [read_write_0(<<"UTF8: bin: Ωß"/utf8>>, Read, Opt) ||
+ Opt <- OptVariants, Read <- ReadVariants],
+ %% These should not work (with encoding set to utf-8)
+ %% according to file's documentation
+ _ = [read_write_0("UTF8: list: Ωß", Read, Opt) ||
+ Opt <- OptVariants, Read <- ReadVariants],
+ ok.
+
+read_write_0(Str, {Func, ReadFun}, Options) ->
+ try
+ Res = read_write_1(Str, ReadFun, Options),
+ io:format("~p: ~ts ~p '~p'~n", [Func, Str, tl(Options), Res]),
+ ok
+ catch {fail, Line, ReadBytes = [_|_]} ->
+ io:format("~p:~p: ~p ERROR: ~w vs~n ~w~n - ~p~n",
+ [?MODULE, Line, Func, Str, ReadBytes, Options]),
+ exit({error, ?LINE});
+ {fail, Line, ReadBytes} ->
+ io:format("~p:~p: ~p ERROR: ~ts vs~n ~w~n - ~p~n",
+ [?MODULE, Line, Func, Str, ReadBytes, Options]),
+ exit({error, ?LINE});
+ error:What ->
+ io:format("~p:??: ~p ERROR: ~p from~n ~w~n ~p~n",
+ [?MODULE, Func, What, Str, Options]),
+
+ io:format("\t~p~n", [erlang:get_stacktrace()]),
+ exit({error, ?LINE})
+ end.
+
+read_write_1(Str0, ReadFun, [{dir,Dir}|Options]) ->
+ File = um_filename(Str0, Dir, Options),
+ Pre = "line 1\n", Post = "\nlast line\n",
+ Str = case is_list(Str0) andalso lists:max(Str0) > 255 of
+ false -> %% Normal case Use options
+ {ok, FdW} = file:open(File, [write|Options]),
+ IO = [Pre, Str0, Post],
+ ok = file:write(FdW, IO),
+ case is_binary(Str0) of
+ true -> iolist_to_binary(IO);
+ false -> lists:append(IO)
+ end;
+ true -> %% Test unicode lists
+ {ok, FdW} = file:open(File, [write]),
+ Utf8 = unicode:characters_to_binary([Pre, Str0, Post]),
+ file:write(FdW, Utf8),
+ {unicode, Utf8}
+ end,
+ file:close(FdW),
+ {ok, FdR} = file:open(File, [read|Options]),
+ ReadRes = ReadFun(FdR),
+ file:close(FdR),
+ Res = um_check(Str, ReadRes, Options),
+ file:delete(File),
+ Res.
+um_read(Fd, Fun) ->
+ um_read(Fd, Fun, []).
+
+um_read(Fd, Fun, Acc) ->
+ case Fun(Fd) of
+ eof ->
+ case is_binary(hd(Acc)) of
+ true -> {ok, iolist_to_binary(lists:reverse(Acc))};
+ false -> {ok, lists:append(lists:reverse(Acc))}
+ end;
+ {ok, Data} ->
+ um_read(Fd, Fun, [Data|Acc]);
+ Error ->
+ Error
+ end.
+
+
+um_check(Str, {ok, Str}, _) -> ok;
+um_check(Bin, {ok, Res}, _Options) when is_binary(Bin), is_list(Res) ->
+ case list_to_binary(Res) of
+ Bin -> ok;
+ _ -> ?THROW_ERROR(Res)
+ end;
+um_check(Str, {ok, Res}, _Options) when is_list(Str), is_binary(Res) ->
+ case iolist_to_binary(Str) of
+ Res -> ok;
+ _ -> ?THROW_ERROR(Res)
+ end;
+um_check({unicode, Utf8Bin}, Res, Options) ->
+ um_check_unicode(Utf8Bin, Res,
+ proplists:get_value(binary, Options, false),
+ proplists:get_value(encoding, Options, none));
+um_check(_Str, Res, _Options) ->
+ ?THROW_ERROR(Res).
+
+um_check_unicode(Utf8Bin, {ok, Utf8Bin}, true, none) ->
+ ok;
+um_check_unicode(Utf8Bin, {ok, List = [_|_]}, false, none) ->
+ case binary_to_list(Utf8Bin) == List of
+ true -> ok;
+ false -> ?THROW_ERROR(List)
+ end;
+um_check_unicode(_Utf8Bin, {error, {no_translation, unicode, latin1}}, _, _) ->
+ no_translation;
+um_check_unicode(_Utf8Bin, Error = {error, _}, _, _Unicode) ->
+ ?THROW_ERROR(Error);
+um_check_unicode(_Utf8Bin, {ok, _ListOrBin}, _, _UTF8_) ->
+ %% List = if is_binary(ListOrBin) -> unicode:characters_to_list(ListOrBin);
+ %% true -> ListOrBin
+ %% end,
+ %% io:format("In: ~w~n", [binary_to_list(Utf8Bin)]),
+ %% io:format("Ut: ~w~n", [List]),
+ ?THROW_ERROR({shoud_be, no_translation}).
+
+um_filename(Bin, Dir, Options) when is_binary(Bin) ->
+ um_filename(binary_to_list(Bin), Dir, Options);
+um_filename(Str = [_|_], Dir, Options) ->
+ Name = hd(string:tokens(Str, ":")),
+ Enc = atom_to_list(proplists:get_value(encoding, Options, latin1)),
+ File = case lists:member(binary, Options) of
+ true ->
+ "test_" ++ Name ++ "_bin_enc_" ++ Enc;
+ false ->
+ "test_" ++ Name ++ "_list_enc_" ++ Enc
+ end,
+ filename:join(Dir, File).
+
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
read_write_file(suite) -> [];