From 94fe1619da9d5e67e5f67189562408cf1c42e618 Mon Sep 17 00:00:00 2001 From: Dan Gudmundsson Date: Thu, 5 Mar 2015 15:07:02 +0100 Subject: kernel: Fix file:read_line/1 unicode error handling When a file was opened with some unicode encoding, file:read_line/1 could return unicode codepoints > 255 in list mode and wrong error message in bin mode. Chose to break out 'get_line' functionality from get_chars/5 since 'get_until' handling is different (comes from io module which should return unicode lists) and seems to have its own (doc?) problems. --- lib/kernel/src/file_io_server.erl | 114 +++++++++++++++++--------------------- 1 file changed, 50 insertions(+), 64 deletions(-) diff --git a/lib/kernel/src/file_io_server.erl b/lib/kernel/src/file_io_server.erl index 0e9ff5bc0f..7d30e7e1d8 100644 --- a/lib/kernel/src/file_io_server.erl +++ b/lib/kernel/src/file_io_server.erl @@ -307,18 +307,18 @@ io_request({get_chars,Enc,_Prompt,N}, #state{}=State) -> get_chars(N, Enc, State); -%% -%% This optimization gives almost nothing - needs more working... -%% Disabled for now. /PaN -%% -%% io_request({get_line,Enc,_Prompt}, -%% #state{unic=latin1}=State) -> -%% get_line(Enc,State); - -io_request({get_line,Enc,_Prompt}, - #state{}=State) -> - get_chars(io_lib, collect_line, [], Enc, State); - +io_request({get_line,OutEnc,_Prompt}, #state{buf=Buf, read_mode=Mode, unic=InEnc} = State0) -> + try + %% Minimize the encoding conversions + WorkEnc = case InEnc of + {_,_} -> OutEnc; %% utf16 or utf32 + _ -> InEnc %% Byte oriented utf8 or latin1 + end, + {Res, State} = get_line(start, convert_enc(Buf, InEnc, WorkEnc), WorkEnc, State0), + {reply, cast(Res, Mode, WorkEnc, OutEnc), State} + catch exit:ExError -> + {stop,ExError,{error,ExError},State0#state{buf= <<>>}} + end; io_request({setopts, Opts}, #state{}=State) when is_list(Opts) -> @@ -386,56 +386,40 @@ put_chars(Chars, InEncoding, #state{handle=Handle, unic=OutEncoding}=State) -> {stop,normal,{error,{no_translation, InEncoding, OutEncoding}},State} end. -%% -%% Process the I/O request get_line for latin1 encoding of file specially -%% Unfortunately this function gives almost nothing, it needs more work -%% I disable it for now /PaN -%% -%% srch(<<>>,_,_) -> -%% nomatch; -%% srch(<>,X,N) -> -%% {match,N}; -%% srch(<<_:8,T/binary>>,X,N) -> -%% srch(T,X,N+1). -%% get_line(OutEnc, #state{handle=Handle,buf = <<>>,unic=latin1}=State) -> -%% case ?PRIM_FILE:read(Handle,?READ_SIZE_BINARY) of -%% {ok, B} -> -%% get_line(OutEnc, State#state{buf = B}); -%% eof -> -%% {reply,eof,State}; -%% {error,Reason}=Error -> -%% {stop,Reason,Error,State} -%% end; -%% get_line(OutEnc, #state{handle=Handle,buf=Buf,read_mode=ReadMode,unic=latin1}=State) -> -%% case srch(Buf,$\n,0) of -%% nomatch -> -%% case ?PRIM_FILE:read(Handle,?READ_SIZE_BINARY) of -%% {ok, B} -> -%% get_line(OutEnc,State#state{buf = <>}); -%% eof -> -%% std_reply(cast(Buf, ReadMode,latin1,OutEnc), State); -%% {error,Reason}=Error -> -%% {stop,Reason,Error,State#state{buf= <<>>}} -%% end; -%% {match,Pos} when Pos >= 1-> -%% PosP1 = Pos + 1, -%% <> = Buf, -%% PosM1 = Pos - 1, -%% Res = case Res0 of -%% <> -> -%% cat(Chomped, <<"\n">>, ReadMode,latin1,OutEnc); -%% _Other -> -%% cast(Res0, ReadMode,latin1,OutEnc) -%% end, -%% {reply,Res,State#state{buf=NewBuf}}; -%% {match,Pos} -> -%% PosP1 = Pos + 1, -%% <> = Buf, -%% {reply,Res,State#state{buf=NewBuf}} -%% end; -%% get_line(_, #state{}=State) -> -%% {error,{error,get_line},State}. - +get_line(S, {<<>>, Cont}, OutEnc, + #state{handle=Handle, read_mode=Mode, unic=InEnc}=State) -> + case ?PRIM_FILE:read(Handle, read_size(Mode)) of + {ok,Bin} -> + get_line(S, convert_enc([Cont, Bin], InEnc, OutEnc), OutEnc, State); + eof -> + get_line(S, {eof, Cont}, OutEnc, State); + {error,Reason}=Error -> + {stop,Reason,Error,State} + end; +get_line(S0, {Buf, BCont}, OutEnc, #state{unic=InEnc}=State) -> + case io_lib:collect_line(S0, Buf, OutEnc, []) of + {stop, Result, Cont0} -> + %% Convert both buffers back to file InEnc encoding + {Cont, <<>>} = convert_enc(Cont0, OutEnc, InEnc), + {Result, State#state{buf=cast_binary([Cont, BCont])}}; + S -> + get_line(S, {<<>>, BCont}, OutEnc, State) + end. + +convert_enc(Bins, Enc, Enc) -> + {cast_binary(Bins), <<>>}; +convert_enc(eof, _, _) -> + {<<>>, <<>>}; +convert_enc(Bin, InEnc, OutEnc) -> + case unicode:characters_to_binary(Bin, InEnc, OutEnc) of + Res when is_binary(Res) -> + {Res, <<>>}; + {incomplete, Res, Cont} -> + {Res, Cont}; + {error, _, _} -> + exit({no_translation, InEnc, OutEnc}) + end. + %% %% Process the I/O request get_chars %% @@ -640,8 +624,6 @@ invalid_unicode_error(Mod, Func, XtraArg, S) -> %% Convert error code to make it look as before err_func(io_lib, get_until, {_,F,_}) -> - F; -err_func(_, F, _) -> F. @@ -713,6 +695,8 @@ cat(B1, B2, list, latin1,_) -> binary_to_list(B1)++binary_to_list(B2). %% Cast binary to list or binary +cast(eof, _, _, _) -> + eof; cast(B, binary, latin1, latin1) -> B; cast(B, binary, InEncoding, OutEncoding) -> @@ -736,6 +720,8 @@ cast(B, list, InEncoding, OutEncoding) -> %% Convert buffer to binary cast_binary(Binary) when is_binary(Binary) -> Binary; +cast_binary([<<>>|List]) -> + cast_binary(List); cast_binary(List) when is_list(List) -> list_to_binary(List); cast_binary(_EOF) -> -- cgit v1.2.3 From f896e24696f7d7ddb0c665751389fe2dc7ab8503 Mon Sep 17 00:00:00 2001 From: Dan Gudmundsson Date: Tue, 10 Mar 2015 07:58:16 +0100 Subject: kernel: Add test for unicode mode in file --- lib/kernel/test/file_SUITE.erl | 152 ++++++++++++++++++++++++++++++++++++++++- 1 file changed, 151 insertions(+), 1 deletion(-) diff --git a/lib/kernel/test/file_SUITE.erl b/lib/kernel/test/file_SUITE.erl index 2ce2303ba3..1213d8e37e 100644 --- a/lib/kernel/test/file_SUITE.erl +++ b/lib/kernel/test/file_SUITE.erl @@ -93,6 +93,8 @@ -export([old_io_protocol/1]). +-export([unicode_mode/1]). + %% Debug exports -export([create_file_slow/2, create_file/2, create_bin/2]). -export([verify_file/2, verify_bin/3]). @@ -105,6 +107,7 @@ -include_lib("test_server/include/test_server.hrl"). -include_lib("kernel/include/file.hrl"). +-define(THROW_ERROR(RES), throw({fail, ?LINE, RES})). suite() -> [{ct_hooks,[ts_install_cth]}]. @@ -116,7 +119,9 @@ all() -> delayed_write, read_ahead, segment_read, segment_write, ipread, pid2name, interleaved_read_write, otp_5814, otp_10852, large_file, large_write, read_line_1, read_line_2, read_line_3, - read_line_4, standard_io, old_io_protocol]. + read_line_4, standard_io, old_io_protocol, + unicode_mode + ]. groups() -> [{dirs, [], [make_del_dir, cur_dir_0, cur_dir_1, @@ -347,8 +352,153 @@ old_io_protocol(Config) when is_list(Config) -> [] = flush(), ok. +unicode_mode(suite) -> []; +unicode_mode(doc) -> [""]; +unicode_mode(Config) -> + Dir = {dir, ?config(priv_dir,Config)}, + OptVariants = [[Dir], + [Dir, {encoding, utf8}], + [Dir, binary], + [Dir, binary, {encoding, utf8}] + ], + ReadVariants = [{read, fun(Fd) -> um_read(Fd, fun(Fd1) -> file:read(Fd1, 1024) end) end}, + {read_line, fun(Fd) -> um_read(Fd, fun(Fd1) -> file:read_line(Fd1) end) end} + %%{pread, fun(Fd) -> file:pread(Fd, 0, 1024) end}, + %%{preadl, fun(Fd) -> file:pread(Fd, [{0, 1024}]) end}, + ], + + _ = [read_write_0("ASCII: list: Hello World", Read, Opt) || + Opt <- OptVariants, Read <- ReadVariants], + _ = [read_write_0("LATIN1: list: åäöÅÄÖ", Read, Opt) || + Opt <- OptVariants, Read <- ReadVariants], + _ = [read_write_0(<<"ASCII: bin: Hello World">>, Read, Opt) || + Opt <- OptVariants, Read <- ReadVariants], + _ = [read_write_0(<<"LATIN1: bin: åäöÅÄÖ">>, Read, Opt) || + Opt <- OptVariants, Read <- ReadVariants], + %% These will be double encoded if option is encoding utf-8 + _ = [read_write_0(<<"UTF8: bin: Ωß"/utf8>>, Read, Opt) || + Opt <- OptVariants, Read <- ReadVariants], + %% These should not work (with encoding set to utf-8) + %% according to file's documentation + _ = [read_write_0("UTF8: list: Ωß", Read, Opt) || + Opt <- OptVariants, Read <- ReadVariants], + ok. + +read_write_0(Str, {Func, ReadFun}, Options) -> + try + Res = read_write_1(Str, ReadFun, Options), + io:format("~p: ~ts ~p '~p'~n", [Func, Str, tl(Options), Res]), + ok + catch {fail, Line, ReadBytes = [_|_]} -> + io:format("~p:~p: ~p ERROR: ~w vs~n ~w~n - ~p~n", + [?MODULE, Line, Func, Str, ReadBytes, Options]), + exit({error, ?LINE}); + {fail, Line, ReadBytes} -> + io:format("~p:~p: ~p ERROR: ~ts vs~n ~w~n - ~p~n", + [?MODULE, Line, Func, Str, ReadBytes, Options]), + exit({error, ?LINE}); + error:What -> + io:format("~p:??: ~p ERROR: ~p from~n ~w~n ~p~n", + [?MODULE, Func, What, Str, Options]), + + io:format("\t~p~n", [erlang:get_stacktrace()]), + exit({error, ?LINE}) + end. + +read_write_1(Str0, ReadFun, [{dir,Dir}|Options]) -> + File = um_filename(Str0, Dir, Options), + Pre = "line 1\n", Post = "\nlast line\n", + Str = case is_list(Str0) andalso lists:max(Str0) > 255 of + false -> %% Normal case Use options + {ok, FdW} = file:open(File, [write|Options]), + IO = [Pre, Str0, Post], + ok = file:write(FdW, IO), + case is_binary(Str0) of + true -> iolist_to_binary(IO); + false -> lists:append(IO) + end; + true -> %% Test unicode lists + {ok, FdW} = file:open(File, [write]), + Utf8 = unicode:characters_to_binary([Pre, Str0, Post]), + file:write(FdW, Utf8), + {unicode, Utf8} + end, + file:close(FdW), + {ok, FdR} = file:open(File, [read|Options]), + ReadRes = ReadFun(FdR), + file:close(FdR), + Res = um_check(Str, ReadRes, Options), + file:delete(File), + Res. +um_read(Fd, Fun) -> + um_read(Fd, Fun, []). + +um_read(Fd, Fun, Acc) -> + case Fun(Fd) of + eof -> + case is_binary(hd(Acc)) of + true -> {ok, iolist_to_binary(lists:reverse(Acc))}; + false -> {ok, lists:append(lists:reverse(Acc))} + end; + {ok, Data} -> + um_read(Fd, Fun, [Data|Acc]); + Error -> + Error + end. + + +um_check(Str, {ok, Str}, _) -> ok; +um_check(Bin, {ok, Res}, _Options) when is_binary(Bin), is_list(Res) -> + case list_to_binary(Res) of + Bin -> ok; + _ -> ?THROW_ERROR(Res) + end; +um_check(Str, {ok, Res}, _Options) when is_list(Str), is_binary(Res) -> + case iolist_to_binary(Str) of + Res -> ok; + _ -> ?THROW_ERROR(Res) + end; +um_check({unicode, Utf8Bin}, Res, Options) -> + um_check_unicode(Utf8Bin, Res, + proplists:get_value(binary, Options, false), + proplists:get_value(encoding, Options, none)); +um_check(_Str, Res, _Options) -> + ?THROW_ERROR(Res). + +um_check_unicode(Utf8Bin, {ok, Utf8Bin}, true, none) -> + ok; +um_check_unicode(Utf8Bin, {ok, List = [_|_]}, false, none) -> + case binary_to_list(Utf8Bin) == List of + true -> ok; + false -> ?THROW_ERROR(List) + end; +um_check_unicode(_Utf8Bin, {error, {no_translation, unicode, latin1}}, _, _) -> + no_translation; +um_check_unicode(_Utf8Bin, Error = {error, _}, _, _Unicode) -> + ?THROW_ERROR(Error); +um_check_unicode(_Utf8Bin, {ok, _ListOrBin}, _, _UTF8_) -> + %% List = if is_binary(ListOrBin) -> unicode:characters_to_list(ListOrBin); + %% true -> ListOrBin + %% end, + %% io:format("In: ~w~n", [binary_to_list(Utf8Bin)]), + %% io:format("Ut: ~w~n", [List]), + ?THROW_ERROR({shoud_be, no_translation}). + +um_filename(Bin, Dir, Options) when is_binary(Bin) -> + um_filename(binary_to_list(Bin), Dir, Options); +um_filename(Str = [_|_], Dir, Options) -> + Name = hd(string:tokens(Str, ":")), + Enc = atom_to_list(proplists:get_value(encoding, Options, latin1)), + File = case lists:member(binary, Options) of + true -> + "test_" ++ Name ++ "_bin_enc_" ++ Enc; + false -> + "test_" ++ Name ++ "_list_enc_" ++ Enc + end, + filename:join(Dir, File). + %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% read_write_file(suite) -> []; -- cgit v1.2.3