From 94fe1619da9d5e67e5f67189562408cf1c42e618 Mon Sep 17 00:00:00 2001 From: Dan Gudmundsson Date: Thu, 5 Mar 2015 15:07:02 +0100 Subject: kernel: Fix file:read_line/1 unicode error handling When a file was opened with some unicode encoding, file:read_line/1 could return unicode codepoints > 255 in list mode and wrong error message in bin mode. Chose to break out 'get_line' functionality from get_chars/5 since 'get_until' handling is different (comes from io module which should return unicode lists) and seems to have its own (doc?) problems. --- lib/kernel/src/file_io_server.erl | 114 +++++++++++++++++--------------------- 1 file changed, 50 insertions(+), 64 deletions(-) (limited to 'lib') diff --git a/lib/kernel/src/file_io_server.erl b/lib/kernel/src/file_io_server.erl index 0e9ff5bc0f..7d30e7e1d8 100644 --- a/lib/kernel/src/file_io_server.erl +++ b/lib/kernel/src/file_io_server.erl @@ -307,18 +307,18 @@ io_request({get_chars,Enc,_Prompt,N}, #state{}=State) -> get_chars(N, Enc, State); -%% -%% This optimization gives almost nothing - needs more working... -%% Disabled for now. /PaN -%% -%% io_request({get_line,Enc,_Prompt}, -%% #state{unic=latin1}=State) -> -%% get_line(Enc,State); - -io_request({get_line,Enc,_Prompt}, - #state{}=State) -> - get_chars(io_lib, collect_line, [], Enc, State); - +io_request({get_line,OutEnc,_Prompt}, #state{buf=Buf, read_mode=Mode, unic=InEnc} = State0) -> + try + %% Minimize the encoding conversions + WorkEnc = case InEnc of + {_,_} -> OutEnc; %% utf16 or utf32 + _ -> InEnc %% Byte oriented utf8 or latin1 + end, + {Res, State} = get_line(start, convert_enc(Buf, InEnc, WorkEnc), WorkEnc, State0), + {reply, cast(Res, Mode, WorkEnc, OutEnc), State} + catch exit:ExError -> + {stop,ExError,{error,ExError},State0#state{buf= <<>>}} + end; io_request({setopts, Opts}, #state{}=State) when is_list(Opts) -> @@ -386,56 +386,40 @@ put_chars(Chars, InEncoding, #state{handle=Handle, unic=OutEncoding}=State) -> {stop,normal,{error,{no_translation, InEncoding, OutEncoding}},State} end. -%% -%% Process the I/O request get_line for latin1 encoding of file specially -%% Unfortunately this function gives almost nothing, it needs more work -%% I disable it for now /PaN -%% -%% srch(<<>>,_,_) -> -%% nomatch; -%% srch(<>,X,N) -> -%% {match,N}; -%% srch(<<_:8,T/binary>>,X,N) -> -%% srch(T,X,N+1). -%% get_line(OutEnc, #state{handle=Handle,buf = <<>>,unic=latin1}=State) -> -%% case ?PRIM_FILE:read(Handle,?READ_SIZE_BINARY) of -%% {ok, B} -> -%% get_line(OutEnc, State#state{buf = B}); -%% eof -> -%% {reply,eof,State}; -%% {error,Reason}=Error -> -%% {stop,Reason,Error,State} -%% end; -%% get_line(OutEnc, #state{handle=Handle,buf=Buf,read_mode=ReadMode,unic=latin1}=State) -> -%% case srch(Buf,$\n,0) of -%% nomatch -> -%% case ?PRIM_FILE:read(Handle,?READ_SIZE_BINARY) of -%% {ok, B} -> -%% get_line(OutEnc,State#state{buf = <>}); -%% eof -> -%% std_reply(cast(Buf, ReadMode,latin1,OutEnc), State); -%% {error,Reason}=Error -> -%% {stop,Reason,Error,State#state{buf= <<>>}} -%% end; -%% {match,Pos} when Pos >= 1-> -%% PosP1 = Pos + 1, -%% <> = Buf, -%% PosM1 = Pos - 1, -%% Res = case Res0 of -%% <> -> -%% cat(Chomped, <<"\n">>, ReadMode,latin1,OutEnc); -%% _Other -> -%% cast(Res0, ReadMode,latin1,OutEnc) -%% end, -%% {reply,Res,State#state{buf=NewBuf}}; -%% {match,Pos} -> -%% PosP1 = Pos + 1, -%% <> = Buf, -%% {reply,Res,State#state{buf=NewBuf}} -%% end; -%% get_line(_, #state{}=State) -> -%% {error,{error,get_line},State}. - +get_line(S, {<<>>, Cont}, OutEnc, + #state{handle=Handle, read_mode=Mode, unic=InEnc}=State) -> + case ?PRIM_FILE:read(Handle, read_size(Mode)) of + {ok,Bin} -> + get_line(S, convert_enc([Cont, Bin], InEnc, OutEnc), OutEnc, State); + eof -> + get_line(S, {eof, Cont}, OutEnc, State); + {error,Reason}=Error -> + {stop,Reason,Error,State} + end; +get_line(S0, {Buf, BCont}, OutEnc, #state{unic=InEnc}=State) -> + case io_lib:collect_line(S0, Buf, OutEnc, []) of + {stop, Result, Cont0} -> + %% Convert both buffers back to file InEnc encoding + {Cont, <<>>} = convert_enc(Cont0, OutEnc, InEnc), + {Result, State#state{buf=cast_binary([Cont, BCont])}}; + S -> + get_line(S, {<<>>, BCont}, OutEnc, State) + end. + +convert_enc(Bins, Enc, Enc) -> + {cast_binary(Bins), <<>>}; +convert_enc(eof, _, _) -> + {<<>>, <<>>}; +convert_enc(Bin, InEnc, OutEnc) -> + case unicode:characters_to_binary(Bin, InEnc, OutEnc) of + Res when is_binary(Res) -> + {Res, <<>>}; + {incomplete, Res, Cont} -> + {Res, Cont}; + {error, _, _} -> + exit({no_translation, InEnc, OutEnc}) + end. + %% %% Process the I/O request get_chars %% @@ -640,8 +624,6 @@ invalid_unicode_error(Mod, Func, XtraArg, S) -> %% Convert error code to make it look as before err_func(io_lib, get_until, {_,F,_}) -> - F; -err_func(_, F, _) -> F. @@ -713,6 +695,8 @@ cat(B1, B2, list, latin1,_) -> binary_to_list(B1)++binary_to_list(B2). %% Cast binary to list or binary +cast(eof, _, _, _) -> + eof; cast(B, binary, latin1, latin1) -> B; cast(B, binary, InEncoding, OutEncoding) -> @@ -736,6 +720,8 @@ cast(B, list, InEncoding, OutEncoding) -> %% Convert buffer to binary cast_binary(Binary) when is_binary(Binary) -> Binary; +cast_binary([<<>>|List]) -> + cast_binary(List); cast_binary(List) when is_list(List) -> list_to_binary(List); cast_binary(_EOF) -> -- cgit v1.2.3