From bfe7e45cda6591dc31405ed1be961f079cc541c9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B6rn=20Gustavsson?= Date: Mon, 3 Mar 2014 13:50:23 +0100 Subject: Don't fail compilation for modules that contain invalid UTF-8 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The default encoding for Erlang modules is now UTF-8, and the compilation would fail if a module contained byte sequences that are not valid UTF-8 sequences. In a large project with say many hundreds of Erlang modules with names of developers such as "Björn" or "Håkan" encoded in latin-1, that could mean that many hundreds of files would need to be modified just to get started testing OTP 17. As a temporary measure to ease the transition, automatically fall back to the latin-1 encoding with a warning for any module that contains invalid byte sequences and for which no encoding has been specified. The intention is to remove this workaround in OTP 18 or 19. --- lib/compiler/src/compile.erl | 59 ++++++++++++++++++++++++++++++------ lib/compiler/test/error_SUITE.erl | 28 +++++++++++++++-- lib/compiler/test/warnings_SUITE.erl | 38 +++++++++++++++++++++-- 3 files changed, 111 insertions(+), 14 deletions(-) (limited to 'lib') diff --git a/lib/compiler/src/compile.erl b/lib/compiler/src/compile.erl index 9030dd998b..c7d91070f6 100644 --- a/lib/compiler/src/compile.erl +++ b/lib/compiler/src/compile.erl @@ -234,7 +234,9 @@ format_error({crash,Pass,Reason}) -> format_error({bad_return,Pass,Reason}) -> io_lib:format("internal error in ~p;\nbad return value: ~ts", [Pass,format_error_reason(Reason)]); format_error({module_name,Mod,Filename}) -> - io_lib:format("Module name '~s' does not match file name '~ts'", [Mod,Filename]). + io_lib:format("Module name '~s' does not match file name '~ts'", [Mod,Filename]); +format_error(reparsing_invalid_unicode) -> + "Non-UTF-8 character(s) detected, but no encoding declared. Encode the file in UTF-8 or add \"%% coding: latin-1\" at the beginning of the file. Retrying with latin-1 encoding.". format_error_reason({Reason, Stack}) when is_list(Stack) -> StackFun = fun @@ -792,20 +794,59 @@ no_native_compilation(BeamFile, #compile{options=Opts0}) -> _ -> false end. -parse_module(St) -> - Opts = St#compile.options, - Cwd = ".", - IncludePath = [Cwd, St#compile.dir|inc_paths(Opts)], - R = epp:parse_file(St#compile.ifile, IncludePath, pre_defs(Opts)), +parse_module(St0) -> + case do_parse_module(utf8, St0) of + {ok,_}=Ret -> + Ret; + {error,_}=Ret -> + Ret; + {invalid_unicode,File,Line} -> + case do_parse_module(latin1, St0) of + {ok,St} -> + Es = [{File,[{Line,?MODULE,reparsing_invalid_unicode}]}], + {ok,St#compile{warnings=Es++St#compile.warnings}}; + {error,St} -> + Es = [{File,[{Line,?MODULE,reparsing_invalid_unicode}]}], + {error,St#compile{errors=Es++St#compile.errors}} + end + end. + +do_parse_module(DefEncoding, #compile{ifile=File,options=Opts,dir=Dir}=St) -> + R = epp:parse_file(File, + [{includes,[".",Dir|inc_paths(Opts)]}, + {macros,pre_defs(Opts)}, + {default_encoding,DefEncoding}, + extra]), case R of - {ok,Forms} -> - Encoding = epp:read_encoding(St#compile.ifile), - {ok,St#compile{code=Forms,encoding=Encoding}}; + {ok,Forms,Extra} -> + Encoding = proplists:get_value(encoding, Extra), + case find_invalid_unicode(Forms, File) of + none -> + {ok,St#compile{code=Forms,encoding=Encoding}}; + {invalid_unicode,_,_}=Ret -> + case Encoding of + none -> + Ret; + _ -> + {ok,St#compile{code=Forms,encoding=Encoding}} + end + end; {error,E} -> Es = [{St#compile.ifile,[{none,?MODULE,{epp,E}}]}], {error,St#compile{errors=St#compile.errors ++ Es}} end. +find_invalid_unicode([H|T], File0) -> + case H of + {attribute,_,file,{File,_}} -> + find_invalid_unicode(T, File); + {error,{Line,file_io_server,invalid_unicode}} -> + {invalid_unicode,File0,Line}; + _Other -> + find_invalid_unicode(T, File0) + end; +find_invalid_unicode([], _) -> none. + parse_core(St) -> case file:read_file(St#compile.ifile) of {ok,Bin} -> diff --git a/lib/compiler/test/error_SUITE.erl b/lib/compiler/test/error_SUITE.erl index 5cdf429a5f..bd877bb528 100644 --- a/lib/compiler/test/error_SUITE.erl +++ b/lib/compiler/test/error_SUITE.erl @@ -23,7 +23,7 @@ -export([all/0, suite/0,groups/0,init_per_suite/1, end_per_suite/1, init_per_group/2,end_per_group/2, head_mismatch_line/1,warnings_as_errors/1, bif_clashes/1, - transforms/1,forbidden_maps/1]). + transforms/1,forbidden_maps/1,bad_utf8/1]). %% Used by transforms/1 test case. -export([parse_transform/2]). @@ -36,7 +36,8 @@ all() -> groups() -> [{p,test_lib:parallel(), - [head_mismatch_line,warnings_as_errors,bif_clashes,transforms,forbidden_maps]}]. + [head_mismatch_line,warnings_as_errors,bif_clashes, + transforms,forbidden_maps,bad_utf8]}]. init_per_suite(Config) -> Config. @@ -254,6 +255,23 @@ forbidden_maps(Config) when is_list(Config) -> [] = run2(Config, Ts1), ok. +bad_utf8(Config) -> + Ts = [{bad_utf8, + %% If coding is specified explicitly as utf-8, there should be + %% a compilation error; we must not fallback to parsing the + %% file in latin-1 mode. + <<"%% coding: utf-8 + %% Bj",246,"rn + t() -> \"",246,"\". + ">>, + [], + {error,[{2,epp,cannot_parse}, + {2,file_io_server,invalid_unicode}], + []} + }], + [] = run2(Config, Ts), + ok. + run(Config, Tests) -> ?line File = test_filename(Config), @@ -318,6 +336,7 @@ run_test(Test0, File, Warnings, WriteBeam) -> ?line compile:file(File, [binary,report|Warnings]), %% Test result of compilation. + io:format("~p\n", [Opts]), ?line Res = case compile:file(File, Opts) of {ok,Mod,_,[{_File,Ws}]} -> %io:format("compile:file(~s,~p) ->~n~p~n", @@ -335,6 +354,11 @@ run_test(Test0, File, Warnings, WriteBeam) -> %io:format("compile:file(~s,~p) ->~n~p~n", % [File,Opts,_ZZ]), {error,Es,Ws}; + {error,[{XFile,Es1},{XFile,Es2}],Ws} = _ZZ + when is_list(XFile) -> + %io:format("compile:file(~s,~p) ->~n~p~n", + % [File,Opts,_ZZ]), + {error,Es1++Es2,Ws}; {error,Es,[{_File,Ws}]} = _ZZ-> %io:format("compile:file(~s,~p) ->~n~p~n", % [File,Opts,_ZZ]), diff --git a/lib/compiler/test/warnings_SUITE.erl b/lib/compiler/test/warnings_SUITE.erl index de56a59e12..c3b02819f9 100644 --- a/lib/compiler/test/warnings_SUITE.erl +++ b/lib/compiler/test/warnings_SUITE.erl @@ -37,8 +37,9 @@ -export([pattern/1,pattern2/1,pattern3/1,pattern4/1, guard/1,bad_arith/1,bool_cases/1,bad_apply/1, - files/1,effect/1,bin_opt_info/1,bin_construction/1, comprehensions/1, - maps/1,redundant_boolean_clauses/1]). + files/1,effect/1,bin_opt_info/1,bin_construction/1, + comprehensions/1,maps/1,redundant_boolean_clauses/1, + latin1_fallback/1]). % Default timetrap timeout (set in init_per_testcase). -define(default_timeout, ?t:minutes(2)). @@ -63,7 +64,7 @@ groups() -> [pattern,pattern2,pattern3,pattern4,guard, bad_arith,bool_cases,bad_apply,files,effect, bin_opt_info,bin_construction,comprehensions,maps, - redundant_boolean_clauses]}]. + redundant_boolean_clauses,latin1_fallback]}]. init_per_suite(Config) -> Config. @@ -591,6 +592,37 @@ redundant_boolean_clauses(Config) when is_list(Config) -> run(Config, Ts), ok. +latin1_fallback(Conf) when is_list(Conf) -> + DataDir = ?privdir, + IncFile = filename:join(DataDir, "include_me.hrl"), + file:write_file(IncFile, <<"%% ",246," in include file\n">>), + Ts1 = [{latin1_fallback1, + %% Test that the compiler fall backs to latin-1 with + %% a warning if a file has no encoding and does not + %% contain correct UTF-8 sequences. + <<"%% Bj",246,"rn + t(_) -> \"",246,"\"; + t(x) -> ok. + ">>, + [], + {warnings,[{1,compile,reparsing_invalid_unicode}, + {3,sys_core_fold,{nomatch_shadow,2}}]}}], + [] = run(Conf, Ts1), + + Ts2 = [{latin1_fallback2, + %% Test that the compiler fall backs to latin-1 with + %% a warning if a file has no encoding and does not + %% contain correct UTF-8 sequences. + <<" + + -include(\"include_me.hrl\"). + ">>, + [], + {warnings,[{1,compile,reparsing_invalid_unicode}]} + }], + [] = run(Conf, Ts2), + ok. + %%% %%% End of test cases. %%% -- cgit v1.2.3