diff options
author | Björn Gustavsson <bjorn@erlang.org> | 2014-03-03 13:50:23 +0100 |
---|---|---|
committer | Björn Gustavsson <bjorn@erlang.org> | 2014-03-18 17:47:51 +0100 |
commit | bfe7e45cda6591dc31405ed1be961f079cc541c9 (patch) | |
tree | 45c894c1fb3a291d6ab7e5e083b648640554e540 /lib | |
parent | f3cee0e9f409c5850709f11ba15cec22d7387401 (diff) | |
download | otp-bfe7e45cda6591dc31405ed1be961f079cc541c9.tar.gz otp-bfe7e45cda6591dc31405ed1be961f079cc541c9.tar.bz2 otp-bfe7e45cda6591dc31405ed1be961f079cc541c9.zip |
Don't fail compilation for modules that contain invalid UTF-8
The default encoding for Erlang modules is now UTF-8, and the
compilation would fail if a module contained byte sequences that
are not valid UTF-8 sequences.
In a large project with say many hundreds of Erlang modules
with names of developers such as "Björn" or "Håkan" encoded in
latin-1, that could mean that many hundreds of files would need
to be modified just to get started testing OTP 17.
As a temporary measure to ease the transition, automatically
fall back to the latin-1 encoding with a warning for any module
that contains invalid byte sequences and for which no encoding
has been specified.
The intention is to remove this workaround in OTP 18 or 19.
Diffstat (limited to 'lib')
-rw-r--r-- | lib/compiler/src/compile.erl | 59 | ||||
-rw-r--r-- | lib/compiler/test/error_SUITE.erl | 28 | ||||
-rw-r--r-- | lib/compiler/test/warnings_SUITE.erl | 38 |
3 files changed, 111 insertions, 14 deletions
diff --git a/lib/compiler/src/compile.erl b/lib/compiler/src/compile.erl index 9030dd998b..c7d91070f6 100644 --- a/lib/compiler/src/compile.erl +++ b/lib/compiler/src/compile.erl @@ -234,7 +234,9 @@ format_error({crash,Pass,Reason}) -> format_error({bad_return,Pass,Reason}) -> io_lib:format("internal error in ~p;\nbad return value: ~ts", [Pass,format_error_reason(Reason)]); format_error({module_name,Mod,Filename}) -> - io_lib:format("Module name '~s' does not match file name '~ts'", [Mod,Filename]). + io_lib:format("Module name '~s' does not match file name '~ts'", [Mod,Filename]); +format_error(reparsing_invalid_unicode) -> + "Non-UTF-8 character(s) detected, but no encoding declared. Encode the file in UTF-8 or add \"%% coding: latin-1\" at the beginning of the file. Retrying with latin-1 encoding.". format_error_reason({Reason, Stack}) when is_list(Stack) -> StackFun = fun @@ -792,20 +794,59 @@ no_native_compilation(BeamFile, #compile{options=Opts0}) -> _ -> false end. -parse_module(St) -> - Opts = St#compile.options, - Cwd = ".", - IncludePath = [Cwd, St#compile.dir|inc_paths(Opts)], - R = epp:parse_file(St#compile.ifile, IncludePath, pre_defs(Opts)), +parse_module(St0) -> + case do_parse_module(utf8, St0) of + {ok,_}=Ret -> + Ret; + {error,_}=Ret -> + Ret; + {invalid_unicode,File,Line} -> + case do_parse_module(latin1, St0) of + {ok,St} -> + Es = [{File,[{Line,?MODULE,reparsing_invalid_unicode}]}], + {ok,St#compile{warnings=Es++St#compile.warnings}}; + {error,St} -> + Es = [{File,[{Line,?MODULE,reparsing_invalid_unicode}]}], + {error,St#compile{errors=Es++St#compile.errors}} + end + end. + +do_parse_module(DefEncoding, #compile{ifile=File,options=Opts,dir=Dir}=St) -> + R = epp:parse_file(File, + [{includes,[".",Dir|inc_paths(Opts)]}, + {macros,pre_defs(Opts)}, + {default_encoding,DefEncoding}, + extra]), case R of - {ok,Forms} -> - Encoding = epp:read_encoding(St#compile.ifile), - {ok,St#compile{code=Forms,encoding=Encoding}}; + {ok,Forms,Extra} -> + Encoding = proplists:get_value(encoding, Extra), + case find_invalid_unicode(Forms, File) of + none -> + {ok,St#compile{code=Forms,encoding=Encoding}}; + {invalid_unicode,_,_}=Ret -> + case Encoding of + none -> + Ret; + _ -> + {ok,St#compile{code=Forms,encoding=Encoding}} + end + end; {error,E} -> Es = [{St#compile.ifile,[{none,?MODULE,{epp,E}}]}], {error,St#compile{errors=St#compile.errors ++ Es}} end. +find_invalid_unicode([H|T], File0) -> + case H of + {attribute,_,file,{File,_}} -> + find_invalid_unicode(T, File); + {error,{Line,file_io_server,invalid_unicode}} -> + {invalid_unicode,File0,Line}; + _Other -> + find_invalid_unicode(T, File0) + end; +find_invalid_unicode([], _) -> none. + parse_core(St) -> case file:read_file(St#compile.ifile) of {ok,Bin} -> diff --git a/lib/compiler/test/error_SUITE.erl b/lib/compiler/test/error_SUITE.erl index 5cdf429a5f..bd877bb528 100644 --- a/lib/compiler/test/error_SUITE.erl +++ b/lib/compiler/test/error_SUITE.erl @@ -23,7 +23,7 @@ -export([all/0, suite/0,groups/0,init_per_suite/1, end_per_suite/1, init_per_group/2,end_per_group/2, head_mismatch_line/1,warnings_as_errors/1, bif_clashes/1, - transforms/1,forbidden_maps/1]). + transforms/1,forbidden_maps/1,bad_utf8/1]). %% Used by transforms/1 test case. -export([parse_transform/2]). @@ -36,7 +36,8 @@ all() -> groups() -> [{p,test_lib:parallel(), - [head_mismatch_line,warnings_as_errors,bif_clashes,transforms,forbidden_maps]}]. + [head_mismatch_line,warnings_as_errors,bif_clashes, + transforms,forbidden_maps,bad_utf8]}]. init_per_suite(Config) -> Config. @@ -254,6 +255,23 @@ forbidden_maps(Config) when is_list(Config) -> [] = run2(Config, Ts1), ok. +bad_utf8(Config) -> + Ts = [{bad_utf8, + %% If coding is specified explicitly as utf-8, there should be + %% a compilation error; we must not fallback to parsing the + %% file in latin-1 mode. + <<"%% coding: utf-8 + %% Bj",246,"rn + t() -> \"",246,"\". + ">>, + [], + {error,[{2,epp,cannot_parse}, + {2,file_io_server,invalid_unicode}], + []} + }], + [] = run2(Config, Ts), + ok. + run(Config, Tests) -> ?line File = test_filename(Config), @@ -318,6 +336,7 @@ run_test(Test0, File, Warnings, WriteBeam) -> ?line compile:file(File, [binary,report|Warnings]), %% Test result of compilation. + io:format("~p\n", [Opts]), ?line Res = case compile:file(File, Opts) of {ok,Mod,_,[{_File,Ws}]} -> %io:format("compile:file(~s,~p) ->~n~p~n", @@ -335,6 +354,11 @@ run_test(Test0, File, Warnings, WriteBeam) -> %io:format("compile:file(~s,~p) ->~n~p~n", % [File,Opts,_ZZ]), {error,Es,Ws}; + {error,[{XFile,Es1},{XFile,Es2}],Ws} = _ZZ + when is_list(XFile) -> + %io:format("compile:file(~s,~p) ->~n~p~n", + % [File,Opts,_ZZ]), + {error,Es1++Es2,Ws}; {error,Es,[{_File,Ws}]} = _ZZ-> %io:format("compile:file(~s,~p) ->~n~p~n", % [File,Opts,_ZZ]), diff --git a/lib/compiler/test/warnings_SUITE.erl b/lib/compiler/test/warnings_SUITE.erl index de56a59e12..c3b02819f9 100644 --- a/lib/compiler/test/warnings_SUITE.erl +++ b/lib/compiler/test/warnings_SUITE.erl @@ -37,8 +37,9 @@ -export([pattern/1,pattern2/1,pattern3/1,pattern4/1, guard/1,bad_arith/1,bool_cases/1,bad_apply/1, - files/1,effect/1,bin_opt_info/1,bin_construction/1, comprehensions/1, - maps/1,redundant_boolean_clauses/1]). + files/1,effect/1,bin_opt_info/1,bin_construction/1, + comprehensions/1,maps/1,redundant_boolean_clauses/1, + latin1_fallback/1]). % Default timetrap timeout (set in init_per_testcase). -define(default_timeout, ?t:minutes(2)). @@ -63,7 +64,7 @@ groups() -> [pattern,pattern2,pattern3,pattern4,guard, bad_arith,bool_cases,bad_apply,files,effect, bin_opt_info,bin_construction,comprehensions,maps, - redundant_boolean_clauses]}]. + redundant_boolean_clauses,latin1_fallback]}]. init_per_suite(Config) -> Config. @@ -591,6 +592,37 @@ redundant_boolean_clauses(Config) when is_list(Config) -> run(Config, Ts), ok. +latin1_fallback(Conf) when is_list(Conf) -> + DataDir = ?privdir, + IncFile = filename:join(DataDir, "include_me.hrl"), + file:write_file(IncFile, <<"%% ",246," in include file\n">>), + Ts1 = [{latin1_fallback1, + %% Test that the compiler fall backs to latin-1 with + %% a warning if a file has no encoding and does not + %% contain correct UTF-8 sequences. + <<"%% Bj",246,"rn + t(_) -> \"",246,"\"; + t(x) -> ok. + ">>, + [], + {warnings,[{1,compile,reparsing_invalid_unicode}, + {3,sys_core_fold,{nomatch_shadow,2}}]}}], + [] = run(Conf, Ts1), + + Ts2 = [{latin1_fallback2, + %% Test that the compiler fall backs to latin-1 with + %% a warning if a file has no encoding and does not + %% contain correct UTF-8 sequences. + <<" + + -include(\"include_me.hrl\"). + ">>, + [], + {warnings,[{1,compile,reparsing_invalid_unicode}]} + }], + [] = run(Conf, Ts2), + ok. + %%% %%% End of test cases. %%% |