From 300c5466a7c9cfe3ed22bba2a88ba21058406402 Mon Sep 17 00:00:00 2001 From: Hans Bolinder Date: Thu, 4 Oct 2012 15:58:26 +0200 Subject: [stdlib, kernel] Introduce Unicode support for Erlang source files Expect modifications, additions and corrections. There is a kludge in file_io_server and erl_scan:continuation_location() that's not so pleasing. --- lib/parsetools/include/yeccpre.hrl | 6 +- lib/parsetools/src/leex.erl | 99 ++++++++++++++++-------- lib/parsetools/src/yecc.erl | 120 ++++++++++++++++++----------- lib/parsetools/src/yeccscan.erl | 6 +- lib/parsetools/test/leex_SUITE.erl | 115 ++++++++++++++++++++++++++- lib/parsetools/test/yecc_SUITE.erl | 154 ++++++++++++++++++++++++++++++++++++- 6 files changed, 412 insertions(+), 88 deletions(-) (limited to 'lib/parsetools') diff --git a/lib/parsetools/include/yeccpre.hrl b/lib/parsetools/include/yeccpre.hrl index 3672394fc5..e4c3ba52be 100644 --- a/lib/parsetools/include/yeccpre.hrl +++ b/lib/parsetools/include/yeccpre.hrl @@ -1,7 +1,7 @@ %% %% %CopyrightBegin% %% -%% Copyright Ericsson AB 1996-2011. All Rights Reserved. +%% Copyright Ericsson AB 1996-2012. All Rights Reserved. %% %% The contents of this file are subject to the Erlang Public License, %% Version 1.1, (the "License"); you may not use this file except in @@ -36,7 +36,7 @@ parse_and_scan({M, F, A}) -> -spec format_error(any()) -> [char() | list()]. format_error(Message) -> - case io_lib:deep_char_list(Message) of + case io_lib:deep_unicode_char_list(Message) of true -> Message; _ -> @@ -164,7 +164,7 @@ yecctoken_location(Token) -> yecctoken2string({atom, _, A}) -> io_lib:write(A); yecctoken2string({integer,_,N}) -> io_lib:write(N); yecctoken2string({float,_,F}) -> io_lib:write(F); -yecctoken2string({char,_,C}) -> io_lib:write_char(C); +yecctoken2string({char,_,C}) -> io_lib:write_unicode_char(C); yecctoken2string({var,_,V}) -> io_lib:format("~s", [V]); yecctoken2string({string,_,S}) -> io_lib:write_unicode_string(S); yecctoken2string({reserved_symbol, _, A}) -> io_lib:write(A); diff --git a/lib/parsetools/src/leex.erl b/lib/parsetools/src/leex.erl index cdf20461d9..bbef4053b4 100644 --- a/lib/parsetools/src/leex.erl +++ b/lib/parsetools/src/leex.erl @@ -58,6 +58,7 @@ gfile=[], % Graph file module, % Module name opts=[], % Options + encoding=none, % Encoding of Xrl file % posix=false, % POSIX regular expressions errors=[], warnings=[] @@ -146,7 +147,9 @@ format_error({regexp,E})-> end, ["bad regexp `",Es,"'"]; format_error(ignored_characters) -> - "ignored characters". + "ignored characters"; +format_error(cannot_parse) -> + io_lib:fwrite("cannot parse; probably encoding mismatch", []). %%% %%% Local functions @@ -298,10 +301,10 @@ pack_warnings([]) -> report_errors(St) -> when_opt(fun () -> foreach(fun({File,{none,Mod,E}}) -> - io:fwrite("~s: ~s\n", + io:fwrite("~s: ~ts\n", [File,Mod:format_error(E)]); ({File,{Line,Mod,E}}) -> - io:fwrite("~s:~w: ~s\n", + io:fwrite("~s:~w: ~ts\n", [File,Line,Mod:format_error(E)]) end, sort(St#leex.errors)) end, report_errors, St#leex.opts). @@ -316,11 +319,11 @@ report_warnings(St) -> ShouldReport = member(report_warnings, St#leex.opts) orelse ReportWerror, when_bool(fun () -> foreach(fun({File,{none,Mod,W}}) -> - io:fwrite("~s: ~s~s\n", + io:fwrite("~s: ~s~ts\n", [File,Prefix, Mod:format_error(W)]); ({File,{Line,Mod,W}}) -> - io:fwrite("~s:~w: ~s~s\n", + io:fwrite("~s:~w: ~s~ts\n", [File,Line,Prefix, Mod:format_error(W)]) end, sort(St#leex.warnings)) @@ -396,17 +399,18 @@ verbose_print(St, Format, Args) -> parse_file(St0) -> case file:open(St0#leex.xfile, [read]) of {ok,Xfile} -> + St1 = St0#leex{encoding = epp:set_encoding(Xfile)}, try - verbose_print(St0, "Parsing file ~s, ", [St0#leex.xfile]), + verbose_print(St1, "Parsing file ~s, ", [St1#leex.xfile]), %% We KNOW that errors throw so we can ignore them here. - {ok,Line1,St1} = parse_head(Xfile, St0), - {ok,Line2,Macs,St2} = parse_defs(Xfile, Line1, St1), - {ok,Line3,REAs,Actions,St3} = - parse_rules(Xfile, Line2, Macs, St2), - {ok,Code,St4} = parse_code(Xfile, Line3, St3), - verbose_print(St1, "contained ~w rules.~n", [length(REAs)]), - {ok,REAs,Actions,Code,St4} - after file:close(Xfile) + {ok,Line1,St2} = parse_head(Xfile, St1), + {ok,Line2,Macs,St3} = parse_defs(Xfile, Line1, St2), + {ok,Line3,REAs,Actions,St4} = + parse_rules(Xfile, Line2, Macs, St3), + {ok,Code,St5} = parse_code(Xfile, Line3, St4), + verbose_print(St5, "contained ~w rules.~n", [length(REAs)]), + {ok,REAs,Actions,Code,St5} + after ok = file:close(Xfile) end; {error,Error} -> add_error({none,leex,{file_error,Error}}, St0) @@ -415,7 +419,7 @@ parse_file(St0) -> %% parse_head(File, State) -> {ok,NextLine,State}. %% Parse the head of the file. Skip all comments and blank lines. -parse_head(Ifile, St) -> {ok,nextline(Ifile, 0),St}. +parse_head(Ifile, St) -> {ok,nextline(Ifile, 0, St),St}. %% parse_defs(File, Line, State) -> {ok,NextLine,Macros,State}. %% Parse the macro definition section of a file. This must exist. @@ -423,7 +427,7 @@ parse_head(Ifile, St) -> {ok,nextline(Ifile, 0),St}. parse_defs(Ifile, {ok,?DEFS_HEAD ++ Rest,L}, St) -> St1 = warn_ignored_chars(L, Rest, St), - parse_defs(Ifile, nextline(Ifile, L), [], St1); + parse_defs(Ifile, nextline(Ifile, L, St), [], St1); parse_defs(_, {ok,_,L}, St) -> add_error({L,leex,missing_defs}, St); parse_defs(_, {eof,L}, St) -> @@ -435,7 +439,7 @@ parse_defs(Ifile, {ok,Chars,L}=Line, Ms, St) -> case re:run(Chars, MS, [{capture,all_but_first,list}]) of {match,[Name,Def]} -> %%io:fwrite("~p = ~p\n", [Name,Def]), - parse_defs(Ifile, nextline(Ifile, L), [{Name,Def}|Ms], St); + parse_defs(Ifile, nextline(Ifile, L, St), [{Name,Def}|Ms], St); _ -> {ok,Line,Ms,St} % Anything else end; parse_defs(_, Line, Ms, St) -> @@ -446,7 +450,7 @@ parse_defs(_, Line, Ms, St) -> parse_rules(Ifile, {ok,?RULE_HEAD ++ Rest,L}, Ms, St) -> St1 = warn_ignored_chars(L, Rest, St), - parse_rules(Ifile, nextline(Ifile, L), Ms, [], [], 0, St1); + parse_rules(Ifile, nextline(Ifile, L, St), Ms, [], [], 0, St1); parse_rules(_, {ok,_,L}, _, St) -> add_error({L,leex,missing_rules}, St); parse_rules(_, {eof,L}, _, St) -> @@ -464,7 +468,7 @@ parse_rules(Ifile, NextLine, Ms, REAs, As, N, St) -> case collect_rule(Ifile, Chars, L0) of {ok,Re,Atoks,L1} -> {ok,REA,A,St1} = parse_rule(Re, L0, Atoks, Ms, N, St), - parse_rules(Ifile, nextline(Ifile, L1), Ms, + parse_rules(Ifile, nextline(Ifile, L1, St), Ms, [REA|REAs], [A|As], N+1, St1); {error,E} -> add_error(E, St) end; @@ -497,8 +501,10 @@ collect_rule(Ifile, Chars, L0) -> {error,E,_} -> {error,E} end. +collect_action(_Ifile, {error, _}, L, _Cont0) -> + {error, {L, leex, cannot_parse}, ignored_end_line}; collect_action(Ifile, Chars, L0, Cont0) -> - case erl_scan:tokens(Cont0, Chars, L0) of + case erl_scan:tokens(Cont0, Chars, L0, [unicode]) of {done,{ok,Toks,_},_} -> {ok,Toks,L0}; {done,{eof,_},_} -> {eof,L0}; {done,{error,E,_},_} -> {error,E,L0}; @@ -560,29 +566,32 @@ parse_code(Ifile, {ok,?CODE_HEAD ++ Rest,CodeL}, St) -> St1 = warn_ignored_chars(CodeL, Rest, St), {ok, CodePos} = file:position(Ifile, cur), %% Just count the lines; copy the code from file to file later. - NCodeLines = count_lines(Ifile, 0), + EndCodeLine = count_lines(Ifile, CodeL, St), + NCodeLines = EndCodeLine - CodeL, {ok,{CodeL,CodePos,NCodeLines},St1}; parse_code(_, {ok,_,L}, St) -> add_error({L,leex,missing_code}, St); parse_code(_, {eof,L}, St) -> add_error({L,leex,missing_code}, St). -count_lines(File, N) -> +count_lines(File, N, St) -> case io:get_line(File, leex) of eof -> N; - _Line -> count_lines(File, N+1) + {error, _} -> add_error({N+1, leex, cannot_parse}, St); + _Line -> count_lines(File, N+1, St) end. -%% nextline(InputFile, PrevLineNo) -> {ok,Chars,LineNo} | {eof,LineNo}. +%% nextline(InputFile, PrevLineNo, State) -> {ok,Chars,LineNo} | {eof,LineNo}. %% Get the next line skipping comment lines and blank lines. -nextline(Ifile, L) -> +nextline(Ifile, L, St) -> case io:get_line(Ifile, leex) of eof -> {eof,L}; + {error, _} -> add_error({L+1, leex, cannot_parse}, St); Chars -> case substr(Chars, span(Chars, " \t\n")+1) of - [$%|_Rest] -> nextline(Ifile, L+1); - [] -> nextline(Ifile, L+1); + [$%|_Rest] -> nextline(Ifile, L+1, St); + [] -> nextline(Ifile, L+1, St); _Other -> {ok,Chars,L+1} end end. @@ -1289,19 +1298,21 @@ out_file(St0, DFA, DF, Actions, Code) -> try case file:open(St0#leex.efile, [write]) of {ok,Ofile} -> + set_encoding(St0, Ofile), try + output_encoding_comment(Ofile, St0), output_file_directive(Ofile, St0#leex.ifile, 0), out_file(Ifile, Ofile, St0, DFA, DF, Actions, Code, 1), verbose_print(St0, "ok~n", []), St0 - after file:close(Ofile) + after ok = file:close(Ofile) end; {error,Error} -> verbose_print(St0, "error~n", []), add_error({none,leex,{file_error,Error}}, St0) end - after file:close(Ifile) + after ok = file:close(Ifile) end; {{error,Error},Ifile} -> add_error(Ifile, {none,leex,{file_error,Error}}, St0) @@ -1310,7 +1321,9 @@ out_file(St0, DFA, DF, Actions, Code) -> open_inc_file(State) -> Ifile = State#leex.ifile, case file:open(Ifile, [read]) of - {ok,F} -> {ok,F}; + {ok,F} -> + _ = epp:set_encoding(F), + {ok,F}; Error -> {Error,Ifile} end. @@ -1328,6 +1341,7 @@ inc_file_name(Filename) -> out_file(Ifile, Ofile, St, DFA, DF, Actions, Code, L) -> case io:get_line(Ifile, leex) of eof -> output_file_directive(Ofile, St#leex.ifile, L); + {error, _} -> add_error(St#leex.ifile, {L, leex, cannot_parse}, St); Line -> case substr(Line, 1, 5) of "##mod" -> out_module(Ofile, St); @@ -1347,14 +1361,23 @@ out_erlang_code(File, St, Code, L) -> output_file_directive(File, St#leex.xfile, CodeL), {ok,Xfile} = file:open(St#leex.xfile, [read]), try + set_encoding(St, Xfile), {ok,_} = file:position(Xfile, CodePos), - {ok,_} = file:copy(Xfile, File) + ok = file_copy(Xfile, File) after - file:close(Xfile) + ok = file:close(Xfile) end, io:nl(File), output_file_directive(File, St#leex.ifile, L). +file_copy(From, To) -> + case io:get_line(From, leex) of + eof -> ok; + Line when is_list(Line) -> + io:fwrite(To, "~ts", [Line]), + file_copy(From, To) + end. + out_dfa(File, St, DFA, Code, DF, L) -> {_CodeL,_CodePos,NCodeLines} = Code, %% Three file attributes before this one... @@ -1569,7 +1592,7 @@ out_dfa_graph(St, DFA, DF) -> io:fwrite(Gfile, "}~n", []), verbose_print(St, "ok~n", []), St - after file:close(Gfile) + after ok = file:close(Gfile) end; {error,Error} -> verbose_print(St, "error~n", []), @@ -1610,6 +1633,16 @@ dfa_edgelabel(Cranges) -> (C) -> [quote(C)] end, Cranges) ++ "]". +set_encoding(#leex{encoding = none}, File) -> + ok = io:setopts(File, [{encoding, epp:default_encoding()}]); +set_encoding(#leex{encoding = E}, File) -> + ok = io:setopts(File, [{encoding, E}]). + +output_encoding_comment(_File, #leex{encoding = none}) -> + ok; +output_encoding_comment(File, #leex{encoding = Encoding}) -> + io:fwrite(File, <<"%% ~s\n">>, [epp:encoding_to_string(Encoding)]). + output_file_directive(File, Filename, Line) -> io:fwrite(File, <<"-file(~s, ~w).\n">>, [format_filename(Filename), Line]). diff --git a/lib/parsetools/src/yecc.erl b/lib/parsetools/src/yecc.erl index b0792a6ed8..dbb7d025ae 100644 --- a/lib/parsetools/src/yecc.erl +++ b/lib/parsetools/src/yecc.erl @@ -1,7 +1,7 @@ %% %% %CopyrightBegin% %% -%% Copyright Ericsson AB 1996-2011. All Rights Reserved. +%% Copyright Ericsson AB 1996-2012. All Rights Reserved. %% %% The contents of this file are subject to the Erlang Public License, %% Version 1.1, (the "License"); you may not use this file except in @@ -42,6 +42,7 @@ includefile, includefile_version, module, + encoding = none, options = [], verbose = false, file_attrs = true, @@ -224,7 +225,11 @@ format_error({unused_nonterminal, Nonterminal}) -> [format_symbol(Nonterminal)]); format_error({unused_terminal, Terminal}) -> io_lib:fwrite("terminal symbol ~s not used", - [format_symbol(Terminal)]). + [format_symbol(Terminal)]); +format_error({bad_symbol, String}) -> + io_lib:fwrite("bad symbol ~ts", [String]); +format_error(cannot_parse) -> + io_lib:fwrite("cannot parse; possibly encoding mismatch", []). file(File) -> file(File, [report_errors, report_warnings]). @@ -257,7 +262,7 @@ yecc(Infile, Outfile, Verbose) -> yecc(Infile, Outfile, Verbose, []). yecc(Infilex, Outfilex, Verbose, Includefilex) -> - statistics(runtime), + _ = statistics(runtime), case file(Infilex, [{parserfile, Outfilex}, {verbose, Verbose}, {report, true}, @@ -407,7 +412,9 @@ infile(Parent, Infilex, Options) -> St = case file:open(St0#yecc.infile, [read, read_ahead]) of {ok, Inport} -> try - outfile(St0#yecc{inport = Inport}) + Encoding = epp:set_encoding(Inport), + St1 = St0#yecc{inport = Inport, encoding = Encoding}, + outfile(St1) after ok = file:close(Inport) end; @@ -428,6 +435,8 @@ outfile(St0) -> case file:open(St0#yecc.outfile, [write, delayed_write]) of {ok, Outport} -> try + %% Set the same encoding as infile: + set_encoding(St0, Outport), generate(St0#yecc{outport = Outport, line = 1}) catch throw: St1 -> @@ -466,13 +475,14 @@ timeit(Name, Fun, St0) -> -define(PASS(P), {P, fun P/1}). generate(St0) -> + St1 = output_encoding_comment(St0), Passes = [?PASS(parse_grammar), ?PASS(check_grammar), ?PASS(states_and_goto_table), ?PASS(parse_actions), ?PASS(action_conflicts), ?PASS(write_file)], - F = case member(time, St0#yecc.options) of + F = case member(time, St1#yecc.options) of true -> io:fwrite(<<"Generating parser from grammar in ~s\n">>, - [format_filename(St0#yecc.infile)]), + [format_filename(St1#yecc.infile)]), fun timeit/3; false -> fun(_Name, Fn, St) -> Fn(St) end @@ -484,13 +494,13 @@ generate(St0) -> true -> throw(St2) end end, - foldl(Fun, St0, Passes). + foldl(Fun, St1, Passes). parse_grammar(St) -> parse_grammar(St#yecc.inport, 1, St). parse_grammar(Inport, Line, St) -> - {NextLine, Grammar} = read_grammar(Inport, Line), + {NextLine, Grammar} = read_grammar(Inport, St, Line), parse_grammar(Grammar, Inport, NextLine, St). parse_grammar(eof, _Inport, _NextLine, St) -> @@ -523,6 +533,8 @@ parse_grammar({rule, Rule, Tokens}, St0) -> St#yecc{rules_list = [RuleDef | St#yecc.rules_list]}; parse_grammar({prec, Prec}, St) -> St#yecc{prec = Prec ++ St#yecc.prec}; +parse_grammar({#symbol{}, [{string,Line,String}]}, St) -> + add_error(Line, {bad_symbol, String}, St); parse_grammar({#symbol{line = Line, name = Name}, Symbols}, St) -> CF = fun(I) -> case element(I, St) of @@ -543,12 +555,17 @@ parse_grammar({#symbol{line = Line, name = Name}, Symbols}, St) -> _ -> add_warning(Line, bad_declaration, St) end. -read_grammar(Inport, Line) -> +read_grammar(Inport, St, Line) -> case yeccscan:scan(Inport, '', Line) of {eof, NextLine} -> {NextLine, eof}; {error, {ErrorLine, Mod, What}, NextLine} -> {NextLine, {error, ErrorLine, {error, Mod, What}}}; + {error, terminated} -> + throw(St); + {error, _} -> + File = St#yecc.infile, + throw(add_error(File, none, cannot_parse, St)); {ok, Input, NextLine} -> {NextLine, case yeccparser:parse(Input) of {error, {ErrorLine, Mod, Message}} -> @@ -738,9 +755,9 @@ states_and_goto_table(St0) -> create_precedence_table(St). parse_actions(St) -> - erase(), % the pd is used when decoding lookahead sets + _ = erase(), % the pd is used when decoding lookahead sets ParseActions = compute_parse_actions(St#yecc.n_states, St, []), - erase(), + _ = erase(), St#yecc{parse_actions = ParseActions, state_tab = []}. action_conflicts(St0) -> @@ -841,10 +858,10 @@ report_errors(St) -> case member(report_errors, St#yecc.options) of true -> foreach(fun({File,{none,Mod,E}}) -> - io:fwrite(<<"~s: ~s\n">>, + io:fwrite(<<"~s: ~ts\n">>, [File,Mod:format_error(E)]); ({File,{Line,Mod,E}}) -> - io:fwrite(<<"~s:~w: ~s\n">>, + io:fwrite(<<"~s:~w: ~ts\n">>, [File,Line,Mod:format_error(E)]) end, sort(St#yecc.errors)); false -> @@ -861,11 +878,11 @@ report_warnings(St) -> case member(report_warnings, St#yecc.options) orelse ReportWerror of true -> foreach(fun({File,{none,Mod,W}}) -> - io:fwrite(<<"~s: ~s~s\n">>, + io:fwrite(<<"~s: ~s~ts\n">>, [File,Prefix, Mod:format_error(W)]); ({File,{Line,Mod,W}}) -> - io:fwrite(<<"~s:~w: ~s~s\n">>, + io:fwrite(<<"~s:~w: ~s~ts\n">>, [File,Line,Prefix, Mod:format_error(W)]) end, sort(St#yecc.warnings)); @@ -1024,7 +1041,7 @@ compute_states(St0) -> rp_info = RulePointerInfo, goto = GotoTab}, - erase(), + _ = erase(), EndsymCode = code_terminal(StC#yecc.endsymbol, StC#yecc.symbol_tab), {StateId, State0} = compute_state([{EndsymCode, 1}], Tables), @@ -1923,9 +1940,10 @@ output_prelude(Outport, Inport, St0) when St0#yecc.includefile =:= [] -> {St20, 0, no_erlang_code}; Next_line -> St_10 = output_file_directive(St20, Infile, Next_line-1), - Nmbr_of_lines = include1([], Inport, Outport), - {St_10, Nmbr_of_lines, - {last_erlang_code_line, Next_line+Nmbr_of_lines}} + Last_line = include1([], Inport, Outport, Infile, + Next_line, St_10), + Nmbr_of_lines = Last_line - Next_line, + {St_10, Nmbr_of_lines, {last_erlang_code_line, Last_line}} end, St30 = nl(St25), IncludeFile = @@ -1946,13 +1964,13 @@ output_prelude(Outport, Inport, St0) -> {St30, N_lines_1, no_erlang_code}; Next_line -> St = output_file_directive(St30, Infile, Next_line-1), - Nmbr_of_lines = include1([], Inport, Outport), - {St, Nmbr_of_lines + N_lines_1, - {last_erlang_code_line, Next_line+Nmbr_of_lines}} + Last_line = include1([], Inport, Outport, Infile, Next_line, St), + Nmbr_of_lines = Last_line - Next_line, + {St, Nmbr_of_lines + N_lines_1, {last_erlang_code_line, Last_line}} end. output_header(St0) -> - lists:foldl(fun(Str, St) -> fwrite(St, <<"~s\n">>, [Str]) + lists:foldl(fun(Str, St) -> fwrite(St, <<"~ts\n">>, [Str]) end, St0, St0#yecc.header). output_goto(St, [{_Nonterminal, []} | Go], StateInfo) -> @@ -2250,8 +2268,8 @@ output_inlined(St0, FunctionName, Reduce, Infile) -> [append(["[", tl(A), " | __Stack]"])]) end, St = St40#yecc{line = St40#yecc.line + NLines}, - fwrite(St, <<" [begin\n ~s\n end | ~s].\n\n">>, - [pp_tokens(Tokens, Line0), Stack]). + fwrite(St, <<" [begin\n ~ts\n end | ~s].\n\n">>, + [pp_tokens(Tokens, Line0, St#yecc.encoding), Stack]). inlined_function_name(State, "Cat") -> inlined_function_name(State, ""); @@ -2421,24 +2439,24 @@ include(St, File, Outport) -> {error, Reason} -> throw(add_error(File, none, {file_error, Reason}, St)); {ok, Inport} -> + _ = epp:set_encoding(Inport), Line = io:get_line(Inport, ''), - N_lines = include1(Line, Inport, Outport), - file:close(Inport), - N_lines + try include1(Line, Inport, Outport, File, 1, St) - 1 + after ok = file:close(Inport) + end end. -include1(Line, Inport, Outport) -> - include1(Line, Inport, Outport, 0). - -include1(eof, _, _, Nmbr_of_lines) -> - Nmbr_of_lines; -include1(Line, Inport, Outport, Nmbr_of_lines) -> +include1(eof, _, _, _File, L, _St) -> + L; +include1({error, _}=_Error, _Inport, _Outport, File, L, St) -> + throw(add_error(File, L, cannot_parse, St)); +include1(Line, Inport, Outport, File, L, St) -> Incr = case member($\n, Line) of true -> 1; false -> 0 end, io:put_chars(Outport, Line), - include1(io:get_line(Inport, ''), Inport, Outport, Nmbr_of_lines + Incr). + include1(io:get_line(Inport, ''), Inport, Outport, File, L + Incr, St). includefile_version([]) -> {1,4}; @@ -2465,18 +2483,22 @@ parse_file(Epp) -> end. %% Keeps the line breaks of the original code. -pp_tokens(Tokens, Line0) -> - concat(pp_tokens1(Tokens, Line0, [])). +pp_tokens(Tokens, Line0, Enc) -> + concat(pp_tokens1(Tokens, Line0, Enc, [])). -pp_tokens1([], _Line0, _T0) -> +pp_tokens1([], _Line0, _Enc, _T0) -> []; -pp_tokens1([T | Ts], Line0, T0) -> +pp_tokens1([T | Ts], Line0, Enc, T0) -> Line = element(2, T), - [pp_sep(Line, Line0, T0), pp_symbol(T) | pp_tokens1(Ts, Line, T)]. + [pp_sep(Line, Line0, T0), pp_symbol(T, Enc)|pp_tokens1(Ts, Line, Enc, T)]. -pp_symbol({var,_,Var}) -> Var; -pp_symbol({_,_,Symbol}) -> io_lib:fwrite(<<"~p">>, [Symbol]); -pp_symbol({Symbol, _}) -> Symbol. +pp_symbol({var,_,Var}, _Enc) -> Var; +pp_symbol({string,_,String}, latin1) -> + io_lib:write_unicode_string_as_latin1(String); +pp_symbol({string,_,String}, _Enc) -> io_lib:write_unicode_string(String); +pp_symbol({_,_,Symbol}, latin1) -> io_lib:fwrite(<<"~p">>, [Symbol]); +pp_symbol({_,_,Symbol}, _Enc) -> io_lib:fwrite(<<"~tp">>, [Symbol]); +pp_symbol({Symbol, _}, _Enc) -> Symbol. pp_sep(Line, Line0, T0) when Line > Line0 -> ["\n " | pp_sep(Line - 1, Line0, T0)]; @@ -2485,6 +2507,16 @@ pp_sep(_Line, _Line0, {'.',_}) -> pp_sep(_Line, _Line0, _T0) -> " ". +set_encoding(#yecc{encoding = none}, Port) -> + ok = io:setopts(Port, [{encoding, epp:default_encoding()}]); +set_encoding(#yecc{encoding = E}, Port) -> + ok = io:setopts(Port, [{encoding, E}]). + +output_encoding_comment(#yecc{encoding = none}=St) -> + St; +output_encoding_comment(#yecc{encoding = Encoding}=St) -> + fwrite(St, <<"%% ~s\n">>, [epp:encoding_to_string(Encoding)]). + output_file_directive(St, Filename, Line) when St#yecc.file_attrs -> fwrite(St, <<"-file(~s, ~w).\n">>, [format_filename(Filename), Line]); @@ -2529,7 +2561,7 @@ format_assoc(nonassoc) -> format_symbol(Symbol) -> String = concat([Symbol]), - case erl_scan:string(String) of + case erl_scan:string(String, 1, [unicode]) of {ok, [{atom, _, _}], _} -> io_lib:fwrite(<<"~w">>, [Symbol]); {ok, [{Word, _}], _} when Word =/= ':', Word =/= '->' -> diff --git a/lib/parsetools/src/yeccscan.erl b/lib/parsetools/src/yeccscan.erl index d7ec3ba8d3..9e0e85143a 100644 --- a/lib/parsetools/src/yeccscan.erl +++ b/lib/parsetools/src/yeccscan.erl @@ -1,7 +1,7 @@ %% %% %CopyrightBegin% %% -%% Copyright Ericsson AB 1996-2009. All Rights Reserved. +%% Copyright Ericsson AB 1996-2012. All Rights Reserved. %% %% The contents of this file are subject to the Erlang Public License, %% Version 1.1, (the "License"); you may not use this file except in @@ -24,7 +24,7 @@ scan(Inport) -> scan(Inport, '', 1). scan(Inport, Prompt, Line1) -> - case catch io:scan_erl_form(Inport, Prompt, Line1) of + case catch io:scan_erl_form(Inport, Prompt, Line1, [unicode]) of {eof, Line2} -> {eof, Line2}; {ok, Tokens, Line2} -> @@ -34,6 +34,8 @@ scan(Inport, Prompt, Line1) -> _ -> {ok, lex(Tokens), Line2} end; + {error, Reason} -> + {error, Reason}; {error, Descriptor, Line2} -> {error, Descriptor, Line2}; {'EXIT', Why} -> diff --git a/lib/parsetools/test/leex_SUITE.erl b/lib/parsetools/test/leex_SUITE.erl index 1e50aedf07..a0d4fd7c48 100644 --- a/lib/parsetools/test/leex_SUITE.erl +++ b/lib/parsetools/test/leex_SUITE.erl @@ -1,7 +1,8 @@ +%% -*= coding: latin-1 -*- %% %% %CopyrightBegin% %% -%% Copyright Ericsson AB 2010-2011. All Rights Reserved. +%% Copyright Ericsson AB 2010-2012. All Rights Reserved. %% %% The contents of this file are subject to the Erlang Public License, %% Version 1.1, (the "License"); you may not use this file except in @@ -42,7 +43,9 @@ -export([ file/1, compile/1, syntax/1, - pt/1, man/1, ex/1, ex2/1, not_yet/1]). + pt/1, man/1, ex/1, ex2/1, not_yet/1, + + otp_10302/1]). % Default timetrap timeout (set in init_per_testcase). -define(default_timeout, ?t:minutes(1)). @@ -63,7 +66,8 @@ all() -> groups() -> [{checks, [], [file, compile, syntax]}, - {examples, [], [pt, man, ex, ex2, not_yet]}]. + {examples, [], [pt, man, ex, ex2, not_yet]}, + {tickets, [], [otp_10302]}]. init_per_suite(Config) -> Config. @@ -875,6 +879,111 @@ not_yet(Config) when is_list(Config) -> ok. +otp_10302(doc) -> + "OTP-10302. Unicode characters scanner/parser."; +otp_10302(suite) -> []; +otp_10302(Config) when is_list(Config) -> + Dir = ?privdir, + Filename = filename:join(Dir, "file.xrl"), + Ret = [return, {report, true}], + + ok = file:write_file(Filename,<< + "%% coding: UTF-8\n" + "ä" + >>), + {error,[{_,[{2,leex,cannot_parse}]}],[]} = + leex:file(Filename, Ret), + + ok = file:write_file(Filename,<< + "%% coding: UTF-8\n" + "Definitions.\n" + "ä" + >>), + {error,[{_,[{3,leex,cannot_parse}]}],[]} = leex:file(Filename, Ret), + + ok = file:write_file(Filename,<< + "%% coding: UTF-8\n" + "Definitions.\n" + "A = a\n" + "L = [{A}-{Z}]\n" + "Z = z\n" + "Rules.\n" + "{L}+ : {token,{list_to_atom(TokenChars),Häpp}}.\n" + >>), + {error,[{_,[{7,leex,cannot_parse}]}],[]} = leex:file(Filename, Ret), + + ok = file:write_file(Filename,<< + "%% coding: UTF-8\n" + "Definitions.\n" + "A = a\n" + "L = [{A}-{Z}]\n" + "Z = z\n" + "Rules.\n" + "{L}+ : {token,{list_to_atom(TokenChars)}}.\n" + "Erlang code.\n" + "-export([t/0]).\n" + "t() ->\n" + " Häpp\n" + >>), + {error,[{_,[{11,leex,cannot_parse}]}],[]} = leex:file(Filename, Ret), + + Mini = <<"Definitions.\n" + "D = [0-9]\n" + "Rules.\n" + "{L}+ : {token,{word,TokenLine,TokenChars}}.\n" + "Erlang code.\n">>, + LeexPre = filename:join(Dir, "leexinc.hrl"), + ?line ok = file:write_file(LeexPre, <<"%% coding: UTF-8\n ä">>), + PreErrors = run_test(Config, Mini, LeexPre), + {error,[{IncludeFile,[{2,leex,cannot_parse}]}],[]} = PreErrors, + "leexinc.hrl" = filename:basename(IncludeFile), + + Ts = [{uni_1, + <<"%% coding: UTF-8\n" + "Definitions.\n" + "A = a\n" + "L = [{A}-{Z}]\n" + "Z = z\n" + "Rules.\n" + "{L}+ : {token,{list_to_atom(TokenChars),\n" + "begin Häpp = foo, Häpp end," + " 'Häpp',\"\\x{400}B\",\"örn_Ѐ\"}}.\n" + "Erlang code.\n" + "-export([t/0]).\n" + "t() ->\n" + " %% Häpp, 'Häpp',\"\\x{400}B\",\"örn_Ѐ\"\n" + " {ok, [R], 1} = string(\"tip\"),\n" + " {tip,foo,'Häpp',[1024,66],[246,114,110,95,1024]} = R,\n" + " Häpp = foo,\n" + " {tip, Häpp, 'Häpp',\"\\x{400}B\",\"örn_Ѐ\"} = R,\n" + " ok.\n">>, + default, + ok}, + {uni_2, + <<"%% coding: Latin-1\n" + "Definitions.\n" + "A = a\n" + "L = [{A}-{Z}]\n" + "Z = z\n" + "Rules.\n" + "{L}+ : {token,{list_to_atom(TokenChars),\n" + "begin Häpp = foo, Häpp end," + " 'Häpp',\"\\x{400}B\",\"örn_Ѐ\"}}.\n" + "Erlang code.\n" + "-export([t/0]).\n" + "t() ->\n" + " %% Häpp, 'Häpp',\"\\x{400}B\",\"örn_Ѐ\"\n" + " {ok, [R], 1} = string(\"tip\"),\n" + " {tip,foo,'Häpp',[1024,66],[195,182,114,110,95,208,128]} = R,\n" + " Häpp = foo,\n" + " {tip, Häpp, 'Häpp',\"\\x{400}B\",\"örn_Ѐ\"} = R,\n" + " ok.\n">>, + default, + ok}], + run(Config, Ts), + + ok. + unwritable(Fname) -> {ok, Info} = file:read_file_info(Fname), Mode = Info#file_info.mode - 8#00200, diff --git a/lib/parsetools/test/yecc_SUITE.erl b/lib/parsetools/test/yecc_SUITE.erl index 3d26adf1be..c306dbe833 100644 --- a/lib/parsetools/test/yecc_SUITE.erl +++ b/lib/parsetools/test/yecc_SUITE.erl @@ -1,7 +1,8 @@ +%% -*- coding: latin-1 -*- %% %% %CopyrightBegin% %% -%% Copyright Ericsson AB 2005-2011. All Rights Reserved. +%% Copyright Ericsson AB 2005-2012. All Rights Reserved. %% %% The contents of this file are subject to the Erlang Public License, %% Version 1.1, (the "License"); you may not use this file except in @@ -48,7 +49,7 @@ otp_5369/1, otp_6362/1, otp_7945/1, otp_8483/1, otp_8486/1, - otp_7292/1, otp_7969/1, otp_8919/1]). + otp_7292/1, otp_7969/1, otp_8919/1, otp_10302/1]). % Default timetrap timeout (set in init_per_testcase). -define(default_timeout, ?t:minutes(1)). @@ -75,7 +76,7 @@ groups() -> [empty, prec, yeccpre, lalr, old_yecc, other_examples]}, {bugs, [], [otp_5369, otp_6362, otp_7945, otp_8483, otp_8486]}, - {improvements, [], [otp_7292, otp_7969, otp_8919]}]. + {improvements, [], [otp_7292, otp_7969, otp_8919, otp_10302]}]. init_per_suite(Config) -> Config. @@ -1815,6 +1816,153 @@ otp_8919(Config) when is_list(Config) -> "syntax error before: \"hello\"" = lists:flatten(Mod:format_error(Mess)), ok. +otp_10302(doc) -> + "OTP-10302. Unicode characters scanner/parser."; +otp_10302(suite) -> []; +otp_10302(Config) when is_list(Config) -> + Dir = ?privdir, + Filename = filename:join(Dir, "OTP-10302.yrl"), + Ret = [return, {report, true}], + Mini1 = <<"%% coding: utf-8 + Nonterminals Häpp. + nt -> t.">>, + ok = file:write_file(Filename, Mini1), + %% This could (and should) be refined: + {error,[{Filename,[{2,Mod1,Err1}]}],[]} = + yecc:file(Filename, Ret), + "cannot translate from UTF-8" = Mod1:format_error(Err1), + + Mini2 = <<"%% coding: Utf-8 + Nonterminals Hopp. + Terminals t. + Rootsymbol Hopp. + + Hopp -> t. + + Erlang code. + + t() -> + Häpp.">>, + ok = file:write_file(Filename, Mini2), + {error,[{Filename,[{11,Mod2,Err2}]}],[]} = + yecc:file(Filename, Ret), + "cannot parse; possibly encoding mismatch" = Mod2:format_error(Err2), + + Mini3 = <<"%% coding: latin-1 + Nonterminals Hopp. + Terminals t. + Rootsymbol Hopp. + + Hopp -> t. + + Erlang code. + + t() -> + Häpp.">>, + ok = file:write_file(Filename, Mini3), + YeccPre = filename:join(Dir, "yeccpre.hrl"), + ok = file:write_file(YeccPre, [<<"%% coding: UTF-8\n ä.\n">>]), + Inc = [{includefile,YeccPre}], + {error,[{_,[{2,yecc,cannot_parse}]}],[]} = + yecc:file(Filename, Inc ++ Ret), + + ok = file:write_file(Filename, + <<"%% coding: UTF-8 + Nonterminals Hopp. + Terminals t. + Rootsymbol \"örn_Ѐ\". + Hopp -> t : '$1'.">>), + {error,[{Filename,[{4,yecc,{bad_symbol,"örn_"++[1024]}}]}],[]} = + yecc:file(Filename, Ret), + + ok = file:write_file(Filename, + <<"%% coding: UTF-8 + Nonterminals Hopp. + Terminals t. + Rootsymbol Hopp. + Endsymbol \"örn_Ѐ\". + Hopp -> t : '$1'.">>), + {error,[{Filename,[{5,yecc,{bad_symbol,"örn_"++[1024]}}]}],[]} = + yecc:file(Filename, Ret), + + ok = file:write_file(Filename, + <<"%% coding: UTF-8 + Nonterminals Hopp. + Terminals t. + Rootsymbol Hopp. + Expect \"örn_Ѐ\". + Hopp -> t : '$1'.">>), + {error,[{Filename,[{5,yecc,{bad_symbol,"örn_"++[1024]}}]}],[]} = + yecc:file(Filename, Ret), + + ok = file:write_file(Filename, + <<"%% coding: UTF-8 + Nonterminals Hopp. + Terminals t. + Rootsymbol Hopp. + States \"örn_Ѐ\". + Hopp -> t : '$1'.">>), + {error,[{Filename,[{5,yecc,{bad_symbol,"örn_"++[1024]}}]}],[]} = + yecc:file(Filename, Ret), + + Ts = [{otp_10302_1,<<" + %% coding: UTF-8 + Header \"%% örn_Ѐ\" \"%% \\x{400}B\". + Nonterminals Häpp list. + Terminals element. + Rootsymbol Häpp. + + Häpp -> list : '$1'. + + list -> element : '$1'. + list -> list element : + begin + Häpp = foo, + {Häpp, 'Häpp',\"\\x{400}B\",\"örn_Ѐ\"} + end. + + Erlang code. + + -export([t/0]). + + t() -> + L = [{element, 1}, {element,2}], + {ok, R} = parse(L), + Häpp = foo, + {_,_,[1024,66],[246,114,110,95,1024]} = R, + {Häpp,'Häpp',\"\\x{400}B\",\"örn_Ѐ\"} = R, + ok. + ">>,default,ok}, + {otp_10302_2,<<" + %% coding: Latin-1 + Nonterminals Häpp list. + Terminals element. + Rootsymbol Häpp. + + Häpp -> list : '$1'. + + list -> element : '$1'. + list -> list element : + begin + Häpp = foo, + {Häpp, 'Häpp',\"\\x{400}B\",\"örn_Ѐ\"} + end. + + Erlang code. + + -export([t/0]). + + t() -> + L = [{element, 1}, {element,2}], + {ok, R} = parse(L), + Häpp = foo, + {_,_,[1024,66],[195,182,114,110,95,208,128]} = R, + {Häpp,'Häpp',\"\\x{400}B\",\"örn_Ѐ\"} = R, + ok. + ">>,default,ok}], + run(Config, Ts), + ok. + yeccpre_size() -> yeccpre_size(default_yeccpre()). -- cgit v1.2.3