From 0dcd574b493daa864e22a8332d11be9945466cb6 Mon Sep 17 00:00:00 2001 From: Hans Bolinder Date: Mon, 3 Dec 2012 12:38:07 +0100 Subject: [syntax_tools] Introduce Unicode support for Erlang source files Not complete. Unicode in wild attribute doesn't work. No support for Unicode regarding Igor stubs. --- lib/syntax_tools/src/epp_dodger.erl | 4 ++- lib/syntax_tools/src/erl_comment_scan.erl | 12 ++++++- lib/syntax_tools/src/erl_prettypr.erl | 14 +++++--- lib/syntax_tools/src/erl_syntax.erl | 48 +++++++++++++++++++++++++-- lib/syntax_tools/src/erl_tidy.erl | 11 ++++--- lib/syntax_tools/src/igor.erl | 55 ++++++++++++++++++++++--------- 6 files changed, 116 insertions(+), 28 deletions(-) diff --git a/lib/syntax_tools/src/epp_dodger.erl b/lib/syntax_tools/src/epp_dodger.erl index b3ced34c14..70395848a1 100644 --- a/lib/syntax_tools/src/epp_dodger.erl +++ b/lib/syntax_tools/src/epp_dodger.erl @@ -186,6 +186,7 @@ quick_parse_file(File, Options) -> parse_file(File, Parser, Options) -> case file:open(File, [read]) of {ok, Dev} -> + _ = epp:set_encoding(Dev), try Parser(Dev, 1, Options) after ok = file:close(Dev) end; @@ -400,7 +401,7 @@ quick_parse_form(Dev, L0, Options) -> parse_form(Dev, L0, Parser, Options) -> NoFail = proplists:get_bool(no_fail, Options), Opt = #opt{clever = proplists:get_bool(clever, Options)}, - case io:scan_erl_form(Dev, "", L0) of + case io:scan_erl_form(Dev, "", L0, [unicode]) of {ok, Ts, L1} -> case catch {ok, Parser(Ts, Opt)} of {'EXIT', Term} -> @@ -419,6 +420,7 @@ parse_form(Dev, L0, Parser, Options) -> {ok, F, L1} end; {error, _IoErr, _L1} = Err -> Err; + {error, _Reason} -> {eof, L0}; % This is probably encoding problem {eof, _L1} = Eof -> Eof end. diff --git a/lib/syntax_tools/src/erl_comment_scan.erl b/lib/syntax_tools/src/erl_comment_scan.erl index b833e1c069..a70e7ba413 100644 --- a/lib/syntax_tools/src/erl_comment_scan.erl +++ b/lib/syntax_tools/src/erl_comment_scan.erl @@ -72,7 +72,17 @@ file(Name) -> {ok, V} -> case V of {ok, B} -> - string(binary_to_list(B)); + Enc = case epp:read_encoding(Name) of + none -> epp:default_encoding(); + Enc0 -> Enc0 + end, + case catch unicode:characters_to_list(B, Enc) of + String when is_list(String) -> + string(String); + R -> + error_read_file(Name1), + exit(R) + end; {error, E} -> error_read_file(Name1), exit({read, E}) diff --git a/lib/syntax_tools/src/erl_prettypr.erl b/lib/syntax_tools/src/erl_prettypr.erl index f4bbf975c3..577dd21a77 100644 --- a/lib/syntax_tools/src/erl_prettypr.erl +++ b/lib/syntax_tools/src/erl_prettypr.erl @@ -60,7 +60,9 @@ hook = ?NOHOOK :: hook(), paper = ?PAPER :: integer(), ribbon = ?RIBBON :: integer(), - user = ?NOUSER :: term()}). + user = ?NOUSER :: term(), + encoding = epp:default_encoding() :: epp:source_encoding()}). + -type context() :: #ctxt{}. %% ===================================================================== @@ -231,6 +233,8 @@ format(Node) -> %%
{user, term()}
%%
User-specific data for use in hook functions. The default %% value is `undefined'.
+%%
{encoding, epp:source_encoding()}
+%%
Specifies the encoding of the generated file.
%% %% %% A hook function (cf. the {@link hook()} type) is passed the current @@ -342,7 +346,9 @@ layout(Node, Options) -> #ctxt{hook = proplists:get_value(hook, Options, ?NOHOOK), paper = proplists:get_value(paper, Options, ?PAPER), ribbon = proplists:get_value(ribbon, Options, ?RIBBON), - user = proplists:get_value(user, Options)}). + user = proplists:get_value(user, Options), + encoding = proplists:get_value(encoding, Options, + epp:default_encoding())}). lay(Node, Ctxt) -> case erl_syntax:get_ann(Node) of @@ -445,10 +451,10 @@ lay_2(Node, Ctxt) -> text(tidy_float(erl_syntax:float_literal(Node))); char -> - text(erl_syntax:char_literal(Node)); + text(erl_syntax:char_literal(Node, Ctxt#ctxt.encoding)); string -> - lay_string(erl_syntax:string_literal(Node), Ctxt); + lay_string(erl_syntax:string_literal(Node, Ctxt#ctxt.encoding), Ctxt); nil -> text("[]"); diff --git a/lib/syntax_tools/src/erl_syntax.erl b/lib/syntax_tools/src/erl_syntax.erl index 151f04b03b..93b9dc54dd 100644 --- a/lib/syntax_tools/src/erl_syntax.erl +++ b/lib/syntax_tools/src/erl_syntax.erl @@ -161,6 +161,7 @@ is_char/2, char_value/1, char_literal/1, + char_literal/2, clause/2, clause/3, clause_body/1, @@ -271,6 +272,7 @@ is_string/2, string_value/1, string_literal/1, + string_literal/2, text/1, text_string/1, try_expr/2, @@ -1628,6 +1630,7 @@ float_literal(Node) -> %% %% @see char_value/1 %% @see char_literal/1 +%% @see char_literal/2 %% @see is_char/2 %% type(Node) = char @@ -1687,13 +1690,34 @@ char_value(Node) -> %% ===================================================================== %% @doc Returns the literal string represented by a `char' %% node. This includes the leading "`$'" character. +%% Characters beyond 255 will be escaped. %% %% @see char/1 -spec char_literal(syntaxTree()) -> nonempty_string(). char_literal(Node) -> - io_lib:write_char(char_value(Node)). + char_literal(Node, latin1). + + +%% ===================================================================== +%% @doc Returns the literal string represented by a `char' +%% node. This includes the leading "`$'" character. +%% Depending on the encoding a character beyond 255 will be escaped +%% ('latin1') or copied as is ('utf8'). +%% +%% @see char/1 + +-type encoding() :: 'utf8' | 'unicode' | 'latin1'. + +-spec char_literal(syntaxTree(), encoding()) -> nonempty_string(). + +char_literal(Node, unicode) -> + io_lib:write_unicode_char(char_value(Node)); +char_literal(Node, utf8) -> + io_lib:write_unicode_char(char_value(Node)); +char_literal(Node, latin1) -> + io_lib:write_unicode_char_as_latin1(char_value(Node)). %% ===================================================================== @@ -1708,6 +1732,7 @@ char_literal(Node) -> %% %% @see string_value/1 %% @see string_literal/1 +%% @see string_literal/2 %% @see is_string/2 %% @see char/1 @@ -1768,13 +1793,32 @@ string_value(Node) -> %% ===================================================================== %% @doc Returns the literal string represented by a `string' %% node. This includes surrounding double-quote characters. +%% Characters beyond 255 will be escaped. %% %% @see string/1 -spec string_literal(syntaxTree()) -> nonempty_string(). string_literal(Node) -> - io_lib:write_string(string_value(Node)). + string_literal(Node, latin1). + + +%% ===================================================================== +%% @doc Returns the literal string represented by a `string' +%% node. This includes surrounding double-quote characters. +%% Depending on the encoding characters beyond 255 will be escaped +%% ('latin1') or copied as is ('utf8'). +%% +%% @see string/1 + +-spec string_literal(syntaxTree(), encoding()) -> nonempty_string(). + +string_literal(Node, utf8) -> + io_lib:write_unicode_string(string_value(Node)); +string_literal(Node, unicode) -> + io_lib:write_unicode_string(string_value(Node)); +string_literal(Node, latin1) -> + io_lib:write_unicode_string_as_latin1(string_value(Node)). %% ===================================================================== diff --git a/lib/syntax_tools/src/erl_tidy.erl b/lib/syntax_tools/src/erl_tidy.erl index 59cf6c0a92..e9a88caff3 100644 --- a/lib/syntax_tools/src/erl_tidy.erl +++ b/lib/syntax_tools/src/erl_tidy.erl @@ -375,6 +375,8 @@ write_module(Tree, Name, Opts) -> end, filename(filename:join(Dir, Name1)) end, + Encoding = [{encoding,Enc} || Enc <- [epp:read_encoding(Name)], + Enc =/= none], case proplists:get_bool(backups, Opts) of true -> backup_file(File, Opts); @@ -382,9 +384,9 @@ write_module(Tree, Name, Opts) -> ok end, Printer = proplists:get_value(printer, Opts), - FD = open_output_file(File), + FD = open_output_file(File, Encoding), verbose("writing to file `~s'.", [File], Opts), - V = (catch {ok, output(FD, Printer, Tree, Opts)}), + V = (catch {ok, output(FD, Printer, Tree, Opts++Encoding)}), ok = file:close(FD), case V of {ok, _} -> @@ -432,8 +434,9 @@ file_type(Name, Links) -> throw(R) end. -open_output_file(FName) -> - case catch file:open(FName, [write]) of +open_output_file(FName, Options) -> +io:format("Options ~p~n", [Options]), + case catch file:open(FName, [write]++Options) of {ok, FD} -> FD; {error, R} -> diff --git a/lib/syntax_tools/src/igor.erl b/lib/syntax_tools/src/igor.erl index 37e561cbbe..8abc3f41cb 100644 --- a/lib/syntax_tools/src/igor.erl +++ b/lib/syntax_tools/src/igor.erl @@ -341,10 +341,12 @@ merge(Name, Files) -> merge(Name, Files, Opts) -> Opts1 = Opts ++ ?DEFAULT_MERGE_OPTS, - {Tree, Stubs} = merge_files(Name, Files, Opts1), + {Sources, Enc} = merge_files1(Files, Opts1), + {Tree, Stubs} = merge_sources(Name, Sources, Opts1), Dir = proplists:get_value(dir, Opts1, ""), Filename = proplists:get_value(outfile, Opts1, Name), - File = write_module(Tree, Filename, Dir, Opts1), + Encoding = [{encoding, Enc} || Enc =/= none], + File = write_module(Tree, Filename, Dir, Encoding ++ Opts1), [File | maybe_create_stubs(Stubs, Opts1)]. @@ -459,16 +461,21 @@ merge_files(Name, Files, Options) -> -spec merge_files(atom(), erl_syntax:forms(), [file:filename()], [option()]) -> {erl_syntax:syntaxTree(), [stubDescriptor()]}. -merge_files(_, _Trees, [], _) -> +merge_files(Name, Trees, Files, Opts) -> + {Sources, _Encoding} = merge_files1(Files, Opts), + merge_sources(Name, Trees ++ Sources, Opts). + +merge_files1([], _) -> report_error("no files to merge."), exit(badarg); -merge_files(Name, Trees, Files, Opts) -> +merge_files1(Files, Opts) -> Opts1 = Opts ++ [{includes, ?DEFAULT_INCLUDES}, {macros, ?DEFAULT_MACROS}, {preprocess, false}, comments], - Sources = [read_module(F, Opts1) || F <- Files], - merge_sources(Name, Trees ++ Sources, Opts1). + SourceEncodings = [read_module(F, Opts1) || F <- Files], + {Sources, [Encoding | _]} = lists:unzip(SourceEncodings), + {Sources, Encoding}. %% ===================================================================== @@ -2512,7 +2519,11 @@ rename(Files, Renamings, Opts) -> lists:flatmap(fun (F) -> rename_file(F, Dict, Opts1) end, Files). rename_file(File, Dict, Opts) -> - S = read_module(File, Opts), + {S, Enc} = read_module(File, Opts), + %% Try to avoid *two* coding: comments: + Encoding = [{encoding, Enc} || + Enc =/= none, + not proplists:get_bool(comments, Opts)], M = get_module_info(S), Name = M#module.name, Name1 = case dict:find(Name, Dict) of @@ -2526,10 +2537,10 @@ rename_file(File, Dict, Opts) -> Opts1 = [no_headers, {export, [Name]}, {static, [Name]}, - {redirect, dict:to_list(Dict1)}] ++ Opts, + {redirect, dict:to_list(Dict1)}] ++ Encoding ++ Opts, {Tree, Stubs} = merge_sources(Name1, [S], Opts1), Dir = filename:dirname(filename(File)), - File1 = write_module(Tree, Name1, Dir, Opts), + File1 = write_module(Tree, Name1, Dir, Opts++Encoding), %% We create the stub file in the same directory as the source file %% and the target file. @@ -2648,7 +2659,7 @@ error_text(D, Name) -> {L, M, E} when is_integer(L), is_atom(M) -> case catch M:format_error(E) of S when is_list(S) -> - io_lib:fwrite("`~w', line ~w: ~s.", + io_lib:fwrite("`~w', line ~w: ~ts.", [Name, L, S]); _ -> error_text_1(D, Name) @@ -2706,7 +2717,17 @@ open_output_file(FName) -> exit(R) end. -%% read_module(Name, Options) -> syntaxTree() +output_encoding(FD, Opts) -> + case proplists:get_value(encoding, Opts) of + undefined -> + ok = io:setopts(FD, [{encoding, epp:default_encoding()}]); + Encoding -> + ok = io:setopts(FD, [{encoding, Encoding}]), + EncS = epp:encoding_to_string(Encoding), + ok = io:fwrite(FD, <<"%% ~s\n">>, [EncS]) + end. + +%% read_module(Name, Options) -> {syntaxTree(), epp:source_encoding()} %% %% This also tries to locate the real source file, if "Name" does not %% point directly to a particular file. @@ -2729,20 +2750,21 @@ read_module(Name, Options) -> read_module_1(Name, Options) -> verbose("reading module `~s'.", [filename(Name)], Options), - Forms = read_module_2(Name, Options), + {Forms, Enc} = read_module_2(Name, Options), case proplists:get_bool(comments, Options) of false -> - Forms; + {Forms, Enc}; true -> Comments = erl_comment_scan:file(Name), - erl_recomment:recomment_forms(Forms, Comments) + {erl_recomment:recomment_forms(Forms, Comments), Enc} end. read_module_2(Name, Options) -> case read_module_3(Name, Options) of {ok, Forms} -> check_forms(Forms, Name), - Forms; + Enc = epp:read_encoding(Name), + {Forms, Enc}; {error, _} = Error -> error_read_file(Name), exit(Error) @@ -2772,7 +2794,7 @@ check_forms([F | Fs], File) -> _ -> "unknown error" end, - report_error("in file `~s' at line ~w:\n ~s", + report_error("in file `~s' at line ~w:\n ~ts", [filename(File), erl_syntax:get_pos(F), S]), exit(error); _ -> @@ -2847,6 +2869,7 @@ write_module(Tree, Name, Dir, Opts) -> end, Printer = proplists:get_value(printer, Opts), FD = open_output_file(File), + ok = output_encoding(FD, Opts), verbose("writing to file `~s'.", [File], Opts), V = (catch {ok, output(FD, Printer, Tree, Opts)}), ok = file:close(FD), -- cgit v1.2.3