aboutsummaryrefslogtreecommitdiffstats
path: root/lib/stdlib/src/epp.erl
diff options
context:
space:
mode:
authorHans Bolinder <[email protected]>2012-10-04 15:58:26 +0200
committerHans Bolinder <[email protected]>2013-01-02 10:15:17 +0100
commit300c5466a7c9cfe3ed22bba2a88ba21058406402 (patch)
treeb8c30800b17d5ae98255de2fd2818d8b5d4d6eba /lib/stdlib/src/epp.erl
parent7a884a31cfcaaf23f7920ba1a006aa2855529030 (diff)
downloadotp-300c5466a7c9cfe3ed22bba2a88ba21058406402.tar.gz
otp-300c5466a7c9cfe3ed22bba2a88ba21058406402.tar.bz2
otp-300c5466a7c9cfe3ed22bba2a88ba21058406402.zip
[stdlib, kernel] Introduce Unicode support for Erlang source files
Expect modifications, additions and corrections. There is a kludge in file_io_server and erl_scan:continuation_location() that's not so pleasing.
Diffstat (limited to 'lib/stdlib/src/epp.erl')
-rw-r--r--lib/stdlib/src/epp.erl211
1 files changed, 194 insertions, 17 deletions
diff --git a/lib/stdlib/src/epp.erl b/lib/stdlib/src/epp.erl
index 2c8d84a9e1..a0f7660ecf 100644
--- a/lib/stdlib/src/epp.erl
+++ b/lib/stdlib/src/epp.erl
@@ -23,13 +23,18 @@
-export([open/2,open/3,open/5,close/1,format_error/1]).
-export([scan_erl_form/1,parse_erl_form/1,macro_defs/1]).
-export([parse_file/1, parse_file/3]).
+-export([default_encoding/0, encoding_to_string/1,
+ read_encoding/1, read_encoding/2, set_encoding/1]).
-export([interpret_file_attribute/1]).
-export([normalize_typed_record_fields/1,restore_typed_record_fields/1]).
%%------------------------------------------------------------------------
+-export_type([source_encoding/0]).
+
-type macros() :: [{atom(), term()}].
-type epp_handle() :: pid().
+-type source_encoding() :: latin1 | utf8.
%% Epp state record.
-record(epp, {file, %Current file
@@ -213,6 +218,173 @@ parse_file(Epp) ->
[{eof,Location}]
end.
+-define(DEFAULT_ENCODING, latin1).
+
+-spec default_encoding() -> source_encoding().
+
+default_encoding() ->
+ ?DEFAULT_ENCODING.
+
+-spec encoding_to_string(Encoding) -> string() when
+ Encoding :: source_encoding().
+
+encoding_to_string(latin1) -> "coding: latin-1";
+encoding_to_string(utf8) -> "coding: utf-8".
+
+-spec read_encoding(FileName) -> source_encoding() | none when
+ FileName :: file:name().
+
+read_encoding(Name) ->
+ read_encoding(Name, []).
+
+-spec read_encoding(FileName, Options) -> source_encoding() | none when
+ FileName :: file:name(),
+ Options :: [Option],
+ Option :: {in_comment_only, boolean()}.
+
+read_encoding(Name, Options) ->
+ InComment = proplists:get_value(in_comment_only, Options, true),
+ case file:open(Name, [read]) of
+ {ok,File} ->
+ try read_encoding_from_file(File, InComment)
+ after ok = file:close(File)
+ end;
+ _Error ->
+ none
+ end.
+
+-spec set_encoding(File) -> source_encoding() | none when
+ File :: io:device(). % pid(); raw files don't work
+
+set_encoding(File) ->
+ Encoding = read_encoding_from_file(File, true),
+ Enc = case Encoding of
+ none -> default_encoding();
+ Encoding -> Encoding
+ end,
+ ok = io:setopts(File, [{encoding, Enc}]),
+ Encoding.
+
+-spec read_encoding_from_file(File, InComment) -> source_encoding() | none when
+ File :: io:device(),
+ InComment :: boolean().
+
+-define(ENC_CHUNK, 32).
+-define(N_ENC_CHUNK, 16). % a total of 512 bytes
+
+read_encoding_from_file(File, InComment) ->
+ {ok, Pos0} = file:position(File, cur),
+ Opts = io:getopts(File),
+ Encoding0 = lists:keyfind(encoding, 1, Opts),
+ Binary0 = lists:keyfind(binary, 1, Opts),
+ ok = io:setopts(File, [binary, {encoding, latin1}]),
+ try
+ {B, Fun} = (reader(File, 0))(),
+ com_nl(B, Fun, 0, InComment)
+ catch
+ throw:no ->
+ none
+ after
+ {ok, Pos0} = file:position(File, Pos0),
+ ok = io:setopts(File, [Binary0, Encoding0])
+ end.
+
+reader(Fd, N) ->
+ fun() when N =:= ?N_ENC_CHUNK ->
+ throw(no);
+ () ->
+ case file:read(Fd, ?ENC_CHUNK) of
+ eof ->
+ {<<>>, reader(Fd, N+1)};
+ {ok, Bin} ->
+ {Bin, reader(Fd, N+1)};
+ {error, _} ->
+ throw(no) % ignore errors
+ end
+ end.
+
+com_nl(_, _, 2, _) ->
+ throw(no);
+com_nl(B, Fun, N, false=Com) ->
+ com_c(B, Fun, N, Com);
+com_nl(B, Fun, N, true=Com) ->
+ com(B, Fun, N, Com).
+
+com(<<"\n",B/binary>>, Fun, N, Com) ->
+ com_nl(B, Fun, N+1, Com);
+com(<<"%", B/binary>>, Fun, N, Com) ->
+ com_c(B, Fun, N, Com);
+com(<<_:1/unit:8,B/binary>>, Fun, N, Com) ->
+ com(B, Fun, N, Com);
+com(<<>>, Fun, N, Com) ->
+ {B, Fun1} = Fun(),
+ com(B, Fun1, N, Com).
+
+com_c(<<"c",B/binary>>, Fun, N, Com) ->
+ com_oding(B, Fun, N, Com);
+com_c(<<"\n",B/binary>>, Fun, N, Com) ->
+ com_nl(B, Fun, N+1, Com);
+com_c(<<_:1/unit:8,B/binary>>, Fun, N, Com) ->
+ com_c(B, Fun, N, Com);
+com_c(<<>>, Fun, N, Com) ->
+ {B, Fun1} = Fun(),
+ com_c(B, Fun1, N, Com).
+
+com_oding(<<"oding",B/binary>>, Fun, N, Com) ->
+ com_sep(B, Fun, N, Com);
+com_oding(B, Fun, N, Com) when byte_size(B) >= length("oding") ->
+ com_c(B, Fun, N, Com);
+com_oding(B, Fun, N, Com) ->
+ {B1, Fun1} = Fun(),
+ com_oding(list_to_binary([B, B1]), Fun1, N, Com).
+
+com_sep(<<":",B/binary>>, Fun, N, Com) ->
+ com_space(B, Fun, N, Com);
+com_sep(<<"=",B/binary>>, Fun, N, Com) ->
+ com_space(B, Fun, N, Com);
+com_sep(<<"\s",B/binary>>, Fun, N, Com) ->
+ com_sep(B, Fun, N, Com);
+com_sep(<<>>, Fun, N, Com) ->
+ {B, Fun1} = Fun(),
+ com_sep(B, Fun1, N, Com);
+com_sep(B, Fun, N, Com) ->
+ com_c(B, Fun, N, Com).
+
+com_space(<<"\s",B/binary>>, Fun, N, Com) ->
+ com_space(B, Fun, N, Com);
+com_space(<<>>, Fun, N, Com) ->
+ {B, Fun1} = Fun(),
+ com_space(B, Fun1, N, Com);
+com_space(B, Fun, N, _Com) ->
+ com_enc(B, Fun, N, [], []).
+
+com_enc(<<C:1/unit:8,B/binary>>, Fun, N, L, Ps) when C >= $a, C =< $z;
+ C >= $A, C =< $Z;
+ C >= $0, C =< $9 ->
+ com_enc(B, Fun, N, [C | L], Ps);
+com_enc(<<>>, Fun, N, L, Ps) ->
+ case Fun() of
+ {<<>>, _} ->
+ com_enc_end([L | Ps]);
+ {B, Fun1} ->
+ com_enc(B, Fun1, N, L, Ps)
+ end;
+com_enc(<<"-",B/binary>>, Fun, N, L, Ps) ->
+ com_enc(B, Fun, N, [], [L | Ps]);
+com_enc(_B, _Fun, _N, L, Ps) ->
+ com_enc_end([L | Ps]).
+
+com_enc_end(Ps0) ->
+ Ps = lists:reverse([lists:reverse(string:to_lower(P)) || P <- Ps0]),
+ com_encoding(Ps).
+
+com_encoding(["latin","1"|_]) ->
+ latin1;
+com_encoding(["utf","8"|_]) ->
+ utf8;
+com_encoding(_) ->
+ throw(no). % Don't try any further
+
normalize_typed_record_fields([]) ->
{typed, []};
normalize_typed_record_fields(Fields) ->
@@ -266,14 +438,17 @@ init_server(Pid, Name, File, AtLocation, Path, Pdm, Pre) ->
Ms0 = predef_macros(Name),
case user_predef(Pdm, Ms0) of
{ok,Ms1} ->
- epp_reply(Pid, {ok,self()}),
- %% ensure directory of current source file is first in path
+ _ = set_encoding(File),
+ epp_reply(Pid, {ok,self()}),
+ %% ensure directory of current source file is
+ %% first in path
Path1 = [filename:dirname(Name) | Path],
- St = #epp{file=File, location=AtLocation, delta=0, name=Name,
- name2=Name, path=Path1, macs=Ms1, pre_opened = Pre},
- From = wait_request(St),
- enter_file_reply(From, Name, AtLocation, AtLocation),
- wait_req_scan(St);
+ St = #epp{file=File, location=AtLocation, delta=0,
+ name=Name, name2=Name, path=Path1, macs=Ms1,
+ pre_opened = Pre},
+ From = wait_request(St),
+ enter_file_reply(From, Name, AtLocation, AtLocation),
+ wait_req_scan(St);
{error,E} ->
epp_reply(Pid, {error,E})
end.
@@ -385,19 +560,20 @@ enter_file(NewName, Inc, From, St) ->
%% enter_file2(File, FullName, From, EppState, AtLocation) -> EppState.
%% Set epp to use this file and "enter" it.
-enter_file2(NewF, Pname, From, St, AtLocation) ->
+enter_file2(NewF, Pname, From, St0, AtLocation) ->
Loc = start_loc(AtLocation),
enter_file_reply(From, Pname, Loc, AtLocation),
- Ms = dict:store({atom,'FILE'}, {none,[{string,Loc,Pname}]}, St#epp.macs),
+ Ms = dict:store({atom,'FILE'}, {none,[{string,Loc,Pname}]}, St0#epp.macs),
%% update the head of the include path to be the directory of the new
%% source file, so that an included file can always include other files
%% relative to its current location (this is also how C does it); note
%% that the directory of the parent source file (the previous head of
%% the path) must be dropped, otherwise the path used within the current
%% file will depend on the order of file inclusions in the parent files
- Path = [filename:dirname(Pname) | tl(St#epp.path)],
+ Path = [filename:dirname(Pname) | tl(St0#epp.path)],
+ _ = set_encoding(NewF),
#epp{file=NewF,location=Loc,name=Pname,delta=0,
- sstk=[St|St#epp.sstk],path=Path,macs=Ms}.
+ sstk=[St0|St0#epp.sstk],path=Path,macs=Ms}.
enter_file_reply(From, Name, Location, AtLocation) ->
Attr = loc_attr(AtLocation),
@@ -456,7 +632,7 @@ leave_file(From, St) ->
%% scan_toks(Tokens, From, EppState)
scan_toks(From, St) ->
- case io:scan_erl_form(St#epp.file, '', St#epp.location) of
+ case io:scan_erl_form(St#epp.file, '', St#epp.location, [unicode]) of
{ok,Toks,Cl} ->
scan_toks(Toks, From, St#epp{location=Cl});
{error,E,Cl} ->
@@ -830,7 +1006,7 @@ new_location(Ln, {Le,_}, {Lf,_}) ->
%% nested conditionals and repeated 'else's.
skip_toks(From, St, [I|Sis]) ->
- case io:scan_erl_form(St#epp.file, '', St#epp.location) of
+ case io:scan_erl_form(St#epp.file, '', St#epp.location, [unicode]) of
{ok,[{'-',_Lh},{atom,_Li,ifdef}|_Toks],Cl} ->
skip_toks(From, St#epp{location=Cl}, [ifdef,I|Sis]);
{ok,[{'-',_Lh},{atom,_Li,ifndef}|_Toks],Cl} ->
@@ -1094,6 +1270,7 @@ expand_arg([], Ts, L, Rest, Bs) ->
%%% tokenized would yield the token list Ts.
%% erl_scan:token_info(T, text) is not backward compatible with this.
+%% Note that escaped characters will be replaced by themselves.
token_src({dot, _}) ->
".";
token_src({X, _}) when is_atom(X) ->
@@ -1101,16 +1278,16 @@ token_src({X, _}) when is_atom(X) ->
token_src({var, _, X}) ->
atom_to_list(X);
token_src({char,_,C}) ->
- io_lib:write_char(C);
+ io_lib:write_unicode_char(C);
token_src({string, _, X}) ->
- lists:flatten(io_lib:format("~p", [X]));
+ io_lib:write_unicode_string(X);
token_src({_, _, X}) ->
- lists:flatten(io_lib:format("~w", [X])).
+ io_lib:format("~w", [X]).
stringify1([]) ->
[];
stringify1([T | Tokens]) ->
- [io_lib:format(" ~s", [token_src(T)]) | stringify1(Tokens)].
+ [io_lib:format(" ~ts", [token_src(T)]) | stringify1(Tokens)].
stringify(Ts, L) ->
[$\s | S] = lists:flatten(stringify1(Ts)),