aboutsummaryrefslogtreecommitdiffstats
path: root/lib/compiler/src/core_scan.erl
diff options
context:
space:
mode:
Diffstat (limited to 'lib/compiler/src/core_scan.erl')
-rw-r--r--lib/compiler/src/core_scan.erl468
1 files changed, 468 insertions, 0 deletions
diff --git a/lib/compiler/src/core_scan.erl b/lib/compiler/src/core_scan.erl
new file mode 100644
index 0000000000..5aab8ae855
--- /dev/null
+++ b/lib/compiler/src/core_scan.erl
@@ -0,0 +1,468 @@
+%%
+%% %CopyrightBegin%
+%%
+%% Copyright Ericsson AB 2000-2009. All Rights Reserved.
+%%
+%% The contents of this file are subject to the Erlang Public License,
+%% Version 1.1, (the "License"); you may not use this file except in
+%% compliance with the License. You should have received a copy of the
+%% Erlang Public License along with this software. If not, it can be
+%% retrieved online at http://www.erlang.org/.
+%%
+%% Software distributed under the License is distributed on an "AS IS"
+%% basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See
+%% the License for the specific language governing rights and limitations
+%% under the License.
+%%
+%% %CopyrightEnd%
+%%
+%% Purpose: Scanner for Core Erlang.
+
+%% For handling ISO 8859-1 (Latin-1) we use the following type
+%% information:
+%%
+%% 000 - 037 NUL - US control
+%% 040 - 057 SPC - / punctuation
+%% 060 - 071 0 - 9 digit
+%% 072 - 100 : - @ punctuation
+%% 101 - 132 A - Z uppercase
+%% 133 - 140 [ - ` punctuation
+%% 141 - 172 a - z lowercase
+%% 173 - 176 { - ~ punctuation
+%% 177 DEL control
+%% 200 - 237 control
+%% 240 - 277 NBSP - � punctuation
+%% 300 - 326 � - � uppercase
+%% 327 � punctuation
+%% 330 - 336 � - � uppercase
+%% 337 - 366 � - � lowercase
+%% 367 � punctuation
+%% 370 - 377 � - � lowercase
+%%
+%% Many punctuation characters region have special meaning. Must
+%% watch using � \327, bvery close to x \170
+
+-module(core_scan).
+
+-export([string/1, string/2, format_error/1]).
+
+-import(lists, [reverse/1]).
+
+%% string([Char]) ->
+%% string([Char], StartPos) ->
+%% {ok, [Tok], EndPos} |
+%% {error, {Pos,core_scan,What}, EndPos}
+
+string(Cs) -> string(Cs, 1).
+
+string(Cs, Sp) ->
+ %% Add an 'eof' to always get correct handling.
+ case string_pre_scan(Cs, [], Sp) of
+ {done,_,SoFar,Ep} -> %Got tokens
+ case scan(reverse(SoFar), Sp) of
+ {ok,Toks} -> {ok,Toks,Ep};
+ {error,E} -> {error,E,Ep}
+ end;
+ Other -> Other %An error has occurred
+ end.
+
+%% string_pre_scan(Cs, SoFar0, StartPos) ->
+%% {done,Rest,SoFar,EndPos} | {error,E,EndPos}.
+
+string_pre_scan(Cs, SoFar0, Sp) ->
+ case pre_scan(Cs, SoFar0, Sp) of
+ {done,Rest,SoFar1,Ep} -> %Got complete tokens
+ {done,Rest,SoFar1,Ep};
+ {more,Rest,SoFar1,Ep} -> %Missing end token
+ string_pre_scan(Rest ++ eof, SoFar1, Ep);
+ Other -> Other %An error has occurred
+ end.
+
+%% format_error(Error)
+%% Return a string describing the error.
+
+-spec format_error(term()) -> iolist().
+
+format_error({string,Quote,Head}) ->
+ ["unterminated " ++ string_thing(Quote) ++
+ " starting with " ++ io_lib:write_string(Head,Quote)];
+format_error({illegal,Type}) -> io_lib:fwrite("illegal ~w", [Type]);
+format_error(char) -> "unterminated character";
+format_error(scan) -> "premature end";
+format_error({base,Base}) -> io_lib:fwrite("illegal base '~w'", [Base]);
+format_error(float) -> "bad float";
+format_error(Other) -> io_lib:write(Other).
+
+string_thing($') -> "atom"; %' stupid emacs
+string_thing($") -> "string". %" stupid emacs
+
+%% Re-entrant pre-scanner.
+%%
+%% If the input list of characters is insufficient to build a term the
+%% scanner returns a request for more characters and a continuation to be
+%% used when trying to build a term with more characters. To indicate
+%% end-of-file the input character list should be replaced with 'eof'
+%% as an empty list has meaning.
+%%
+%% When more characters are need inside a comment, string or quoted
+%% atom, which can become rather long, instead of pushing the
+%% characters read so far back onto RestChars to be reread, a special
+%% reentry token is returned indicating the middle of a construct.
+%% The token is the start character as an atom, '%', '"' and '\''.
+
+%% pre_scan([Char], SoFar, StartPos) ->
+%% {done,RestChars,ScannedChars,NewPos} |
+%% {more,RestChars,ScannedChars,NewPos} |
+%% {error,{ErrorPos,core_scan,Description},NewPos}.
+%% Main pre-scan function. It has been split into 2 functions because of
+%% efficiency, with a good indexing compiler it would be unnecessary.
+
+pre_scan([C|Cs], SoFar, Pos) ->
+ pre_scan(C, Cs, SoFar, Pos);
+pre_scan([], SoFar, Pos) ->
+ {more,[],SoFar,Pos};
+pre_scan(eof, SoFar, Pos) ->
+ {done,eof,SoFar,Pos}.
+
+%% pre_scan(Char, [Char], SoFar, Pos)
+
+pre_scan($$, Cs0, SoFar0, Pos) ->
+ case pre_char(Cs0, [$$|SoFar0]) of
+ {Cs,SoFar} ->
+ pre_scan(Cs, SoFar, Pos);
+ more ->
+ {more,[$$|Cs0],SoFar0, Pos};
+ error ->
+ pre_error(char, Pos, Pos)
+ end;
+pre_scan($', Cs, SoFar, Pos) ->
+ pre_string(Cs, $', '\'', Pos, [$'|SoFar], Pos);
+pre_scan({'\'',Sp}, Cs, SoFar, Pos) -> %Re-entering quoted atom
+ pre_string(Cs, $', '\'', Sp, SoFar, Pos);
+pre_scan($", Cs, SoFar, Pos) ->
+ pre_string(Cs, $", '"', Pos, [$"|SoFar], Pos);
+pre_scan({'"',Sp}, Cs, SoFar, Pos) -> %Re-entering string
+ pre_string(Cs, $", '"', Sp, SoFar, Pos);
+pre_scan($%, Cs, SoFar, Pos) ->
+ pre_comment(Cs, SoFar, Pos);
+pre_scan('%', Cs, SoFar, Pos) -> %Re-entering comment
+ pre_comment(Cs, SoFar, Pos);
+pre_scan($\n, Cs, SoFar, Pos) ->
+ pre_scan(Cs, [$\n|SoFar], Pos+1);
+pre_scan(C, Cs, SoFar, Pos) ->
+ pre_scan(Cs, [C|SoFar], Pos).
+
+%% pre_string([Char], Quote, Reent, StartPos, SoFar, Pos)
+
+pre_string([Q|Cs], Q, _, _, SoFar, Pos) ->
+ pre_scan(Cs, [Q|SoFar], Pos);
+pre_string([$\n|Cs], Q, Reent, Sp, SoFar, Pos) ->
+ pre_string(Cs, Q, Reent, Sp, [$\n|SoFar], Pos+1);
+pre_string([$\\|Cs0], Q, Reent, Sp, SoFar0, Pos) ->
+ case pre_escape(Cs0, SoFar0) of
+ {Cs,SoFar} ->
+ pre_string(Cs, Q, Reent, Sp, SoFar, Pos);
+ more ->
+ {more,[{Reent,Sp},$\\|Cs0],SoFar0,Pos};
+ error ->
+ pre_string_error(Q, Sp, SoFar0, Pos)
+ end;
+pre_string([C|Cs], Q, Reent, Sp, SoFar, Pos) ->
+ pre_string(Cs, Q, Reent, Sp, [C|SoFar], Pos);
+pre_string([], _, Reent, Sp, SoFar, Pos) ->
+ {more,[{Reent,Sp}],SoFar,Pos};
+pre_string(eof, Q, _, Sp, SoFar, Pos) ->
+ pre_string_error(Q, Sp, SoFar, Pos).
+
+pre_string_error(Q, Sp, SoFar, Pos) ->
+ S = reverse(string:substr(SoFar, 1, string:chr(SoFar, Q)-1)),
+ pre_error({string,Q,string:substr(S, 1, 16)}, Sp, Pos).
+
+pre_char([C|Cs], SoFar) -> pre_char(C, Cs, SoFar);
+pre_char([], _) -> more;
+pre_char(eof, _) -> error.
+
+pre_char($\\, Cs, SoFar) ->
+ pre_escape(Cs, SoFar);
+pre_char(C, Cs, SoFar) ->
+ {Cs,[C|SoFar]}.
+
+pre_escape([$^|Cs0], SoFar) ->
+ case Cs0 of
+ [C3|Cs] ->
+ {Cs,[C3,$^,$\\|SoFar]};
+ [] -> more;
+ eof -> error
+ end;
+pre_escape([C|Cs], SoFar) ->
+ {Cs,[C,$\\|SoFar]};
+pre_escape([], _) -> more;
+pre_escape(eof, _) -> error.
+
+%% pre_comment([Char], SoFar, Pos)
+%% Comments are replaced by one SPACE.
+
+pre_comment([$\n|Cs], SoFar, Pos) ->
+ pre_scan(Cs, [$\n,$\s|SoFar], Pos+1); %Terminate comment
+pre_comment([_|Cs], SoFar, Pos) ->
+ pre_comment(Cs, SoFar, Pos);
+pre_comment([], SoFar, Pos) ->
+ {more,['%'],SoFar,Pos};
+pre_comment(eof, Sofar, Pos) ->
+ pre_scan(eof, [$\s|Sofar], Pos).
+
+pre_error(E, Epos, Pos) ->
+ {error,{Epos,core_scan,E}, Pos}.
+
+%% scan(CharList, StartPos)
+%% This takes a list of characters and tries to tokenise them.
+%%
+%% The token list is built in reverse order (in a stack) to save appending
+%% and then reversed when all the tokens have been collected. Most tokens
+%% are built in the same way.
+%%
+%% Returns:
+%% {ok,[Tok]}
+%% {error,{ErrorPos,core_scan,What}}
+
+scan(Cs, Pos) ->
+ scan1(Cs, [], Pos).
+
+%% scan1(Characters, TokenStack, Position)
+%% Scan a list of characters into tokens.
+
+scan1([$\n|Cs], Toks, Pos) -> %Skip newline
+ scan1(Cs, Toks, Pos+1);
+scan1([C|Cs], Toks, Pos) when C >= $\000, C =< $\s -> %Skip control chars
+ scan1(Cs, Toks, Pos);
+scan1([C|Cs], Toks, Pos) when C >= $\200, C =< $\240 ->
+ scan1(Cs, Toks, Pos);
+scan1([C|Cs], Toks, Pos) when C >= $a, C =< $z -> %Keywords
+ scan_key_word(C, Cs, Toks, Pos);
+scan1([C|Cs], Toks, Pos) when C >= $�, C =< $�, C /= $� ->
+ scan_key_word(C, Cs, Toks, Pos);
+scan1([C|Cs], Toks, Pos) when C >= $A, C =< $Z -> %Variables
+ scan_variable(C, Cs, Toks, Pos);
+scan1([C|Cs], Toks, Pos) when C >= $�, C =< $�, C /= $� ->
+ scan_variable(C, Cs, Toks, Pos);
+scan1([C|Cs], Toks, Pos) when C >= $0, C =< $9 -> %Numbers
+ scan_number(C, Cs, Toks, Pos);
+scan1([$-,C|Cs], Toks, Pos) when C >= $0, C =< $9 -> %Signed numbers
+ scan_signed_number($-, C, Cs, Toks, Pos);
+scan1([$+,C|Cs], Toks, Pos) when C >= $0, C =< $9 -> %Signed numbers
+ scan_signed_number($+, C, Cs, Toks, Pos);
+scan1([$_|Cs], Toks, Pos) -> %_ variables
+ scan_variable($_, Cs, Toks, Pos);
+scan1([$$|Cs0], Toks, Pos) -> %Character constant
+ {C,Cs,Pos1} = scan_char(Cs0, Pos),
+ scan1(Cs, [{char,Pos,C}|Toks], Pos1);
+scan1([$'|Cs0], Toks, Pos) -> %Atom (always quoted)
+ {S,Cs1,Pos1} = scan_string(Cs0, $', Pos),
+ case catch list_to_atom(S) of
+ A when is_atom(A) ->
+ scan1(Cs1, [{atom,Pos,A}|Toks], Pos1);
+ _Error -> scan_error({illegal,atom}, Pos)
+ end;
+scan1([$"|Cs0], Toks, Pos) -> %String
+ {S,Cs1,Pos1} = scan_string(Cs0, $", Pos),
+ scan1(Cs1, [{string,Pos,S}|Toks], Pos1);
+%% Punctuation characters and operators, first recognise multiples.
+scan1("->" ++ Cs, Toks, Pos) ->
+ scan1(Cs, [{'->',Pos}|Toks], Pos);
+scan1("-|" ++ Cs, Toks, Pos) ->
+ scan1(Cs, [{'-|',Pos}|Toks], Pos);
+scan1([C|Cs], Toks, Pos) -> %Punctuation character
+ P = list_to_atom([C]),
+ scan1(Cs, [{P,Pos}|Toks], Pos);
+scan1([], Toks0, _) ->
+ Toks = reverse(Toks0),
+ {ok,Toks}.
+
+%% scan_key_word(FirstChar, CharList, Tokens, Pos)
+%% scan_variable(FirstChar, CharList, Tokens, Pos)
+
+scan_key_word(C, Cs0, Toks, Pos) ->
+ {Wcs,Cs} = scan_name(Cs0, []),
+ case catch list_to_atom([C|reverse(Wcs)]) of
+ Name when is_atom(Name) ->
+ scan1(Cs, [{Name,Pos}|Toks], Pos);
+ _Error -> scan_error({illegal,atom}, Pos)
+ end.
+
+scan_variable(C, Cs0, Toks, Pos) ->
+ {Wcs,Cs} = scan_name(Cs0, []),
+ case catch list_to_atom([C|reverse(Wcs)]) of
+ Name when is_atom(Name) ->
+ scan1(Cs, [{var,Pos,Name}|Toks], Pos);
+ _Error -> scan_error({illegal,var}, Pos)
+ end.
+
+%% scan_name(Cs) -> lists:splitwith(fun (C) -> name_char(C) end, Cs).
+
+scan_name([C|Cs], Ncs) ->
+ case name_char(C) of
+ true -> scan_name(Cs, [C|Ncs]);
+ false -> {Ncs,[C|Cs]} %Must rebuild here, sigh!
+ end;
+scan_name([], Ncs) ->
+ {Ncs,[]}.
+
+name_char(C) when C >= $a, C =< $z -> true;
+name_char(C) when C >= $�, C =< $�, C /= $� -> true;
+name_char(C) when C >= $A, C =< $Z -> true;
+name_char(C) when C >= $�, C =< $�, C /= $� -> true;
+name_char(C) when C >= $0, C =< $9 -> true;
+name_char($_) -> true;
+name_char($@) -> true;
+name_char(_) -> false.
+
+%% scan_string(CharList, QuoteChar, Pos) -> {StringChars,RestChars,NewPos}.
+
+scan_string(Cs, Q, Pos) ->
+ scan_string(Cs, [], Q, Pos).
+
+scan_string([Q|Cs], Scs, Q, Pos) ->
+ {reverse(Scs),Cs,Pos};
+scan_string([$\n|Cs], Scs, Q, Pos) ->
+ scan_string(Cs, [$\n|Scs], Q, Pos+1);
+scan_string([$\\|Cs0], Scs, Q, Pos) ->
+ {C,Cs,Pos1} = scan_escape(Cs0, Pos),
+ scan_string(Cs, [C|Scs], Q, Pos1);
+scan_string([C|Cs], Scs, Q, Pos) ->
+ scan_string(Cs, [C|Scs], Q, Pos).
+
+%% scan_char(Chars, Pos) -> {Char,RestChars,NewPos}.
+%% Read a single character from a character constant. The pre-scan
+%% phase has checked for errors here.
+
+scan_char([$\\|Cs], Pos) ->
+ scan_escape(Cs, Pos);
+scan_char([$\n|Cs], Pos) -> %Newline
+ {$\n,Cs,Pos+1};
+scan_char([C|Cs], Pos) ->
+ {C,Cs,Pos}.
+
+scan_escape([O1,O2,O3|Cs], Pos) when %\<1-3> octal digits
+ O1 >= $0, O1 =< $7, O2 >= $0, O2 =< $7, O3 >= $0, O3 =< $7 ->
+ Val = (O1*8 + O2)*8 + O3 - 73*$0,
+ {Val,Cs,Pos};
+scan_escape([O1,O2|Cs], Pos) when
+ O1 >= $0, O1 =< $7, O2 >= $0, O2 =< $7 ->
+ Val = (O1*8 + O2) - 9*$0,
+ {Val,Cs,Pos};
+scan_escape([O1|Cs], Pos) when
+ O1 >= $0, O1 =< $7 ->
+ {O1 - $0,Cs,Pos};
+scan_escape([$^,C|Cs], Pos) -> %\^X -> CTL-X
+ Val = C band 31,
+ {Val,Cs,Pos};
+%scan_escape([$\n,C1|Cs],Pos) ->
+% {C1,Cs,Pos+1};
+%scan_escape([C,C1|Cs],Pos) when C >= $\000, C =< $\s ->
+% {C1,Cs,Pos};
+scan_escape([$\n|Cs],Pos) ->
+ {$\n,Cs,Pos+1};
+scan_escape([C0|Cs],Pos) ->
+ C = escape_char(C0),
+ {C,Cs,Pos}.
+
+escape_char($n) -> $\n; %\n = LF
+escape_char($r) -> $\r; %\r = CR
+escape_char($t) -> $\t; %\t = TAB
+escape_char($v) -> $\v; %\v = VT
+escape_char($b) -> $\b; %\b = BS
+escape_char($f) -> $\f; %\f = FF
+escape_char($e) -> $\e; %\e = ESC
+escape_char($s) -> $\s; %\s = SPC
+escape_char($d) -> $\d; %\d = DEL
+escape_char(C) -> C.
+
+%% scan_number(Char, CharList, TokenStack, Pos)
+%% We can handle simple radix notation:
+%% <digit>#<digits> - the digits read in that base
+%% <digits> - the digits in base 10
+%% <digits>.<digits>
+%% <digits>.<digits>E+-<digits>
+%%
+%% Except for explicitly based integers we build a list of all the
+%% characters and then use list_to_integer/1 or list_to_float/1 to
+%% generate the value.
+
+%% SPos == Start position
+%% CPos == Current position
+
+scan_number(C, Cs0, Toks, Pos) ->
+ {Ncs,Cs,Pos1} = scan_integer(Cs0, [C], Pos),
+ scan_after_int(Cs, Ncs, Toks, Pos, Pos1).
+
+scan_signed_number(S, C, Cs0, Toks, Pos) ->
+ {Ncs,Cs,Pos1} = scan_integer(Cs0, [C,S], Pos),
+ scan_after_int(Cs, Ncs, Toks, Pos, Pos1).
+
+scan_integer([C|Cs], Stack, Pos) when C >= $0, C =< $9 ->
+ scan_integer(Cs, [C|Stack], Pos);
+scan_integer(Cs, Stack, Pos) ->
+ {Stack,Cs,Pos}.
+
+scan_after_int([$.,C|Cs0], Ncs0, Toks, SPos, CPos) when C >= $0, C =< $9 ->
+ {Ncs,Cs,CPos1} = scan_integer(Cs0, [C,$.|Ncs0], CPos),
+ scan_after_fraction(Cs, Ncs, Toks, SPos, CPos1);
+scan_after_int([$#|Cs], Ncs, Toks, SPos, CPos) ->
+ case list_to_integer(reverse(Ncs)) of
+ Base when Base >= 2, Base =< 16 ->
+ scan_based_int(Cs, 0, Base, Toks, SPos, CPos);
+ Base ->
+ scan_error({base,Base}, CPos)
+ end;
+scan_after_int(Cs, Ncs, Toks, SPos, CPos) ->
+ N = list_to_integer(reverse(Ncs)),
+ scan1(Cs, [{integer,SPos,N}|Toks], CPos).
+
+scan_based_int([C|Cs], SoFar, Base, Toks, SPos, CPos) when
+ C >= $0, C =< $9, C < Base + $0 ->
+ Next = SoFar * Base + (C - $0),
+ scan_based_int(Cs, Next, Base, Toks, SPos, CPos);
+scan_based_int([C|Cs], SoFar, Base, Toks, SPos, CPos) when
+ C >= $a, C =< $f, C < Base + $a - 10 ->
+ Next = SoFar * Base + (C - $a + 10),
+ scan_based_int(Cs, Next, Base, Toks, SPos, CPos);
+scan_based_int([C|Cs], SoFar, Base, Toks, SPos, CPos) when
+ C >= $A, C =< $F, C < Base + $A - 10 ->
+ Next = SoFar * Base + (C - $A + 10),
+ scan_based_int(Cs, Next, Base, Toks, SPos, CPos);
+scan_based_int(Cs, SoFar, _, Toks, SPos, CPos) ->
+ scan1(Cs, [{integer,SPos,SoFar}|Toks], CPos).
+
+scan_after_fraction([$E|Cs], Ncs, Toks, SPos, CPos) ->
+ scan_exponent(Cs, [$E|Ncs], Toks, SPos, CPos);
+scan_after_fraction([$e|Cs], Ncs, Toks, SPos, CPos) ->
+ scan_exponent(Cs, [$E|Ncs], Toks, SPos, CPos);
+scan_after_fraction(Cs, Ncs, Toks, SPos, CPos) ->
+ case catch list_to_float(reverse(Ncs)) of
+ N when is_float(N) ->
+ scan1(Cs, [{float,SPos,N}|Toks], CPos);
+ _Error -> scan_error({illegal,float}, SPos)
+ end.
+
+%% scan_exponent(CharList, NumberCharStack, TokenStack, StartPos, CurPos)
+%% Generate an error here if E{+|-} not followed by any digits.
+
+scan_exponent([$+|Cs], Ncs, Toks, SPos, CPos) ->
+ scan_exponent1(Cs, [$+|Ncs], Toks, SPos, CPos);
+scan_exponent([$-|Cs], Ncs, Toks, SPos, CPos) ->
+ scan_exponent1(Cs, [$-|Ncs], Toks, SPos, CPos);
+scan_exponent(Cs, Ncs, Toks, SPos, CPos) ->
+ scan_exponent1(Cs, Ncs, Toks, SPos, CPos).
+
+scan_exponent1([C|Cs0], Ncs0, Toks, SPos, CPos) when C >= $0, C =< $9 ->
+ {Ncs,Cs,CPos1} = scan_integer(Cs0, [C|Ncs0], CPos),
+ case catch list_to_float(reverse(Ncs)) of
+ N when is_float(N) ->
+ scan1(Cs, [{float,SPos,N}|Toks], CPos1);
+ _Error -> scan_error({illegal,float}, SPos)
+ end;
+scan_exponent1(_, _, _, _, CPos) ->
+ scan_error(float, CPos).
+
+scan_error(In, Pos) ->
+ {error,{Pos,core_scan,In}}.