1 files changed, 468 insertions, 0 deletions
diff --git a/lib/compiler/src/core_scan.erl b/lib/compiler/src/core_scan.erl
new file mode 100644
index 0000000000..5aab8ae855
--- /dev/null
+++ b/lib/compiler/src/core_scan.erl
@@ -0,0 +1,468 @@
+%%
+%% %CopyrightBegin%
+%% 
+%% Copyright Ericsson AB 2000-2009. All Rights Reserved.
+%% 
+%% The contents of this file are subject to the Erlang Public License,
+%% Version 1.1, (the "License"); you may not use this file except in
+%% compliance with the License. You should have received a copy of the
+%% Erlang Public License along with this software. If not, it can be
+%% retrieved online at http://www.erlang.org/.
+%% 
+%% Software distributed under the License is distributed on an "AS IS"
+%% basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See
+%% the License for the specific language governing rights and limitations
+%% under the License.
+%% 
+%% %CopyrightEnd%
+%%
+%% Purpose: Scanner for Core Erlang.
+
+%% For handling ISO 8859-1 (Latin-1) we use the following type
+%% information:
+%%
+%% 000 - 037	NUL - US	control
+%% 040 - 057	SPC - /		punctuation
+%% 060 - 071	0 - 9		digit
+%% 072 - 100	: - @		punctuation
+%% 101 - 132	A - Z		uppercase
+%% 133 - 140	[ - `		punctuation
+%% 141 - 172	a - z		lowercase
+%% 173 - 176	{ - ~		punctuation
+%% 177		DEL		control
+%% 200 - 237			control
+%% 240 - 277	NBSP - �	punctuation
+%% 300 - 326	� - �		uppercase
+%% 327		�		punctuation
+%% 330 - 336	� - �		uppercase
+%% 337 - 366	� - �		lowercase
+%% 367		�		punctuation
+%% 370 - 377	� - �		lowercase
+%%
+%% Many punctuation characters region have special meaning.  Must
+%% watch using � \327, bvery close to x \170
+
+-module(core_scan).
+
+-export([string/1, string/2, format_error/1]).
+
+-import(lists, [reverse/1]).
+
+%% string([Char]) ->
+%% string([Char], StartPos) ->
+%%    {ok, [Tok], EndPos} |
+%%    {error, {Pos,core_scan,What}, EndPos}
+
+string(Cs) -> string(Cs, 1).
+
+string(Cs, Sp) ->
+    %% Add an 'eof' to always get correct handling.
+    case string_pre_scan(Cs, [], Sp) of
+	{done,_,SoFar,Ep} ->			%Got tokens
+	    case scan(reverse(SoFar), Sp) of
+		{ok,Toks} -> {ok,Toks,Ep};
+		{error,E} -> {error,E,Ep}
+	    end;
+	Other -> Other				%An error has occurred
+    end.
+
+%% string_pre_scan(Cs, SoFar0, StartPos) ->
+%%      {done,Rest,SoFar,EndPos} | {error,E,EndPos}.
+
+string_pre_scan(Cs, SoFar0, Sp) ->
+    case pre_scan(Cs, SoFar0, Sp) of
+	{done,Rest,SoFar1,Ep} ->		%Got complete tokens
+	    {done,Rest,SoFar1,Ep};
+	{more,Rest,SoFar1,Ep} ->		%Missing end token
+	    string_pre_scan(Rest ++ eof, SoFar1, Ep);
+	Other -> Other				%An error has occurred
+    end.
+
+%% format_error(Error)
+%%  Return a string describing the error.
+
+-spec format_error(term()) -> iolist().
+
+format_error({string,Quote,Head}) ->
+    ["unterminated " ++ string_thing(Quote) ++
+     " starting with " ++ io_lib:write_string(Head,Quote)];
+format_error({illegal,Type}) -> io_lib:fwrite("illegal ~w", [Type]);
+format_error(char) -> "unterminated character";
+format_error(scan) -> "premature end";
+format_error({base,Base}) -> io_lib:fwrite("illegal base '~w'", [Base]);
+format_error(float) -> "bad float";
+format_error(Other) -> io_lib:write(Other).
+
+string_thing($') -> "atom";    %' stupid emacs
+string_thing($") -> "string".  %" stupid emacs
+
+%% Re-entrant pre-scanner.
+%%
+%% If the input list of characters is insufficient to build a term the
+%% scanner returns a request for more characters and a continuation to be
+%% used when trying to build a term with more characters. To indicate
+%% end-of-file the input character list should be replaced with 'eof'
+%% as an empty list has meaning.
+%%
+%% When more characters are need inside a comment, string or quoted
+%% atom, which can become rather long, instead of pushing the
+%% characters read so far back onto RestChars to be reread, a special
+%% reentry token is returned indicating the middle of a construct.
+%% The token is the start character as an atom, '%', '"' and '\''.
+
+%% pre_scan([Char], SoFar, StartPos) ->
+%%	{done,RestChars,ScannedChars,NewPos} |
+%%	{more,RestChars,ScannedChars,NewPos} |
+%%	{error,{ErrorPos,core_scan,Description},NewPos}.
+%%  Main pre-scan function. It has been split into 2 functions because of
+%%  efficiency, with a good indexing compiler it would be unnecessary.
+
+pre_scan([C|Cs], SoFar, Pos) ->
+    pre_scan(C, Cs, SoFar, Pos);
+pre_scan([], SoFar, Pos) ->
+    {more,[],SoFar,Pos};
+pre_scan(eof, SoFar, Pos) ->
+    {done,eof,SoFar,Pos}.
+
+%% pre_scan(Char, [Char], SoFar, Pos)
+
+pre_scan($$, Cs0, SoFar0, Pos) ->
+    case pre_char(Cs0, [$$|SoFar0]) of
+	{Cs,SoFar} ->
+	    pre_scan(Cs, SoFar, Pos);
+	more ->
+	    {more,[$$|Cs0],SoFar0, Pos};
+	error ->
+	    pre_error(char, Pos, Pos)
+    end;
+pre_scan($', Cs, SoFar, Pos) ->
+    pre_string(Cs, $', '\'', Pos, [$'|SoFar], Pos);
+pre_scan({'\'',Sp}, Cs, SoFar, Pos) ->		%Re-entering quoted atom
+    pre_string(Cs, $', '\'', Sp, SoFar, Pos);
+pre_scan($", Cs, SoFar, Pos) ->
+    pre_string(Cs, $", '"', Pos, [$"|SoFar], Pos);
+pre_scan({'"',Sp}, Cs, SoFar, Pos) ->		%Re-entering string
+    pre_string(Cs, $", '"', Sp, SoFar, Pos);
+pre_scan($%, Cs, SoFar, Pos) ->
+    pre_comment(Cs, SoFar, Pos);
+pre_scan('%', Cs, SoFar, Pos) ->		%Re-entering comment
+    pre_comment(Cs, SoFar, Pos);
+pre_scan($\n, Cs, SoFar, Pos) ->
+    pre_scan(Cs, [$\n|SoFar], Pos+1);
+pre_scan(C, Cs, SoFar, Pos) ->
+    pre_scan(Cs, [C|SoFar], Pos).
+
+%% pre_string([Char], Quote, Reent, StartPos, SoFar, Pos)
+
+pre_string([Q|Cs], Q, _, _, SoFar, Pos) ->
+    pre_scan(Cs, [Q|SoFar], Pos);
+pre_string([$\n|Cs], Q, Reent, Sp, SoFar, Pos) ->
+    pre_string(Cs, Q, Reent, Sp, [$\n|SoFar], Pos+1);
+pre_string([$\\|Cs0], Q, Reent, Sp, SoFar0, Pos) ->
+    case pre_escape(Cs0, SoFar0) of
+	{Cs,SoFar} ->
+	    pre_string(Cs, Q, Reent, Sp, SoFar, Pos);
+	more ->
+	    {more,[{Reent,Sp},$\\|Cs0],SoFar0,Pos};
+	error ->
+	    pre_string_error(Q, Sp, SoFar0, Pos)
+    end;
+pre_string([C|Cs], Q, Reent, Sp, SoFar, Pos) ->
+    pre_string(Cs, Q, Reent, Sp, [C|SoFar], Pos);
+pre_string([], _, Reent, Sp, SoFar, Pos) ->
+    {more,[{Reent,Sp}],SoFar,Pos};
+pre_string(eof, Q, _, Sp, SoFar, Pos) ->
+    pre_string_error(Q, Sp, SoFar, Pos).
+
+pre_string_error(Q, Sp, SoFar, Pos) ->
+    S = reverse(string:substr(SoFar, 1, string:chr(SoFar, Q)-1)),
+    pre_error({string,Q,string:substr(S, 1, 16)}, Sp, Pos).
+
+pre_char([C|Cs], SoFar) -> pre_char(C, Cs, SoFar);
+pre_char([], _) -> more;
+pre_char(eof, _) -> error.
+
+pre_char($\\, Cs, SoFar) ->
+    pre_escape(Cs, SoFar);
+pre_char(C, Cs, SoFar) ->
+    {Cs,[C|SoFar]}.
+
+pre_escape([$^|Cs0], SoFar) ->
+    case Cs0 of
+	[C3|Cs] ->
+	    {Cs,[C3,$^,$\\|SoFar]};
+	[] -> more;
+	eof -> error
+    end;
+pre_escape([C|Cs], SoFar) ->
+    {Cs,[C,$\\|SoFar]};
+pre_escape([], _) -> more;
+pre_escape(eof, _) -> error.
+
+%% pre_comment([Char], SoFar, Pos)
+%%  Comments are replaced by one SPACE.
+
+pre_comment([$\n|Cs], SoFar, Pos) ->
+    pre_scan(Cs, [$\n,$\s|SoFar], Pos+1);	%Terminate comment
+pre_comment([_|Cs], SoFar, Pos) ->
+    pre_comment(Cs, SoFar, Pos);
+pre_comment([], SoFar, Pos) ->
+    {more,['%'],SoFar,Pos};
+pre_comment(eof, Sofar, Pos) ->
+    pre_scan(eof, [$\s|Sofar], Pos).
+
+pre_error(E, Epos, Pos) ->
+    {error,{Epos,core_scan,E}, Pos}.
+
+%% scan(CharList, StartPos)
+%%  This takes a list of characters and tries to tokenise them.
+%%
+%%  The token list is built in reverse order (in a stack) to save appending
+%%  and then reversed when all the tokens have been collected. Most tokens
+%%  are built in the same way.
+%%
+%%  Returns:
+%%	{ok,[Tok]}
+%%	{error,{ErrorPos,core_scan,What}}
+
+scan(Cs, Pos) ->
+    scan1(Cs, [], Pos).
+
+%% scan1(Characters, TokenStack, Position)
+%%  Scan a list of characters into tokens.
+
+scan1([$\n|Cs], Toks, Pos) ->            	        %Skip newline
+    scan1(Cs, Toks, Pos+1);
+scan1([C|Cs], Toks, Pos) when C >= $\000, C =< $\s -> 	%Skip control chars
+    scan1(Cs, Toks, Pos);
+scan1([C|Cs], Toks, Pos) when C >= $\200, C =< $\240 ->
+    scan1(Cs, Toks, Pos);
+scan1([C|Cs], Toks, Pos) when C >= $a, C =< $z ->	%Keywords
+    scan_key_word(C, Cs, Toks, Pos);
+scan1([C|Cs], Toks, Pos) when C >= $�, C =< $�, C /= $� ->
+    scan_key_word(C, Cs, Toks, Pos);
+scan1([C|Cs], Toks, Pos) when C >= $A, C =< $Z ->	%Variables
+    scan_variable(C, Cs, Toks, Pos);
+scan1([C|Cs], Toks, Pos) when C >= $�, C =< $�, C /= $� ->
+    scan_variable(C, Cs, Toks, Pos);
+scan1([C|Cs], Toks, Pos) when C >= $0, C =< $9 ->	%Numbers
+    scan_number(C, Cs, Toks, Pos);
+scan1([$-,C|Cs], Toks, Pos) when C >= $0, C =< $9 ->	%Signed numbers
+    scan_signed_number($-, C, Cs, Toks, Pos);
+scan1([$+,C|Cs], Toks, Pos) when C >= $0, C =< $9 ->	%Signed numbers
+    scan_signed_number($+, C, Cs, Toks, Pos);
+scan1([$_|Cs], Toks, Pos) ->				%_ variables
+    scan_variable($_, Cs, Toks, Pos);
+scan1([$$|Cs0], Toks, Pos) ->				%Character constant
+    {C,Cs,Pos1} = scan_char(Cs0, Pos),
+    scan1(Cs, [{char,Pos,C}|Toks], Pos1);
+scan1([$'|Cs0], Toks, Pos) ->				%Atom (always quoted)
+    {S,Cs1,Pos1} = scan_string(Cs0, $', Pos),
+    case catch list_to_atom(S) of
+	A when is_atom(A) ->
+	    scan1(Cs1, [{atom,Pos,A}|Toks], Pos1);
+	_Error -> scan_error({illegal,atom}, Pos)
+    end;
+scan1([$"|Cs0], Toks, Pos) ->				%String
+    {S,Cs1,Pos1} = scan_string(Cs0, $", Pos),
+    scan1(Cs1, [{string,Pos,S}|Toks], Pos1);
+%% Punctuation characters and operators, first recognise multiples.
+scan1("->" ++ Cs, Toks, Pos) ->
+    scan1(Cs, [{'->',Pos}|Toks], Pos);
+scan1("-|" ++ Cs, Toks, Pos) ->
+    scan1(Cs, [{'-|',Pos}|Toks], Pos);
+scan1([C|Cs], Toks, Pos) ->				%Punctuation character
+    P = list_to_atom([C]),
+    scan1(Cs, [{P,Pos}|Toks], Pos);
+scan1([], Toks0, _) ->
+    Toks = reverse(Toks0),
+    {ok,Toks}.
+
+%% scan_key_word(FirstChar, CharList, Tokens, Pos)
+%% scan_variable(FirstChar, CharList, Tokens, Pos)
+
+scan_key_word(C, Cs0, Toks, Pos) ->
+    {Wcs,Cs} = scan_name(Cs0, []),
+    case catch list_to_atom([C|reverse(Wcs)]) of
+	Name when is_atom(Name) ->
+	    scan1(Cs, [{Name,Pos}|Toks], Pos);
+	_Error -> scan_error({illegal,atom}, Pos)
+    end.
+
+scan_variable(C, Cs0, Toks, Pos) ->
+    {Wcs,Cs} = scan_name(Cs0, []),
+    case catch list_to_atom([C|reverse(Wcs)]) of
+	Name when is_atom(Name) ->
+	    scan1(Cs, [{var,Pos,Name}|Toks], Pos);
+	_Error -> scan_error({illegal,var}, Pos)
+    end.
+
+%% scan_name(Cs) -> lists:splitwith(fun (C) -> name_char(C) end, Cs).
+
+scan_name([C|Cs], Ncs) ->
+    case name_char(C) of
+	true -> scan_name(Cs, [C|Ncs]);
+	false -> {Ncs,[C|Cs]}			%Must rebuild here, sigh!
+    end;
+scan_name([], Ncs) ->
+    {Ncs,[]}.
+
+name_char(C) when C >= $a, C =< $z -> true;
+name_char(C) when C >= $�, C =< $�, C /= $� -> true;
+name_char(C) when C >= $A, C =< $Z -> true;
+name_char(C) when C >= $�, C =< $�, C /= $� -> true;
+name_char(C) when C >= $0, C =< $9 -> true;
+name_char($_) -> true;
+name_char($@) -> true;
+name_char(_) -> false.
+
+%% scan_string(CharList, QuoteChar, Pos) -> {StringChars,RestChars,NewPos}.
+
+scan_string(Cs, Q, Pos) ->
+    scan_string(Cs, [], Q, Pos).
+
+scan_string([Q|Cs], Scs, Q, Pos) ->
+    {reverse(Scs),Cs,Pos};
+scan_string([$\n|Cs], Scs, Q, Pos) ->
+    scan_string(Cs, [$\n|Scs], Q, Pos+1);
+scan_string([$\\|Cs0], Scs, Q, Pos) ->
+    {C,Cs,Pos1} = scan_escape(Cs0, Pos),
+    scan_string(Cs, [C|Scs], Q, Pos1);
+scan_string([C|Cs], Scs, Q, Pos) ->
+    scan_string(Cs, [C|Scs], Q, Pos).
+
+%% scan_char(Chars, Pos) -> {Char,RestChars,NewPos}.
+%%  Read a single character from a character constant. The pre-scan
+%%  phase has checked for errors here.
+
+scan_char([$\\|Cs], Pos) ->
+    scan_escape(Cs, Pos);
+scan_char([$\n|Cs], Pos) ->                  %Newline
+    {$\n,Cs,Pos+1};
+scan_char([C|Cs], Pos) ->
+    {C,Cs,Pos}.
+
+scan_escape([O1,O2,O3|Cs], Pos) when            %\<1-3> octal digits
+    O1 >= $0, O1 =< $7, O2 >= $0, O2 =< $7, O3 >= $0, O3 =< $7 ->
+    Val = (O1*8 + O2)*8 + O3 - 73*$0,
+    {Val,Cs,Pos};
+scan_escape([O1,O2|Cs], Pos) when
+    O1 >= $0, O1 =< $7, O2 >= $0, O2 =< $7 ->
+    Val = (O1*8 + O2) - 9*$0,
+    {Val,Cs,Pos};
+scan_escape([O1|Cs], Pos) when
+    O1 >= $0, O1 =< $7 ->
+    {O1 - $0,Cs,Pos};
+scan_escape([$^,C|Cs], Pos) ->			%\^X -> CTL-X
+    Val = C band 31,
+    {Val,Cs,Pos};
+%scan_escape([$\n,C1|Cs],Pos) ->
+%    {C1,Cs,Pos+1};
+%scan_escape([C,C1|Cs],Pos) when C >= $\000, C =< $\s ->
+%    {C1,Cs,Pos};
+scan_escape([$\n|Cs],Pos) ->
+    {$\n,Cs,Pos+1};
+scan_escape([C0|Cs],Pos) ->
+    C = escape_char(C0),
+    {C,Cs,Pos}.
+
+escape_char($n) -> $\n;				%\n = LF
+escape_char($r) -> $\r;				%\r = CR
+escape_char($t) -> $\t;				%\t = TAB
+escape_char($v) -> $\v;				%\v = VT
+escape_char($b) -> $\b;				%\b = BS
+escape_char($f) -> $\f;				%\f = FF
+escape_char($e) -> $\e;				%\e = ESC
+escape_char($s) -> $\s;				%\s = SPC
+escape_char($d) -> $\d;				%\d = DEL
+escape_char(C) -> C.
+
+%% scan_number(Char, CharList, TokenStack, Pos)
+%%  We can handle simple radix notation:
+%%    <digit>#<digits>		- the digits read in that base
+%%    <digits>			- the digits in base 10
+%%    <digits>.<digits>
+%%    <digits>.<digits>E+-<digits>
+%%
+%%  Except for explicitly based integers we build a list of all the
+%%  characters and then use list_to_integer/1 or list_to_float/1 to
+%%  generate the value.
+
+%%  SPos == Start position
+%%  CPos == Current position
+
+scan_number(C, Cs0, Toks, Pos) ->
+    {Ncs,Cs,Pos1} = scan_integer(Cs0, [C], Pos),
+    scan_after_int(Cs, Ncs, Toks, Pos, Pos1).
+
+scan_signed_number(S, C, Cs0, Toks, Pos) ->
+    {Ncs,Cs,Pos1} = scan_integer(Cs0, [C,S], Pos),
+    scan_after_int(Cs, Ncs, Toks, Pos, Pos1).
+
+scan_integer([C|Cs], Stack, Pos) when C >= $0, C =< $9 ->
+    scan_integer(Cs, [C|Stack], Pos);
+scan_integer(Cs, Stack, Pos) ->
+    {Stack,Cs,Pos}.
+
+scan_after_int([$.,C|Cs0], Ncs0, Toks, SPos, CPos) when C >= $0, C =< $9 ->
+    {Ncs,Cs,CPos1} = scan_integer(Cs0, [C,$.|Ncs0], CPos),
+    scan_after_fraction(Cs, Ncs, Toks, SPos, CPos1);	
+scan_after_int([$#|Cs], Ncs, Toks, SPos, CPos) ->
+    case list_to_integer(reverse(Ncs)) of
+	Base when Base >= 2, Base =< 16 ->
+	    scan_based_int(Cs, 0, Base, Toks, SPos, CPos);
+	Base ->
+	    scan_error({base,Base}, CPos)
+    end;
+scan_after_int(Cs, Ncs, Toks, SPos, CPos) ->
+    N = list_to_integer(reverse(Ncs)),
+    scan1(Cs, [{integer,SPos,N}|Toks], CPos).
+
+scan_based_int([C|Cs], SoFar, Base, Toks, SPos, CPos) when
+    C >= $0, C =< $9, C < Base + $0 ->
+    Next = SoFar * Base + (C - $0),
+    scan_based_int(Cs, Next, Base, Toks, SPos, CPos);
+scan_based_int([C|Cs], SoFar, Base, Toks, SPos, CPos) when
+    C >= $a, C =< $f, C < Base + $a - 10 ->
+    Next = SoFar * Base + (C - $a + 10),
+    scan_based_int(Cs, Next, Base, Toks, SPos, CPos);
+scan_based_int([C|Cs], SoFar, Base, Toks, SPos, CPos) when
+    C >= $A, C =< $F, C < Base + $A - 10 ->
+    Next = SoFar * Base + (C - $A + 10),
+    scan_based_int(Cs, Next, Base, Toks, SPos, CPos);
+scan_based_int(Cs, SoFar, _, Toks, SPos, CPos) ->
+    scan1(Cs, [{integer,SPos,SoFar}|Toks], CPos).
+
+scan_after_fraction([$E|Cs], Ncs, Toks, SPos, CPos) ->
+    scan_exponent(Cs, [$E|Ncs], Toks, SPos, CPos);
+scan_after_fraction([$e|Cs], Ncs, Toks, SPos, CPos) ->
+    scan_exponent(Cs, [$E|Ncs], Toks, SPos, CPos);
+scan_after_fraction(Cs, Ncs, Toks, SPos, CPos) ->
+    case catch list_to_float(reverse(Ncs)) of
+	N when is_float(N) ->
+	    scan1(Cs, [{float,SPos,N}|Toks], CPos);
+	_Error -> scan_error({illegal,float}, SPos)
+    end.
+
+%% scan_exponent(CharList, NumberCharStack, TokenStack, StartPos, CurPos)
+%%  Generate an error here if E{+|-} not followed by any digits.
+
+scan_exponent([$+|Cs], Ncs, Toks, SPos, CPos) ->
+    scan_exponent1(Cs, [$+|Ncs], Toks, SPos, CPos);
+scan_exponent([$-|Cs], Ncs, Toks, SPos, CPos) ->
+    scan_exponent1(Cs, [$-|Ncs], Toks, SPos, CPos);
+scan_exponent(Cs, Ncs, Toks, SPos, CPos) ->
+    scan_exponent1(Cs, Ncs, Toks, SPos, CPos).
+
+scan_exponent1([C|Cs0], Ncs0, Toks, SPos, CPos) when C >= $0, C =< $9 ->
+    {Ncs,Cs,CPos1} = scan_integer(Cs0, [C|Ncs0], CPos),
+    case catch list_to_float(reverse(Ncs)) of
+	N when is_float(N) ->
+	    scan1(Cs, [{float,SPos,N}|Toks], CPos1);
+	_Error -> scan_error({illegal,float}, SPos)
+    end;
+scan_exponent1(_, _, _, _, CPos) ->
+    scan_error(float, CPos).
+
+scan_error(In, Pos) ->
+    {error,{Pos,core_scan,In}}.