%% ``Licensed under the Apache License, Version 2.0 (the "License"); %% you may not use this file except in compliance with the License. %% You may obtain a copy of the License at %% %% http://www.apache.org/licenses/LICENSE-2.0 %% %% Unless required by applicable law or agreed to in writing, software %% distributed under the License is distributed on an "AS IS" BASIS, %% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. %% See the License for the specific language governing permissions and %% limitations under the License. %% %% The Initial Developer of the Original Code is Ericsson Utvecklings %% AB. Portions created by Ericsson are Copyright 1999, Ericsson %% Utvecklings AB. All Rights Reserved.'' %% %% @private %% @copyright Richard Carlsson 2001-2003. Portions created by Ericsson %% are Copyright 1999, Ericsson Utvecklings AB. All Rights Reserved. %% @author Richard Carlsson %% @see edoc %% @end %% @doc Tokeniser for EDoc. Based on the Erlang standard library module %% {@link //stdlib/erl_scan}. -module(edoc_scanner). %% NOTE: the interface to this module is ancient and should be updated. %% Please do not regard these exported functions as stable. Their %% behaviour is described in the documentation of the module `erl_scan'. %% %% Since there are no `full stop' tokens in EDoc specifications, the %% `tokens' function *always* returns `{more, Continuation}' unless an %% error occurs. -export([string/1,string/2,format_error/1]). -import(lists, [reverse/1]). string(Cs) -> string(Cs, 1). string(Cs, StartPos) -> case scan(Cs, StartPos) of {ok,Toks} -> {ok,Toks,StartPos}; {error,E} -> {error,E,StartPos} end. %% format_error(Error) %% Return a string describing the error. format_error({string,Quote,Head}) -> ["unterminated string starting with " ++ io_lib:write_string(Head,Quote)]; format_error({illegal,Type}) -> io_lib:fwrite("illegal ~w", [Type]); format_error(char) -> "unterminated character"; format_error(scan) -> "premature end"; format_error({base,Base}) -> io_lib:fwrite("illegal base '~w'", [Base]); format_error(float) -> "bad float"; format_error(Other) -> io_lib:write(Other). %% Reserved words, not atoms: reserved('where') -> true; reserved(_) -> false. %% scan(CharList, StartPos) %% This takes a list of characters and tries to tokenise them. %% %% The token list is built in reverse order (in a stack) to save appending %% and then reversed when all the tokens have been collected. Most tokens %% are built in the same way. %% %% Returns: %% {ok,[Tok]} %% {error,{ErrorPos,edoc_scanner,What}} scan(Cs, Pos) -> scan1(Cs, [], Pos). %% scan1(Characters, TokenStack, Position) %% Scan a list of characters into tokens. scan1([$\n|Cs], Toks, Pos) -> % Newline scan1(Cs, Toks, Pos+1); scan1([C|Cs], Toks, Pos) when C >= 0, C =< $ -> % Skip blanks scan1(Cs, Toks, Pos); scan1([C|Cs], Toks, Pos) when C >= $a, C =< $z -> % Unquoted atom scan_atom(C, Cs, Toks, Pos); scan1([C|Cs], Toks, Pos) when C >= $\337, C =< $\377, C /= $\367 -> scan_atom(C, Cs, Toks, Pos); scan1([C|Cs], Toks, Pos) when C >= $0, C =< $9 -> % Numbers scan_number(C, Cs, Toks, Pos); scan1([$-,C| Cs], Toks, Pos) when C >= $0, C =< $9 -> % Signed numbers scan_signed_number($-, C, Cs, Toks, Pos); scan1([$+,C| Cs], Toks, Pos) when C >= $0, C =< $9 -> % Signed numbers scan_signed_number($+, C, Cs, Toks, Pos); scan1([C|Cs], Toks, Pos) when C >= $A, C =< $Z -> % Variables scan_variable(C, Cs, Toks, Pos); scan1([$_|Cs], Toks, Pos) -> % Variables scan_variable($_, Cs, Toks, Pos); scan1([C|Cs], Toks, Pos) when C >= $\300, C =< $\336, C /= $\327 -> scan_variable(C, Cs, Toks, Pos); scan1([$$|Cs], Toks, Pos) -> % Character constant case scan_char_const(Cs, Toks, Pos) of {ok, Result} -> {ok, Result}; {error, truncated_char} -> scan_error(char, Pos); {error, illegal_character} -> scan_error({illegal, char}, Pos) end; scan1([$'|Cs0], Toks, Pos) -> % Quoted atom case scan_string(Cs0, $', Pos) of {S,Cs1,Pos1} -> case catch list_to_atom(S) of A when is_atom(A) -> scan1(Cs1, [{atom,Pos,A}|Toks], Pos1); _Error -> scan_error({illegal,atom}, Pos) end; {error, premature_end} -> scan_error({string,$',Cs0}, Pos); {error, truncated_char} -> scan_error(char, Pos); {error, illegal_character} -> scan_error({illegal, atom}, Pos) end; scan1([$"|Cs0], Toks, Pos) -> % String case scan_string(Cs0, $", Pos) of {S,Cs1,Pos1} -> case Toks of [{string, Pos0, S0} | Toks1] -> scan1(Cs1, [{string, Pos0, S0 ++ S} | Toks1], Pos1); _ -> scan1(Cs1, [{string,Pos,S}|Toks], Pos1) end; {error, premature_end} -> scan_error({string,$",Cs0}, Pos); {error, truncated_char} -> scan_error(char, Pos); {error, illegal_character} -> scan_error({illegal, string}, Pos) end; %% Punctuation characters and operators, first recognise multiples. scan1([$=,$>|Cs], Toks, Pos) -> scan1(Cs, [{'=>',Pos}|Toks], Pos); scan1([$<,$<|Cs], Toks, Pos) -> scan1(Cs, [{'<<',Pos}|Toks], Pos); scan1([$>,$>|Cs], Toks, Pos) -> scan1(Cs, [{'>>',Pos}|Toks], Pos); scan1([$-,$>|Cs], Toks, Pos) -> scan1(Cs, [{'->',Pos}|Toks], Pos); scan1([$:,$=|Cs], Toks, Pos) -> scan1(Cs, [{':=',Pos}|Toks], Pos); scan1([$:,$:|Cs], Toks, Pos) -> scan1(Cs, [{'::',Pos}|Toks], Pos); scan1([$/,$/|Cs], Toks, Pos) -> scan1(Cs, [{'//',Pos}|Toks], Pos); scan1([$.,$.,$.|Cs], Toks, Pos) -> scan1(Cs, [{'...',Pos}|Toks], Pos); scan1([$.,$.|Cs], Toks, Pos) -> scan1(Cs, [{'..',Pos}|Toks], Pos); scan1([C|Cs], Toks, Pos) -> % Punctuation character P = list_to_atom([C]), scan1(Cs, [{P,Pos}|Toks], Pos); scan1([], Toks0, _Pos) -> Toks = reverse(Toks0), {ok,Toks}. %% Note that `_' is not accepted as a variable token. scan_variable(C, Cs, Toks, Pos) -> {Wcs,Cs1} = scan_name(Cs, []), W = [C|reverse(Wcs)], case W of "_" -> scan1(Cs1, [{an_var,Pos,'_'}|Toks], Pos); _ -> case catch list_to_atom(W) of A when is_atom(A) -> scan1(Cs1, [{var,Pos,A}|Toks], Pos); _ -> scan_error({illegal,variable}, Pos) end end. scan_atom(C, Cs, Toks, Pos) -> {Wcs,Cs1} = scan_name(Cs, []), W = [C|reverse(Wcs)], case catch list_to_atom(W) of A when is_atom(A) -> case reserved(A) of true -> scan1(Cs1, [{A,Pos}|Toks], Pos); false -> scan1(Cs1, [{atom,Pos,A}|Toks], Pos) end; _ -> scan_error({illegal,token}, Pos) end. %% scan_name(Cs) -> lists:splitwith(fun (C) -> name_char(C) end, Cs). scan_name([C|Cs], Ncs) -> case name_char(C) of true -> scan_name(Cs, [C|Ncs]); false -> {Ncs,[C|Cs]} % Must rebuild here, sigh! end; scan_name([], Ncs) -> {Ncs,[]}. name_char(C) when C >= $a, C =< $z -> true; name_char(C) when C >= $\337, C =< $\377, C /= $\367 -> true; name_char(C) when C >= $A, C =< $Z -> true; name_char(C) when C >= $\300, C =< $\336, C /= $\327 -> true; name_char(C) when C >= $0, C =< $9 -> true; name_char($_) -> true; name_char($@) -> true; name_char(_) -> false. %% scan_string(CharList, QuoteChar, Pos) -> %% {StringChars,RestChars, NewPos} scan_string(Cs, Quote, Pos) -> scan_string(Cs, [], Quote, Pos). scan_string([Quote|Cs], Scs, Quote, Pos) -> {reverse(Scs),Cs,Pos}; scan_string([], _Scs, _Quote, _Pos) -> {error, premature_end}; scan_string(Cs0, Scs, Quote, Pos) -> case scan_char(Cs0, Pos) of {C,Cs,Pos1} -> %% Only build the string here scan_string(Cs, [C|Scs], Quote, Pos1); Error -> Error end. %% Note that space characters are not allowed scan_char_const([$\040 | _Cs0], _Toks, _Pos) -> {error, illegal_character}; scan_char_const(Cs0, Toks, Pos) -> case scan_char(Cs0, Pos) of {C,Cs,Pos1} -> scan1(Cs, [{char,Pos,C}|Toks], Pos1); Error -> Error end. %% {Character,RestChars,NewPos} = scan_char(Chars, Pos) %% Read a single character from a string or character constant. The %% pre-scan phase has checked for errors here. %% Note that control characters are not allowed. scan_char([$\\|Cs], Pos) -> scan_escape(Cs, Pos); scan_char([C | _Cs], _Pos) when C =< 16#1f -> {error, illegal_character}; scan_char([C|Cs], Pos) -> {C,Cs,Pos}; scan_char([], _Pos) -> {error, truncated_char}. %% The following conforms to Standard Erlang escape sequences. -define(HEX(C), C >= $0 andalso C =< $9 orelse C >= $A andalso C =< $F orelse C >= $a andalso C =< $f). -define(UNICODE(C), (C >= 0 andalso C < 16#D800 orelse C > 16#DFFF andalso C < 16#FFFE orelse C > 16#FFFF andalso C =< 16#10FFFF)). scan_escape([O1, O2, O3 | Cs], Pos) when % \<1-3> octal digits O1 >= $0, O1 =< $3, O2 >= $0, O2 =< $7, O3 >= $0, O3 =< $7 -> Val = (O1*8 + O2)*8 + O3 - 73*$0, {Val,Cs,Pos}; scan_escape([O1, O2 | Cs], Pos) when O1 >= $0, O1 =< $7, O2 >= $0, O2 =< $7 -> Val = (O1*8 + O2) - 9*$0, {Val,Cs,Pos}; scan_escape([O1 | Cs], Pos) when O1 >= $0, O1 =< $7 -> {O1 - $0,Cs,Pos}; scan_escape([$x, ${ | Cs], Pos) -> scan_hex(Cs, Pos, []); scan_escape([$x, H1, H2 | Cs], Pos) when ?HEX(H1), ?HEX(H2) -> Val = (H1*16 + H2) - 17*$0, {Val,Cs,Pos}; scan_escape([$^, C | Cs], Pos) -> % \^X -> CTL-X if C >= $\100, C =< $\137 -> {C - $\100,Cs,Pos}; true -> {error, illegal_control_character} end; scan_escape([C | Cs], Pos) -> case escape_char(C) of C1 when C1 > $\000 -> {C1,Cs,Pos}; _ -> {error, undefined_escape_sequence} end; scan_escape([], _Pos) -> {error, truncated_char}. scan_hex([C | Cs], Pos, HCs) when ?HEX(C) -> scan_hex(Cs, Pos, [C | HCs]); scan_hex([$} | Cs], Pos, HCs) -> case catch erlang:list_to_integer(lists:reverse(HCs), 16) of Val when ?UNICODE(Val) -> {Val,Cs,Pos}; _ -> {error, undefined_escape_sequence} end; scan_hex(_Cs, _Pos, _HCs) -> {error, undefined_escape_sequence}. %% Note that we return $\000 for undefined escapes. escape_char($b) -> $\010; % \b = BS escape_char($d) -> $\177; % \d = DEL escape_char($e) -> $\033; % \e = ESC escape_char($f) -> $\014; % \f = FF escape_char($n) -> $\012; % \n = LF escape_char($r) -> $\015; % \r = CR escape_char($s) -> $\040; % \s = SPC escape_char($t) -> $\011; % \t = HT escape_char($v) -> $\013; % \v = VT escape_char($\\) -> $\134; % \\ = \ escape_char($') -> $\047; % \' = ' escape_char($") -> $\042; % \" = " escape_char(_C) -> $\000. %% scan_number(Char, CharList, TokenStack, Pos) %% We handle sign and radix notation: %% [+-] - the digits in base [+-]10 %% [+-]. %% [+-].E+- %% [+-]# - the digits read in base [+-]B %% %% Except for explicitly based integers we build a list of all the %% characters and then use list_to_integer/1 or list_to_float/1 to %% generate the value. %% SPos == Start position %% CPos == Current position scan_number(C, Cs0, Toks, Pos) -> {Ncs,Cs,Pos1} = scan_integer(Cs0, [C], Pos), scan_after_int(Cs, Ncs, Toks, Pos, Pos1). scan_signed_number(S, C, Cs0, Toks, Pos) -> {Ncs,Cs,Pos1} = scan_integer(Cs0, [C, S], Pos), scan_after_int(Cs, Ncs, Toks, Pos, Pos1). scan_integer([C|Cs], Stack, Pos) when C >= $0, C =< $9 -> scan_integer(Cs, [C|Stack], Pos); scan_integer(Cs, Stack, Pos) -> {Stack,Cs,Pos}. scan_after_int([$.,C|Cs0], Ncs0, Toks, SPos, CPos) when C >= $0, C =< $9 -> {Ncs,Cs,CPos1} = scan_integer(Cs0, [C,$.|Ncs0], CPos), scan_after_fraction(Cs, Ncs, Toks, SPos, CPos1); scan_after_int(Cs, Ncs, Toks, SPos, CPos) -> N = list_to_integer(reverse(Ncs)), scan1(Cs, [{integer,SPos,N}|Toks], CPos). scan_after_fraction([$E|Cs], Ncs, Toks, SPos, CPos) -> scan_exponent(Cs, [$E|Ncs], Toks, SPos, CPos); scan_after_fraction([$e|Cs], Ncs, Toks, SPos, CPos) -> scan_exponent(Cs, [$e|Ncs], Toks, SPos, CPos); scan_after_fraction(Cs, Ncs, Toks, SPos, CPos) -> case catch list_to_float(reverse(Ncs)) of N when is_float(N) -> scan1(Cs, [{float,SPos,N}|Toks], CPos); _Error -> scan_error({illegal,float}, SPos) end. %% scan_exponent(CharList, NumberCharStack, TokenStack, StartPos, CurPos) %% Generate an error here if E{+|-} not followed by any digits. scan_exponent([$+|Cs], Ncs, Toks, SPos, CPos) -> scan_exponent1(Cs, [$+|Ncs], Toks, SPos, CPos); scan_exponent([$-|Cs], Ncs, Toks, SPos, CPos) -> scan_exponent1(Cs, [$-|Ncs], Toks, SPos, CPos); scan_exponent(Cs, Ncs, Toks, SPos, CPos) -> scan_exponent1(Cs, Ncs, Toks, SPos, CPos). scan_exponent1([C|Cs0], Ncs0, Toks, SPos, CPos) when C >= $0, C =< $9 -> {Ncs,Cs,CPos1} = scan_integer(Cs0, [C|Ncs0], CPos), case catch list_to_float(reverse(Ncs)) of N when is_float(N) -> scan1(Cs, [{float,SPos,N}|Toks], CPos1); _Error -> scan_error({illegal,float}, SPos) end; scan_exponent1(_, _, _, _, CPos) -> scan_error(float, CPos). scan_error(In, Pos) -> {error,{Pos,edoc_scanner,In}}.