diff options
Diffstat (limited to 'lib/edoc/src/edoc_scanner.erl')
-rw-r--r-- | lib/edoc/src/edoc_scanner.erl | 358 |
1 files changed, 358 insertions, 0 deletions
diff --git a/lib/edoc/src/edoc_scanner.erl b/lib/edoc/src/edoc_scanner.erl new file mode 100644 index 0000000000..d3dff64682 --- /dev/null +++ b/lib/edoc/src/edoc_scanner.erl @@ -0,0 +1,358 @@ +%% ``The contents of this file are subject to the Erlang Public License, +%% Version 1.1, (the "License"); you may not use this file except in +%% compliance with the License. You should have received a copy of the +%% Erlang Public License along with this software. If not, it can be +%% retrieved via the world wide web at http://www.erlang.org/. +%% +%% Software distributed under the License is distributed on an "AS IS" +%% basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See +%% the License for the specific language governing rights and +%% limitations under the License. +%% +%% The Initial Developer of the Original Code is Ericsson Utvecklings +%% AB. Portions created by Ericsson are Copyright 1999, Ericsson +%% Utvecklings AB. All Rights Reserved.'' +%% +%% $Id$ +%% +%% @private +%% @copyright Richard Carlsson 2001-2003. Portions created by Ericsson +%% are Copyright 1999, Ericsson Utvecklings AB. All Rights Reserved. +%% @author Richard Carlsson <[email protected]> +%% @see edoc +%% @end + +%% @doc Tokeniser for EDoc. Based on the Erlang standard library module +%% {@link //stdlib/erl_scan}. + +-module(edoc_scanner). + +%% NOTE: the interface to this module is ancient and should be updated. +%% Please do not regard these exported functions as stable. Their +%% behaviour is described in the documentation of the module `erl_scan'. +%% +%% Since there are no `full stop' tokens in EDoc specifications, the +%% `tokens' function *always* returns `{more, Continuation}' unless an +%% error occurs. + +-export([string/1,string/2,format_error/1]). + +-import(lists, [reverse/1]). + +string(Cs) -> string(Cs, 1). + +string(Cs, StartPos) -> + case scan(Cs, StartPos) of + {ok,Toks} -> {ok,Toks,StartPos}; + {error,E} -> {error,E,StartPos} + end. + +%% format_error(Error) +%% Return a string describing the error. + +format_error({string,Quote,Head}) -> + ["unterminated string starting with " ++ io_lib:write_string(Head,Quote)]; +format_error({illegal,Type}) -> io_lib:fwrite("illegal ~w", [Type]); +format_error(char) -> "unterminated character"; +format_error(scan) -> "premature end"; +format_error({base,Base}) -> io_lib:fwrite("illegal base '~w'", [Base]); +format_error(float) -> "bad float"; + +format_error(Other) -> io_lib:write(Other). + +%% Reserved words, not atoms: +reserved('where') -> true; +reserved(_) -> false. + +%% scan(CharList, StartPos) +%% This takes a list of characters and tries to tokenise them. +%% +%% The token list is built in reverse order (in a stack) to save appending +%% and then reversed when all the tokens have been collected. Most tokens +%% are built in the same way. +%% +%% Returns: +%% {ok,[Tok]} +%% {error,{ErrorPos,edoc_scanner,What}} + +scan(Cs, Pos) -> + scan1(Cs, [], Pos). + +%% scan1(Characters, TokenStack, Position) +%% Scan a list of characters into tokens. + +scan1([$\n|Cs], Toks, Pos) -> % Newline + scan1(Cs, Toks, Pos+1); +scan1([C|Cs], Toks, Pos) when C >= 0, C =< $ -> % Skip blanks + scan1(Cs, Toks, Pos); +scan1([C|Cs], Toks, Pos) when C >= $a, C =< $z -> % Unquoted atom + scan_atom(C, Cs, Toks, Pos); +scan1([C|Cs], Toks, Pos) when C >= $0, C =< $9 -> % Numbers + scan_number(C, Cs, Toks, Pos); +scan1([$-,C| Cs], Toks, Pos) when C >= $0, C =< $9 -> % Signed numbers + scan_signed_number($-, C, Cs, Toks, Pos); +scan1([$+,C| Cs], Toks, Pos) when C >= $0, C =< $9 -> % Signed numbers + scan_signed_number($+, C, Cs, Toks, Pos); +scan1([C|Cs], Toks, Pos) when C >= $A, C =< $Z -> % Variables + scan_variable(C, Cs, Toks, Pos); +scan1([$_|Cs], Toks, Pos) -> % Variables + scan_variable($_, Cs, Toks, Pos); +scan1([$$|Cs], Toks, Pos) -> % Character constant + case scan_char_const(Cs, Toks, Pos) of + {ok, Result} -> + {ok, Result}; + {error, truncated_char} -> + scan_error(char, Pos); + {error, illegal_character} -> + scan_error({illegal, char}, Pos) + end; +scan1([$'|Cs0], Toks, Pos) -> % Quoted atom + case scan_string(Cs0, $', Pos) of + {S,Cs1,Pos1} -> + case catch list_to_atom(S) of + A when is_atom(A) -> + scan1(Cs1, [{atom,Pos,A}|Toks], Pos1); + _Error -> scan_error({illegal,atom}, Pos) + end; + {error, premature_end} -> + scan_error({string,$',Cs0}, Pos); + {error, truncated_char} -> + scan_error(char, Pos); + {error, illegal_character} -> + scan_error({illegal, atom}, Pos) + end; +scan1([$"|Cs0], Toks, Pos) -> % String + case scan_string(Cs0, $", Pos) of + {S,Cs1,Pos1} -> + case Toks of + [{string, Pos0, S0} | Toks1] -> + scan1(Cs1, [{string, Pos0, S0 ++ S} | Toks1], + Pos1); + _ -> + scan1(Cs1, [{string,Pos,S}|Toks], Pos1) + end; + {error, premature_end} -> + scan_error({string,$",Cs0}, Pos); + {error, truncated_char} -> + scan_error(char, Pos); + {error, illegal_character} -> + scan_error({illegal, string}, Pos) + end; +%% Punctuation characters and operators, first recognise multiples. +scan1([$-,$>|Cs], Toks, Pos) -> + scan1(Cs, [{'->',Pos}|Toks], Pos); +scan1([$:,$:|Cs], Toks, Pos) -> + scan1(Cs, [{'::',Pos}|Toks], Pos); +scan1([$/,$/|Cs], Toks, Pos) -> + scan1(Cs, [{'//',Pos}|Toks], Pos); +scan1([C|Cs], Toks, Pos) -> % Punctuation character + P = list_to_atom([C]), + scan1(Cs, [{P,Pos}|Toks], Pos); +scan1([], Toks0, _Pos) -> + Toks = reverse(Toks0), + {ok,Toks}. + +%% Note that `_' is not accepted as a variable token. +scan_variable(C, Cs, Toks, Pos) -> + {Wcs,Cs1} = scan_name(Cs, []), + W = [C|reverse(Wcs)], + case W of + "_" -> + scan_error({illegal,token}, Pos); + _ -> + case catch list_to_atom(W) of + A when is_atom(A) -> + scan1(Cs1, [{var,Pos,A}|Toks], Pos); + _ -> + scan_error({illegal,variable}, Pos) + end + end. + +scan_atom(C, Cs, Toks, Pos) -> + {Wcs,Cs1} = scan_name(Cs, []), + W = [C|reverse(Wcs)], + case catch list_to_atom(W) of + A when is_atom(A) -> + case reserved(A) of + true -> + scan1(Cs1, [{A,Pos}|Toks], Pos); + false -> + scan1(Cs1, [{atom,Pos,A}|Toks], Pos) + end; + _ -> + scan_error({illegal,token}, Pos) + end. + +%% scan_name(Cs) -> lists:splitwith(fun (C) -> name_char(C) end, Cs). + +scan_name([C|Cs], Ncs) -> + case name_char(C) of + true -> + scan_name(Cs, [C|Ncs]); + false -> + {Ncs,[C|Cs]} % Must rebuild here, sigh! + end; +scan_name([], Ncs) -> + {Ncs,[]}. + +name_char(C) when C >= $a, C =< $z -> true; +name_char(C) when C >= $\337, C =< $\377, C /= $\367 -> true; +name_char(C) when C >= $A, C =< $Z -> true; +name_char(C) when C >= $\300, C =< $\336, C /= $\327 -> true; +name_char(C) when C >= $0, C =< $9 -> true; +name_char($_) -> true; +name_char($@) -> true; +name_char(_) -> false. + +%% scan_string(CharList, QuoteChar, Pos) -> +%% {StringChars,RestChars, NewPos} + +scan_string(Cs, Quote, Pos) -> + scan_string(Cs, [], Quote, Pos). + +scan_string([Quote|Cs], Scs, Quote, Pos) -> + {reverse(Scs),Cs,Pos}; +scan_string([], _Scs, _Quote, _Pos) -> + {error, premature_end}; +scan_string(Cs0, Scs, Quote, Pos) -> + case scan_char(Cs0, Pos) of + {C,Cs,Pos1} -> + %% Only build the string here + scan_string(Cs, [C|Scs], Quote, Pos1); + Error -> + Error + end. + +%% Note that space characters are not allowed +scan_char_const([$\040 | _Cs0], _Toks, _Pos) -> + {error, illegal_character}; +scan_char_const(Cs0, Toks, Pos) -> + case scan_char(Cs0, Pos) of + {C,Cs,Pos1} -> + scan1(Cs, [{char,Pos,C}|Toks], Pos1); + Error -> + Error + end. + +%% {Character,RestChars,NewPos} = scan_char(Chars, Pos) +%% Read a single character from a string or character constant. The +%% pre-scan phase has checked for errors here. +%% Note that control characters are not allowed. + +scan_char([$\\|Cs], Pos) -> + scan_escape(Cs, Pos); +scan_char([C | _Cs], _Pos) when C =< 16#1f -> + {error, illegal_character}; +scan_char([C|Cs], Pos) -> + {C,Cs,Pos}; +scan_char([], _Pos) -> + {error, truncated_char}. + +%% The following conforms to Standard Erlang escape sequences. + +scan_escape([O1, O2, O3 | Cs], Pos) when % \<1-3> octal digits + O1 >= $0, O1 =< $3, O2 >= $0, O2 =< $7, O3 >= $0, O3 =< $7 -> + Val = (O1*8 + O2)*8 + O3 - 73*$0, + {Val,Cs,Pos}; +scan_escape([O1, O2 | Cs], Pos) when + O1 >= $0, O1 =< $7, O2 >= $0, O2 =< $7 -> + Val = (O1*8 + O2) - 9*$0, + {Val,Cs,Pos}; +scan_escape([O1 | Cs], Pos) when + O1 >= $0, O1 =< $7 -> + {O1 - $0,Cs,Pos}; +scan_escape([$^, C | Cs], Pos) -> % \^X -> CTL-X + if C >= $\100, C =< $\137 -> + {C - $\100,Cs,Pos}; + true -> {error, illegal_control_character} + end; +scan_escape([C | Cs], Pos) -> + case escape_char(C) of + C1 when C1 > $\000 -> {C1,Cs,Pos}; + _ -> {error, undefined_escape_sequence} + end; +scan_escape([], _Pos) -> + {error, truncated_char}. + +%% Note that we return $\000 for undefined escapes. +escape_char($b) -> $\010; % \b = BS +escape_char($d) -> $\177; % \d = DEL +escape_char($e) -> $\033; % \e = ESC +escape_char($f) -> $\014; % \f = FF +escape_char($n) -> $\012; % \n = LF +escape_char($r) -> $\015; % \r = CR +escape_char($s) -> $\040; % \s = SPC +escape_char($t) -> $\011; % \t = HT +escape_char($v) -> $\013; % \v = VT +escape_char($\\) -> $\134; % \\ = \ +escape_char($') -> $\047; % \' = ' +escape_char($") -> $\042; % \" = " +escape_char(_C) -> $\000. + +%% scan_number(Char, CharList, TokenStack, Pos) +%% We handle sign and radix notation: +%% [+-]<digits> - the digits in base [+-]10 +%% [+-]<digits>.<digits> +%% [+-]<digits>.<digits>E+-<digits> +%% [+-]<digits>#<digits> - the digits read in base [+-]B +%% +%% Except for explicitly based integers we build a list of all the +%% characters and then use list_to_integer/1 or list_to_float/1 to +%% generate the value. + +%% SPos == Start position +%% CPos == Current position + +scan_number(C, Cs0, Toks, Pos) -> + {Ncs,Cs,Pos1} = scan_integer(Cs0, [C], Pos), + scan_after_int(Cs, Ncs, Toks, Pos, Pos1). + +scan_signed_number(S, C, Cs0, Toks, Pos) -> + {Ncs,Cs,Pos1} = scan_integer(Cs0, [C, S], Pos), + scan_after_int(Cs, Ncs, Toks, Pos, Pos1). + +scan_integer([C|Cs], Stack, Pos) when C >= $0, C =< $9 -> + scan_integer(Cs, [C|Stack], Pos); +scan_integer(Cs, Stack, Pos) -> + {Stack,Cs,Pos}. + +scan_after_int([$.,C|Cs0], Ncs0, Toks, SPos, CPos) when C >= $0, C =< $9 -> + {Ncs,Cs,CPos1} = scan_integer(Cs0, [C,$.|Ncs0], CPos), + scan_after_fraction(Cs, Ncs, Toks, SPos, CPos1); +scan_after_int(Cs, Ncs, Toks, SPos, CPos) -> + N = list_to_integer(reverse(Ncs)), + scan1(Cs, [{integer,SPos,N}|Toks], CPos). + +scan_after_fraction([$E|Cs], Ncs, Toks, SPos, CPos) -> + scan_exponent(Cs, [$E|Ncs], Toks, SPos, CPos); +scan_after_fraction([$e|Cs], Ncs, Toks, SPos, CPos) -> + scan_exponent(Cs, [$e|Ncs], Toks, SPos, CPos); +scan_after_fraction(Cs, Ncs, Toks, SPos, CPos) -> + case catch list_to_float(reverse(Ncs)) of + N when is_float(N) -> + scan1(Cs, [{float,SPos,N}|Toks], CPos); + _Error -> scan_error({illegal,float}, SPos) + end. + +%% scan_exponent(CharList, NumberCharStack, TokenStack, StartPos, CurPos) +%% Generate an error here if E{+|-} not followed by any digits. + +scan_exponent([$+|Cs], Ncs, Toks, SPos, CPos) -> + scan_exponent1(Cs, [$+|Ncs], Toks, SPos, CPos); +scan_exponent([$-|Cs], Ncs, Toks, SPos, CPos) -> + scan_exponent1(Cs, [$-|Ncs], Toks, SPos, CPos); +scan_exponent(Cs, Ncs, Toks, SPos, CPos) -> + scan_exponent1(Cs, Ncs, Toks, SPos, CPos). + +scan_exponent1([C|Cs0], Ncs0, Toks, SPos, CPos) when C >= $0, C =< $9 -> + {Ncs,Cs,CPos1} = scan_integer(Cs0, [C|Ncs0], CPos), + case catch list_to_float(reverse(Ncs)) of + N when is_float(N) -> + scan1(Cs, [{float,SPos,N}|Toks], CPos1); + _Error -> scan_error({illegal,float}, SPos) + end; +scan_exponent1(_, _, _, _, CPos) -> + scan_error(float, CPos). + +scan_error(In, Pos) -> + {error,{Pos,edoc_scanner,In}}. |