%% Description : Token scanner for XPATH grammar
%%% The XPATH grammar is a bit tricky, due to operator overloading.
%%% This version of the scanner is based on the XPATH spec:
%%% http://www.w3.org/TR/1999/REC-xpath-19991116 (XPATH version 1.0)
%%% Quote from the spec:
%%% "The following special tokenization rules must be applied in the order
%%% specified to disambiguate the ExprToken grammar:
%%% o If there is a preceding token and the preceding token is not one of
%%% @, ::. (, [, or an Operator, then a * must be recognized as a
%%% MultiplyOperator and an NCName must be recognized as an OperatorName
%%% o If the character following an NCName (possible after intervening
%%% ExprWhiteSpace) is (, then the token must be recognized as a NodeType
%%% or a FunctionName.
%%% o If the two characters following an NCName (possible after intervening
%%% ExprWhiteSpace) are ::, then the token must be recognized as an
%%% AxisName.
%%% o Otherwise, the token must not be recognized as a MultiplyOperator, an
%%% OperatorName, a NodeType, a FunctionName, or an AxisName."
%% main API
%% exported helper functions
-define(L, 1).
tokens(Str) ->
tokens(strip_ws(Str), []).
tokens([], Acc) ->
lists:reverse([{'$end', ?L, '$end'}|Acc]);
tokens(Str, Acc) ->
case scan_token(Str, Acc) of
{rescan, NewStr} ->
tokens(NewStr, Acc);
{Token, T} ->
tokens(strip_ws(T), [Token|Acc])
%% Expr Tokens
scan_token("(" ++ T, _A) -> {{'(', ?L, '('}, T};
scan_token(")" ++ T, _A) -> {{')', ?L, ')'}, T};
scan_token("[" ++ T, _A) -> {{'[', ?L, '['}, T};
scan_token("]" ++ T, _A) -> {{']', ?L, ']'}, T};
scan_token(".." ++ T, _A) -> {rescan,"parent::node()" ++ T} ;
% {{'..',?L,'..'}, T};
scan_token("@" ++ T, _A) -> {rescan,"attribute::" ++ T};
% {{'@',?L,'@'},T};
scan_token("," ++ T, _A) -> {{',', ?L, ','}, T};
scan_token("::" ++ T, _A) -> {{'::', ?L, '::'}, T};
%% operators
scan_token("//" ++ T, _A) -> {rescan,"/descendant-or-self::node()/" ++ T};
% {{'//',?L,'//'},T};
scan_token("/" ++ T, _A) -> {{'/', ?L, '/'}, T};
scan_token("|" ++ T, _A) -> {{'|', ?L, '|'}, T};
scan_token("+" ++ T, _A) -> {{'+', ?L, '+'}, T};
scan_token("-" ++ T, _A) -> {{'-', ?L, '-'}, T};
scan_token("=" ++ T, _A) -> {{'=', ?L, '='}, T};
scan_token("!=" ++ T, _A) -> {{'!=', ?L, '!='}, T};
scan_token("<=" ++ T, _A) -> {{'<=', ?L, '<='}, T};
scan_token("<" ++ T, _A) -> {{'<', ?L, '<'}, T};
scan_token(">=" ++ T, _A) -> {{'>=', ?L, '>='}, T};
scan_token(">" ++ T, _A) -> {{'>', ?L, '>'}, T};
scan_token("*" ++ T, A) ->
Tok =
case A of
[{X,_,_}|_] ->
case special_token(X) of
false ->
{'*', ?L, '*'};
true ->
{'wildcard', ?L, 'wildcard'}
_ ->
{'wildcard', ?L, 'wildcard'}
{Tok, T};
%% numbers
scan_token(Str = [H|_], _A) when H >= $0, H =< $9 ->
scan_token(Str = [$., H|_], A) when H >= $0, H =< $9 ->
scan_number(Str, A);
scan_token("." ++ T, _A) ->
% {{'.', ?L, '.'}, T};
{rescan, "self::node()" ++ T};
%% Variable Reference
scan_token([$$|T], _A) ->
{{Prefix, Local}, T1} = scan_name(T),
case Prefix of
[] ->
{{var_reference, ?L, list_to_atom(Local)}, T1};
_ ->
{{var_reference, ?L, list_to_atom(Prefix++":"++Local)}, T1}
scan_token([H|T], _A) when H == $" ; H == $' ->
{Literal, T1} = scan_literal(T, H, []),
{{literal, ?L, Literal}, T1};
scan_token(T, A) ->
{{Prefix, Local}, T1} = scan_name(T),
case A of
[{X,_,_}|_] ->
case special_token(X) of
false ->
operator_name(Prefix, Local, T1);
true ->
other_name(Prefix, Local, strip_ws(T1))
_ ->
other_name(Prefix, Local, T1)
operator_name([], "and", T) -> {{'and', ?L, 'and'}, T};
operator_name([], "or", T) -> {{'or', ?L, 'or'}, T};
operator_name([], "mod", T) -> {{'mod', ?L, 'mod'}, T};
operator_name([], "div", T) -> {{'div', ?L, 'div'}, T}.
other_name(Prefix, [], "*" ++ T) ->
%% [37] NameTest ::= '*' | NCName ':' '*' | QName
{{prefix_test, ?L, Prefix}, T};
other_name(Prefix, Local, T = "(" ++ _) ->
node_type_or_function_name(Prefix, Local, T);
other_name(Prefix, Local, T = "::" ++ _) ->
axis(Prefix, Local, T);
other_name([], Local, T) ->
{{name, ?L, {list_to_atom(Local), [], Local}}, T};
other_name(Prefix, Local, T) ->
{{name, ?L, {list_to_atom(Prefix++":"++Local), Prefix, Local}}, T}.
%% node types
node_type_or_function_name([], "comment", T) ->
{{node_type, ?L, comment}, T};
node_type_or_function_name([], "text", T) ->
{{node_type, ?L, text}, T};
node_type_or_function_name([], "processing-instruction", T) ->
{{'processing-instruction', ?L, 'processing-instruction'}, T};
node_type_or_function_name([], "node", T) ->
{{node_type, ?L, node}, T};
node_type_or_function_name(Prefix, Local, T) ->
{{function_name, ?L, list_to_atom(Prefix ++ Local)}, T}.
%% axis names
axis([], "ancestor-or-self", T) -> {{axis, ?L, ancestor_or_self}, T};
axis([], "ancestor", T) -> {{axis, ?L, ancestor}, T};
axis([], "attribute", T) -> {{axis, ?L, attribute}, T};
axis([], "child", T) -> {{axis, ?L, child}, T};
axis([], "descendant-or-self", T) -> {{axis, ?L, descendant_or_self}, T};
axis([], "descendant", T) -> {{axis, ?L, descendant}, T};
axis([], "following-sibling", T) -> {{axis, ?L, following_sibling}, T};
axis([], "following", T) -> {{axis, ?L, following}, T};
axis([], "namespace", T) -> {{axis, ?L, namespace}, T};
axis([], "parent", T) -> {{axis, ?L, parent}, T};
axis([], "preceding-sibling", T) -> {{axis, ?L, preceding_sibling}, T};
axis([], "preceding", T) -> {{axis, ?L, preceding}, T};
axis([], "self", T) -> {{axis, ?L, self}, T}.
scan_literal([H|T], H, Acc) ->
{lists:reverse(Acc), T};
scan_literal([H|T], Delim, Acc) ->
scan_literal(T, Delim, [H|Acc]).
scan_name([H1, H2 | T]) when H1 == $: ; H1 == $_ ->
if ?whitespace(H2) ->
exit({invalid_name, [H1, H2, '...']});
true ->
scan_prefix(T, [H2, H1])
scan_name([H|T]) ->
case xmerl_lib:is_letter(H) of
true ->
scan_prefix(T, [H]);
false ->
exit({invalid_name, lists:sublist([H|T], 1, 6)})
scan_name(Str) ->
exit({invalid_name, lists:sublist(Str, 1, 6)}).
scan_prefix([], Acc) ->
{{[], lists:reverse(Acc)}, []};
scan_prefix(Str = [H|_], Acc) when ?whitespace(H) ->
{{[], lists:reverse(Acc)}, Str};
scan_prefix(T = "::" ++ _, Acc) ->
%% This is the next token
{{[], lists:reverse(Acc)}, T};
scan_prefix(":" ++ T, Acc) ->
{LocalPart, T1} = scan_local_part(T, []),
Prefix = lists:reverse(Acc),
{{Prefix, LocalPart}, T1};
scan_prefix(Str = [H|T], Acc) ->
case xmerl_lib:is_namechar(H) of
true ->
scan_prefix(T, [H|Acc]);
false ->
{{[], lists:reverse(Acc)}, Str}
scan_local_part([], Acc) ->
{lists:reverse(Acc), []};
scan_local_part(Str = [H|_], Acc) when ?whitespace(H) ->
{lists:reverse(Acc), Str};
scan_local_part(Str = [H|T], Acc) ->
case xmerl_lib:is_namechar(H) of
true ->
scan_local_part(T, [H|Acc]);
false ->
{lists:reverse(Acc), Str}
scan_number(T) ->
scan_number(T, []).
scan_number([], Acc) ->
{{number, ?L, list_to_integer(lists:reverse(Acc))}, []};
scan_number("." ++ T, []) ->
{Digits, T1} = scan_digits(T, ".0"),
Number = list_to_float(Digits),
{{number, ?L, Number}, T1};
scan_number("." ++ T, Acc) ->
{Digits, T1} = scan_digits(T, "." ++ Acc),
Number = list_to_float(Digits),
{{number, ?L, Number}, T1};
scan_number([H|T], Acc) when H >= $0, H =< $9 ->
scan_number(T, [H|Acc]);
scan_number(T, Acc) ->
{{number, ?L, list_to_integer(lists:reverse(Acc))}, T}.
scan_digits([], Acc) ->
{lists:reverse(Acc), []};
scan_digits([H|T], Acc) when H >= $0, H =< $9 ->
scan_digits(T, [H|Acc]);
scan_digits(T, Acc) ->
{lists:reverse(Acc), T}.
strip_ws([H|T]) when ?whitespace(H) ->
strip_ws(T) ->
special_token('@') -> true;
special_token('::') -> true;
special_token(',') -> true;
special_token('(') -> true;
special_token('[') -> true;
special_token('/') -> true;
special_token('//') -> true;
special_token('|') -> true;
special_token('+') -> true;
special_token('-') -> true;
special_token('=') -> true;
special_token('!=') -> true;
special_token('<') -> true;
special_token('<=') -> true;
special_token('>') -> true;
special_token('>=') -> true;
special_token('and') -> true;
special_token('or') -> true;
special_token('mod') -> true;
special_token('div') -> true;
special_token(_) ->