diff options
Diffstat (limited to 'lib/xmerl/src/xmerl_xpath_scan.erl')
-rw-r--r-- | lib/xmerl/src/xmerl_xpath_scan.erl | 308 |
1 files changed, 308 insertions, 0 deletions
diff --git a/lib/xmerl/src/xmerl_xpath_scan.erl b/lib/xmerl/src/xmerl_xpath_scan.erl new file mode 100644 index 0000000000..10e2756e74 --- /dev/null +++ b/lib/xmerl/src/xmerl_xpath_scan.erl @@ -0,0 +1,308 @@ +%% +%% %CopyrightBegin% +%% +%% Copyright Ericsson AB 2003-2009. All Rights Reserved. +%% +%% The contents of this file are subject to the Erlang Public License, +%% Version 1.1, (the "License"); you may not use this file except in +%% compliance with the License. You should have received a copy of the +%% Erlang Public License along with this software. If not, it can be +%% retrieved online at http://www.erlang.org/. +%% +%% Software distributed under the License is distributed on an "AS IS" +%% basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See +%% the License for the specific language governing rights and limitations +%% under the License. +%% +%% %CopyrightEnd% +%% + +%% Description : Token scanner for XPATH grammar + +%%%---------------------------------------------------------------------- +%%% +%%% The XPATH grammar is a bit tricky, due to operator overloading. +%%% This version of the scanner is based on the XPATH spec: +%%% http://www.w3.org/TR/1999/REC-xpath-19991116 (XPATH version 1.0) +%%% +%%% Quote from the spec: +%%% +%%% "The following special tokenization rules must be applied in the order +%%% specified to disambiguate the ExprToken grammar: +%%% +%%% o If there is a preceding token and the preceding token is not one of +%%% @, ::. (, [, or an Operator, then a * must be recognized as a +%%% MultiplyOperator and an NCName must be recognized as an OperatorName +%%% o If the character following an NCName (possible after intervening +%%% ExprWhiteSpace) is (, then the token must be recognized as a NodeType +%%% or a FunctionName. +%%% o If the two characters following an NCName (possible after intervening +%%% ExprWhiteSpace) are ::, then the token must be recognized as an +%%% AxisName. +%%% o Otherwise, the token must not be recognized as a MultiplyOperator, an +%%% OperatorName, a NodeType, a FunctionName, or an AxisName." +%%%---------------------------------------------------------------------- + +-module(xmerl_xpath_scan). + + +%% main API +-export([tokens/1]). + +%% exported helper functions +-export([scan_number/1]). + +-include("xmerl.hrl"). + +-define(L, 1). + + +tokens(Str) -> + tokens(strip_ws(Str), []). + +tokens([], Acc) -> + lists:reverse([{'$end', ?L, '$end'}|Acc]); +tokens(Str, Acc) -> + case scan_token(Str, Acc) of + {rescan, NewStr} -> + tokens(NewStr, Acc); + {Token, T} -> + tokens(strip_ws(T), [Token|Acc]) + end. + +%% Expr Tokens +scan_token("(" ++ T, _A) -> {{'(', ?L, '('}, T}; +scan_token(")" ++ T, _A) -> {{')', ?L, ')'}, T}; +scan_token("[" ++ T, _A) -> {{'[', ?L, '['}, T}; +scan_token("]" ++ T, _A) -> {{']', ?L, ']'}, T}; +scan_token(".." ++ T, _A) -> {rescan,"parent::node()" ++ T} ; + % {{'..',?L,'..'}, T}; +scan_token("@" ++ T, _A) -> {rescan,"attribute::" ++ T}; + % {{'@',?L,'@'},T}; +scan_token("," ++ T, _A) -> {{',', ?L, ','}, T}; +scan_token("::" ++ T, _A) -> {{'::', ?L, '::'}, T}; + +%% operators +scan_token("//" ++ T, _A) -> {rescan,"/descendant-or-self::node()/" ++ T}; + % {{'//',?L,'//'},T}; +scan_token("/" ++ T, _A) -> {{'/', ?L, '/'}, T}; +scan_token("|" ++ T, _A) -> {{'|', ?L, '|'}, T}; +scan_token("+" ++ T, _A) -> {{'+', ?L, '+'}, T}; +scan_token("-" ++ T, _A) -> {{'-', ?L, '-'}, T}; +scan_token("=" ++ T, _A) -> {{'=', ?L, '='}, T}; +scan_token("!=" ++ T, _A) -> {{'!=', ?L, '!='}, T}; +scan_token("<=" ++ T, _A) -> {{'<=', ?L, '<='}, T}; +scan_token("<" ++ T, _A) -> {{'<', ?L, '<'}, T}; +scan_token(">=" ++ T, _A) -> {{'>=', ?L, '>='}, T}; +scan_token(">" ++ T, _A) -> {{'>', ?L, '>'}, T}; + +scan_token("*" ++ T, A) -> + Tok = + case A of + [{X,_,_}|_] -> + case special_token(X) of + false -> + {'*', ?L, '*'}; + true -> + {'wildcard', ?L, 'wildcard'} + end; + _ -> + {'wildcard', ?L, 'wildcard'} + end, + {Tok, T}; + +%% numbers +scan_token(Str = [H|_], _A) when H >= $0, H =< $9 -> + scan_number(Str); +scan_token(Str = [$., H|_], A) when H >= $0, H =< $9 -> + scan_number(Str, A); +scan_token("." ++ T, _A) -> +% {{'.', ?L, '.'}, T}; + {rescan, "self::node()" ++ T}; + +%% Variable Reference +scan_token([$$|T], _A) -> + {{Prefix, Local}, T1} = scan_name(T), + case Prefix of + [] -> + {{var_reference, ?L, list_to_atom(Local)}, T1}; + _ -> + {{var_reference, ?L, list_to_atom(Prefix++":"++Local)}, T1} + end; + +scan_token([H|T], _A) when H == $" ; H == $' -> + {Literal, T1} = scan_literal(T, H, []), + {{literal, ?L, Literal}, T1}; + +scan_token(T, A) -> + {{Prefix, Local}, T1} = scan_name(T), + case A of + [{X,_,_}|_] -> + case special_token(X) of + false -> + operator_name(Prefix, Local, T1); + true -> + other_name(Prefix, Local, strip_ws(T1)) + end; + _ -> + other_name(Prefix, Local, T1) + end. + +operator_name([], "and", T) -> {{'and', ?L, 'and'}, T}; +operator_name([], "or", T) -> {{'or', ?L, 'or'}, T}; +operator_name([], "mod", T) -> {{'mod', ?L, 'mod'}, T}; +operator_name([], "div", T) -> {{'div', ?L, 'div'}, T}. + + +other_name(Prefix, [], "*" ++ T) -> + %% [37] NameTest ::= '*' | NCName ':' '*' | QName + {{prefix_test, ?L, Prefix}, T}; +other_name(Prefix, Local, T = "(" ++ _) -> + node_type_or_function_name(Prefix, Local, T); +other_name(Prefix, Local, T = "::" ++ _) -> + axis(Prefix, Local, T); +other_name([], Local, T) -> + {{name, ?L, {list_to_atom(Local), [], Local}}, T}; +other_name(Prefix, Local, T) -> + {{name, ?L, {list_to_atom(Prefix++":"++Local), Prefix, Local}}, T}. + + + +%% node types +node_type_or_function_name([], "comment", T) -> + {{node_type, ?L, comment}, T}; +node_type_or_function_name([], "text", T) -> + {{node_type, ?L, text}, T}; +node_type_or_function_name([], "processing-instruction", T) -> + {{'processing-instruction', ?L, 'processing-instruction'}, T}; +node_type_or_function_name([], "node", T) -> + {{node_type, ?L, node}, T}; +node_type_or_function_name(Prefix, Local, T) -> + {{function_name, ?L, list_to_atom(Prefix ++ Local)}, T}. + + +%% axis names +axis([], "ancestor-or-self", T) -> {{axis, ?L, ancestor_or_self}, T}; +axis([], "ancestor", T) -> {{axis, ?L, ancestor}, T}; +axis([], "attribute", T) -> {{axis, ?L, attribute}, T}; +axis([], "child", T) -> {{axis, ?L, child}, T}; +axis([], "descendant-or-self", T) -> {{axis, ?L, descendant_or_self}, T}; +axis([], "descendant", T) -> {{axis, ?L, descendant}, T}; +axis([], "following-sibling", T) -> {{axis, ?L, following_sibling}, T}; +axis([], "following", T) -> {{axis, ?L, following}, T}; +axis([], "namespace", T) -> {{axis, ?L, namespace}, T}; +axis([], "parent", T) -> {{axis, ?L, parent}, T}; +axis([], "preceding-sibling", T) -> {{axis, ?L, preceding_sibling}, T}; +axis([], "preceding", T) -> {{axis, ?L, preceding}, T}; +axis([], "self", T) -> {{axis, ?L, self}, T}. + + + + +scan_literal([H|T], H, Acc) -> + {lists:reverse(Acc), T}; +scan_literal([H|T], Delim, Acc) -> + scan_literal(T, Delim, [H|Acc]). + + +scan_name([H1, H2 | T]) when H1 == $: ; H1 == $_ -> + if ?whitespace(H2) -> + exit({invalid_name, [H1, H2, '...']}); + true -> + scan_prefix(T, [H2, H1]) + end; +scan_name([H|T]) -> + case xmerl_lib:is_letter(H) of + true -> + scan_prefix(T, [H]); + false -> + exit({invalid_name, lists:sublist([H|T], 1, 6)}) + end; +scan_name(Str) -> + exit({invalid_name, lists:sublist(Str, 1, 6)}). + +scan_prefix([], Acc) -> + {{[], lists:reverse(Acc)}, []}; +scan_prefix(Str = [H|_], Acc) when ?whitespace(H) -> + {{[], lists:reverse(Acc)}, Str}; +scan_prefix(T = "::" ++ _, Acc) -> + %% This is the next token + {{[], lists:reverse(Acc)}, T}; +scan_prefix(":" ++ T, Acc) -> + {LocalPart, T1} = scan_local_part(T, []), + Prefix = lists:reverse(Acc), + {{Prefix, LocalPart}, T1}; +scan_prefix(Str = [H|T], Acc) -> + case xmerl_lib:is_namechar(H) of + true -> + scan_prefix(T, [H|Acc]); + false -> + {{[], lists:reverse(Acc)}, Str} + end. + +scan_local_part([], Acc) -> + {lists:reverse(Acc), []}; +scan_local_part(Str = [H|_], Acc) when ?whitespace(H) -> + {lists:reverse(Acc), Str}; +scan_local_part(Str = [H|T], Acc) -> + case xmerl_lib:is_namechar(H) of + true -> + scan_local_part(T, [H|Acc]); + false -> + {lists:reverse(Acc), Str} + end. + + +scan_number(T) -> + scan_number(T, []). + +scan_number([], Acc) -> + {{number, ?L, list_to_integer(lists:reverse(Acc))}, []}; +scan_number("." ++ T, []) -> + {Digits, T1} = scan_digits(T, ".0"), + Number = list_to_float(Digits), + {{number, ?L, Number}, T1}; +scan_number("." ++ T, Acc) -> + {Digits, T1} = scan_digits(T, "." ++ Acc), + Number = list_to_float(Digits), + {{number, ?L, Number}, T1}; +scan_number([H|T], Acc) when H >= $0, H =< $9 -> + scan_number(T, [H|Acc]); +scan_number(T, Acc) -> + {{number, ?L, list_to_integer(lists:reverse(Acc))}, T}. + +scan_digits([], Acc) -> + {lists:reverse(Acc), []}; +scan_digits([H|T], Acc) when H >= $0, H =< $9 -> + scan_digits(T, [H|Acc]); +scan_digits(T, Acc) -> + {lists:reverse(Acc), T}. + + +strip_ws([H|T]) when ?whitespace(H) -> + strip_ws(T); +strip_ws(T) -> + T. + + +special_token('@') -> true; +special_token('::') -> true; +special_token('(') -> true; +special_token('[') -> true; +special_token('/') -> true; +special_token('//') -> true; +special_token('|') -> true; +special_token('+') -> true; +special_token('-') -> true; +special_token('=') -> true; +special_token('!=') -> true; +special_token('<') -> true; +special_token('<=') -> true; +special_token('>') -> true; +special_token('>=') -> true; +special_token('and') -> true; +special_token('or') -> true; +special_token('mod') -> true; +special_token('div') -> true; +special_token(_) -> + false. |