aboutsummaryrefslogtreecommitdiffstats
path: root/lib/xmerl/src/xmerl_scan.erl
diff options
context:
space:
mode:
Diffstat (limited to 'lib/xmerl/src/xmerl_scan.erl')
-rw-r--r--lib/xmerl/src/xmerl_scan.erl376
1 files changed, 258 insertions, 118 deletions
diff --git a/lib/xmerl/src/xmerl_scan.erl b/lib/xmerl/src/xmerl_scan.erl
index 25c6547497..ec7ea534d6 100644
--- a/lib/xmerl/src/xmerl_scan.erl
+++ b/lib/xmerl/src/xmerl_scan.erl
@@ -100,7 +100,21 @@
%% <dd>Set default character set used (default UTF-8).
%% This character set is used only if not explicitly given by the XML
%% declaration. </dd>
+%% <dt><code>{document, Flag}</code></dt>
+%% <dd>Set to 'true' if xmerl should return a complete XML document
+%% as an xmlDocument record (default 'false').</dd>
+%% <dt><code>{comments, Flag}</code></dt>
+%% <dd>Set to 'false' if xmerl should skip comments otherwise they will
+%% be returned as xmlComment records (default 'true').</dd>
+%% <dt><code>{default_attrs, Flag}</code></dt>
+%% <dd>Set to 'true' if xmerl should add to elements missing attributes
+%% with a defined default value (default 'false').</dd>
%% </dl>
+%% @type document() = xmlElement() | xmlDocument(). <p>
+%% The document returned by <tt>xmerl_scan:string/[1,2]</tt> and
+%% <tt>xmerl_scan:file/[1,2]</tt>. The type of the returned record depends on
+%% the value of the document option passed to the function.
+%% </p>
-module(xmerl_scan).
@@ -224,7 +238,7 @@ cont_state(X, S=#xmerl_scanner{fun_states = FS}) ->
file(F) ->
file(F, []).
-%% @spec file(Filename::string(), Options::option_list()) -> {xmlElement(),Rest}
+%% @spec file(Filename::string(), Options::option_list()) -> {document(),Rest}
%% Rest = list()
%%% @doc Parse file containing an XML document
file(F, Options) ->
@@ -264,7 +278,7 @@ int_file_decl(F, Options,_ExtCharset) ->
string(Str) ->
string(Str, []).
-%% @spec string(Text::list(),Options::option_list()) -> {xmlElement(),Rest}
+%% @spec string(Text::list(),Options::option_list()) -> {document(),Rest}
%% Rest = list()
%%% @doc Parse string containing an XML document
string(Str, Options) ->
@@ -381,6 +395,12 @@ initial_state([{quiet, F}|T], S) when F==true; F==false ->
initial_state(T, S#xmerl_scanner{quiet = F});
initial_state([{doctype_DTD,DTD}|T], S) ->
initial_state(T,S#xmerl_scanner{doctype_DTD = DTD});
+initial_state([{document, F}|T], S) when is_boolean(F) ->
+ initial_state(T,S#xmerl_scanner{document = F});
+initial_state([{comments, F}|T], S) when is_boolean(F) ->
+ initial_state(T,S#xmerl_scanner{comments = F});
+initial_state([{default_attrs, F}|T], S) when is_boolean(F) ->
+ initial_state(T,S#xmerl_scanner{default_attrs = F});
initial_state([{text_decl,Bool}|T], S) ->
initial_state(T,S#xmerl_scanner{text_decl=Bool});
initial_state([{environment,Env}|T], S) ->
@@ -518,6 +538,7 @@ scan_document(Str0, S=#xmerl_scanner{event_fun = Event,
line = L, col = C,
environment=Env,
encoding=Charset,
+ document=Document,
validation=ValidateResult}) ->
S1 = Event(#xmerl_event{event = started,
line = L,
@@ -530,8 +551,8 @@ scan_document(Str0, S=#xmerl_scanner{event_fun = Event,
Str=if
Charset == "utf-8" ->
Str0;
- Charset=/=undefined -> % Default character set is UTF-8
- xmerl_ucs:to_unicode(Str0,list_to_atom(Charset));
+ Charset =/= undefined -> % Default character set is UTF-8
+ xmerl_ucs:to_unicode(Str0, list_to_atom(Charset));
true -> %% Charset is undefined if no external input is
%% given, and no auto detection of character
%% encoding was made.
@@ -539,17 +560,17 @@ scan_document(Str0, S=#xmerl_scanner{event_fun = Event,
end,
%% M1 = erlang:memory(),
%% io:format("Memory status before prolog: ~p~n",[M1]),
- {T1, S2} = scan_prolog(Str, S1, _StartPos = 1),
+ {Prolog, Pos, T1, S2} = scan_prolog(Str, S1, _StartPos = 1),
%% M2 = erlang:memory(),
%% io:format("Memory status after prolog: ~p~n",[M2]),
%%io:format("scan_document 2, prolog parsed~n",[]),
- T2 = scan_mandatory("<",T1,1,S2,expected_element_start_tag),
+ T2 = scan_mandatory("<", T1, 1, S2, expected_element_start_tag),
%% M3 = erlang:memory(),
%% io:format("Memory status before element: ~p~n",[M3]),
- {Res, T3, S3} =scan_element(T2,S2,_StartPos = 1),
+ {Res, T3, S3} = scan_element(T2,S2,Pos),
%% M4 = erlang:memory(),
%% io:format("Memory status after element: ~p~n",[M4]),
- {Tail, S4}=scan_misc(T3, S3, _StartPos = 1),
+ {Misc, _Pos1, Tail, S4}=scan_misc(T3, S3, Pos + 1),
%% M5 = erlang:memory(),
%% io:format("Memory status after misc: ~p~n",[M5]),
@@ -558,44 +579,52 @@ scan_document(Str0, S=#xmerl_scanner{event_fun = Event,
col = S4#xmerl_scanner.col,
data = document}, S4),
- {Res2,S6} = case validation_mode(ValidateResult) of
+ {Res2, S6} = case validation_mode(ValidateResult) of
off ->
- {Res,cleanup(S5)};
+ {Res, cleanup(S5)};
dtd when Env == element; Env == prolog ->
check_decl2(S5),
- case xmerl_validate:validate(S5,Res) of
- {'EXIT',{error,Reason}} ->
- S5b=cleanup(S5),
- ?fatal({failed_validation,Reason}, S5b);
- {'EXIT',Reason} ->
- S5b=cleanup(S5),
- ?fatal({failed_validation,Reason}, S5b);
- {error,Reason} ->
- S5b=cleanup(S5),
- ?fatal({failed_validation,Reason}, S5b);
- {error,Reason,_Next} ->
- S5b=cleanup(S5),
- ?fatal({failed_validation,Reason}, S5b);
+ case xmerl_validate:validate(S5, Res) of
+ {'EXIT', {error, Reason}} ->
+ S5b = cleanup(S5),
+ ?fatal({failed_validation, Reason}, S5b);
+ {'EXIT', Reason} ->
+ S5b = cleanup(S5),
+ ?fatal({failed_validation, Reason}, S5b);
+ {error, Reason} ->
+ S5b = cleanup(S5),
+ ?fatal({failed_validation, Reason}, S5b);
+ {error, Reason, _Next} ->
+ S5b = cleanup(S5),
+ ?fatal({failed_validation, Reason}, S5b);
_XML ->
- {Res,cleanup(S5)}
+ {Res, cleanup(S5)}
end;
schema ->
- case schemaLocations(Res,S5) of
- {ok,Schemas} ->
+ case schemaLocations(Res, S5) of
+ {ok, Schemas} ->
cleanup(S5),
%%io:format("Schemas: ~p~nRes: ~p~ninhertih_options(S): ~p~n",
%% [Schemas,Res,inherit_options(S5)]),
- XSDRes = xmerl_xsd:process_validate(Schemas,Res,
+ XSDRes = xmerl_xsd:process_validate(Schemas, Res,
inherit_options(S5)),
- handle_schema_result(XSDRes,S5);
+ handle_schema_result(XSDRes, S5);
_ ->
- {Res,cleanup(S5)}
+ {Res, cleanup(S5)}
end;
_ ->
- {Res,cleanup(S5)}
+ {Res, cleanup(S5)}
end,
- {Res2, Tail, S6}.
+ Res3 =
+ case Document of
+ true ->
+ Content = lists:reverse(Prolog, [Res2 | lists:reverse(Misc)]),
+ #xmlDocument{content = Content};
+ false ->
+ Res2
+ end,
+ {Res3, Tail, S6}.
scan_decl(Str, S=#xmerl_scanner{event_fun = Event,
@@ -609,11 +638,11 @@ scan_decl(Str, S=#xmerl_scanner{event_fun = Event,
data = document}, S),
case scan_prolog(Str, S1, _StartPos = 1) of
- {T2="<"++_, S2} ->
+ {_,_,T2="<"++_, S2} ->
{{S2#xmerl_scanner.user_state,T2},[],S2};
- {[], S2}->
+ {_,_,[], S2}->
{[],[],S2};
- {T2, S2} ->
+ {_,_,T2, S2} ->
{_,_,S3} = scan_content(T2,S2,[],_Attrs=[],S2#xmerl_scanner.space,
_Lang=[],_Parents=[],#xmlNamespace{}),
{T2,[],S3}
@@ -624,14 +653,17 @@ scan_decl(Str, S=#xmerl_scanner{event_fun = Event,
%%% prolog ::= XMLDecl? Misc* (doctypedecl Misc*)?
%%%
%% empty text declarations are handled by the first function clause.
-scan_prolog([], S=#xmerl_scanner{continuation_fun = F}, Pos) ->
+scan_prolog(T, S, Pos) ->
+ scan_prolog(T, S, Pos, []).
+scan_prolog([], S=#xmerl_scanner{continuation_fun = F}, Pos, Acc) ->
?dbg("cont()...~n", []),
- F(fun(MoreBytes, S1) -> scan_prolog(MoreBytes, S1, Pos) end,
- fun(S1) -> {[], S1} end,
+ F(fun(MoreBytes, S1) -> scan_prolog(MoreBytes, S1, Pos, Acc) end,
+ fun(S1) -> {Acc, Pos, [], S1} end,
S);
-scan_prolog("<?xml"++T,S0=#xmerl_scanner{encoding=Charset0,col=Col,line=L},Pos)
- when ?whitespace(hd(T)) ->
- {Charset,T3, S3}=
+scan_prolog("<?xml"++T,
+ S0=#xmerl_scanner{encoding=Charset0,col=Col,line=L},
+ Pos,Acc) when ?whitespace(hd(T)) ->
+ {Charset, T3, S3} =
if
Col==1,L==1,S0#xmerl_scanner.text_decl==true ->
?dbg("prolog(\"<?xml\")~n", []),
@@ -639,13 +671,13 @@ scan_prolog("<?xml"++T,S0=#xmerl_scanner{encoding=Charset0,col=Col,line=L},Pos)
{_,T1,S1} = mandatory_strip(T,S),
{Decl,T2, S2}=scan_text_decl(T1,S1),
Encoding=Decl#xmlDecl.encoding,
- {Encoding,T2, S2#xmerl_scanner{encoding=Encoding}};
+ {Encoding, T2, S2#xmerl_scanner{encoding=Encoding}};
Col==1,L==1 ->
?dbg("prolog(\"<?xml\")~n", []),
?bump_col(5),
{Decl,T2, S2}=scan_xml_decl(T, S),
Encoding=Decl#xmlDecl.encoding,
- {Encoding,T2, S2#xmerl_scanner{encoding=Encoding}};
+ {Encoding, T2, S2#xmerl_scanner{encoding=Encoding}};
true ->
?fatal({xml_declaration_must_be_first_in_doc,Col,L},S0)
end,
@@ -659,7 +691,7 @@ scan_prolog("<?xml"++T,S0=#xmerl_scanner{encoding=Charset0,col=Col,line=L},Pos)
%% Now transform to declared character set.
if
Charset==Charset0 -> % Document already transformed to this charset!
- scan_prolog(T3, S3, Pos);
+ scan_prolog(T3, S3, Pos, Acc);
Charset0=/=undefined ->
%% For example may an external entity
%% have the BOM for utf-16 and the internal
@@ -668,17 +700,18 @@ scan_prolog("<?xml"++T,S0=#xmerl_scanner{encoding=Charset0,col=Col,line=L},Pos)
%% 'iso-10646-utf-1', and Charset will be 'utf-16', all
%% legal.
%%
- scan_prolog(T3,S3#xmerl_scanner{encoding=Charset0},Pos);
+ scan_prolog(T3,S3#xmerl_scanner{encoding=Charset0},Pos,Acc);
Charset == "utf-8" ->
- scan_prolog(T3, S3, Pos);
+ scan_prolog(T3, S3, Pos, Acc);
Charset=/=undefined -> % Document not previously transformed
T4=xmerl_ucs:to_unicode(T3,list_to_atom(Charset)),
- scan_prolog(T4, S3, Pos);
+ scan_prolog(T4, S3, Pos, Acc);
true -> % No encoding info given
- scan_prolog(T3, S3, Pos)
+ scan_prolog(T3, S3, Pos, Acc)
end;
-scan_prolog("<!DOCTYPE" ++ T, S0=#xmerl_scanner{environment=prolog,
- encoding=_Charset}, Pos) ->
+scan_prolog("<!DOCTYPE" ++ T,
+ S0=#xmerl_scanner{environment=prolog,encoding=_Charset},
+ Pos, Acc) ->
?dbg("prolog(\"<!DOCTYPE\")~n", []),
?bump_col(9),
%% If no known character set assume it is UTF-8
@@ -687,10 +720,13 @@ scan_prolog("<!DOCTYPE" ++ T, S0=#xmerl_scanner{environment=prolog,
true -> T
end,
{T2, S1} = scan_doctype(T1, S),
- scan_misc(T2, S1, Pos);
-scan_prolog(Str="%"++_T,S=#xmerl_scanner{environment={external,_}},_Pos) ->
- scan_ext_subset(Str,S);
-scan_prolog(Str, S0 = #xmerl_scanner{user_state=_US,encoding=_Charset},Pos) ->
+ scan_misc(T2, S1, Pos, Acc);
+scan_prolog(Str="%"++_T,S=#xmerl_scanner{environment={external,_}},
+ Pos,Acc) ->
+ {T, S1} = scan_ext_subset(Str,S),
+ {Acc, Pos, T, S1};
+scan_prolog(Str, S0 = #xmerl_scanner{user_state=_US,encoding=_Charset},
+ Pos,Acc) ->
?dbg("prolog(\"<\")~n", []),
%% Check for Comments, PI before possible DOCTYPE declaration
@@ -700,26 +736,28 @@ scan_prolog(Str, S0 = #xmerl_scanner{user_state=_US,encoding=_Charset},Pos) ->
%% Charset==undefined -> xmerl_ucs:to_unicode(Str,'utf-8');
true -> Str
end,
- {T1, S1}=scan_misc(T, S, Pos),
- scan_prolog2(T1,S1,Pos).
+ {Acc1, Pos1, T1, S1}=scan_misc(T, S, Pos, Acc),
+ scan_prolog2(T1,S1,Pos1,Acc1).
-scan_prolog2([], S=#xmerl_scanner{continuation_fun = F}, Pos) ->
+scan_prolog2([], S=#xmerl_scanner{continuation_fun = F}, Pos, Acc) ->
?dbg("cont()...~n", []),
- F(fun(MoreBytes, S1) -> scan_prolog2(MoreBytes, S1, Pos) end,
- fun(S1) -> {[], S1} end,
+ F(fun(MoreBytes, S1) -> scan_prolog2(MoreBytes, S1, Pos, Acc) end,
+ fun(S1) -> {Acc, Pos, [], S1} end,
S);
-scan_prolog2("<!DOCTYPE" ++ T, S0=#xmerl_scanner{environment=prolog}, Pos) ->
+scan_prolog2("<!DOCTYPE" ++ T, S0=#xmerl_scanner{environment=prolog},
+ Pos, Acc) ->
?dbg("prolog(\"<!DOCTYPE\")~n", []),
?bump_col(9),
{T1, S1} = scan_doctype(T, S),
- scan_misc(T1, S1, Pos);
-scan_prolog2(Str = "<!" ++ _, S, _Pos) ->
+ scan_misc(T1, S1, Pos, Acc);
+scan_prolog2(Str = "<!" ++ _, S, Pos, Acc) ->
?dbg("prolog(\"<!\")~n", []),
%% In e.g. a DTD, we jump directly to markup declarations
- scan_ext_subset(Str, S);
-scan_prolog2(Str, S0 = #xmerl_scanner{user_state=_US},Pos) ->
+ {T, S1} = scan_ext_subset(Str, S),
+ {Acc, Pos, T, S1};
+scan_prolog2(Str, S0 = #xmerl_scanner{user_state=_US},Pos,Acc) ->
?dbg("prolog(\"<\")~n", []),
%% Here we consider the DTD provided by doctype_DTD option,
@@ -733,7 +771,7 @@ scan_prolog2(Str, S0 = #xmerl_scanner{user_state=_US},Pos) ->
end,
%% Check for more Comments and PI after DOCTYPE declaration
% ?bump_col(1),
- scan_misc(Str, S1, Pos).
+ scan_misc(Str, S1, Pos, Acc).
@@ -743,26 +781,46 @@ scan_prolog2(Str, S0 = #xmerl_scanner{user_state=_US},Pos) ->
%% - Neither of Comment and PI are returned in the resulting parsed
%% structure.
%% - scan_misc/3 implements Misc* as that is how the rule is always used
-scan_misc([], S=#xmerl_scanner{continuation_fun = F}, Pos) ->
+scan_misc(T, S, Pos) ->
+ scan_misc(T, S, Pos, []).
+scan_misc([], S=#xmerl_scanner{continuation_fun = F}, Pos, Acc) ->
?dbg("cont()...~n", []),
- F(fun(MoreBytes, S1) -> scan_misc(MoreBytes, S1, Pos) end,
- fun(S1) -> {[], S1} end,
+ F(fun(MoreBytes, S1) -> scan_misc(MoreBytes, S1, Pos, Acc) end,
+ fun(S1) -> {Acc, Pos, [], S1} end,
S);
-scan_misc("<!--" ++ T, S0, Pos) -> % Comment
+scan_misc("<!--" ++ T, S0=#xmerl_scanner{acc_fun = F, comments=CF}, Pos, Acc) -> % Comment
?bump_col(4),
- {_, T1, S1} = scan_comment(T, S, Pos, _Parents = [], _Lang = []),
- scan_misc(T1,S1,Pos);
-scan_misc("<?" ++ T, S0, Pos) -> % PI
+ {C, T1, S1} = scan_comment(T, S, Pos, _Parents = [], _Lang = []),
+ case CF of
+ true ->
+ {Acc2, Pos2, S3} =
+ case F(C, Acc, S1) of
+ {Acc1, S2} ->
+ {Acc1, Pos + 1, S2};
+ {Acc1, Pos1, S2} ->
+ {Acc1, Pos1, S2}
+ end,
+ scan_misc(T1, S3, Pos2, Acc2);
+ false ->
+ scan_misc(T1, S1, Pos, Acc)
+ end;
+scan_misc("<?" ++ T, S0=#xmerl_scanner{acc_fun = F}, Pos, Acc) -> % PI
?dbg("prolog(\"<?\")~n", []),
?bump_col(2),
- {_PI, T1, S1} = scan_pi(T, S, Pos),
- scan_misc(T1,S1,Pos);
-scan_misc(T=[H|_T], S, Pos) when ?whitespace(H) ->
+ {PI, T1, S1} = scan_pi(T, S, Pos, []),
+ {Acc2, Pos2, S3} = case F(PI, Acc, S1) of
+ {Acc1, S2} ->
+ {Acc1, Pos + 1, S2};
+ {Acc1, Pos1, S2} ->
+ {Acc1, Pos1, S2}
+ end,
+ scan_misc(T1,S3,Pos2,Acc2);
+scan_misc(T=[H|_T], S, Pos, Acc) when ?whitespace(H) ->
?dbg("prolog(whitespace)~n", []),
{_,T1,S1}=strip(T,S),
- scan_misc(T1,S1,Pos);
-scan_misc(T,S,_Pos) ->
- {T,S}.
+ scan_misc(T1,S1,Pos,Acc);
+scan_misc(T,S,Pos,Acc) ->
+ {Acc,Pos,T,S}.
cleanup(S=#xmerl_scanner{keep_rules = false,
@@ -789,7 +847,8 @@ scan_xml_decl(T, S) ->
Attr = #xmlAttribute{name = version,
parents = [{xml, _XMLPos = 1}],
value = Vsn},
- scan_xml_decl(T4, S4, #xmlDecl{attributes = [Attr]}).
+ scan_xml_decl(T4, S4, #xmlDecl{vsn = Vsn,
+ attributes = [Attr]}).
scan_xml_decl([], S=#xmerl_scanner{continuation_fun = F}, Decl) ->
?dbg("cont()...~n", []),
@@ -1025,50 +1084,53 @@ xml_vsn([H|T], S=#xmerl_scanner{col = C}, Delim, Acc) ->
%%%%%%% [16] PI ::= '<?' PITarget (S (Char* - (Char* '?>' Char*)))? '?>'
-scan_pi([], S=#xmerl_scanner{continuation_fun = F}, Pos) ->
+scan_pi([], S=#xmerl_scanner{continuation_fun = F}, Pos, Ps) ->
?dbg("cont()...~n", []),
- F(fun(MoreBytes, S1) -> scan_pi(MoreBytes, S1, Pos) end,
+ F(fun(MoreBytes, S1) -> scan_pi(MoreBytes, S1, Pos, Ps) end,
fun(S1) -> ?fatal(unexpected_end, S1) end,
S);
-scan_pi(Str = [H1,H2,H3 | T],S0=#xmerl_scanner{line = L, col = C}, Pos)
+scan_pi(Str = [H1,H2,H3 | T],S0=#xmerl_scanner{line = L, col = C}, Pos, Ps)
when H1==$x;H1==$X ->
%% names beginning with [xX][mM][lL] are reserved for future use.
?bump_col(3),
if
((H2==$m) or (H2==$M)) and
((H3==$l) or (H3==$L)) ->
- scan_wellknown_pi(T,S,Pos);
+ scan_wellknown_pi(T,S,Pos,Ps);
true ->
{Target, _NamespaceInfo, T1, S1} = scan_name(Str, S),
- scan_pi(T1, S1, Target, L, C, Pos, [])
+ scan_pi(T1, S1, Target, L, C, Pos, Ps, [])
end;
-scan_pi(Str, S=#xmerl_scanner{line = L, col = C}, Pos) ->
+scan_pi(Str, S=#xmerl_scanner{line = L, col = C}, Pos, Ps) ->
{Target, _NamespaceInfo, T1, S1} = scan_name(Str, S),
- scan_pi(T1, S1, Target, L, C, Pos,[]).
+ scan_pi(T1, S1, Target, L, C, Pos, Ps, []).
%%% More info on xml-stylesheet can be found at:
%%% "Associating Style Sheets with XML documents", Version 1.0,
%%% W3C Recommendation 29 June 1999 (http://www.w3.org/TR/xml-stylesheet/)
-scan_wellknown_pi("-stylesheet"++T, S0=#xmerl_scanner{line=L,col=C},Pos) ->
+scan_wellknown_pi("-stylesheet"++T, S0=#xmerl_scanner{line=L,col=C},Pos,Ps) ->
?dbg("prolog(\"<?xml-stylesheet\")~n", []),
?bump_col(16),
- scan_pi(T, S, "xml-stylesheet",L,C,Pos,[]);
-scan_wellknown_pi(Str,S,_Pos) ->
+ scan_pi(T, S, "xml-stylesheet",L,C,Pos,Ps,[]);
+scan_wellknown_pi(Str,S,_Pos,_Ps) ->
?fatal({invalid_target_name, lists:sublist(Str, 1, 10)}, S).
-scan_pi([], S=#xmerl_scanner{continuation_fun = F}, Target,L, C, Pos, Acc) ->
+scan_pi([], S=#xmerl_scanner{continuation_fun = F}, Target,
+ L, C, Pos, Ps, Acc) ->
?dbg("cont()...~n", []),
- F(fun(MoreBytes, S1) -> scan_pi(MoreBytes, S1, Target, L, C, Pos, Acc) end,
+ F(fun(MoreBytes, S1) -> scan_pi(MoreBytes, S1, Target,
+ L, C, Pos, Ps, Acc) end,
fun(S1) -> ?fatal(unexpected_end, S1) end,
S);
scan_pi("?>" ++ T, S0 = #xmerl_scanner{hook_fun = Hook,
event_fun = Event},
- Target, L, C, Pos, Acc) ->
+ Target, L, C, Pos, Ps, Acc) ->
?bump_col(2),
PI = #xmlPI{name = Target,
+ parents = Ps,
pos = Pos,
value = lists:reverse(Acc)},
S1 = #xmerl_scanner{} = Event(#xmerl_event{event = ended,
@@ -1077,22 +1139,25 @@ scan_pi("?>" ++ T, S0 = #xmerl_scanner{hook_fun = Hook,
data = PI}, S),
{Ret, S2} = Hook(PI, S1),
{Ret, T, S2};
-scan_pi([H|T], S, Target, L, C, Pos, Acc) when ?whitespace(H) ->
+scan_pi([H|T], S, Target, L, C, Pos, Ps, Acc) when ?whitespace(H) ->
?strip1,
- scan_pi2(T1, S1, Target, L, C, Pos, Acc);
-scan_pi([H|_T],S,_Target, _L, _C, _Pos, _Acc) ->
+ scan_pi2(T1, S1, Target, L, C, Pos, Ps, Acc);
+scan_pi([H|_T],S,_Target, _L, _C, _Pos, _Ps, _Acc) ->
?fatal({expected_whitespace_OR_end_of_PI,{char,H}}, S).
-scan_pi2([], S=#xmerl_scanner{continuation_fun = F}, Target,L, C, Pos, Acc) ->
+scan_pi2([], S=#xmerl_scanner{continuation_fun = F}, Target,
+ L, C, Pos, Ps, Acc) ->
?dbg("cont()...~n", []),
- F(fun(MoreBytes, S1) -> scan_pi2(MoreBytes, S1, Target, L, C, Pos, Acc) end,
+ F(fun(MoreBytes, S1) -> scan_pi2(MoreBytes, S1, Target,
+ L, C, Pos, Ps, Acc) end,
fun(S1) -> ?fatal(unexpected_end, S1) end,
S);
scan_pi2("?>" ++ T, S0 = #xmerl_scanner{hook_fun = Hook,
event_fun = Event},
- Target, L, C, Pos, Acc) ->
+ Target, L, C, Pos, Ps, Acc) ->
?bump_col(2),
PI = #xmlPI{name = Target,
+ parents = Ps,
pos = Pos,
value = lists:reverse(Acc)},
S1 = #xmerl_scanner{} = Event(#xmerl_event{event = ended,
@@ -1101,10 +1166,10 @@ scan_pi2("?>" ++ T, S0 = #xmerl_scanner{hook_fun = Hook,
data = PI}, S),
{Ret, S2} = Hook(PI, S1),
{Ret, T, S2};
-scan_pi2(Str, S0, Target, L, C, Pos, Acc) ->
+scan_pi2(Str, S0, Target, L, C, Pos, Ps, Acc) ->
?bump_col(1),
{Ch,T} = wfc_legal_char(Str,S),
- scan_pi2(T, S, Target, L, C, Pos, [Ch|Acc]).
+ scan_pi2(T, S, Target, L, C, Pos, Ps, [Ch|Acc]).
@@ -1575,7 +1640,7 @@ scan_markup_decl("<!--" ++ T, S0) ->
scan_comment(T, S);
scan_markup_decl("<?" ++ T, S0) ->
?bump_col(2),
- {_PI, T1, S1} = scan_pi(T, S,_Pos=markup),
+ {_PI, T1, S1} = scan_pi(T, S,_Pos=markup,[]),
strip(T1, S1);
scan_markup_decl("<!ELEMENT" ++ T,
#xmerl_scanner{rules_read_fun = Read,
@@ -1981,7 +2046,7 @@ scan_element(T, S, Pos) ->
scan_element(T, S=#xmerl_scanner{line=L,col=C},
Pos, SpaceDefault,Lang, Parents, NS) ->
{Name, NamespaceInfo, T1, S1} = scan_name(T, S),
- vc_Element_valid(Name,S),
+ vc_Element_valid(Name,NamespaceInfo,S),
?strip2,
scan_element(T2, S2, Pos, Name, L, C, _Attrs = [],
Lang, Parents, NamespaceInfo, NS,
@@ -2016,7 +2081,8 @@ scan_element("/>" ++ T, S0 = #xmerl_scanner{hook_fun = Hook,
Attrs = lists:reverse(Attrs0),
E=processed_whole_element(S, Pos, Name, Attrs, Lang, Parents,NSI,Namespace),
- wfc_unique_att_spec(Attrs,S),
+ #xmlElement{attributes = Attrs1} = E,
+ wfc_unique_att_spec(Attrs1,S),
S1 = #xmerl_scanner{} = Event(#xmerl_event{event = ended,
line = L,
col = C,
@@ -2086,9 +2152,10 @@ scan_element(T, S, Pos, Name, StartL, StartC, Attrs, Lang, Parents,
P+1
end,
Attr = #xmlAttribute{name = AttName,
+ parents = [{Name, Pos}|Parents],
pos = AttrPos,
language = Lang,
- namespace = NamespaceInfo,
+ nsinfo = NamespaceInfo,
value = AttValue,
normalized = IsNorm},
XMLBase=if
@@ -2110,6 +2177,14 @@ scan_element(T, S, Pos, Name, StartL, StartC, Attrs, Lang, Parents,
scan_element(T4, S5, Pos, Name, StartL, StartC, [Attr|Attrs],
Lang, Parents, NSI, NewNS, SpaceDefault).
+get_default_attrs(S = #xmerl_scanner{rules_read_fun = Read}, ElemName) ->
+ case Read(elem_def, ElemName, S) of
+ #xmlElement{attributes = Attrs} ->
+ [ {AttName, AttValue} ||
+ {AttName, _, AttValue, _, _} <- Attrs, AttValue =/= no_value ];
+ _ -> []
+ end.
+
get_att_type(S=#xmerl_scanner{rules_read_fun=Read},AttName,ElemName) ->
case Read(elem_def,ElemName,S) of
#xmlElement{attributes = Attrs} ->
@@ -2139,6 +2214,23 @@ processed_whole_element(S=#xmerl_scanner{hook_fun = _Hook,
Pos, Name, Attrs, Lang, Parents, NSI, Namespace) ->
Language = check_language(Attrs, Lang),
+ AllAttrs =
+ case S#xmerl_scanner.default_attrs of
+ true ->
+ [ #xmlAttribute{name = AttName,
+ parents = [{Name, Pos} | Parents],
+ language = Lang,
+ nsinfo = NSI,
+ namespace = Namespace,
+ value = AttValue,
+ normalized = true} ||
+ {AttName, AttValue} <- get_default_attrs(S, Name),
+ AttValue =/= no_value,
+ not lists:keymember(AttName, #xmlAttribute.name, Attrs) ];
+ false ->
+ Attrs
+ end,
+
{ExpName, ExpAttrs} =
case S#xmerl_scanner.namespace_conformant of
true ->
@@ -2153,14 +2245,15 @@ processed_whole_element(S=#xmerl_scanner{hook_fun = _Hook,
TempNamespace = Namespace#xmlNamespace{default = []},
ExpAttrsX =
[A#xmlAttribute{
+ namespace=Namespace,
expanded_name=expanded_name(
A#xmlAttribute.name,
- A#xmlAttribute.namespace,
+ A#xmlAttribute.nsinfo,
% NSI,
- TempNamespace, S)} || A <- Attrs],
+ TempNamespace, S)} || A <- AllAttrs],
{expanded_name(Name, NSI, Namespace, S), ExpAttrsX};
false ->
- {Name, Attrs}
+ {Name, AllAttrs}
end,
#xmlElement{name = Name,
@@ -2194,10 +2287,32 @@ check_namespace(_, _, _, NS) ->
expanded_name(Name, [], #xmlNamespace{default = []}, _S) ->
Name;
-expanded_name(Name, [], #xmlNamespace{default = URI}, _S) ->
- {URI, Name};
-expanded_name(_Name, {"xmlns", Local}, _NS, _S) -> % CHECK THIS /JB
- {"xmlns",Local};
+expanded_name(Name, [], #xmlNamespace{default = URI}, S) ->
+ case URI of
+ 'http://www.w3.org/XML/1998/namespace' ->
+ ?fatal(cannot_bind_default_namespace_to_xml_namespace_name, S);
+ 'http://www.w3.org/2000/xmlns/' ->
+ ?fatal(cannot_bind_default_namespace_to_xmlns_namespace_name, S);
+ _ ->
+ {URI, Name}
+ end;
+expanded_name(Name, N = {"xmlns", Local}, #xmlNamespace{nodes = Ns}, S) ->
+ {_, Value} = lists:keyfind(Local, 1, Ns),
+ case Name of
+ 'xmlns:xml' when Value =/= 'http://www.w3.org/XML/1998/namespace' ->
+ ?fatal({xml_prefix_cannot_be_redeclared, Value}, S);
+ 'xmlns:xmlns' ->
+ ?fatal({xmlns_prefix_cannot_be_declared, Value}, S);
+ _ ->
+ case Value of
+ 'http://www.w3.org/XML/1998/namespace' ->
+ ?fatal({cannot_bind_prefix_to_xml_namespace, Local}, S);
+ 'http://www.w3.org/2000/xmlns/' ->
+ ?fatal({cannot_bind_prefix_to_xmlns_namespace, Local}, S);
+ _ ->
+ N
+ end
+ end;
expanded_name(_Name, {Prefix, Local}, #xmlNamespace{nodes = Ns}, S) ->
case lists:keysearch(Prefix, 1, Ns) of
{value, {_, URI}} ->
@@ -2449,9 +2564,23 @@ scan_content("&" ++ T, S0, Pos, Name, Attrs, Space, Lang, Parents, NS, Acc,[]) -
_ ->
scan_content(string_to_char_set(S1#xmerl_scanner.encoding,ExpRef)++T1,S1,Pos,Name,Attrs,Space,Lang,Parents,NS,Acc,[])
end;
-scan_content("<!--" ++ T, S, Pos, Name, Attrs, Space, Lang, Parents, NS, Acc,[]) ->
- {_, T1, S1} = scan_comment(T, S, Pos, Parents, Lang),
- scan_content(T1, S1, Pos+1, Name, Attrs, Space, Lang, Parents, NS, Acc,[]);
+scan_content("<!--" ++ T, S0=#xmerl_scanner{acc_fun = F, comments=CF}, Pos, Name, Attrs, Space,
+ Lang, Parents, NS, Acc,[]) ->
+ ?bump_col(4),
+ {C, T1, S1} = scan_comment(T, S, Pos, Parents, Lang),
+ case CF of
+ true ->
+ {Acc2, Pos2, S3} =
+ case F(C, Acc, S1) of
+ {Acc1, S2} ->
+ {Acc1, Pos + 1, S2};
+ {Acc1, Pos1, S2} ->
+ {Acc1, Pos1, S2}
+ end,
+ scan_content(T1, S3, Pos2, Name, Attrs, Space, Lang, Parents, NS, Acc2,[]);
+ false ->
+ scan_content(T1, S1, Pos, Name, Attrs, Space, Lang, Parents, NS, Acc,[])
+ end;
scan_content("<" ++ T, S0, Pos, Name, Attrs, Space, Lang, Parents, NS, Acc,[]) ->
?bump_col(1),
{Markup, T1, S1} =
@@ -2508,9 +2637,9 @@ scan_content_markup("![CDATA[" ++ T, S0, Pos, _Name, _Attrs,
_Space, _Lang, Parents, _NS) ->
?bump_col(8),
scan_cdata(T, S, Pos, Parents);
-scan_content_markup("?"++T,S0,Pos,_Name,_Attrs,_Space,_Lang,_Parents,_NS) ->
+scan_content_markup("?"++T,S0,Pos,_Name,_Attrs,_Space,_Lang,Parents,_NS) ->
?bump_col(1),
- scan_pi(T, S, Pos);
+ scan_pi(T, S, Pos, Parents);
scan_content_markup(T, S, Pos, _Name, _Attrs, Space, Lang, Parents, NS) ->
scan_element(T, S, Pos, Space, Lang, Parents, NS).
@@ -3259,12 +3388,18 @@ mandatory_delimeter_wfc(T,S) ->
wfc_unique_att_spec([],_S) ->
ok;
-wfc_unique_att_spec([#xmlAttribute{name=N}|Atts],S) ->
+wfc_unique_att_spec([#xmlAttribute{name=N,expanded_name=EN}|Atts],S) ->
case lists:keymember(N,#xmlAttribute.name,Atts) of
true ->
?fatal({error,{unique_att_spec_required,N}},S);
_ ->
- wfc_unique_att_spec(Atts,S)
+ case S#xmerl_scanner.namespace_conformant andalso
+ lists:keymember(EN, #xmlAttribute.expanded_name, Atts) of
+ true ->
+ ?fatal({error,{unique_att_spec_required,EN}},S);
+ _ ->
+ wfc_unique_att_spec(Atts,S)
+ end
end.
wfc_legal_char(Chars,S) when is_list(Chars)->
@@ -3313,6 +3448,11 @@ wfc_Internal_parsed_entity(internal,Value,S) ->
wfc_Internal_parsed_entity(_,_,_) ->
ok.
+vc_Element_valid(_Name, {"xmlns", _},
+ S = #xmerl_scanner{namespace_conformant = true}) ->
+ ?fatal({error,{illegal_element_prefix,xmlns}},S);
+vc_Element_valid(Name, _, S) ->
+ vc_Element_valid(Name, S).
vc_Element_valid(_Name,#xmerl_scanner{environment=internal_parsed_entity}) ->
ok;
@@ -3917,7 +4057,7 @@ schemaLocations(El,#xmerl_scanner{schemaLocation=SL}) ->
schemaLocations(#xmlElement{attributes=Atts,xmlbase=_Base}) ->
Pred = fun(#xmlAttribute{name=schemaLocation}) -> false;
- (#xmlAttribute{namespace={_,"schemaLocation"}}) -> false;
+ (#xmlAttribute{nsinfo={_,"schemaLocation"}}) -> false;
(_) -> true
end,
case lists:dropwhile(Pred,Atts) of