From 7fc95c00764fc13d2e3e676cca1a66be5d672c41 Mon Sep 17 00:00:00 2001 From: Anthony Ramine Date: Tue, 7 Dec 2010 17:28:56 +0100 Subject: Allow whole documents to be returned Functions `xmerl_scan:file/2` and `xmerl_scan:string/2` now accepts a new option `{document, true}` to produce a whole document as a `xmlDocument` record instead of just the root element node. You may wonder why this would be useful, this option is the only way to get to the top-level comments and processing instructions without hooking through the customization functions. Those nodes are needed to implement [Canonical XML][c14n-xml] support. [c14n-xml]: http://www.w3.org/TR/2008/PR-xml-c14n11-20080129/ "Canonical XML" --- lib/xmerl/src/xmerl_scan.erl | 140 ++++++++++++++++++++++++++++--------------- 1 file changed, 91 insertions(+), 49 deletions(-) (limited to 'lib/xmerl/src/xmerl_scan.erl') diff --git a/lib/xmerl/src/xmerl_scan.erl b/lib/xmerl/src/xmerl_scan.erl index 740f825053..303fc26550 100644 --- a/lib/xmerl/src/xmerl_scan.erl +++ b/lib/xmerl/src/xmerl_scan.erl @@ -100,7 +100,15 @@ %%
Set default character set used (default UTF-8). %% This character set is used only if not explicitly given by the XML %% declaration.
+%%
{document, Flag}
+%%
Set to 'true' if xmerl should return a complete XML document +%% as an xmlDocument record (default 'false').
%% +%% @type document() = xmlElement() | xmlDocument().

+%% The document returned by xmerl_scan:string/[1,2] and +%% xmerl_scan:file/[1,2]. The type of the returned record depends on +%% the value of the document option passed to the function. +%%

-module(xmerl_scan). @@ -224,7 +232,7 @@ cont_state(X, S=#xmerl_scanner{fun_states = FS}) -> file(F) -> file(F, []). -%% @spec file(Filename::string(), Options::option_list()) -> {xmlElement(),Rest} +%% @spec file(Filename::string(), Options::option_list()) -> {document(),Rest} %% Rest = list() %%% @doc Parse file containing an XML document file(F, Options) -> @@ -264,7 +272,7 @@ int_file_decl(F, Options,_ExtCharset) -> string(Str) -> string(Str, []). -%% @spec string(Text::list(),Options::option_list()) -> {xmlElement(),Rest} +%% @spec string(Text::list(),Options::option_list()) -> {document(),Rest} %% Rest = list() %%% @doc Parse string containing an XML document string(Str, Options) -> @@ -381,6 +389,8 @@ initial_state([{quiet, F}|T], S) when F==true; F==false -> initial_state(T, S#xmerl_scanner{quiet = F}); initial_state([{doctype_DTD,DTD}|T], S) -> initial_state(T,S#xmerl_scanner{doctype_DTD = DTD}); +initial_state([{document, F}|T], S) when is_boolean(F) -> + initial_state(T,S#xmerl_scanner{document = F}); initial_state([{text_decl,Bool}|T], S) -> initial_state(T,S#xmerl_scanner{text_decl=Bool}); initial_state([{environment,Env}|T], S) -> @@ -518,6 +528,7 @@ scan_document(Str0, S=#xmerl_scanner{event_fun = Event, line = L, col = C, environment=Env, encoding=Charset, + document=Document, validation=ValidateResult}) -> S1 = Event(#xmerl_event{event = started, line = L, @@ -539,17 +550,17 @@ scan_document(Str0, S=#xmerl_scanner{event_fun = Event, end, %% M1 = erlang:memory(), %% io:format("Memory status before prolog: ~p~n",[M1]), - {T1, S2} = scan_prolog(Str, S1, _StartPos = 1), + {Prolog, Pos, T1, S2} = scan_prolog(Str, S1, _StartPos = 1), %% M2 = erlang:memory(), %% io:format("Memory status after prolog: ~p~n",[M2]), %%io:format("scan_document 2, prolog parsed~n",[]), T2 = scan_mandatory("<",T1,1,S2,expected_element_start_tag), %% M3 = erlang:memory(), %% io:format("Memory status before element: ~p~n",[M3]), - {Res, T3, S3} =scan_element(T2,S2,_StartPos = 1), + {Res, T3, S3} =scan_element(T2,S2,Pos), %% M4 = erlang:memory(), %% io:format("Memory status after element: ~p~n",[M4]), - {Tail, S4}=scan_misc(T3, S3, _StartPos = 1), + {Misc, _Pos1, Tail, S4}=scan_misc(T3, S3, Pos + 1), %% M5 = erlang:memory(), %% io:format("Memory status after misc: ~p~n",[M5]), @@ -595,7 +606,15 @@ scan_document(Str0, S=#xmerl_scanner{event_fun = Event, {Res,cleanup(S5)} end, - {Res2, Tail, S6}. + Res3 = + case Document of + true -> + Content = lists:reverse(Prolog, [Res2 | lists:reverse(Misc)]), + #xmlDocument{content = Content}; + false -> + Res2#xmlElement{pos = 1} + end, + {Res3, Tail, S6}. scan_decl(Str, S=#xmerl_scanner{event_fun = Event, @@ -624,14 +643,17 @@ scan_decl(Str, S=#xmerl_scanner{event_fun = Event, %%% prolog ::= XMLDecl? Misc* (doctypedecl Misc*)? %%% %% empty text declarations are handled by the first function clause. -scan_prolog([], S=#xmerl_scanner{continuation_fun = F}, Pos) -> +scan_prolog(T, S, Pos) -> + scan_prolog(T, S, Pos, []). +scan_prolog([], S=#xmerl_scanner{continuation_fun = F}, Pos, Acc) -> ?dbg("cont()...~n", []), - F(fun(MoreBytes, S1) -> scan_prolog(MoreBytes, S1, Pos) end, - fun(S1) -> {[], S1} end, + F(fun(MoreBytes, S1) -> scan_prolog(MoreBytes, S1, Pos, Acc) end, + fun(S1) -> {Acc, Pos, [], S1} end, S); -scan_prolog(" - {Charset,T3, S3}= +scan_prolog(" + {Charset, T3, S3} = if Col==1,L==1,S0#xmerl_scanner.text_decl==true -> ?dbg("prolog(\" ?dbg("prolog(\" ?fatal({xml_declaration_must_be_first_in_doc,Col,L},S0) end, @@ -659,7 +681,7 @@ scan_prolog(" % Document already transformed to this charset! - scan_prolog(T3, S3, Pos); + scan_prolog(T3, S3, Pos, Acc); Charset0=/=undefined -> %% For example may an external entity %% have the BOM for utf-16 and the internal @@ -668,17 +690,18 @@ scan_prolog(" - scan_prolog(T3, S3, Pos); + scan_prolog(T3, S3, Pos, Acc); Charset=/=undefined -> % Document not previously transformed T4=xmerl_ucs:to_unicode(T3,list_to_atom(Charset)), - scan_prolog(T4, S3, Pos); + scan_prolog(T4, S3, Pos, Acc); true -> % No encoding info given - scan_prolog(T3, S3, Pos) + scan_prolog(T3, S3, Pos, Acc) end; -scan_prolog(" +scan_prolog(" ?dbg("prolog(\" T end, {T2, S1} = scan_doctype(T1, S), - scan_misc(T2, S1, Pos); -scan_prolog(Str="%"++_T,S=#xmerl_scanner{environment={external,_}},_Pos) -> - scan_ext_subset(Str,S); -scan_prolog(Str, S0 = #xmerl_scanner{user_state=_US,encoding=_Charset},Pos) -> + scan_misc(T2, S1, Pos, Acc); +scan_prolog(Str="%"++_T,S=#xmerl_scanner{environment={external,_}}, + Pos,Acc) -> + {T, S1} = scan_ext_subset(Str,S), + {Acc, Pos, T, S1}; +scan_prolog(Str, S0 = #xmerl_scanner{user_state=_US,encoding=_Charset}, + Pos,Acc) -> ?dbg("prolog(\"<\")~n", []), %% Check for Comments, PI before possible DOCTYPE declaration @@ -700,26 +726,28 @@ scan_prolog(Str, S0 = #xmerl_scanner{user_state=_US,encoding=_Charset},Pos) -> %% Charset==undefined -> xmerl_ucs:to_unicode(Str,'utf-8'); true -> Str end, - {T1, S1}=scan_misc(T, S, Pos), - scan_prolog2(T1,S1,Pos). + {Acc1, Pos1, T1, S1}=scan_misc(T, S, Pos, Acc), + scan_prolog2(T1,S1,Pos1,Acc1). -scan_prolog2([], S=#xmerl_scanner{continuation_fun = F}, Pos) -> +scan_prolog2([], S=#xmerl_scanner{continuation_fun = F}, Pos, Acc) -> ?dbg("cont()...~n", []), - F(fun(MoreBytes, S1) -> scan_prolog2(MoreBytes, S1, Pos) end, - fun(S1) -> {[], S1} end, + F(fun(MoreBytes, S1) -> scan_prolog2(MoreBytes, S1, Pos, Acc) end, + fun(S1) -> {Acc, Pos, [], S1} end, S); -scan_prolog2(" +scan_prolog2(" ?dbg("prolog(\" + scan_misc(T1, S1, Pos, Acc); +scan_prolog2(Str = " ?dbg("prolog(\" + {T, S1} = scan_ext_subset(Str, S), + {Acc, Pos, T, S1}; +scan_prolog2(Str, S0 = #xmerl_scanner{user_state=_US},Pos,Acc) -> ?dbg("prolog(\"<\")~n", []), %% Here we consider the DTD provided by doctype_DTD option, @@ -733,7 +761,7 @@ scan_prolog2(Str, S0 = #xmerl_scanner{user_state=_US},Pos) -> end, %% Check for more Comments and PI after DOCTYPE declaration % ?bump_col(1), - scan_misc(Str, S1, Pos). + scan_misc(Str, S1, Pos, Acc). @@ -743,26 +771,40 @@ scan_prolog2(Str, S0 = #xmerl_scanner{user_state=_US},Pos) -> %% - Neither of Comment and PI are returned in the resulting parsed %% structure. %% - scan_misc/3 implements Misc* as that is how the rule is always used -scan_misc([], S=#xmerl_scanner{continuation_fun = F}, Pos) -> +scan_misc(T, S, Pos) -> + scan_misc(T, S, Pos, []). +scan_misc([], S=#xmerl_scanner{continuation_fun = F}, Pos, Acc) -> ?dbg("cont()...~n", []), - F(fun(MoreBytes, S1) -> scan_misc(MoreBytes, S1, Pos) end, - fun(S1) -> {[], S1} end, + F(fun(MoreBytes, S1) -> scan_misc(MoreBytes, S1, Pos, Acc) end, + fun(S1) -> {Acc, Pos, [], S1} end, S); -scan_misc("