diff options
author | Anthony Ramine <nox@dev-extend.eu> | 2010-12-07 17:28:56 +0100 |
---|---|---|
committer | Lars Thorsen <lars@erlang.org> | 2011-11-11 11:58:42 +0100 |
commit | 7fc95c00764fc13d2e3e676cca1a66be5d672c41 (patch) | |
tree | ccc9c449a581779dfc2593aebe7977ccb7643276 /lib | |
parent | 64a41197d8d5a292348d38467b28a041a95c500b (diff) | |
download | otp-7fc95c00764fc13d2e3e676cca1a66be5d672c41.tar.gz otp-7fc95c00764fc13d2e3e676cca1a66be5d672c41.tar.bz2 otp-7fc95c00764fc13d2e3e676cca1a66be5d672c41.zip |
Allow whole documents to be returned
Functions `xmerl_scan:file/2` and `xmerl_scan:string/2` now accepts a
new option `{document, true}` to produce a whole document as a
`xmlDocument` record instead of just the root element node.
You may wonder why this would be useful, this option is the only way to
get to the top-level comments and processing instructions without
hooking through the customization functions. Those nodes are needed to
implement [Canonical XML][c14n-xml] support.
[c14n-xml]: http://www.w3.org/TR/2008/PR-xml-c14n11-20080129/
"Canonical XML"
Diffstat (limited to 'lib')
-rw-r--r-- | lib/xmerl/include/xmerl.hrl | 1 | ||||
-rw-r--r-- | lib/xmerl/src/xmerl_scan.erl | 140 |
2 files changed, 92 insertions, 49 deletions
diff --git a/lib/xmerl/include/xmerl.hrl b/lib/xmerl/include/xmerl.hrl index fb9b00c73c..9c471c437f 100644 --- a/lib/xmerl/include/xmerl.hrl +++ b/lib/xmerl/include/xmerl.hrl @@ -155,6 +155,7 @@ declarations = [], % [{Name, Attrs}] doctype_name, doctype_DTD = internal, % internal | DTDId + document = false, rules, keep_rules = false, % delete (ets) tab if false namespace_conformant = false, % true | false diff --git a/lib/xmerl/src/xmerl_scan.erl b/lib/xmerl/src/xmerl_scan.erl index 740f825053..303fc26550 100644 --- a/lib/xmerl/src/xmerl_scan.erl +++ b/lib/xmerl/src/xmerl_scan.erl @@ -100,7 +100,15 @@ %% <dd>Set default character set used (default UTF-8). %% This character set is used only if not explicitly given by the XML %% declaration. </dd> +%% <dt><code>{document, Flag}</code></dt> +%% <dd>Set to 'true' if xmerl should return a complete XML document +%% as an xmlDocument record (default 'false').</dd> %% </dl> +%% @type document() = xmlElement() | xmlDocument(). <p> +%% The document returned by <tt>xmerl_scan:string/[1,2]</tt> and +%% <tt>xmerl_scan:file/[1,2]</tt>. The type of the returned record depends on +%% the value of the document option passed to the function. +%% </p> -module(xmerl_scan). @@ -224,7 +232,7 @@ cont_state(X, S=#xmerl_scanner{fun_states = FS}) -> file(F) -> file(F, []). -%% @spec file(Filename::string(), Options::option_list()) -> {xmlElement(),Rest} +%% @spec file(Filename::string(), Options::option_list()) -> {document(),Rest} %% Rest = list() %%% @doc Parse file containing an XML document file(F, Options) -> @@ -264,7 +272,7 @@ int_file_decl(F, Options,_ExtCharset) -> string(Str) -> string(Str, []). -%% @spec string(Text::list(),Options::option_list()) -> {xmlElement(),Rest} +%% @spec string(Text::list(),Options::option_list()) -> {document(),Rest} %% Rest = list() %%% @doc Parse string containing an XML document string(Str, Options) -> @@ -381,6 +389,8 @@ initial_state([{quiet, F}|T], S) when F==true; F==false -> initial_state(T, S#xmerl_scanner{quiet = F}); initial_state([{doctype_DTD,DTD}|T], S) -> initial_state(T,S#xmerl_scanner{doctype_DTD = DTD}); +initial_state([{document, F}|T], S) when is_boolean(F) -> + initial_state(T,S#xmerl_scanner{document = F}); initial_state([{text_decl,Bool}|T], S) -> initial_state(T,S#xmerl_scanner{text_decl=Bool}); initial_state([{environment,Env}|T], S) -> @@ -518,6 +528,7 @@ scan_document(Str0, S=#xmerl_scanner{event_fun = Event, line = L, col = C, environment=Env, encoding=Charset, + document=Document, validation=ValidateResult}) -> S1 = Event(#xmerl_event{event = started, line = L, @@ -539,17 +550,17 @@ scan_document(Str0, S=#xmerl_scanner{event_fun = Event, end, %% M1 = erlang:memory(), %% io:format("Memory status before prolog: ~p~n",[M1]), - {T1, S2} = scan_prolog(Str, S1, _StartPos = 1), + {Prolog, Pos, T1, S2} = scan_prolog(Str, S1, _StartPos = 1), %% M2 = erlang:memory(), %% io:format("Memory status after prolog: ~p~n",[M2]), %%io:format("scan_document 2, prolog parsed~n",[]), T2 = scan_mandatory("<",T1,1,S2,expected_element_start_tag), %% M3 = erlang:memory(), %% io:format("Memory status before element: ~p~n",[M3]), - {Res, T3, S3} =scan_element(T2,S2,_StartPos = 1), + {Res, T3, S3} =scan_element(T2,S2,Pos), %% M4 = erlang:memory(), %% io:format("Memory status after element: ~p~n",[M4]), - {Tail, S4}=scan_misc(T3, S3, _StartPos = 1), + {Misc, _Pos1, Tail, S4}=scan_misc(T3, S3, Pos + 1), %% M5 = erlang:memory(), %% io:format("Memory status after misc: ~p~n",[M5]), @@ -595,7 +606,15 @@ scan_document(Str0, S=#xmerl_scanner{event_fun = Event, {Res,cleanup(S5)} end, - {Res2, Tail, S6}. + Res3 = + case Document of + true -> + Content = lists:reverse(Prolog, [Res2 | lists:reverse(Misc)]), + #xmlDocument{content = Content}; + false -> + Res2#xmlElement{pos = 1} + end, + {Res3, Tail, S6}. scan_decl(Str, S=#xmerl_scanner{event_fun = Event, @@ -624,14 +643,17 @@ scan_decl(Str, S=#xmerl_scanner{event_fun = Event, %%% prolog ::= XMLDecl? Misc* (doctypedecl Misc*)? %%% %% empty text declarations are handled by the first function clause. -scan_prolog([], S=#xmerl_scanner{continuation_fun = F}, Pos) -> +scan_prolog(T, S, Pos) -> + scan_prolog(T, S, Pos, []). +scan_prolog([], S=#xmerl_scanner{continuation_fun = F}, Pos, Acc) -> ?dbg("cont()...~n", []), - F(fun(MoreBytes, S1) -> scan_prolog(MoreBytes, S1, Pos) end, - fun(S1) -> {[], S1} end, + F(fun(MoreBytes, S1) -> scan_prolog(MoreBytes, S1, Pos, Acc) end, + fun(S1) -> {Acc, Pos, [], S1} end, S); -scan_prolog("<?xml"++T,S0=#xmerl_scanner{encoding=Charset0,col=Col,line=L},Pos) - when ?whitespace(hd(T)) -> - {Charset,T3, S3}= +scan_prolog("<?xml"++T, + S0=#xmerl_scanner{encoding=Charset0,col=Col,line=L}, + Pos,Acc) when ?whitespace(hd(T)) -> + {Charset, T3, S3} = if Col==1,L==1,S0#xmerl_scanner.text_decl==true -> ?dbg("prolog(\"<?xml\")~n", []), @@ -639,13 +661,13 @@ scan_prolog("<?xml"++T,S0=#xmerl_scanner{encoding=Charset0,col=Col,line=L},Pos) {_,T1,S1} = mandatory_strip(T,S), {Decl,T2, S2}=scan_text_decl(T1,S1), Encoding=Decl#xmlDecl.encoding, - {Encoding,T2, S2#xmerl_scanner{encoding=Encoding}}; + {Encoding, T2, S2#xmerl_scanner{encoding=Encoding}}; Col==1,L==1 -> ?dbg("prolog(\"<?xml\")~n", []), ?bump_col(5), {Decl,T2, S2}=scan_xml_decl(T, S), Encoding=Decl#xmlDecl.encoding, - {Encoding,T2, S2#xmerl_scanner{encoding=Encoding}}; + {Encoding, T2, S2#xmerl_scanner{encoding=Encoding}}; true -> ?fatal({xml_declaration_must_be_first_in_doc,Col,L},S0) end, @@ -659,7 +681,7 @@ scan_prolog("<?xml"++T,S0=#xmerl_scanner{encoding=Charset0,col=Col,line=L},Pos) %% Now transform to declared character set. if Charset==Charset0 -> % Document already transformed to this charset! - scan_prolog(T3, S3, Pos); + scan_prolog(T3, S3, Pos, Acc); Charset0=/=undefined -> %% For example may an external entity %% have the BOM for utf-16 and the internal @@ -668,17 +690,18 @@ scan_prolog("<?xml"++T,S0=#xmerl_scanner{encoding=Charset0,col=Col,line=L},Pos) %% 'iso-10646-utf-1', and Charset will be 'utf-16', all %% legal. %% - scan_prolog(T3,S3#xmerl_scanner{encoding=Charset0},Pos); + scan_prolog(T3,S3#xmerl_scanner{encoding=Charset0},Pos,Acc); Charset == "utf-8" -> - scan_prolog(T3, S3, Pos); + scan_prolog(T3, S3, Pos, Acc); Charset=/=undefined -> % Document not previously transformed T4=xmerl_ucs:to_unicode(T3,list_to_atom(Charset)), - scan_prolog(T4, S3, Pos); + scan_prolog(T4, S3, Pos, Acc); true -> % No encoding info given - scan_prolog(T3, S3, Pos) + scan_prolog(T3, S3, Pos, Acc) end; -scan_prolog("<!DOCTYPE" ++ T, S0=#xmerl_scanner{environment=prolog, - encoding=_Charset}, Pos) -> +scan_prolog("<!DOCTYPE" ++ T, + S0=#xmerl_scanner{environment=prolog,encoding=_Charset}, + Pos, Acc) -> ?dbg("prolog(\"<!DOCTYPE\")~n", []), ?bump_col(9), %% If no known character set assume it is UTF-8 @@ -687,10 +710,13 @@ scan_prolog("<!DOCTYPE" ++ T, S0=#xmerl_scanner{environment=prolog, true -> T end, {T2, S1} = scan_doctype(T1, S), - scan_misc(T2, S1, Pos); -scan_prolog(Str="%"++_T,S=#xmerl_scanner{environment={external,_}},_Pos) -> - scan_ext_subset(Str,S); -scan_prolog(Str, S0 = #xmerl_scanner{user_state=_US,encoding=_Charset},Pos) -> + scan_misc(T2, S1, Pos, Acc); +scan_prolog(Str="%"++_T,S=#xmerl_scanner{environment={external,_}}, + Pos,Acc) -> + {T, S1} = scan_ext_subset(Str,S), + {Acc, Pos, T, S1}; +scan_prolog(Str, S0 = #xmerl_scanner{user_state=_US,encoding=_Charset}, + Pos,Acc) -> ?dbg("prolog(\"<\")~n", []), %% Check for Comments, PI before possible DOCTYPE declaration @@ -700,26 +726,28 @@ scan_prolog(Str, S0 = #xmerl_scanner{user_state=_US,encoding=_Charset},Pos) -> %% Charset==undefined -> xmerl_ucs:to_unicode(Str,'utf-8'); true -> Str end, - {T1, S1}=scan_misc(T, S, Pos), - scan_prolog2(T1,S1,Pos). + {Acc1, Pos1, T1, S1}=scan_misc(T, S, Pos, Acc), + scan_prolog2(T1,S1,Pos1,Acc1). -scan_prolog2([], S=#xmerl_scanner{continuation_fun = F}, Pos) -> +scan_prolog2([], S=#xmerl_scanner{continuation_fun = F}, Pos, Acc) -> ?dbg("cont()...~n", []), - F(fun(MoreBytes, S1) -> scan_prolog2(MoreBytes, S1, Pos) end, - fun(S1) -> {[], S1} end, + F(fun(MoreBytes, S1) -> scan_prolog2(MoreBytes, S1, Pos, Acc) end, + fun(S1) -> {Acc, Pos, [], S1} end, S); -scan_prolog2("<!DOCTYPE" ++ T, S0=#xmerl_scanner{environment=prolog}, Pos) -> +scan_prolog2("<!DOCTYPE" ++ T, S0=#xmerl_scanner{environment=prolog}, + Pos, Acc) -> ?dbg("prolog(\"<!DOCTYPE\")~n", []), ?bump_col(9), {T1, S1} = scan_doctype(T, S), - scan_misc(T1, S1, Pos); -scan_prolog2(Str = "<!" ++ _, S, _Pos) -> + scan_misc(T1, S1, Pos, Acc); +scan_prolog2(Str = "<!" ++ _, S, Pos, Acc) -> ?dbg("prolog(\"<!\")~n", []), %% In e.g. a DTD, we jump directly to markup declarations - scan_ext_subset(Str, S); -scan_prolog2(Str, S0 = #xmerl_scanner{user_state=_US},Pos) -> + {T, S1} = scan_ext_subset(Str, S), + {Acc, Pos, T, S1}; +scan_prolog2(Str, S0 = #xmerl_scanner{user_state=_US},Pos,Acc) -> ?dbg("prolog(\"<\")~n", []), %% Here we consider the DTD provided by doctype_DTD option, @@ -733,7 +761,7 @@ scan_prolog2(Str, S0 = #xmerl_scanner{user_state=_US},Pos) -> end, %% Check for more Comments and PI after DOCTYPE declaration % ?bump_col(1), - scan_misc(Str, S1, Pos). + scan_misc(Str, S1, Pos, Acc). @@ -743,26 +771,40 @@ scan_prolog2(Str, S0 = #xmerl_scanner{user_state=_US},Pos) -> %% - Neither of Comment and PI are returned in the resulting parsed %% structure. %% - scan_misc/3 implements Misc* as that is how the rule is always used -scan_misc([], S=#xmerl_scanner{continuation_fun = F}, Pos) -> +scan_misc(T, S, Pos) -> + scan_misc(T, S, Pos, []). +scan_misc([], S=#xmerl_scanner{continuation_fun = F}, Pos, Acc) -> ?dbg("cont()...~n", []), - F(fun(MoreBytes, S1) -> scan_misc(MoreBytes, S1, Pos) end, - fun(S1) -> {[], S1} end, + F(fun(MoreBytes, S1) -> scan_misc(MoreBytes, S1, Pos, Acc) end, + fun(S1) -> {Acc, Pos, [], S1} end, S); -scan_misc("<!--" ++ T, S0, Pos) -> % Comment +scan_misc("<!--" ++ T, S0=#xmerl_scanner{acc_fun = F}, Pos, Acc) -> % Comment ?bump_col(4), - {_, T1, S1} = scan_comment(T, S, Pos, _Parents = [], _Lang = []), - scan_misc(T1,S1,Pos); -scan_misc("<?" ++ T, S0, Pos) -> % PI + {C, T1, S1} = scan_comment(T, S, Pos, _Parents = [], _Lang = []), + {Acc2, Pos2, S3} = case F(C, Acc, S1) of + {Acc1, S2} -> + {Acc1, Pos + 1, S2}; + {Acc1, Pos1, S2} -> + {Acc1, Pos1, S2} + end, + scan_misc(T1,S3,Pos2,Acc2); +scan_misc("<?" ++ T, S0=#xmerl_scanner{acc_fun = F}, Pos, Acc) -> % PI ?dbg("prolog(\"<?\")~n", []), ?bump_col(2), - {_PI, T1, S1} = scan_pi(T, S, Pos, []), - scan_misc(T1,S1,Pos); -scan_misc(T=[H|_T], S, Pos) when ?whitespace(H) -> + {PI, T1, S1} = scan_pi(T, S, Pos, []), + {Acc2, Pos2, S3} = case F(PI, Acc, S1) of + {Acc1, S2} -> + {Acc1, Pos + 1, S2}; + {Acc1, Pos1, S2} -> + {Acc1, Pos1, S2} + end, + scan_misc(T1,S3,Pos2,Acc2); +scan_misc(T=[H|_T], S, Pos, Acc) when ?whitespace(H) -> ?dbg("prolog(whitespace)~n", []), {_,T1,S1}=strip(T,S), - scan_misc(T1,S1,Pos); -scan_misc(T,S,_Pos) -> - {T,S}. + scan_misc(T1,S1,Pos,Acc); +scan_misc(T,S,Pos,Acc) -> + {Acc,Pos,T,S}. cleanup(S=#xmerl_scanner{keep_rules = false, |