%% %% %CopyrightBegin% %% %% Copyright Ericsson AB 2003-2018. All Rights Reserved. %% %% Licensed under the Apache License, Version 2.0 (the "License"); %% you may not use this file except in compliance with the License. %% You may obtain a copy of the License at %% %% http://www.apache.org/licenses/LICENSE-2.0 %% %% Unless required by applicable law or agreed to in writing, software %% distributed under the License is distributed on an "AS IS" BASIS, %% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. %% See the License for the specific language governing permissions and %% limitations under the License. %% %% %CopyrightEnd% %% %% Description : Simgle-pass XML scanner. See xmerl.hrl for data defs. %% @doc This module is the interface to the XML parser, it handles XML 1.0. %% The XML parser is activated through %% xmerl_scan:string/[1,2] or %% xmerl_scan:file/[1,2]. %% It returns records of the type defined in xmerl.hrl. %% See also tutorial on customization %% functions. %% @type global_state().

%% The global state of the scanner, represented by the #xmerl_scanner{} record. %%

%% @type option_list().

Options allow to customize the behaviour of the %% scanner. %% See also tutorial on customization %% functions. %%

%%

%% Possible options are: %%

%%
%%
{acc_fun, Fun}
%%
Call back function to accumulate contents of entity.
%%
{continuation_fun, Fun} | %% {continuation_fun, Fun, ContinuationState}
%%
Call back function to decide what to do if the scanner runs into EOF %% before the document is complete.
%%
{event_fun, Fun} | %% {event_fun, Fun, EventState}
%%
Call back function to handle scanner events.
%%
{fetch_fun, Fun} | %% {fetch_fun, Fun, FetchState}
%%
Call back function to fetch an external resource.
%%
{hook_fun, Fun} | %% {hook_fun, Fun, HookState}
%%
Call back function to process the document entities once %% identified.
%%
{close_fun, Fun}
%%
Called when document has been completely parsed.
%%
{rules, ReadFun, WriteFun, RulesState} | %% {rules, Rules}
%%
Handles storing of scanner information when parsing.
%%
{user_state, UserState}
%%
Global state variable accessible from all customization functions
%% %%
{fetch_path, PathList}
%%
PathList is a list of %% directories to search when fetching files. If the file in question %% is not in the fetch_path, the URI will be used as a file %% name.
%%
{space, Flag}
%%
'preserve' (default) to preserve spaces, 'normalize' to %% accumulate consecutive whitespace and replace it with one space.
%%
{line, Line}
%%
To specify starting line for scanning in document which contains %% fragments of XML.
%%
{namespace_conformant, Flag}
%%
Controls whether to behave as a namespace conformant XML parser, %% 'false' (default) to not otherwise 'true'.
%%
{validation, Flag}
%%
Controls whether to process as a validating XML parser: %% 'off' (default) no validation, or validation 'dtd' by DTD or 'schema' %% by XML Schema. 'false' and 'true' options are obsolete %% (i.e. they may be removed in a future release), if used 'false' %% equals 'off' and 'true' equals 'dtd'.
%%
{schemaLocation, [{Namespace,Link}|...]}
%%
Tells explicitly which XML Schema documents to use to validate %% the XML document. Used together with the %% {validation,schema} option.
%%
{quiet, Flag}
%%
Set to 'true' if xmerl should behave quietly and not output any %% information to standard output (default 'false').
%%
{doctype_DTD, DTD}
%%
Allows to specify DTD name when it isn't available in the XML %% document. This option has effect only together with %% {validation,'dtd' option.
%%
{xmlbase, Dir}
%%
XML Base directory. If using string/1 default is current directory. %% If using file/1 default is directory of given file.
%%
{encoding, Enc}
%%
Set default character set used (default UTF-8). %% This character set is used only if not explicitly given by the XML %% declaration.
%%
{document, Flag}
%%
Set to 'true' if xmerl should return a complete XML document %% as an xmlDocument record (default 'false').
%%
{comments, Flag}
%%
Set to 'false' if xmerl should skip comments otherwise they will %% be returned as xmlComment records (default 'true').
%%
{default_attrs, Flag}
%%
Set to 'true' if xmerl should add to elements missing attributes %% with a defined default value (default 'false').
%%
%% @type xmlElement() = #xmlElement{}. %% The record definition is found in xmerl.hrl. %% @type xmlDocument() = #xmlDocument{}. %% The record definition is found in xmerl.hrl. %% @type document() = xmlElement() | xmlDocument().

%% The document returned by xmerl_scan:string/[1,2] and %% xmerl_scan:file/[1,2]. The type of the returned record depends on %% the value of the document option passed to the function. %%

-module(xmerl_scan). -vsn('0.20'). -date('03-09-16'). %% main API -export([string/1, string/2, file/1, file/2]). %% access functions for various states -export([user_state/1, user_state/2, event_state/1, event_state/2, hook_state/1, hook_state/2, rules_state/1, rules_state/2, fetch_state/1, fetch_state/2, cont_state/1, cont_state/2]). %% helper functions. To xmerl_lib ?? -export([accumulate_whitespace/4]). %-define(debug, 1). -include("xmerl.hrl"). % record def, macros -include("xmerl_internal.hrl"). -include_lib("kernel/include/file.hrl"). -define(fatal(Reason, S), if S#xmerl_scanner.quiet -> ok; true -> error_logger:error_msg("~p- fatal: ~p~n", [?LINE, Reason]), ok end, fatal(Reason, S)). -define(ustate(U, S), S#xmerl_scanner{user_state = U}). %% Functions to access the various states %%% @spec user_state(S::global_state()) -> global_state() %%% @equiv user_state(UserState,S) user_state(#xmerl_scanner{user_state = S}) -> S. %%% @spec event_state(S::global_state()) -> global_state() %%% @equiv event_state(EventState,S) event_state(#xmerl_scanner{fun_states = #xmerl_fun_states{event = S}}) -> S. %%% @spec hook_state(S::global_state()) -> global_state() %%% @equiv hook_state(HookState,S) hook_state(#xmerl_scanner{fun_states = #xmerl_fun_states{hook = S}}) -> S. %%% @spec rules_state(S::global_state()) -> global_state() %%% @equiv rules_state(RulesState,S) rules_state(#xmerl_scanner{fun_states = #xmerl_fun_states{rules = S}}) -> S. %%% @spec fetch_state(S::global_state()) -> global_state() %%% @equiv fetch_state(FetchState,S) fetch_state(#xmerl_scanner{fun_states = #xmerl_fun_states{fetch = S}}) -> S. %%% @spec cont_state(S::global_state()) -> global_state() %%% @equiv cont_state(ContinuationState,S) cont_state(#xmerl_scanner{fun_states = #xmerl_fun_states{cont = S}}) -> S. %%%% Functions to modify the various states %%% @spec user_state(UserState, S::global_state()) -> global_state() %%% @doc For controlling the UserState, to be used in a user function. %%% See tutorial on customization functions. user_state(X, S) -> S#xmerl_scanner{user_state = X}. %%% @spec event_state(EventState, S::global_state()) -> global_state() %%% @doc For controlling the EventState, to be used in an event %%% function, and called at the beginning and at the end of a parsed entity. %%% See tutorial on customization functions. event_state(X, S=#xmerl_scanner{fun_states = FS}) -> FS1 = FS#xmerl_fun_states{event = X}, S#xmerl_scanner{fun_states = FS1}. %%% @spec hook_state(HookState, S::global_state()) -> global_state() %%% @doc For controlling the HookState, to be used in a hook %%% function, and called when the parser has parsed a complete entity. %%% See tutorial on customization functions. hook_state(X, S=#xmerl_scanner{fun_states = FS}) -> FS1 = FS#xmerl_fun_states{hook = X}, S#xmerl_scanner{fun_states = FS1}. %%% @spec rules_state(RulesState, S::global_state()) -> global_state() %%% @doc For controlling the RulesState, to be used in a rules %%% function, and called when the parser store scanner information in a rules %%% database. %%% See tutorial on customization functions. rules_state(X, S=#xmerl_scanner{fun_states = FS}) -> FS1 = FS#xmerl_fun_states{rules = X}, S#xmerl_scanner{fun_states = FS1}. %%% @spec fetch_state(FetchState, S::global_state()) -> global_state() %%% @doc For controlling the FetchState, to be used in a fetch %%% function, and called when the parser fetch an external resource (eg. a DTD). %%% See tutorial on customization functions. fetch_state(X, S=#xmerl_scanner{fun_states = FS}) -> FS1 = FS#xmerl_fun_states{fetch = X}, S#xmerl_scanner{fun_states = FS1}. %%% @spec cont_state(ContinuationState, S::global_state()) -> global_state() %%% @doc For controlling the ContinuationState, to be used in a continuation %%% function, and called when the parser encounters the end of the byte stream. %%% See tutorial on customization functions. cont_state(X, S=#xmerl_scanner{fun_states = FS}) -> FS1 = FS#xmerl_fun_states{cont = X}, S#xmerl_scanner{fun_states = FS1}. %% @spec file(Filename::string()) -> {xmlElement(),Rest} %% Rest = list() %% @equiv file(Filename, []) file(F) -> file(F, []). %% @spec file(Filename::string(), Options::option_list()) -> {document(),Rest} %% Rest = list() %%% @doc Parse file containing an XML document file(F, Options) -> ExtCharset=case lists:keysearch(encoding,1,Options) of {value,{_,Val}} -> Val; false -> undefined end, case int_file(F,Options,ExtCharset) of {Res, Tail,S=#xmerl_scanner{close_fun=Close}} -> Close(S), % for side effects only - final state is dropped {Res,Tail}; {error, Reason} -> {error, Reason} end. int_file(F, Options,_ExtCharset) -> %%?dbg("int_file F=~p~n",[F]), case file:read_file(F) of {ok, Bin} -> int_string(binary_to_list(Bin), Options, filename:dirname(F),F); Error -> Error end. int_file_decl(F, Options,_ExtCharset) -> % ?dbg("int_file_decl F=~p~n",[F]), case file:read_file(F) of {ok, Bin} -> int_string_decl(binary_to_list(Bin), Options, filename:dirname(F),F); Error -> Error end. %% @spec string(Text::list()) -> {xmlElement(),Rest} %% Rest = list() %% @equiv string(Text, []) string(Str) -> string(Str, []). %% @spec string(Text::list(),Options::option_list()) -> {document(),Rest} %% Rest = list() %%% @doc Parse string containing an XML document string(Str, Options) -> {Res, Tail, S=#xmerl_scanner{close_fun = Close}} = int_string(Str, Options,file_name_unknown), Close(S), % for side effects only - final state is dropped {Res,Tail}. int_string(Str, Options,FileName) -> {ok, XMLBase} = file:get_cwd(), int_string(Str, Options, XMLBase, FileName). int_string(Str, Options, XMLBase, FileName) -> S0=initial_state0(Options,XMLBase), S = S0#xmerl_scanner{filename=FileName}, %%?dbg("int_string1, calling xmerl_lib:detect_charset~n",[]), %% In case of no encoding attribute in document utf-8 is default, but %% another character set may be detected with help of Byte Order Marker or %% with help of the encoding of the first 4 bytes. case xmerl_lib:detect_charset(S#xmerl_scanner.encoding,Str) of {auto,'iso-10646-utf-1',Str2} -> scan_document(Str2, S#xmerl_scanner{encoding="iso-10646-utf-1"}); {external,'iso-10646-utf-1',Str2} -> scan_document(Str2, S#xmerl_scanner{encoding="iso-10646-utf-1"}); {undefined,undefined,Str2} -> %% no auto detection scan_document(Str2, S); {external,ExtCharset,Str2} -> %% no auto detection, ExtCharset is an explicitly provided %% 7 bit,8 bit or utf-8 encoding scan_document(Str2, S#xmerl_scanner{encoding=atom_to_list(ExtCharset)}) end. int_string_decl(Str, Options, XMLBase, FileName) -> S0=initial_state0(Options,XMLBase), S = S0#xmerl_scanner{filename=FileName}, case xmerl_lib:detect_charset(S#xmerl_scanner.encoding,Str) of {auto,'iso-10646-utf-1',Str2} -> scan_decl(Str2, S#xmerl_scanner{encoding="iso-10646-utf-1"}); {external,'iso-10646-utf-1',Str2} -> scan_decl(Str2, S#xmerl_scanner{encoding="iso-10646-utf-1"}); {undefined,undefined,Str2} -> scan_decl(Str2, S); {external,ExtCharset,Str2} -> scan_decl(Str2, S#xmerl_scanner{encoding=atom_to_list(ExtCharset)}) end. initial_state0(Options,XMLBase) -> CommonData = common_data(), initial_state(Options, #xmerl_scanner{ event_fun = fun event/2, hook_fun = fun hook/2, acc_fun = fun acc/3, fetch_fun = fun fetch/2, close_fun = fun close/1, continuation_fun = fun cont/3, rules_read_fun = fun rules_read/3, rules_write_fun = fun rules_write/4, rules_delete_fun= fun rules_delete/3, xmlbase = XMLBase, common_data = CommonData }). initial_state([{event_fun, F}|T], S) -> initial_state(T, S#xmerl_scanner{event_fun = F}); initial_state([{event_fun, F, ES}|T], S) -> S1 = event_state(ES, S#xmerl_scanner{event_fun = F}), initial_state(T, S1); initial_state([{acc_fun, F}|T], S) -> initial_state(T, S#xmerl_scanner{acc_fun = F}); initial_state([{hook_fun, F}|T], S) -> initial_state(T, S#xmerl_scanner{hook_fun = F}); initial_state([{hook_fun, F, HS}|T], S) -> S1 = hook_state(HS, S#xmerl_scanner{hook_fun = F}), initial_state(T, S1); initial_state([{close_fun, F}|T], S) -> initial_state(T, S#xmerl_scanner{close_fun = F}); initial_state([{fetch_fun, F}|T], S) -> initial_state(T, S#xmerl_scanner{fetch_fun = F}); initial_state([{fetch_fun, F, FS}|T], S) -> S1 = fetch_state(FS, S#xmerl_scanner{fetch_fun = F}), initial_state(T, S1); initial_state([{fetch_path, P}|T], S) -> initial_state(T, S#xmerl_scanner{fetch_path = P}); initial_state([{continuation_fun, F}|T], S) -> initial_state(T, S#xmerl_scanner{continuation_fun = F}); initial_state([{continuation_fun, F, CS}|T], S) -> S1 = cont_state(CS, S#xmerl_scanner{continuation_fun = F}), initial_state(T, S1); initial_state([{rules, R}|T], S) -> initial_state(T, S#xmerl_scanner{rules = R, keep_rules = true}); initial_state([{rules, Read, Write, RS}|T], S) -> S1 = rules_state(RS, S#xmerl_scanner{rules_read_fun = Read, rules_write_fun = Write, keep_rules = true}), initial_state(T, S1); initial_state([{user_state, F}|T], S) -> initial_state(T, S#xmerl_scanner{user_state = F}); initial_state([{space, L}|T], S) -> initial_state(T, S#xmerl_scanner{space = L}); initial_state([{line, L}|T], S) -> initial_state(T, S#xmerl_scanner{line = L}); initial_state([{namespace_conformant, F}|T], S) when F==true; F==false -> initial_state(T, S#xmerl_scanner{namespace_conformant = F}); initial_state([{validation, F}|T], S) when F==off; F==dtd; F==schema; F==true; F==false -> initial_state(T, S#xmerl_scanner{validation = validation_value(F)}); initial_state([{schemaLocation, SL}|T], S) when is_list(SL) -> initial_state(T, S#xmerl_scanner{schemaLocation=SL}); initial_state([{quiet, F}|T], S) when F==true; F==false -> initial_state(T, S#xmerl_scanner{quiet = F}); initial_state([{doctype_DTD,DTD}|T], S) -> initial_state(T,S#xmerl_scanner{doctype_DTD = DTD}); initial_state([{document, F}|T], S) when is_boolean(F) -> initial_state(T,S#xmerl_scanner{document = F}); initial_state([{comments, F}|T], S) when is_boolean(F) -> initial_state(T,S#xmerl_scanner{comments = F}); initial_state([{default_attrs, F}|T], S) when is_boolean(F) -> initial_state(T,S#xmerl_scanner{default_attrs = F}); initial_state([{text_decl,Bool}|T], S) -> initial_state(T,S#xmerl_scanner{text_decl=Bool}); initial_state([{environment,Env}|T], S) -> initial_state(T,S#xmerl_scanner{environment=Env}); initial_state([{xmlbase, D}|T], S) -> initial_state(T, S#xmerl_scanner{xmlbase = D}); initial_state([{encoding, Enc}|T], S) -> initial_state(T, S#xmerl_scanner{encoding = Enc}); initial_state([], S=#xmerl_scanner{rules = undefined}) -> Tab = ets:new(rules, [set, public]), S#xmerl_scanner{rules = Tab}; initial_state([], S) -> S. validation_value(true) -> dtd; validation_value(false) -> off; validation_value(F) -> F. %% Used for compacting (some) indentations. %% See also fast_accumulate_whitespace(). common_data() -> {comdata(lists:duplicate(60, $\s), []), comdata(lists:duplicate(15, $\t), []), "\n"}. comdata([], CD)-> list_to_tuple(CD); comdata([_ | T]=L, CD) -> comdata(T, [[$\n | L] | CD]). %%% ----------------------------------------------------- %%% Default modifier functions %%% Hooks: %%% - {element, Line, Name, Attrs, Content} %%% - {processing_instruction, Line, Data} hook(X, State) -> {X, State}. %%% Events: %%% %%% #xmerl_event{event : started | ended, %%% line : integer(), %%% col : integer(), %%% data} %%% %%% Data Events %%% document started, ended %%% #xmlElement started, ended %%% #xmlAttribute ended %%% #xmlPI ended %%% #xmlComment ended %%% #xmlText ended event(_X, S) -> S. %% The acc/3 function can return either {Acc´, S'} or {Acc', Pos', S'}, %% where Pos' can be derived from X#xmlElement.pos, X#xmlText.pos, or %% X#xmlAttribute.pos (whichever is the current object type.) %% The acc/3 function is not allowed to redefine the type of object %% being defined, but _is_ allowed to either ignore it or split it %% into multiple objects (in which case {Acc',Pos',S'} should be returned.) %% If {Acc',S'} is returned, Pos will be incremented by 1 by default. %% Below is an example of an acceptable operation acc(#xmlText{value = Text}, [X = #xmlText{value = AccText}], S) -> {[X#xmlText{value = AccText ++ Text}], S}; acc(X, Acc, S) -> {[X|Acc], S}. fetch({system, URI}, S) -> fetch_URI(URI, S); fetch({public, _PublicID, URI}, S) -> fetch_URI(URI, S). %%% Always assume an external resource can be found locally! Thus %%% don't bother fetching with e.g. HTTP. Returns the path where the %%% resource is found. The path to the external resource is given by %%% URI directly or the option fetch_path (additional paths) or %%% directory (base path to external resource) fetch_URI(URI, S) -> %% assume URI is a filename Split = filename:split(URI), Filename = fun([])->[];(X)->lists:last(X) end (Split), Fullname = case Split of %% how about Windows systems? ["file:"|Name]-> %% absolute path, see RFC2396 sect 3 %% file:/dtd_name filename:join(["/"|Name]); ["/"|Rest] when Rest /= [] -> %% absolute path name URI; ["http:"|_Rest] -> {http,URI}; [] -> %% empty systemliteral []; _ -> filename:join(S#xmerl_scanner.xmlbase, URI) end, Path = path_locate(S#xmerl_scanner.fetch_path, Filename, Fullname), ?dbg("fetch(~p) -> {file, ~p}.~n", [URI, Path]), {ok, Path, S}. path_locate(_, _, {http,_}=URI) -> URI; path_locate(_, _, []) -> []; path_locate([Dir|Dirs], FN, FullName) -> F = filename:join(Dir, FN), case file:read_file_info(F) of {ok, #file_info{type = regular}} -> {file,F}; _ -> path_locate(Dirs, FN, FullName) end; path_locate([], _FN, FullName) -> {file,FullName}. cont(_F, Exception, US) -> Exception(US). close(S) -> S. %%% ----------------------------------------------------- %%% Scanner %%% [1] document ::= prolog element Misc* scan_document(Str0, S=#xmerl_scanner{event_fun = Event, line = L, col = C, environment=Env, encoding=Charset, document=Document, validation=ValidateResult}) -> S1 = Event(#xmerl_event{event = started, line = L, col = C, data = document}, S), %% Transform to given character set. %% Note that if another character set is given in the encoding %% attribute in a XML declaration that one will be used later Str=if Charset == "utf-8" -> Str0; Charset =/= undefined -> % Default character set is UTF-8 xmerl_ucs:to_unicode(Str0, list_to_atom(Charset)); true -> %% Charset is undefined if no external input is %% given, and no auto detection of character %% encoding was made. Str0 end, %% M1 = erlang:memory(), %% ?dbg("Memory status before prolog: ~p~n",[M1]), {Prolog, Pos, T1, S2} = scan_prolog(Str, S1, _StartPos = 1), %% M2 = erlang:memory(), %% ?dbg("Memory status after prolog: ~p~n",[M2]), %%?dbg("scan_document 2, prolog parsed~n",[]), T2 = scan_mandatory("<", T1, 1, S2, expected_element_start_tag), %% M3 = erlang:memory(), %% ?dbg("Memory status before element: ~p~n",[M3]), {Res, T3, S3} = scan_element(T2,S2,Pos), %% M4 = erlang:memory(), %% ?dbg("Memory status after element: ~p~n",[M4]), {Misc, _Pos1, Tail, S4}=scan_misc(T3, S3, Pos + 1), %% M5 = erlang:memory(), %% ?dbg("Memory status after misc: ~p~n",[M5]), S5 = #xmerl_scanner{} = Event(#xmerl_event{event = ended, line = S4#xmerl_scanner.line, col = S4#xmerl_scanner.col, data = document}, S4), {Res2, S6} = case validation_mode(ValidateResult) of off -> {Res, cleanup(S5)}; dtd when Env == element; Env == prolog -> check_decl2(S5), case xmerl_validate:validate(S5, Res) of {'EXIT', {error, Reason}} -> S5b = cleanup(S5), ?fatal({failed_validation, Reason}, S5b); {'EXIT', Reason} -> S5b = cleanup(S5), ?fatal({failed_validation, Reason}, S5b); {error, Reason} -> S5b = cleanup(S5), ?fatal({failed_validation, Reason}, S5b); {error, Reason, _Next} -> S5b = cleanup(S5), ?fatal({failed_validation, Reason}, S5b); _XML -> {Res, cleanup(S5)} end; schema -> case schemaLocations(Res, S5) of {ok, Schemas} -> _ = cleanup(S5), %%?dbg("Schemas: ~p~nRes: ~p~ninhertih_options(S): ~p~n", %% [Schemas,Res,inherit_options(S5)]), XSDRes = xmerl_xsd:process_validate(Schemas, Res, inherit_options(S5)), handle_schema_result(XSDRes, S5); _ -> {Res, cleanup(S5)} end; _ -> {Res, cleanup(S5)} end, Res3 = case Document of true -> Content = lists:reverse(Prolog, [Res2 | lists:reverse(Misc)]), #xmlDocument{content = Content}; false -> Res2 end, {Res3, Tail, S6}. scan_decl(Str, S=#xmerl_scanner{event_fun = Event, line = L, col = C, environment=_Env, encoding=_Charset, validation=_ValidateResult}) -> S1 = Event(#xmerl_event{event = started, line = L, col = C, data = document}, S), case scan_prolog(Str, S1, _StartPos = 1) of {_,_,T2="<"++_, S2} -> {{S2#xmerl_scanner.user_state,T2},[],S2}; {_,_,[], S2}-> {[],[],S2}; {_,_,T2, S2} -> {_,_,S3} = scan_content(T2,S2,[],_Attrs=[],S2#xmerl_scanner.space, _Lang=[],_Parents=[],#xmlNamespace{}), {T2,[],S3} end. %%% [22] Prolog %%% prolog ::= XMLDecl? Misc* (doctypedecl Misc*)? %%% %% empty text declarations are handled by the first function clause. scan_prolog(T, S, Pos) -> scan_prolog(T, S, Pos, []). scan_prolog([], S=#xmerl_scanner{continuation_fun = F}, Pos, Acc) -> ?dbg("cont()...~n", []), F(fun(MoreBytes, S1) -> scan_prolog(MoreBytes, S1, Pos, Acc) end, fun(S1) -> {Acc, Pos, [], S1} end, S); scan_prolog(" {Charset, T3, S3} = if Col==1,L==1,S0#xmerl_scanner.text_decl==true -> ?dbg("prolog(\" ?dbg("prolog(\" ?fatal({xml_declaration_must_be_first_in_doc,Col,L},S0) end, %% Charset0 is either (1) 'iso-10646-utf-1' (transformation by %% auto detection), (2) undefined (no auto detection and no %% external encoding), (3) any other encoding format that must be %% conformant to the internal explicitly given encoding. The two %% former cases implies that the explicit internal encoding %% (Charset) may be different from Charset0. %% Now transform to declared character set. if Charset==Charset0 -> % Document already transformed to this charset! scan_prolog(T3, S3, Pos, Acc); Charset0=/=undefined -> %% For example may an external entity %% have the BOM for utf-16 and the internal %% explicit encoding='utf-16', then it will be auto %% detected and transformed, Charset0 will be %% 'iso-10646-utf-1', and Charset will be 'utf-16', all %% legal. %% scan_prolog(T3,S3#xmerl_scanner{encoding=Charset0},Pos,Acc); Charset == "utf-8" -> scan_prolog(T3, S3, Pos, Acc); Charset=/=undefined -> % Document not previously transformed T4=xmerl_ucs:to_unicode(T3,list_to_atom(Charset)), scan_prolog(T4, S3, Pos, Acc); true -> % No encoding info given scan_prolog(T3, S3, Pos, Acc) end; scan_prolog(" ?dbg("prolog(\" xmerl_ucs:to_unicode(T,'utf-8'); true -> T end, {T2, S1} = scan_doctype(T1, S), scan_misc(T2, S1, Pos, Acc); scan_prolog(Str="%"++_T,S=#xmerl_scanner{environment={external,_}}, Pos,Acc) -> {T, S1} = scan_ext_subset(Str,S), {Acc, Pos, T, S1}; scan_prolog(Str, S0 = #xmerl_scanner{user_state=_US,encoding=_Charset}, Pos,Acc) -> ?dbg("prolog(\"<\")~n", []), %% Check for Comments, PI before possible DOCTYPE declaration ?bump_col(1), %% If no known character set assume it is UTF-8 T=if %% Charset==undefined -> xmerl_ucs:to_unicode(Str,'utf-8'); true -> Str end, {Acc1, Pos1, T1, S1}=scan_misc(T, S, Pos, Acc), scan_prolog2(T1,S1,Pos1,Acc1). scan_prolog2([], S=#xmerl_scanner{continuation_fun = F}, Pos, Acc) -> ?dbg("cont()...~n", []), F(fun(MoreBytes, S1) -> scan_prolog2(MoreBytes, S1, Pos, Acc) end, fun(S1) -> {Acc, Pos, [], S1} end, S); scan_prolog2(" ?dbg("prolog(\" ?dbg("prolog(\" ?dbg("prolog(\"<\")~n", []), %% Here we consider the DTD provided by doctype_DTD option, S1 = case S0 of #xmerl_scanner{validation=dtd,doctype_DTD=DTD} when is_list(DTD) -> S=fetch_DTD(undefined,S0), check_decl(S), S; _ -> S0 end, %% Check for more Comments and PI after DOCTYPE declaration % ?bump_col(1), scan_misc(Str, S1, Pos, Acc). %%% [27] Misc ::= Comment | PI | S %% Note: %% - Neither of Comment and PI are returned in the resulting parsed %% structure. %% - scan_misc/3 implements Misc* as that is how the rule is always used scan_misc(T, S, Pos) -> scan_misc(T, S, Pos, []). scan_misc([], S=#xmerl_scanner{continuation_fun = F}, Pos, Acc) -> ?dbg("cont()...~n", []), F(fun(MoreBytes, S1) -> scan_misc(MoreBytes, S1, Pos, Acc) end, fun(S1) -> {Acc, Pos, [], S1} end, S); scan_misc(""++ T,S0,Delim,Acc,PEName, parameter=NS,PENesting) -> ?bump_col(3), scan_entity_value(T,S,Delim,["-->"|Acc],PEName,NS, pe_pop("-->",PENesting,S)); %% Stop delimeter for ConditionalSection scan_entity_value("]]>"++ T,S0,Delim,Acc,PEName, parameter=NS,PENesting) -> ?bump_col(3), scan_entity_value(T,S,Delim,["]]>"|Acc],PEName,NS, pe_pop("]]>",PENesting,S)); %% Stop delimeter added to match a content start delimeter included scan_entity_value("/>"++ T,S0,Delim,Acc,PEName, parameter=NS,PENesting) -> ?bump_col(2), scan_entity_value(T,S,Delim,["/>"|Acc],PEName,NS, pe_pop("/>",PENesting,S)); scan_entity_value(")"++ T,S0,Delim,Acc,PEName, parameter=NS,PENesting) -> ?bump_col(1), scan_entity_value(T,S,Delim,[")"|Acc],PEName,NS, pe_pop(")",PENesting,S)); scan_entity_value("\n"++T, S, Delim, Acc, PEName,Namespace,PENesting) -> scan_entity_value(T, S#xmerl_scanner{line=S#xmerl_scanner.line+1}, Delim, ["\n"|Acc], PEName,Namespace,PENesting); scan_entity_value(Str, S0, Delim, Acc, PEName,Namespace,PENesting) -> {Ch,T} = to_ucs(S0#xmerl_scanner.encoding,Str), case xmerl_lib:is_char(Ch) of true -> ?bump_col(1), scan_entity_value(T, S, Delim, [Ch|Acc], PEName,Namespace,PENesting); false -> ?fatal({unexpected_char,Ch}, S0) end. save_refed_entity_name(Name,PEName,S) -> case predefined_entity(Name) of true -> S; _ -> save_refed_entity_name1(Name,PEName,S) end. save_refed_entity_name1(Name,PEName, S=#xmerl_scanner{entity_references=ERefs}) -> case lists:keysearch(PEName,1,ERefs) of {value,{_,Refs}} -> NewRefs = case lists:member(Name,Refs) of true ->Refs; _ -> [Name|Refs] end, S#xmerl_scanner{entity_references=lists:keyreplace(PEName,1,ERefs, {PEName,NewRefs}) }; _ -> S#xmerl_scanner{entity_references=[{PEName,[Name]}|ERefs]} end. pe_push(Tok,Stack,_S) when Tok==" [Tok|Stack]; pe_push(Tok,Stack,#xmerl_scanner{validation=dtd}) when Tok==")";Tok==">";Tok=="?>";Tok=="]]>";Tok=="-->";Tok=="/>"-> [Tok|Stack]; pe_push(_,Stack,_S) -> Stack. pe_pop(">",[" Rest; pe_pop("?>",[" Rest; pe_pop("-->",[""++_T,parameter,dtd) -> "-->"; pe_nesting_token("]]>"++_T,parameter,dtd) -> "]]>"; pe_nesting_token(")"++_T,parameter,dtd) -> ")"; pe_nesting_token("/>"++_T,parameter,dtd) -> "/>"; pe_nesting_token(_,_,_) -> false. predefined_entity(amp) -> true; predefined_entity(lt) -> true; predefined_entity(gt) -> true; predefined_entity(apos) -> true; predefined_entity(quot) -> true; predefined_entity(_) -> false. check_entity_recursion(EName, S=#xmerl_scanner{entity_references=EntityRefList}) -> Set = sofs:family(EntityRefList), case catch sofs:family_to_digraph(Set, [acyclic]) of {'EXIT',{cyclic,_}} -> ?fatal({illegal_recursion_in_Entity, EName}, S); DG -> digraph:delete(DG), ok end. %%%%%%% [15] Comment scan_comment(Str, S) -> scan_comment(Str, S, _Pos = undefined, _Parents = [], _Lang = []). scan_comment(Str,S=#xmerl_scanner{col=C,event_fun=Event}, Pos, Parents, Lang) -> Comment = #xmlComment{pos = Pos, parents = Parents, language = Lang, value = undefined}, S1 = #xmerl_scanner{} = Event(#xmerl_event{event = started, line = S#xmerl_scanner.line, col = C, pos = Pos, data = Comment}, S), scan_comment1(Str, S1, Pos, Comment, _Acc = []). scan_comment1([], S=#xmerl_scanner{continuation_fun = F}, Pos, Comment, Acc) -> ?dbg("cont()...~n", []), F(fun(MoreBytes, S1) -> scan_comment1(MoreBytes, S1, Pos, Comment, Acc) end, fatal_fun(unexpected_end), S); scan_comment1("-->" ++ T, S0 = #xmerl_scanner{col = C, event_fun = Event, hook_fun = Hook}, _Pos, Comment, Acc) -> ?bump_col(3), Comment1 = Comment#xmlComment{value = lists:reverse(Acc)}, S1=#xmerl_scanner{}=Event(#xmerl_event{event = ended, line=S#xmerl_scanner.line, col = C, data = Comment1}, S), {Ret, S2} = Hook(Comment1, S1), {_,T3,S3}=strip(T,S2), {Ret,T3,S3}; scan_comment1("--"++T,S,_Pos,_Comment,_Acc) -> ?fatal({invalid_comment,"--"++[hd(T)]}, S); scan_comment1("\n" ++ T, S=#xmerl_scanner{line = L}, Pos, Cmt, Acc) -> scan_comment1(T, S#xmerl_scanner{line=L+1,col=1},Pos, Cmt, "\n" ++ Acc); scan_comment1("\r\n" ++ T, S=#xmerl_scanner{line = L}, Pos, Cmt, Acc) -> %% CR followed by LF is read as a single LF scan_comment1(T, S#xmerl_scanner{line=L+1,col=1}, Pos, Cmt, "\n" ++ Acc); scan_comment1("\r" ++ T, S=#xmerl_scanner{line = L}, Pos, Cmt, Acc) -> %% CR not followed by LF is read as a LF scan_comment1(T, S#xmerl_scanner{line=L+1,col=1}, Pos, Cmt, "\n" ++ Acc); scan_comment1(Str, S=#xmerl_scanner{col = C}, Pos, Cmt, Acc) -> {Ch,T} = wfc_legal_char(Str,S), scan_comment1(T, S#xmerl_scanner{col=C+1}, Pos, Cmt, [Ch|Acc]). %%%%%%% scan_markup_completion_gt([$>|_R]=T,S) -> {T,S}; scan_markup_completion_gt([$%|T],S0) -> ?bump_col(1), {Name,T1,S1} = scan_pe_reference(T,S), ExpandedRef = expand_pe_reference(Name,S1,as_PE), {_,T2,S2} = strip(ExpandedRef++T1,S1), scan_markup_completion_gt(T2,S2); scan_markup_completion_gt(T,S) -> ?fatal({error,{malformed_syntax_entity_completion,T}},S). scan_mandatory(Pattern,T,N,S,ErrorMsg) -> case lists:prefix(Pattern,T) of true -> lists:nthtail(N,T); _ -> ?fatal(ErrorMsg,S) end. strip(Str,S) -> strip(Str,S,all). strip([], S=#xmerl_scanner{continuation_fun = F},_) -> ?dbg("cont()... stripping whitespace~n", []), F(fun(MoreBytes, S1) -> strip(MoreBytes, S1) end, fun(S1) -> {[], [], S1} end, S); strip("\s" ++ T, S=#xmerl_scanner{col = C},Lim) -> strip(T, S#xmerl_scanner{col = C+1},Lim); strip("\t" ++ _T, S ,no_tab) -> ?fatal({error,{no_tab_allowed}},S); strip("\t" ++ T, S=#xmerl_scanner{col = C},Lim) -> strip(T, S#xmerl_scanner{col = expand_tab(C)},Lim); strip("\n" ++ T, S=#xmerl_scanner{line = L},Lim) -> strip(T, S#xmerl_scanner{line = L+1, col = 1},Lim); strip("\r\n" ++ T, S=#xmerl_scanner{line = L},Lim) -> %% CR followed by LF is read as a single LF strip(T, S#xmerl_scanner{line = L+1, col = 1},Lim); strip("\r" ++ T, S=#xmerl_scanner{line = L},Lim) -> %% CR not followed by LF is read as a LF strip(T, S#xmerl_scanner{line = L+1, col = 1},Lim); strip(Str, S,_Lim) -> {[], Str, S}. %% demands a whitespace, though a parameter entity is ok, it will %% expand with a whitespace on each side. mandatory_strip([],S) -> ?fatal({error,{whitespace_was_expected}},S); mandatory_strip(T,S) when ?whitespace(hd(T)) -> strip(T,S,all); mandatory_strip([$%|T],S) when ?whitespace(hd(T)) -> %this is not a PERefence, but an PEDeclaration ?fatal({error,{whitespace_was_expected}},S); mandatory_strip([$%|_T]=T,S) -> {[],T,S}; mandatory_strip(_T,S) -> ?fatal({error,{whitespace_was_expected}},S). %% strip but don't accept tab pub_id_strip(Str, S) -> strip(Str,S,no_tab). normalize("&"++T,S,IsNorm) -> case scan_reference(T, S) of {ExpRef, T1, S1} when ?whitespace(hd(ExpRef)) -> ExpRef2 = string_to_char_set(S#xmerl_scanner.encoding,ExpRef), normalize(ExpRef2++T1,S1,IsNorm); _ -> {"&"++T,S,IsNorm} end; normalize(T,S,IsNorm) -> case strip(T,S) of {_,T,S} -> {T,S,IsNorm}; {_,T1,S1} -> normalize(T1,S1,true) end. %% Optimization: %% - avoid building list of spaces or tabs; %% - avoid reverse; %% - compact two common indentation patterns. %% Note: only to be called when a \n was found. fast_accumulate_whitespace(" " ++ T, S, _) -> fast_acc_spaces(T, S, 1); fast_accumulate_whitespace("\t"++T, S, _) -> fast_acc_tabs(T, S, 1); fast_accumulate_whitespace("<"++_=R, S, _T) -> #xmerl_scanner{common_data = CD, line = Line} = S, {done, {element(3, CD), R, S#xmerl_scanner{col = 1, line = Line + 1}}}; fast_accumulate_whitespace(_, S, T) -> accumulate_whitespace(T, S, []). fast_acc_spaces(" " ++ T, S, N) -> fast_acc_spaces(T, S, N + 1); fast_acc_spaces(T, S, N) -> fast_acc_end(T, S, N, N, $\s, 1). fast_acc_tabs("\t" ++ T, S, N) -> fast_acc_tabs(T, S, N + 1); fast_acc_tabs(T, S, N) -> fast_acc_end(T, S, N, N * 8 + 1, $\t, 2). fast_acc_end(T, S, N, Col, C, CD_I) -> #xmerl_scanner{common_data = CD, line = Line0} = S, Line = Line0 + 1, try $< = hd(T), {done,{element(N, element(CD_I, CD)), T, S#xmerl_scanner{col = Col, line = Line}}} catch _:_ -> accumulate_whitespace(T, S, Line, Col, lists:duplicate(N, C)++"\n") end. %%% @spec accumulate_whitespace(T::string(),S::global_state(), %%% atom(),Acc::string()) -> {Acc, T1, S1} %%% %%% @doc Function to accumulate and normalize whitespace. accumulate_whitespace(T, S, preserve, Acc) -> accumulate_whitespace(T, S, Acc); accumulate_whitespace(T, S, normalize, Acc) -> {_WsAcc, T1, S1} = accumulate_whitespace(T, S, []), {[$\s|Acc], T1, S1}. accumulate_whitespace(T, S, Acc) -> #xmerl_scanner{line = Line, col = Col} = S, accumulate_whitespace(T, S, Line, Col, Acc). accumulate_whitespace([], S0, Line, Col, Acc) -> #xmerl_scanner{continuation_fun = F} = S0, S = S0#xmerl_scanner{line = Line, col = Col}, ?dbg("cont()...~n", []), F(fun(MoreBytes, S1) -> accumulate_whitespace(MoreBytes, S1, Acc) end, fun(S1) -> {Acc, [], S1} end, S); accumulate_whitespace("\s" ++ T, S, Line, Col, Acc) -> accumulate_whitespace(T, S, Line, Col+1, [$\s|Acc]); accumulate_whitespace("\t" ++ T, S, Line, Col, Acc) -> accumulate_whitespace(T, S, Line, expand_tab(Col), [$\t|Acc]); accumulate_whitespace("\n" ++ T, S, Line, _Col, Acc) -> accumulate_whitespace(T, S, Line+1, 1, [$\n|Acc]); accumulate_whitespace("\r\n" ++ T, S, Line, _Col, Acc) -> %% CR followed by LF is read as a single LF accumulate_whitespace(T, S, Line+1, 1, [$\n|Acc]); accumulate_whitespace("\r" ++ T, S, Line, _Col, Acc) -> %% CR not followed by LF is read as a LF accumulate_whitespace(T, S, Line+1, 1, [$\n|Acc]); accumulate_whitespace(Str, S, Line, Col, Acc) -> {Acc, Str, S#xmerl_scanner{line = Line, col = Col}}. expand_tab(Col) -> Rem = (Col-1) rem 8, _NewCol = Col + 8 - Rem. %% validation_mode(Validation) %% Validation = off | dtd | schema | true | false %% true and false are obsolete validation_mode(false) -> off; validation_mode(true) -> dtd; validation_mode(Other) -> Other. schemaLocations(El,#xmerl_scanner{schemaLocation=[]}) -> schemaLocations(El); schemaLocations(El,#xmerl_scanner{schemaLocation=SL}) -> case SL of [{_,_}|_] -> {ok,SL}; _ -> schemaLocations(El) end. schemaLocations(#xmlElement{attributes=Atts,xmlbase=_Base}) -> Pred = fun(#xmlAttribute{name=schemaLocation}) -> false; (#xmlAttribute{nsinfo={_,"schemaLocation"}}) -> false; (_) -> true end, case lists:dropwhile(Pred,Atts) of [#xmlAttribute{value=Paths}|_] -> case string:tokens(Paths," \n\t\r") of L when length(L) > 0 -> case length(L) rem 2 of 0 -> PairList = fun([],_Fun) -> []; ([SLNS,SLLoc|Rest],Fun) -> [{SLNS,SLLoc}|Fun(Rest,Fun)] end, {ok,PairList(L,PairList)}; _ -> {error,{schemaLocation_attribute,namespace_location_not_in_pair}} end; _ -> {error,{missing_schemaLocation}} end; [] -> {error,{missing_schemaLocation}} end. inherit_options(S) -> %%?dbg("xsdbase: ~p~n",[S#xmerl_scanner.xmlbase]), [{xsdbase,S#xmerl_scanner.xmlbase}]. handle_schema_result({XSDRes=#xmlElement{},_},S5) -> {XSDRes,S5}; handle_schema_result({error,Reason},S5) -> ?fatal({failed_schema_validation,Reason},S5). %%% Helper functions -compile({inline, [fatal_fun/1]}). -spec fatal_fun(_) -> fun((_) -> no_return()). fatal_fun(Reason) -> fun(S) -> ?fatal(Reason, S) end. fatal(Reason, S) -> exit({fatal, {Reason, {file,S#xmerl_scanner.filename}, {line,S#xmerl_scanner.line}, {col,S#xmerl_scanner.col}}}). %% preformat formats tokens in L1 and L2, L2 separated by Sep into a %% list preformat(L1,L2,Sep) -> Format1= lists:flatten(lists:duplicate(length(L1)-1,"~s ")++"~s"), Format2 = lists:flatten(lists:duplicate(length(L2)-1, " ~s"++Sep)++" ~s"), lists:flatten(io_lib:format(Format1++Format2,L1++L2)). %% BUG when we are many balise none attributes has save in rules rules_write(Context, Name, Value, #xmerl_scanner{rules = T} = S) -> case ets:lookup(T, {Context, Name}) of [] -> ets:insert(T, {{Context, Name}, Value}); _ -> ok end, S. rules_read(Context, Name, #xmerl_scanner{rules = T}) -> case ets:lookup(T, {Context, Name}) of [] -> undefined; [{_, V}] -> V end. rules_delete(Context,Name,#xmerl_scanner{rules = T}) -> ets:delete(T,{Context,Name}). to_ucs(Encoding, Chars) when Encoding=="utf-8"; Encoding == undefined -> utf8_2_ucs(Chars); to_ucs(_,[C|Rest]) -> {C,Rest}. utf8_2_ucs([A,B,C,D|Rest]) when A band 16#f8 =:= 16#f0, B band 16#c0 =:= 16#80, C band 16#c0 =:= 16#80, D band 16#c0 =:= 16#80 -> %% 11110vvv 10vvvvvv 10vvvvvv 10vvvvvv case ((D band 16#3f) bor ((C band 16#3f) bsl 6) bor ((B band 16#3f) bsl 12) bor ((A band 16#07) bsl 18)) of Ch when Ch >= 16#10000 -> {Ch,Rest}; Ch -> {{error,{bad_character,Ch}},Rest} end; utf8_2_ucs([A,B,C|Rest]) when A band 16#f0 =:= 16#e0, B band 16#c0 =:= 16#80, C band 16#c0 =:= 16#80 -> %% 1110vvvv 10vvvvvv 10vvvvvv case ((C band 16#3f) bor ((B band 16#3f) bsl 6) bor ((A band 16#0f) bsl 12)) of Ch when Ch >= 16#800 -> {Ch,Rest}; Ch -> {{error,{bad_character,Ch}},Rest} end; utf8_2_ucs([A,B|Rest]) when A band 16#e0 =:= 16#c0, B band 16#c0 =:= 16#80 -> %% 110vvvvv 10vvvvvv case ((B band 16#3f) bor ((A band 16#1f) bsl 6)) of Ch when Ch >= 16#80 -> {Ch,Rest}; Ch -> {{error,{bad_character,Ch}},Rest} end; utf8_2_ucs([A|Rest]) when A < 16#80 -> {A,Rest}; utf8_2_ucs([A|Rest]) -> {{error,{bad_character,A}},Rest}. %% to_char_set("iso-10646-utf-1",Ch) -> %% [Ch]; %% to_char_set(UTF8,Ch) when UTF8 =:= "utf-8"; UTF8 =:= undefined -> %% ucs_2_utf8(Ch); %% to_char_set(_,Ch) -> %% [Ch]. ucs_2_utf8(Ch) when Ch < 128 -> %% 0vvvvvvv [Ch]; ucs_2_utf8(Ch) when Ch < 16#0800 -> %% Ch: -----vvv vvvvvvvv %% 110vvvvv 10vvvvvv %% O1 = (Ch band 16#07c0) bsr 6, %% O2 = (Ch band 16#003f), [((Ch band 16#07c0) bsr 6) bor 16#c0,(Ch band 16#003f) bor 16#80]; ucs_2_utf8(Ch) when Ch < 16#10000 -> %% Ch: vvvvvvvv vvvvvvvv %% 1110vvvv 10vvvvvv 10vvvvvv %% O1 = (Ch band 16#f000) bsr 12 %% O2 = (Ch band 16#0fc0) bsr 6 %% O3 = (Ch band 16#003f) [((Ch band 16#f000) bsr 12) bor 16#e0, ((Ch band 16#0fc0) bsr 6) bor 16#80, (Ch band 16#003f) bor 16#80]; ucs_2_utf8(Ch) when Ch < 16#200000 -> %% Ch: ---vvvvv vvvvvvvv vvvvvvvv %% 11110vvv 10vvvvvv 10vvvvvv 10vvvvvv %% O1 = (Ch band 16#1c0000) bsr 18 %% O2 = (Ch band 16#03f000) bsr 12 %% O3 = (Ch band 16#000fc0) bsr 6 %% O4 = (Ch band 16#00003f) [((Ch band 16#1c0000) bsr 18) bor 16#f0, ((Ch band 16#03f000) bsr 12) bor 16#80, ((Ch band 16#000fc0) bsr 6) bor 16#80, (Ch band 16#00003f) bor 16#80]. string_to_char_set(Enc,Str) when Enc =:= "utf-8"; Enc =:= undefined -> lists:flatten([ucs_2_utf8(X)||X <- Str]); string_to_char_set(_,Str) -> Str. %% diagnose(Line) -> %% Mem=erlang:memory(), %% {OldTot,OldLine} = get_total(), %% NewTot = %% case {lists:keysearch(total,1,Mem),OldTot*1.1} of %% {{_,{_,Tot}},Tot110} when Tot > Tot110 -> %% ?dbg("From ~p to ~p, total memory: ~p (~p)~n",[OldLine,Line,Tot,OldTot]), %% Tot; %% {{_,{_,Tot}},_} -> %% Tot %% end, %% put_total({NewTot,Line}). %% get_total() -> %% case get(xmerl_mem) of %% undefined -> %% put(xmerl_mem,{0,0}), %% {0,0}; %% M -> M %% end. %% put_total(M) -> %% put(xmerl_mem,M).