%%
%% %CopyrightBegin%
%%
%% Copyright Ericsson AB 2003-2010. All Rights Reserved.
%%
%% The contents of this file are subject to the Erlang Public License,
%% Version 1.1, (the "License"); you may not use this file except in
%% compliance with the License. You should have received a copy of the
%% Erlang Public License along with this software. If not, it can be
%% retrieved online at http://www.erlang.org/.
%%
%% Software distributed under the License is distributed on an "AS IS"
%% basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See
%% the License for the specific language governing rights and limitations
%% under the License.
%%
%% %CopyrightEnd%
%%
%% Description : Simgle-pass XML scanner. See xmerl.hrl for data defs.
%% @doc This module is the interface to the XML parser, it handles XML 1.0.
%% The XML parser is activated through
%% xmerl_scan:string/[1,2] or
%% xmerl_scan:file/[1,2].
%% It returns records of the type defined in xmerl.hrl.
%% See also tutorial on customization
%% functions.
%% @type global_state().
%% The global state of the scanner, represented by the #xmerl_scanner{} record.
%%
%% @type option_list(). Options allow to customize the behaviour of the
%% scanner.
%% See also tutorial on customization
%% functions.
%%
%% Possible options are:
%%
%% {acc_fun, Fun}
%% - Call back function to accumulate contents of entity.
%% {continuation_fun, Fun} |
%% {continuation_fun, Fun, ContinuationState}
%% - Call back function to decide what to do if the scanner runs into EOF
%% before the document is complete.
%% {event_fun, Fun} |
%% {event_fun, Fun, EventState}
%% - Call back function to handle scanner events.
%% {fetch_fun, Fun} |
%% {fetch_fun, Fun, FetchState}
%% - Call back function to fetch an external resource.
%% {hook_fun, Fun} |
%% {hook_fun, Fun, HookState}
%% - Call back function to process the document entities once
%% identified.
%% {close_fun, Fun}
%% - Called when document has been completely parsed.
%% {rules, ReadFun, WriteFun, RulesState} |
%% {rules, Rules}
%% - Handles storing of scanner information when parsing.
%% {user_state, UserState}
%% - Global state variable accessible from all customization functions
%%
%% {fetch_path, PathList}
%% - PathList is a list of
%% directories to search when fetching files. If the file in question
%% is not in the fetch_path, the URI will be used as a file
%% name.
%% {space, Flag}
%% - 'preserve' (default) to preserve spaces, 'normalize' to
%% accumulate consecutive whitespace and replace it with one space.
%% {line, Line}
%% - To specify starting line for scanning in document which contains
%% fragments of XML.
%% {namespace_conformant, Flag}
%% - Controls whether to behave as a namespace conformant XML parser,
%% 'false' (default) to not otherwise 'true'.
%% {validation, Flag}
%% - Controls whether to process as a validating XML parser:
%% 'off' (default) no validation, or validation 'dtd' by DTD or 'schema'
%% by XML Schema. 'false' and 'true' options are obsolete
%% (i.e. they may be removed in a future release), if used 'false'
%% equals 'off' and 'true' equals 'dtd'.
%% {schemaLocation, [{Namespace,Link}|...]}
%% - Tells explicitly which XML Schema documents to use to validate
%% the XML document. Used together with the
%%
{validation,schema}
option.
%% {quiet, Flag}
%% - Set to 'true' if xmerl should behave quietly and not output any
%% information to standard output (default 'false').
%% {doctype_DTD, DTD}
%% - Allows to specify DTD name when it isn't available in the XML
%% document. This option has effect only together with
%%
{validation,'dtd'
option.
%% {xmlbase, Dir}
%% - XML Base directory. If using string/1 default is current directory.
%% If using file/1 default is directory of given file.
%% {encoding, Enc}
%% - Set default character set used (default UTF-8).
%% This character set is used only if not explicitly given by the XML
%% declaration.
%%
-module(xmerl_scan).
-vsn('0.20').
-date('03-09-16').
%% main API
-export([string/1, string/2,
file/1, file/2]).
%% access functions for various states
-export([user_state/1, user_state/2,
event_state/1, event_state/2,
hook_state/1, hook_state/2,
rules_state/1, rules_state/2,
fetch_state/1, fetch_state/2,
cont_state/1, cont_state/2]).
%% helper functions. To xmerl_lib ??
-export([accumulate_whitespace/4]).
%-define(debug, 1).
-include("xmerl.hrl"). % record def, macros
-include("xmerl_internal.hrl").
-include_lib("kernel/include/file.hrl").
-define(fatal(Reason, S),
if
S#xmerl_scanner.quiet ->
ok;
true ->
ok=io:format("~p- fatal: ~p~n", [?LINE, Reason])
end,
fatal(Reason, S)).
-define(ustate(U, S), S#xmerl_scanner{user_state = U}).
%% Functions to access the various states
%%% @spec user_state(S::global_state()) -> global_state()
%%% @equiv user_state(UserState,S)
user_state(#xmerl_scanner{user_state = S}) -> S.
%%% @spec event_state(S::global_state()) -> global_state()
%%% @equiv event_state(EventState,S)
event_state(#xmerl_scanner{fun_states = #xmerl_fun_states{event = S}}) -> S.
%%% @spec hook_state(S::global_state()) -> global_state()
%%% @equiv hook_state(HookState,S)
hook_state(#xmerl_scanner{fun_states = #xmerl_fun_states{hook = S}}) -> S.
%%% @spec rules_state(S::global_state()) -> global_state()
%%% @equiv rules_state(RulesState,S)
rules_state(#xmerl_scanner{fun_states = #xmerl_fun_states{rules = S}}) -> S.
%%% @spec fetch_state(S::global_state()) -> global_state()
%%% @equiv fetch_state(FetchState,S)
fetch_state(#xmerl_scanner{fun_states = #xmerl_fun_states{fetch = S}}) -> S.
%%% @spec cont_state(S::global_state()) -> global_state()
%%% @equiv cont_state(ContinuationState,S)
cont_state(#xmerl_scanner{fun_states = #xmerl_fun_states{cont = S}}) -> S.
%%%% Functions to modify the various states
%%% @spec user_state(UserState, S::global_state()) -> global_state()
%%% @doc For controlling the UserState, to be used in a user function.
%%% See tutorial on customization functions.
user_state(X, S) ->
S#xmerl_scanner{user_state = X}.
%%% @spec event_state(EventState, S::global_state()) -> global_state()
%%% @doc For controlling the EventState, to be used in an event
%%% function, and called at the beginning and at the end of a parsed entity.
%%% See tutorial on customization functions.
event_state(X, S=#xmerl_scanner{fun_states = FS}) ->
FS1 = FS#xmerl_fun_states{event = X},
S#xmerl_scanner{fun_states = FS1}.
%%% @spec hook_state(HookState, S::global_state()) -> global_state()
%%% @doc For controlling the HookState, to be used in a hook
%%% function, and called when the parser has parsed a complete entity.
%%% See tutorial on customization functions.
hook_state(X, S=#xmerl_scanner{fun_states = FS}) ->
FS1 = FS#xmerl_fun_states{hook = X},
S#xmerl_scanner{fun_states = FS1}.
%%% @spec rules_state(RulesState, S::global_state()) -> global_state()
%%% @doc For controlling the RulesState, to be used in a rules
%%% function, and called when the parser store scanner information in a rules
%%% database.
%%% See tutorial on customization functions.
rules_state(X, S=#xmerl_scanner{fun_states = FS}) ->
FS1 = FS#xmerl_fun_states{rules = X},
S#xmerl_scanner{fun_states = FS1}.
%%% @spec fetch_state(FetchState, S::global_state()) -> global_state()
%%% @doc For controlling the FetchState, to be used in a fetch
%%% function, and called when the parser fetch an external resource (eg. a DTD).
%%% See tutorial on customization functions.
fetch_state(X, S=#xmerl_scanner{fun_states = FS}) ->
FS1 = FS#xmerl_fun_states{fetch = X},
S#xmerl_scanner{fun_states = FS1}.
%%% @spec cont_state(ContinuationState, S::global_state()) -> global_state()
%%% @doc For controlling the ContinuationState, to be used in a continuation
%%% function, and called when the parser encounters the end of the byte stream.
%%% See tutorial on customization functions.
cont_state(X, S=#xmerl_scanner{fun_states = FS}) ->
FS1 = FS#xmerl_fun_states{cont = X},
S#xmerl_scanner{fun_states = FS1}.
%% @spec file(Filename::string()) -> {xmlElement(),Rest}
%% Rest = list()
%% @equiv file(Filename, [])
file(F) ->
file(F, []).
%% @spec file(Filename::string(), Options::option_list()) -> {xmlElement(),Rest}
%% Rest = list()
%%% @doc Parse file containing an XML document
file(F, Options) ->
ExtCharset=case lists:keysearch(encoding,1,Options) of
{value,{_,Val}} -> Val;
false -> undefined
end,
case int_file(F,Options,ExtCharset) of
{Res, Tail,S=#xmerl_scanner{close_fun=Close}} ->
Close(S), % for side effects only - final state is dropped
{Res,Tail};
{error, Reason} ->
{error, Reason}
end.
int_file(F, Options,_ExtCharset) ->
%%io:format("int_file F=~p~n",[F]),
case file:read_file(F) of
{ok, Bin} ->
int_string(binary_to_list(Bin), Options, filename:dirname(F),F);
Error ->
Error
end.
int_file_decl(F, Options,_ExtCharset) ->
% io:format("int_file_decl F=~p~n",[F]),
case file:read_file(F) of
{ok, Bin} ->
int_string_decl(binary_to_list(Bin), Options, filename:dirname(F),F);
Error ->
Error
end.
%% @spec string(Text::list()) -> {xmlElement(),Rest}
%% Rest = list()
%% @equiv string(Test, [])
string(Str) ->
string(Str, []).
%% @spec string(Text::list(),Options::option_list()) -> {xmlElement(),Rest}
%% Rest = list()
%%% @doc Parse string containing an XML document
string(Str, Options) ->
{Res, Tail, S=#xmerl_scanner{close_fun = Close}} =
int_string(Str, Options,file_name_unknown),
Close(S), % for side effects only - final state is dropped
{Res,Tail}.
int_string(Str, Options,FileName) ->
{ok, XMLBase} = file:get_cwd(),
int_string(Str, Options, XMLBase, FileName).
int_string(Str, Options, XMLBase, FileName) ->
S0=initial_state0(Options,XMLBase),
S = S0#xmerl_scanner{filename=FileName},
%%io:format("int_string1, calling xmerl_lib:detect_charset~n",[]),
%% In case of no encoding attribute in document utf-8 is default, but
%% another character set may be detected with help of Byte Order Marker or
%% with help of the encoding of the first 4 bytes.
case xmerl_lib:detect_charset(S#xmerl_scanner.encoding,Str) of
{auto,'iso-10646-utf-1',Str2} ->
scan_document(Str2, S#xmerl_scanner{encoding="iso-10646-utf-1"});
{external,'iso-10646-utf-1',Str2} ->
scan_document(Str2, S#xmerl_scanner{encoding="iso-10646-utf-1"});
{undefined,undefined,Str2} -> %% no auto detection
scan_document(Str2, S);
{external,ExtCharset,Str2} ->
%% no auto detection, ExtCharset is an explicitly provided
%% 7 bit,8 bit or utf-8 encoding
scan_document(Str2, S#xmerl_scanner{encoding=atom_to_list(ExtCharset)})
end.
int_string_decl(Str, Options, XMLBase, FileName) ->
S0=initial_state0(Options,XMLBase),
S = S0#xmerl_scanner{filename=FileName},
case xmerl_lib:detect_charset(S#xmerl_scanner.encoding,Str) of
{auto,'iso-10646-utf-1',Str2} ->
scan_decl(Str2, S#xmerl_scanner{encoding="iso-10646-utf-1"});
{external,'iso-10646-utf-1',Str2} ->
scan_decl(Str2, S#xmerl_scanner{encoding="iso-10646-utf-1"});
{undefined,undefined,Str2} ->
scan_decl(Str2, S);
{external,ExtCharset,Str2} ->
scan_decl(Str2, S#xmerl_scanner{encoding=atom_to_list(ExtCharset)})
end.
initial_state0(Options,XMLBase) ->
CommonData = common_data(),
initial_state(Options, #xmerl_scanner{
event_fun = fun event/2,
hook_fun = fun hook/2,
acc_fun = fun acc/3,
fetch_fun = fun fetch/2,
close_fun = fun close/1,
continuation_fun = fun cont/3,
rules_read_fun = fun rules_read/3,
rules_write_fun = fun rules_write/4,
rules_delete_fun= fun rules_delete/3,
xmlbase = XMLBase,
common_data = CommonData
}).
initial_state([{event_fun, F}|T], S) ->
initial_state(T, S#xmerl_scanner{event_fun = F});
initial_state([{event_fun, F, ES}|T], S) ->
S1 = event_state(ES, S#xmerl_scanner{event_fun = F}),
initial_state(T, S1);
initial_state([{acc_fun, F}|T], S) ->
initial_state(T, S#xmerl_scanner{acc_fun = F});
initial_state([{hook_fun, F}|T], S) ->
initial_state(T, S#xmerl_scanner{hook_fun = F});
initial_state([{hook_fun, F, HS}|T], S) ->
S1 = hook_state(HS, S#xmerl_scanner{hook_fun = F}),
initial_state(T, S1);
initial_state([{close_fun, F}|T], S) ->
initial_state(T, S#xmerl_scanner{close_fun = F});
initial_state([{fetch_fun, F}|T], S) ->
initial_state(T, S#xmerl_scanner{fetch_fun = F});
initial_state([{fetch_fun, F, FS}|T], S) ->
S1 = fetch_state(FS, S#xmerl_scanner{fetch_fun = F}),
initial_state(T, S1);
initial_state([{fetch_path, P}|T], S) ->
initial_state(T, S#xmerl_scanner{fetch_path = P});
initial_state([{continuation_fun, F}|T], S) ->
initial_state(T, S#xmerl_scanner{continuation_fun = F});
initial_state([{continuation_fun, F, CS}|T], S) ->
S1 = cont_state(CS, S#xmerl_scanner{continuation_fun = F}),
initial_state(T, S1);
initial_state([{rules, R}|T], S) ->
initial_state(T, S#xmerl_scanner{rules = R,
keep_rules = true});
initial_state([{rules, Read, Write, RS}|T], S) ->
S1 = rules_state(RS, S#xmerl_scanner{rules_read_fun = Read,
rules_write_fun = Write,
keep_rules = true}),
initial_state(T, S1);
initial_state([{user_state, F}|T], S) ->
initial_state(T, S#xmerl_scanner{user_state = F});
initial_state([{space, L}|T], S) ->
initial_state(T, S#xmerl_scanner{space = L});
initial_state([{line, L}|T], S) ->
initial_state(T, S#xmerl_scanner{line = L});
initial_state([{namespace_conformant, F}|T], S) when F==true; F==false ->
initial_state(T, S#xmerl_scanner{namespace_conformant = F});
initial_state([{validation, F}|T], S)
when F==off; F==dtd; F==schema; F==true; F==false ->
initial_state(T, S#xmerl_scanner{validation = validation_value(F)});
initial_state([{schemaLocation, SL}|T], S) when is_list(SL) ->
initial_state(T, S#xmerl_scanner{schemaLocation=SL});
initial_state([{quiet, F}|T], S) when F==true; F==false ->
initial_state(T, S#xmerl_scanner{quiet = F});
initial_state([{doctype_DTD,DTD}|T], S) ->
initial_state(T,S#xmerl_scanner{doctype_DTD = DTD});
initial_state([{text_decl,Bool}|T], S) ->
initial_state(T,S#xmerl_scanner{text_decl=Bool});
initial_state([{environment,Env}|T], S) ->
initial_state(T,S#xmerl_scanner{environment=Env});
initial_state([{xmlbase, D}|T], S) ->
initial_state(T, S#xmerl_scanner{xmlbase = D});
initial_state([{encoding, Enc}|T], S) ->
initial_state(T, S#xmerl_scanner{encoding = Enc});
initial_state([], S=#xmerl_scanner{rules = undefined}) ->
Tab = ets:new(rules, [set, public]),
S#xmerl_scanner{rules = Tab};
initial_state([], S) ->
S.
validation_value(true) ->
dtd;
validation_value(false) ->
off;
validation_value(F) ->
F.
%% Used for compacting (some) indentations.
%% See also fast_accumulate_whitespace().
common_data() ->
{comdata(lists:duplicate(60, $\s), []),
comdata(lists:duplicate(15, $\t), []),
"\n"}.
comdata([], CD)->
list_to_tuple(CD);
comdata([_ | T]=L, CD) ->
comdata(T, [[$\n | L] | CD]).
%%% -----------------------------------------------------
%%% Default modifier functions
%%% Hooks:
%%% - {element, Line, Name, Attrs, Content}
%%% - {processing_instruction, Line, Data}
hook(X, State) ->
{X, State}.
%%% Events:
%%%
%%% #xmerl_event{event : started | ended,
%%% line : integer(),
%%% col : integer(),
%%% data}
%%%
%%% Data Events
%%% document started, ended
%%% #xmlElement started, ended
%%% #xmlAttribute ended
%%% #xmlPI ended
%%% #xmlComment ended
%%% #xmlText ended
event(_X, S) ->
S.
%% The acc/3 function can return either {Acc´, S'} or {Acc', Pos', S'},
%% where Pos' can be derived from X#xmlElement.pos, X#xmlText.pos, or
%% X#xmlAttribute.pos (whichever is the current object type.)
%% The acc/3 function is not allowed to redefine the type of object
%% being defined, but _is_ allowed to either ignore it or split it
%% into multiple objects (in which case {Acc',Pos',S'} should be returned.)
%% If {Acc',S'} is returned, Pos will be incremented by 1 by default.
%% Below is an example of an acceptable operation
acc(X = #xmlText{value = Text}, Acc, S) ->
{[X#xmlText{value = Text}|Acc], S};
acc(X, Acc, S) ->
{[X|Acc], S}.
fetch({system, URI}, S) ->
fetch_URI(URI, S);
fetch({public, _PublicID, URI}, S) ->
fetch_URI(URI, S).
%%% Always assume an external resource can be found locally! Thus
%%% don't bother fetching with e.g. HTTP. Returns the path where the
%%% resource is found. The path to the external resource is given by
%%% URI directly or the option fetch_path (additional paths) or
%%% directory (base path to external resource)
fetch_URI(URI, S) ->
%% assume URI is a filename
Split = filename:split(URI),
Filename = fun([])->[];(X)->lists:last(X) end (Split),
Fullname =
case Split of %% how about Windows systems?
["file:"|Name]-> %% absolute path, see RFC2396 sect 3
%% file:/dtd_name
filename:join(["/"|Name]);
["/"|Rest] when Rest /= [] ->
%% absolute path name
URI;
["http:"|_Rest] ->
{http,URI};
[] -> %% empty systemliteral
[];
_ ->
filename:join(S#xmerl_scanner.xmlbase, URI)
end,
Path = path_locate(S#xmerl_scanner.fetch_path, Filename, Fullname),
?dbg("fetch(~p) -> {file, ~p}.~n", [URI, Path]),
{ok, Path, S}.
path_locate(_, _, {http,_}=URI) ->
URI;
path_locate(_, _, []) ->
[];
path_locate([Dir|Dirs], FN, FullName) ->
F = filename:join(Dir, FN),
case file:read_file_info(F) of
{ok, #file_info{type = regular}} ->
{file,F};
_ ->
path_locate(Dirs, FN, FullName)
end;
path_locate([], _FN, FullName) ->
{file,FullName}.
cont(_F, Exception, US) ->
Exception(US).
close(S) ->
S.
%%% -----------------------------------------------------
%%% Scanner
%%% [1] document ::= prolog element Misc*
scan_document(Str0, S=#xmerl_scanner{event_fun = Event,
line = L, col = C,
environment=Env,
encoding=Charset,
validation=ValidateResult}) ->
S1 = Event(#xmerl_event{event = started,
line = L,
col = C,
data = document}, S),
%% Transform to given character set.
%% Note that if another character set is given in the encoding
%% attribute in a XML declaration that one will be used later
Str=if
Charset == "utf-8" ->
Str0;
Charset=/=undefined -> % Default character set is UTF-8
xmerl_ucs:to_unicode(Str0,list_to_atom(Charset));
true -> %% Charset is undefined if no external input is
%% given, and no auto detection of character
%% encoding was made.
Str0
end,
%% M1 = erlang:memory(),
%% io:format("Memory status before prolog: ~p~n",[M1]),
{T1, S2} = scan_prolog(Str, S1, _StartPos = 1),
%% M2 = erlang:memory(),
%% io:format("Memory status after prolog: ~p~n",[M2]),
%%io:format("scan_document 2, prolog parsed~n",[]),
T2 = scan_mandatory("<",T1,1,S2,expected_element_start_tag),
%% M3 = erlang:memory(),
%% io:format("Memory status before element: ~p~n",[M3]),
{Res, T3, S3} =scan_element(T2,S2,_StartPos = 1),
%% M4 = erlang:memory(),
%% io:format("Memory status after element: ~p~n",[M4]),
{Tail, S4}=scan_misc(T3, S3, _StartPos = 1),
%% M5 = erlang:memory(),
%% io:format("Memory status after misc: ~p~n",[M5]),
S5 = #xmerl_scanner{} = Event(#xmerl_event{event = ended,
line = S4#xmerl_scanner.line,
col = S4#xmerl_scanner.col,
data = document}, S4),
{Res2,S6} = case validation_mode(ValidateResult) of
off ->
{Res,cleanup(S5)};
dtd when Env == element; Env == prolog ->
check_decl2(S5),
case xmerl_validate:validate(S5,Res) of
{'EXIT',{error,Reason}} ->
S5b=cleanup(S5),
?fatal({failed_validation,Reason}, S5b);
{'EXIT',Reason} ->
S5b=cleanup(S5),
?fatal({failed_validation,Reason}, S5b);
{error,Reason} ->
S5b=cleanup(S5),
?fatal({failed_validation,Reason}, S5b);
{error,Reason,_Next} ->
S5b=cleanup(S5),
?fatal({failed_validation,Reason}, S5b);
_XML ->
{Res,cleanup(S5)}
end;
schema ->
case schemaLocations(Res,S5) of
{ok,Schemas} ->
cleanup(S5),
%%io:format("Schemas: ~p~nRes: ~p~ninhertih_options(S): ~p~n",
%% [Schemas,Res,inherit_options(S5)]),
XSDRes = xmerl_xsd:process_validate(Schemas,Res,
inherit_options(S5)),
handle_schema_result(XSDRes,S5);
_ ->
{Res,cleanup(S5)}
end;
_ ->
{Res,cleanup(S5)}
end,
{Res2, Tail, S6}.
scan_decl(Str, S=#xmerl_scanner{event_fun = Event,
line = L, col = C,
environment=_Env,
encoding=_Charset,
validation=_ValidateResult}) ->
S1 = Event(#xmerl_event{event = started,
line = L,
col = C,
data = document}, S),
case scan_prolog(Str, S1, _StartPos = 1) of
{T2="<"++_, S2} ->
{{S2#xmerl_scanner.user_state,T2},[],S2};
{[], S2}->
{[],[],S2};
{T2, S2} ->
{_,_,S3} = scan_content(T2,S2,[],_Attrs=[],S2#xmerl_scanner.space,
_Lang=[],_Parents=[],#xmlNamespace{}),
{T2,[],S3}
end.
%%% [22] Prolog
%%% prolog ::= XMLDecl? Misc* (doctypedecl Misc*)?
%%%
%% empty text declarations are handled by the first function clause.
scan_prolog([], S=#xmerl_scanner{continuation_fun = F}, Pos) ->
?dbg("cont()...~n", []),
F(fun(MoreBytes, S1) -> scan_prolog(MoreBytes, S1, Pos) end,
fun(S1) -> {[], S1} end,
S);
scan_prolog("
{Charset,T3, S3}=
if
Col==1,L==1,S0#xmerl_scanner.text_decl==true ->
?dbg("prolog(\"
?dbg("prolog(\"
?fatal({xml_declaration_must_be_first_in_doc,Col,L},S0)
end,
%% Charset0 is either (1) 'iso-10646-utf-1' (transformation by
%% auto detection), (2) undefined (no auto detection and no
%% external encoding), (3) any other encoding format that must be
%% conformant to the internal explicitly given encoding. The two
%% former cases implies that the explicit internal encoding
%% (Charset) may be different from Charset0.
%% Now transform to declared character set.
if
Charset==Charset0 -> % Document already transformed to this charset!
scan_prolog(T3, S3, Pos);
Charset0=/=undefined ->
%% For example may an external entity
%% have the BOM for utf-16 and the internal
%% explicit encoding='utf-16', then it will be auto
%% detected and transformed, Charset0 will be
%% 'iso-10646-utf-1', and Charset will be 'utf-16', all
%% legal.
%%
scan_prolog(T3,S3#xmerl_scanner{encoding=Charset0},Pos);
Charset == "utf-8" ->
scan_prolog(T3, S3, Pos);
Charset=/=undefined -> % Document not previously transformed
T4=xmerl_ucs:to_unicode(T3,list_to_atom(Charset)),
scan_prolog(T4, S3, Pos);
true -> % No encoding info given
scan_prolog(T3, S3, Pos)
end;
scan_prolog("
?dbg("prolog(\" xmerl_ucs:to_unicode(T,'utf-8');
true -> T
end,
{T2, S1} = scan_doctype(T1, S),
scan_misc(T2, S1, Pos);
scan_prolog(Str="%"++_T,S=#xmerl_scanner{environment={external,_}},_Pos) ->
scan_ext_subset(Str,S);
scan_prolog(Str, S0 = #xmerl_scanner{user_state=_US,encoding=_Charset},Pos) ->
?dbg("prolog(\"<\")~n", []),
%% Check for Comments, PI before possible DOCTYPE declaration
?bump_col(1),
%% If no known character set assume it is UTF-8
T=if
%% Charset==undefined -> xmerl_ucs:to_unicode(Str,'utf-8');
true -> Str
end,
{T1, S1}=scan_misc(T, S, Pos),
scan_prolog2(T1,S1,Pos).
scan_prolog2([], S=#xmerl_scanner{continuation_fun = F}, Pos) ->
?dbg("cont()...~n", []),
F(fun(MoreBytes, S1) -> scan_prolog2(MoreBytes, S1, Pos) end,
fun(S1) -> {[], S1} end,
S);
scan_prolog2("
?dbg("prolog(\"
?dbg("prolog(\"
?dbg("prolog(\"<\")~n", []),
%% Here we consider the DTD provided by doctype_DTD option,
S1 =
case S0 of
#xmerl_scanner{validation=dtd,doctype_DTD=DTD} when is_list(DTD) ->
S=fetch_DTD(undefined,S0),
check_decl(S),
S;
_ -> S0
end,
%% Check for more Comments and PI after DOCTYPE declaration
% ?bump_col(1),
scan_misc(Str, S1, Pos).
%%% [27] Misc ::= Comment | PI | S
%% Note:
%% - Neither of Comment and PI are returned in the resulting parsed
%% structure.
%% - scan_misc/3 implements Misc* as that is how the rule is always used
scan_misc([], S=#xmerl_scanner{continuation_fun = F}, Pos) ->
?dbg("cont()...~n", []),
F(fun(MoreBytes, S1) -> scan_misc(MoreBytes, S1, Pos) end,
fun(S1) -> {[], S1} end,
S);
scan_misc(""++ T,S0,Delim,Acc,PEName, parameter=NS,PENesting) ->
?bump_col(3),
scan_entity_value(T,S,Delim,["-->"|Acc],PEName,NS,
pe_pop("-->",PENesting,S));
%% Stop delimeter for ConditionalSection
scan_entity_value("]]>"++ T,S0,Delim,Acc,PEName, parameter=NS,PENesting) ->
?bump_col(3),
scan_entity_value(T,S,Delim,["]]>"|Acc],PEName,NS,
pe_pop("]]>",PENesting,S));
%% Stop delimeter added to match a content start delimeter included
scan_entity_value("/>"++ T,S0,Delim,Acc,PEName, parameter=NS,PENesting) ->
?bump_col(2),
scan_entity_value(T,S,Delim,["/>"|Acc],PEName,NS,
pe_pop("/>",PENesting,S));
scan_entity_value(")"++ T,S0,Delim,Acc,PEName, parameter=NS,PENesting) ->
?bump_col(1),
scan_entity_value(T,S,Delim,[")"|Acc],PEName,NS,
pe_pop(")",PENesting,S));
scan_entity_value("\n"++T, S, Delim, Acc, PEName,Namespace,PENesting) ->
scan_entity_value(T, S#xmerl_scanner{line=S#xmerl_scanner.line+1},
Delim, ["\n"|Acc], PEName,Namespace,PENesting);
scan_entity_value(Str, S0, Delim, Acc, PEName,Namespace,PENesting) ->
{Ch,T} = to_ucs(S0#xmerl_scanner.encoding,Str),
case xmerl_lib:is_char(Ch) of
true ->
?bump_col(1),
scan_entity_value(T, S, Delim, [Ch|Acc], PEName,Namespace,PENesting);
false ->
?fatal({unexpected_char,Ch}, S0)
end.
save_refed_entity_name(Name,PEName,S) ->
case predefined_entity(Name) of
true ->
S;
_ ->
save_refed_entity_name1(Name,PEName,S)
end.
save_refed_entity_name1(Name,PEName,
S=#xmerl_scanner{entity_references=ERefs}) ->
case lists:keysearch(PEName,1,ERefs) of
{value,{_,Refs}} ->
NewRefs =
case lists:member(Name,Refs) of
true ->Refs;
_ -> [Name|Refs]
end,
S#xmerl_scanner{entity_references=lists:keyreplace(PEName,1,ERefs,
{PEName,NewRefs})
};
_ ->
S#xmerl_scanner{entity_references=[{PEName,[Name]}|ERefs]}
end.
pe_push(Tok,Stack,_S) when Tok=="
[Tok|Stack];
pe_push(Tok,Stack,#xmerl_scanner{validation=dtd})
when Tok==")";Tok==">";Tok=="?>";Tok=="]]>";Tok=="-->";Tok=="/>"->
[Tok|Stack];
pe_push(_,Stack,_S) ->
Stack.
pe_pop(">",[" Rest;
pe_pop("?>",[""|Rest],_S) -> Rest;
pe_pop("-->",[""++_T,parameter,dtd) -> "-->";
pe_nesting_token("]]>"++_T,parameter,dtd) -> "]]>";
pe_nesting_token(")"++_T,parameter,dtd) -> ")";
pe_nesting_token("/>"++_T,parameter,dtd) -> "/>";
pe_nesting_token(_,_,_) -> false.
predefined_entity(amp) -> true;
predefined_entity(lt) -> true;
predefined_entity(gt) -> true;
predefined_entity(apos) -> true;
predefined_entity(quot) -> true;
predefined_entity(_) -> false.
check_entity_recursion(EName,
S=#xmerl_scanner{entity_references=EntityRefList}) ->
Set = sofs:family(EntityRefList),
case catch sofs:family_to_digraph(Set, [acyclic]) of
{'EXIT',{cyclic,_}} ->
?fatal({illegal_recursion_in_Entity, EName}, S);
DG ->
digraph:delete(DG),
ok
end.
%%%%%%% [15] Comment
scan_comment(Str, S) ->
scan_comment(Str, S, _Pos = undefined, _Parents = [], _Lang = []).
scan_comment(Str,S=#xmerl_scanner{col=C,event_fun=Event}, Pos, Parents, Lang) ->
Comment = #xmlComment{pos = Pos,
parents = Parents,
language = Lang,
value = undefined},
S1 = #xmerl_scanner{} = Event(#xmerl_event{event = started,
line = S#xmerl_scanner.line,
col = C,
pos = Pos,
data = Comment}, S),
scan_comment1(Str, S1, Pos, Comment, _Acc = []).
scan_comment1([], S=#xmerl_scanner{continuation_fun = F},
Pos, Comment, Acc) ->
?dbg("cont()...~n", []),
F(fun(MoreBytes, S1) -> scan_comment1(MoreBytes, S1, Pos, Comment, Acc) end,
fun(S1) -> ?fatal(unexpected_end, S1) end,
S);
scan_comment1("-->" ++ T, S0 = #xmerl_scanner{col = C,
event_fun = Event,
hook_fun = Hook},
_Pos, Comment, Acc) ->
?bump_col(3),
Comment1 = Comment#xmlComment{value = lists:reverse(Acc)},
S1=#xmerl_scanner{}=Event(#xmerl_event{event = ended,
line=S#xmerl_scanner.line,
col = C,
data = Comment1}, S),
{Ret, S2} = Hook(Comment1, S1),
{_,T3,S3}=strip(T,S2),
{Ret,T3,S3};
scan_comment1("--"++T,S,_Pos,_Comment,_Acc) ->
?fatal({invalid_comment,"--"++[hd(T)]}, S);
scan_comment1("\n" ++ T, S=#xmerl_scanner{line = L}, Pos, Cmt, Acc) ->
scan_comment1(T, S#xmerl_scanner{line=L+1,col=1},Pos, Cmt, "\n" ++ Acc);
scan_comment1("\r\n" ++ T, S=#xmerl_scanner{line = L}, Pos, Cmt, Acc) ->
%% CR followed by LF is read as a single LF
scan_comment1(T, S#xmerl_scanner{line=L+1,col=1}, Pos, Cmt, "\n" ++ Acc);
scan_comment1("\r" ++ T, S=#xmerl_scanner{line = L}, Pos, Cmt, Acc) ->
%% CR not followed by LF is read as a LF
scan_comment1(T, S#xmerl_scanner{line=L+1,col=1}, Pos, Cmt, "\n" ++ Acc);
scan_comment1(Str, S=#xmerl_scanner{col = C}, Pos, Cmt, Acc) ->
{Ch,T} = wfc_legal_char(Str,S),
scan_comment1(T, S#xmerl_scanner{col=C+1}, Pos, Cmt, [Ch|Acc]).
%%%%%%%
scan_markup_completion_gt([$>|_R]=T,S) ->
{T,S};
scan_markup_completion_gt([$%|T],S0) ->
?bump_col(1),
{Name,T1,S1} = scan_pe_reference(T,S),
ExpandedRef = expand_pe_reference(Name,S1,as_PE),
{_,T2,S2} = strip(ExpandedRef++T1,S1),
scan_markup_completion_gt(T2,S2);
scan_markup_completion_gt(T,S) ->
?fatal({error,{malformed_syntax_entity_completion,T}},S).
scan_mandatory(Pattern,T,N,S,ErrorMsg) ->
case lists:prefix(Pattern,T) of
true ->
lists:nthtail(N,T);
_ ->
?fatal(ErrorMsg,S)
end.
strip(Str,S) ->
strip(Str,S,all).
strip([], S=#xmerl_scanner{continuation_fun = F},_) ->
?dbg("cont()... stripping whitespace~n", []),
F(fun(MoreBytes, S1) -> strip(MoreBytes, S1) end,
fun(S1) -> {[], [], S1} end,
S);
strip("\s" ++ T, S=#xmerl_scanner{col = C},Lim) ->
strip(T, S#xmerl_scanner{col = C+1},Lim);
strip("\t" ++ _T, S ,no_tab) ->
?fatal({error,{no_tab_allowed}},S);
strip("\t" ++ T, S=#xmerl_scanner{col = C},Lim) ->
strip(T, S#xmerl_scanner{col = expand_tab(C)},Lim);
strip("\n" ++ T, S=#xmerl_scanner{line = L},Lim) ->
strip(T, S#xmerl_scanner{line = L+1, col = 1},Lim);
strip("\r\n" ++ T, S=#xmerl_scanner{line = L},Lim) ->
%% CR followed by LF is read as a single LF
strip(T, S#xmerl_scanner{line = L+1, col = 1},Lim);
strip("\r" ++ T, S=#xmerl_scanner{line = L},Lim) ->
%% CR not followed by LF is read as a LF
strip(T, S#xmerl_scanner{line = L+1, col = 1},Lim);
strip(Str, S,_Lim) ->
{[], Str, S}.
%% demands a whitespace, though a parameter entity is ok, it will
%% expand with a whitespace on each side.
mandatory_strip([],S) ->
?fatal({error,{whitespace_was_expected}},S);
mandatory_strip(T,S) when ?whitespace(hd(T)) ->
strip(T,S,all);
mandatory_strip([$%|T],S) when ?whitespace(hd(T)) -> %this is not a PERefence, but an PEDeclaration
?fatal({error,{whitespace_was_expected}},S);
mandatory_strip([$%|_T]=T,S) ->
{[],T,S};
mandatory_strip(_T,S) ->
?fatal({error,{whitespace_was_expected}},S).
%% strip but don't accept tab
pub_id_strip(Str, S) ->
strip(Str,S,no_tab).
normalize("&"++T,S,IsNorm) ->
case scan_reference(T, S) of
{ExpRef, T1, S1} when ?whitespace(hd(ExpRef)) ->
ExpRef2 = string_to_char_set(S#xmerl_scanner.encoding,ExpRef),
normalize(ExpRef2++T1,S1,IsNorm);
_ ->
{"&"++T,S,IsNorm}
end;
normalize(T,S,IsNorm) ->
case strip(T,S) of
{_,T,S} ->
{T,S,IsNorm};
{_,T1,S1} ->
{T1,S1,true}
end.
%% Optimization:
%% - avoid building list of spaces or tabs;
%% - avoid reverse;
%% - compact two common indentation patterns.
%% Note: only to be called when a \n was found.
fast_accumulate_whitespace(" " ++ T, S, _) ->
fast_acc_spaces(T, S, 1);
fast_accumulate_whitespace("\t"++T, S, _) ->
fast_acc_tabs(T, S, 1);
fast_accumulate_whitespace("<"++_=R, S, _T) ->
#xmerl_scanner{common_data = CD, line = Line} = S,
{done, {element(3, CD), R, S#xmerl_scanner{col = 1, line = Line + 1}}};
fast_accumulate_whitespace(_, S, T) ->
accumulate_whitespace(T, S, []).
fast_acc_spaces(" " ++ T, S, N) ->
fast_acc_spaces(T, S, N + 1);
fast_acc_spaces(T, S, N) ->
fast_acc_end(T, S, N, N, $\s, 1).
fast_acc_tabs("\t" ++ T, S, N) ->
fast_acc_tabs(T, S, N + 1);
fast_acc_tabs(T, S, N) ->
fast_acc_end(T, S, N, N * 8 + 1, $\t, 2).
fast_acc_end(T, S, N, Col, C, CD_I) ->
#xmerl_scanner{common_data = CD, line = Line0} = S,
Line = Line0 + 1,
try
$< = hd(T),
{done,{element(N, element(CD_I, CD)), T,
S#xmerl_scanner{col = Col, line = Line}}}
catch _:_ ->
accumulate_whitespace(T, S, Line, Col, lists:duplicate(N, C)++"\n")
end.
%%% @spec accumulate_whitespace(T::string(),S::global_state(),
%%% atom(),Acc::string()) -> {Acc, T1, S1}
%%%
%%% @doc Function to accumulate and normalize whitespace.
accumulate_whitespace(T, S, preserve, Acc) ->
accumulate_whitespace(T, S, Acc);
accumulate_whitespace(T, S, normalize, Acc) ->
{_WsAcc, T1, S1} = accumulate_whitespace(T, S, []),
{[$\s|Acc], T1, S1}.
accumulate_whitespace(T, S, Acc) ->
#xmerl_scanner{line = Line, col = Col} = S,
accumulate_whitespace(T, S, Line, Col, Acc).
accumulate_whitespace([], S0, Line, Col, Acc) ->
#xmerl_scanner{continuation_fun = F} = S0,
S = S0#xmerl_scanner{line = Line, col = Col},
?dbg("cont()...~n", []),
F(fun(MoreBytes, S1) -> accumulate_whitespace(MoreBytes, S1, Acc) end,
fun(S1) -> {Acc, [], S1} end,
S);
accumulate_whitespace("\s" ++ T, S, Line, Col, Acc) ->
accumulate_whitespace(T, S, Line, Col+1, [$\s|Acc]);
accumulate_whitespace("\t" ++ T, S, Line, Col, Acc) ->
accumulate_whitespace(T, S, Line, expand_tab(Col), [$\t|Acc]);
accumulate_whitespace("\n" ++ T, S, Line, _Col, Acc) ->
accumulate_whitespace(T, S, Line+1, 1, [$\n|Acc]);
accumulate_whitespace("\r\n" ++ T, S, Line, _Col, Acc) ->
%% CR followed by LF is read as a single LF
accumulate_whitespace(T, S, Line+1, 1, [$\n|Acc]);
accumulate_whitespace("\r" ++ T, S, Line, _Col, Acc) ->
%% CR not followed by LF is read as a LF
accumulate_whitespace(T, S, Line+1, 1, [$\n|Acc]);
accumulate_whitespace(Str, S, Line, Col, Acc) ->
{Acc, Str, S#xmerl_scanner{line = Line, col = Col}}.
expand_tab(Col) ->
Rem = (Col-1) rem 8,
_NewCol = Col + 8 - Rem.
%% validation_mode(Validation)
%% Validation = off | dtd | schema | true | false
%% true and false are obsolete
validation_mode(false) ->
off;
validation_mode(true) ->
dtd;
validation_mode(Other) ->
Other.
schemaLocations(El,#xmerl_scanner{schemaLocation=[]}) ->
schemaLocations(El);
schemaLocations(El,#xmerl_scanner{schemaLocation=SL}) ->
case SL of
[{_,_}|_] ->
{ok,SL};
_ ->
schemaLocations(El)
end.
schemaLocations(#xmlElement{attributes=Atts,xmlbase=_Base}) ->
Pred = fun(#xmlAttribute{name=schemaLocation}) -> false;
(#xmlAttribute{namespace={_,"schemaLocation"}}) -> false;
(_) -> true
end,
case lists:dropwhile(Pred,Atts) of
[#xmlAttribute{value=Paths}|_] ->
case string:tokens(Paths," ") of
L when length(L) > 0 ->
case length(L) rem 2 of
0 ->
PairList =
fun([],_Fun) ->
[];
([SLNS,SLLoc|Rest],Fun) ->
[{SLNS,SLLoc}|Fun(Rest,Fun)]
end,
{ok,PairList(L,PairList)};
_ ->
{error,{schemaLocation_attribute,namespace_location_not_in_pair}}
end;
_ ->
{error,{missing_schemaLocation}}
end;
[] ->
{error,{missing_schemaLocation}}
end.
inherit_options(S) ->
%%io:format("xsdbase: ~p~n",[S#xmerl_scanner.xmlbase]),
[{xsdbase,S#xmerl_scanner.xmlbase}].
handle_schema_result({XSDRes=#xmlElement{},_},S5) ->
{XSDRes,S5};
handle_schema_result({error,Reason},S5) ->
?fatal({failed_schema_validation,Reason},S5).
%%% Helper functions
fatal(Reason, S) ->
exit({fatal, {Reason,
{file,S#xmerl_scanner.filename},
{line,S#xmerl_scanner.line},
{col,S#xmerl_scanner.col}}}).
%% preformat formats tokens in L1 and L2, L2 separated by Sep into a
%% list
preformat(L1,L2,Sep) ->
Format1= lists:flatten(lists:duplicate(length(L1)-1,"~s ")++"~s"),
Format2 = lists:flatten(lists:duplicate(length(L2)-1,
" ~s"++Sep)++" ~s"),
lists:flatten(io_lib:format(Format1++Format2,L1++L2)).
%% BUG when we are many balise none attributes has save in rules
rules_write(Context, Name, Value, #xmerl_scanner{rules = T} = S) ->
case ets:lookup(T, {Context, Name}) of
[] ->
ets:insert(T, {{Context, Name}, Value});
_ ->
ok
end,
S.
rules_read(Context, Name, #xmerl_scanner{rules = T}) ->
case ets:lookup(T, {Context, Name}) of
[] ->
undefined;
[{_, V}] ->
V
end.
rules_delete(Context,Name,#xmerl_scanner{rules = T}) ->
ets:delete(T,{Context,Name}).
to_ucs(Encoding, Chars) when Encoding=="utf-8"; Encoding == undefined ->
utf8_2_ucs(Chars);
to_ucs(_,[C|Rest]) ->
{C,Rest}.
utf8_2_ucs([A,B,C,D|Rest]) when A band 16#f8 =:= 16#f0,
B band 16#c0 =:= 16#80,
C band 16#c0 =:= 16#80,
D band 16#c0 =:= 16#80 ->
%% 11110vvv 10vvvvvv 10vvvvvv 10vvvvvv
case ((D band 16#3f) bor ((C band 16#3f) bsl 6) bor
((B band 16#3f) bsl 12) bor ((A band 16#07) bsl 18)) of
Ch when Ch >= 16#10000 ->
{Ch,Rest};
Ch ->
{{error,{bad_character,Ch}},Rest}
end;
utf8_2_ucs([A,B,C|Rest]) when A band 16#f0 =:= 16#e0,
B band 16#c0 =:= 16#80,
C band 16#c0 =:= 16#80 ->
%% 1110vvvv 10vvvvvv 10vvvvvv
case ((C band 16#3f) bor ((B band 16#3f) bsl 6) bor
((A band 16#0f) bsl 12)) of
Ch when Ch >= 16#800 ->
{Ch,Rest};
Ch ->
{{error,{bad_character,Ch}},Rest}
end;
utf8_2_ucs([A,B|Rest]) when A band 16#e0 =:= 16#c0,
B band 16#c0 =:= 16#80 ->
%% 110vvvvv 10vvvvvv
case ((B band 16#3f) bor ((A band 16#1f) bsl 6)) of
Ch when Ch >= 16#80 ->
{Ch,Rest};
Ch ->
{{error,{bad_character,Ch}},Rest}
end;
utf8_2_ucs([A|Rest]) when A < 16#80 ->
{A,Rest};
utf8_2_ucs([A|Rest]) ->
{{error,{bad_character,A}},Rest}.
%% to_char_set("iso-10646-utf-1",Ch) ->
%% [Ch];
%% to_char_set(UTF8,Ch) when UTF8 =:= "utf-8"; UTF8 =:= undefined ->
%% ucs_2_utf8(Ch);
%% to_char_set(_,Ch) ->
%% [Ch].
ucs_2_utf8(Ch) when Ch < 128 ->
%% 0vvvvvvv
[Ch];
ucs_2_utf8(Ch) when Ch < 16#0800 ->
%% Ch: -----vvv vvvvvvvv
%% 110vvvvv 10vvvvvv
%% O1 = (Ch band 16#07c0) bsr 6,
%% O2 = (Ch band 16#003f),
[((Ch band 16#07c0) bsr 6) bor 16#c0,(Ch band 16#003f) bor 16#80];
ucs_2_utf8(Ch) when Ch < 16#10000 ->
%% Ch: vvvvvvvv vvvvvvvv
%% 1110vvvv 10vvvvvv 10vvvvvv
%% O1 = (Ch band 16#f000) bsr 12
%% O2 = (Ch band 16#0fc0) bsr 6
%% O3 = (Ch band 16#003f)
[((Ch band 16#f000) bsr 12) bor 16#e0,
((Ch band 16#0fc0) bsr 6) bor 16#80,
(Ch band 16#003f) bor 16#80];
ucs_2_utf8(Ch) when Ch < 16#200000 ->
%% Ch: ---vvvvv vvvvvvvv vvvvvvvv
%% 11110vvv 10vvvvvv 10vvvvvv 10vvvvvv
%% O1 = (Ch band 16#1c0000) bsr 18
%% O2 = (Ch band 16#03f000) bsr 12
%% O3 = (Ch band 16#000fc0) bsr 6
%% O4 = (Ch band 16#00003f)
[((Ch band 16#1c0000) bsr 18) bor 16#f0,
((Ch band 16#03f000) bsr 12) bor 16#80,
((Ch band 16#000fc0) bsr 6) bor 16#80,
(Ch band 16#00003f) bor 16#80].
string_to_char_set(Enc,Str) when Enc =:= "utf-8"; Enc =:= undefined ->
lists:flatten([ucs_2_utf8(X)||X <- Str]);
string_to_char_set(_,Str) ->
Str.
%% diagnose(Line) ->
%% Mem=erlang:memory(),
%% {OldTot,OldLine} = get_total(),
%% NewTot =
%% case {lists:keysearch(total,1,Mem),OldTot*1.1} of
%% {{_,{_,Tot}},Tot110} when Tot > Tot110 ->
%% io:format("From ~p to ~p, total memory: ~p (~p)~n",[OldLine,Line,Tot,OldTot]),
%% Tot;
%% {{_,{_,Tot}},_} ->
%% Tot
%% end,
%% put_total({NewTot,Line}).
%% get_total() ->
%% case get(xmerl_mem) of
%% undefined ->
%% put(xmerl_mem,{0,0}),
%% {0,0};
%% M -> M
%% end.
%% put_total(M) ->
%% put(xmerl_mem,M).