diff options
Diffstat (limited to 'lib/xmerl/src/xmerl_scan.erl')
-rw-r--r-- | lib/xmerl/src/xmerl_scan.erl | 4088 |
1 files changed, 4088 insertions, 0 deletions
diff --git a/lib/xmerl/src/xmerl_scan.erl b/lib/xmerl/src/xmerl_scan.erl new file mode 100644 index 0000000000..4e5cc59d8f --- /dev/null +++ b/lib/xmerl/src/xmerl_scan.erl @@ -0,0 +1,4088 @@ +%% +%% %CopyrightBegin% +%% +%% Copyright Ericsson AB 2003-2009. All Rights Reserved. +%% +%% The contents of this file are subject to the Erlang Public License, +%% Version 1.1, (the "License"); you may not use this file except in +%% compliance with the License. You should have received a copy of the +%% Erlang Public License along with this software. If not, it can be +%% retrieved online at http://www.erlang.org/. +%% +%% Software distributed under the License is distributed on an "AS IS" +%% basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See +%% the License for the specific language governing rights and limitations +%% under the License. +%% +%% %CopyrightEnd% +%% + +%% Description : Simgle-pass XML scanner. See xmerl.hrl for data defs. + +%% @doc This module is the interface to the XML parser, it handles XML 1.0. +%% The XML parser is activated through +%% <tt>xmerl_scan:string/[1,2]</tt> or +%% <tt>xmerl_scan:file/[1,2]</tt>. +%% It returns records of the type defined in xmerl.hrl. +%% See also <a href="xmerl_examples.html">tutorial</a> on customization +%% functions. +%% @type global_state(). <p> +%% The global state of the scanner, represented by the #xmerl_scanner{} record. +%% </p> +%% @type option_list(). <p>Options allow to customize the behaviour of the +%% scanner. +%% See also <a href="xmerl_examples.html">tutorial</a> on customization +%% functions. +%% </p> +%% Possible options are: +%% <dl> +%% <dt><code>{acc_fun, Fun}</code></dt> +%% <dd>Call back function to accumulate contents of entity.</dd> +%% <dt><code>{continuation_fun, Fun} | +%% {continuation_fun, Fun, ContinuationState}</code></dt> +%% <dd>Call back function to decide what to do if the scanner runs into EOF +%% before the document is complete.</dd> +%% <dt><code>{event_fun, Fun} | +%% {event_fun, Fun, EventState}</code></dt> +%% <dd>Call back function to handle scanner events.</dd> +%% <dt><code>{fetch_fun, Fun} | +%% {fetch_fun, Fun, FetchState}</code></dt> +%% <dd>Call back function to fetch an external resource.</dd> +%% <dt><code>{hook_fun, Fun} | +%% {hook_fun, Fun, HookState}</code></dt> +%% <dd>Call back function to process the document entities once +%% identified.</dd> +%% <dt><code>{close_fun, Fun}</code></dt> +%% <dd>Called when document has been completely parsed.</dd> +%% <dt><code>{rules, ReadFun, WriteFun, RulesState} | +%% {rules, Rules}</code></dt> +%% <dd>Handles storing of scanner information when parsing.</dd> +%% <dt><code>{user_state, UserState}</code></dt> +%% <dd>Global state variable accessible from all customization functions</dd> +%% +%% <dt><code>{fetch_path, PathList}</code></dt> +%% <dd>PathList is a list of +%% directories to search when fetching files. If the file in question +%% is not in the fetch_path, the URI will be used as a file +%% name.</dd> +%% <dt><code>{space, Flag}</code></dt> +%% <dd>'preserve' (default) to preserve spaces, 'normalize' to +%% accumulate consecutive whitespace and replace it with one space.</dd> +%% <dt><code>{line, Line}</code></dt> +%% <dd>To specify starting line for scanning in document which contains +%% fragments of XML.</dd> +%% <dt><code>{namespace_conformant, Flag}</code></dt> +%% <dd>Controls whether to behave as a namespace conformant XML parser, +%% 'false' (default) to not otherwise 'true'.</dd> +%% <dt><code>{validation, Flag}</code></dt> +%% <dd>Controls whether to process as a validating XML parser: +%% 'off' (default) no validation, or validation 'dtd' by DTD or 'schema' +%% by XML Schema. 'false' and 'true' options are obsolete +%% (i.e. they may be removed in a future release), if used 'false' +%% equals 'off' and 'true' equals 'dtd'.</dd> +%% <dt><code>{schemaLocation, [{Namespace,Link}|...]}</code></dt> +%% <dd>Tells explicitly which XML Schema documents to use to validate +%% the XML document. Used together with the +%% <code>{validation,schema}</code> option.</dd> +%% <dt><code>{quiet, Flag}</code></dt> +%% <dd>Set to 'true' if xmerl should behave quietly and not output any +%% information to standard output (default 'false').</dd> +%% <dt><code>{doctype_DTD, DTD}</code></dt> +%% <dd>Allows to specify DTD name when it isn't available in the XML +%% document. This option has effect only together with +%% <code>{validation,'dtd'</code> option.</dd> +%% <dt><code>{xmlbase, Dir}</code></dt> +%% <dd>XML Base directory. If using string/1 default is current directory. +%% If using file/1 default is directory of given file.</dd> +%% <dt><code>{encoding, Enc}</code></dt> +%% <dd>Set default character set used (default UTF-8). +%% This character set is used only if not explicitly given by the XML +%% declaration. </dd> +%% </dl> + +-module(xmerl_scan). +-vsn('0.20'). +-date('03-09-16'). + +%% main API +-export([string/1, string/2, + file/1, file/2]). + +%% access functions for various states +-export([user_state/1, user_state/2, + event_state/1, event_state/2, + hook_state/1, hook_state/2, + rules_state/1, rules_state/2, + fetch_state/1, fetch_state/2, + cont_state/1, cont_state/2]). + +%% helper functions. To xmerl_lib ?? +-export([accumulate_whitespace/4]). + +%-define(debug, 1). +-include("xmerl.hrl"). % record def, macros +-include("xmerl_internal.hrl"). +-include_lib("kernel/include/file.hrl"). + + +-define(fatal(Reason, S), + if + S#xmerl_scanner.quiet -> + ok; + true -> + ok=io:format("~p- fatal: ~p~n", [?LINE, Reason]) + end, + fatal(Reason, S)). + + +-define(ustate(U, S), S#xmerl_scanner{user_state = U}). + + +%% Functions to access the various states + +%%% @spec user_state(S::global_state()) -> global_state() +%%% @equiv user_state(UserState,S) +user_state(#xmerl_scanner{user_state = S}) -> S. + +%%% @spec event_state(S::global_state()) -> global_state() +%%% @equiv event_state(EventState,S) +event_state(#xmerl_scanner{fun_states = #xmerl_fun_states{event = S}}) -> S. + +%%% @spec hook_state(S::global_state()) -> global_state() +%%% @equiv hook_state(HookState,S) +hook_state(#xmerl_scanner{fun_states = #xmerl_fun_states{hook = S}}) -> S. + +%%% @spec rules_state(S::global_state()) -> global_state() +%%% @equiv rules_state(RulesState,S) +rules_state(#xmerl_scanner{fun_states = #xmerl_fun_states{rules = S}}) -> S. + +%%% @spec fetch_state(S::global_state()) -> global_state() +%%% @equiv fetch_state(FetchState,S) +fetch_state(#xmerl_scanner{fun_states = #xmerl_fun_states{fetch = S}}) -> S. + +%%% @spec cont_state(S::global_state()) -> global_state() +%%% @equiv cont_state(ContinuationState,S) +cont_state(#xmerl_scanner{fun_states = #xmerl_fun_states{cont = S}}) -> S. + + +%%%% Functions to modify the various states + +%%% @spec user_state(UserState, S::global_state()) -> global_state() +%%% @doc For controlling the UserState, to be used in a user function. +%%% See <a href="xmerl_examples.html">tutorial</a> on customization functions. +user_state(X, S) -> + S#xmerl_scanner{user_state = X}. + +%%% @spec event_state(EventState, S::global_state()) -> global_state() +%%% @doc For controlling the EventState, to be used in an event +%%% function, and called at the beginning and at the end of a parsed entity. +%%% See <a href="xmerl_examples.html">tutorial</a> on customization functions. +event_state(X, S=#xmerl_scanner{fun_states = FS}) -> + FS1 = FS#xmerl_fun_states{event = X}, + S#xmerl_scanner{fun_states = FS1}. + +%%% @spec hook_state(HookState, S::global_state()) -> global_state() +%%% @doc For controlling the HookState, to be used in a hook +%%% function, and called when the parser has parsed a complete entity. +%%% See <a href="xmerl_examples.html">tutorial</a> on customization functions. +hook_state(X, S=#xmerl_scanner{fun_states = FS}) -> + FS1 = FS#xmerl_fun_states{hook = X}, + S#xmerl_scanner{fun_states = FS1}. + +%%% @spec rules_state(RulesState, S::global_state()) -> global_state() +%%% @doc For controlling the RulesState, to be used in a rules +%%% function, and called when the parser store scanner information in a rules +%%% database. +%%% See <a href="xmerl_examples.html">tutorial</a> on customization functions. +rules_state(X, S=#xmerl_scanner{fun_states = FS}) -> + FS1 = FS#xmerl_fun_states{rules = X}, + S#xmerl_scanner{fun_states = FS1}. + +%%% @spec fetch_state(FetchState, S::global_state()) -> global_state() +%%% @doc For controlling the FetchState, to be used in a fetch +%%% function, and called when the parser fetch an external resource (eg. a DTD). +%%% See <a href="xmerl_examples.html">tutorial</a> on customization functions. +fetch_state(X, S=#xmerl_scanner{fun_states = FS}) -> + FS1 = FS#xmerl_fun_states{fetch = X}, + S#xmerl_scanner{fun_states = FS1}. + +%%% @spec cont_state(ContinuationState, S::global_state()) -> global_state() +%%% @doc For controlling the ContinuationState, to be used in a continuation +%%% function, and called when the parser encounters the end of the byte stream. +%%% See <a href="xmerl_examples.html">tutorial</a> on customization functions. +cont_state(X, S=#xmerl_scanner{fun_states = FS}) -> + FS1 = FS#xmerl_fun_states{cont = X}, + S#xmerl_scanner{fun_states = FS1}. + + +%% @spec file(Filename::string()) -> {xmlElement(),Rest} +%% Rest = list() +%% @equiv file(Filename, []) +file(F) -> + file(F, []). + +%% @spec file(Filename::string(), Options::option_list()) -> {xmlElement(),Rest} +%% Rest = list() +%%% @doc Parse file containing an XML document +file(F, Options) -> + ExtCharset=case lists:keysearch(encoding,1,Options) of + {value,{_,Val}} -> Val; + false -> undefined + end, + case int_file(F,Options,ExtCharset) of + {Res, Tail,S=#xmerl_scanner{close_fun=Close}} -> + Close(S), % for side effects only - final state is dropped + {Res,Tail}; + {error, Reason} -> + {error, Reason} + end. + +int_file(F, Options,_ExtCharset) -> + %%io:format("int_file F=~p~n",[F]), + case file:read_file(F) of + {ok, Bin} -> + int_string(binary_to_list(Bin), Options, filename:dirname(F),F); + Error -> + Error + end. + +int_file_decl(F, Options,_ExtCharset) -> +% io:format("int_file_decl F=~p~n",[F]), + case file:read_file(F) of + {ok, Bin} -> + int_string_decl(binary_to_list(Bin), Options, filename:dirname(F),F); + Error -> + Error + end. + +%% @spec string(Text::list()) -> {xmlElement(),Rest} +%% Rest = list() +%% @equiv string(Test, []) +string(Str) -> + string(Str, []). + +%% @spec string(Text::list(),Options::option_list()) -> {xmlElement(),Rest} +%% Rest = list() +%%% @doc Parse string containing an XML document +string(Str, Options) -> + {Res, Tail, S=#xmerl_scanner{close_fun = Close}} = + int_string(Str, Options,file_name_unknown), + Close(S), % for side effects only - final state is dropped + {Res,Tail}. + +int_string(Str, Options,FileName) -> + {ok, XMLBase} = file:get_cwd(), + int_string(Str, Options, XMLBase, FileName). + +int_string(Str, Options, XMLBase, FileName) -> + S0=initial_state0(Options,XMLBase), + S = S0#xmerl_scanner{filename=FileName}, + %%io:format("int_string1, calling xmerl_lib:detect_charset~n",[]), + + %% In case of no encoding attribute in document utf-8 is default, but + %% another character set may be detected with help of Byte Order Marker or + %% with help of the encoding of the first 4 bytes. + case xmerl_lib:detect_charset(S#xmerl_scanner.encoding,Str) of + {auto,'iso-10646-utf-1',Str2} -> + scan_document(Str2, S#xmerl_scanner{encoding="iso-10646-utf-1"}); + {external,'iso-10646-utf-1',Str2} -> + scan_document(Str2, S#xmerl_scanner{encoding="iso-10646-utf-1"}); + {undefined,undefined,Str2} -> %% no auto detection + scan_document(Str2, S); + {external,ExtCharset,Str2} -> + %% no auto detection, ExtCharset is an explicitly provided + %% 7 bit,8 bit or utf-8 encoding + scan_document(Str2, S#xmerl_scanner{encoding=atom_to_list(ExtCharset)}) + end. + +int_string_decl(Str, Options, XMLBase, FileName) -> + S0=initial_state0(Options,XMLBase), + S = S0#xmerl_scanner{filename=FileName}, + case xmerl_lib:detect_charset(S#xmerl_scanner.encoding,Str) of + {auto,'iso-10646-utf-1',Str2} -> + scan_decl(Str2, S#xmerl_scanner{encoding="iso-10646-utf-1"}); + {external,'iso-10646-utf-1',Str2} -> + scan_decl(Str2, S#xmerl_scanner{encoding="iso-10646-utf-1"}); + {undefined,undefined,Str2} -> + scan_decl(Str2, S); + {external,ExtCharset,Str2} -> + scan_decl(Str2, S#xmerl_scanner{encoding=atom_to_list(ExtCharset)}) + end. + + + +initial_state0(Options,XMLBase) -> + CommonData = common_data(), + initial_state(Options, #xmerl_scanner{ + event_fun = fun event/2, + hook_fun = fun hook/2, + acc_fun = fun acc/3, + fetch_fun = fun fetch/2, + close_fun = fun close/1, + continuation_fun = fun cont/3, + rules_read_fun = fun rules_read/3, + rules_write_fun = fun rules_write/4, + rules_delete_fun= fun rules_delete/3, + xmlbase = XMLBase, + common_data = CommonData + }). + +initial_state([{event_fun, F}|T], S) -> + initial_state(T, S#xmerl_scanner{event_fun = F}); +initial_state([{event_fun, F, ES}|T], S) -> + S1 = event_state(ES, S#xmerl_scanner{event_fun = F}), + initial_state(T, S1); +initial_state([{acc_fun, F}|T], S) -> + initial_state(T, S#xmerl_scanner{acc_fun = F}); +initial_state([{hook_fun, F}|T], S) -> + initial_state(T, S#xmerl_scanner{hook_fun = F}); +initial_state([{hook_fun, F, HS}|T], S) -> + S1 = hook_state(HS, S#xmerl_scanner{hook_fun = F}), + initial_state(T, S1); +initial_state([{close_fun, F}|T], S) -> + initial_state(T, S#xmerl_scanner{close_fun = F}); +initial_state([{fetch_fun, F}|T], S) -> + initial_state(T, S#xmerl_scanner{fetch_fun = F}); +initial_state([{fetch_fun, F, FS}|T], S) -> + S1 = fetch_state(FS, S#xmerl_scanner{fetch_fun = F}), + initial_state(T, S1); +initial_state([{fetch_path, P}|T], S) -> + initial_state(T, S#xmerl_scanner{fetch_path = P}); +initial_state([{continuation_fun, F}|T], S) -> + initial_state(T, S#xmerl_scanner{continuation_fun = F}); +initial_state([{continuation_fun, F, CS}|T], S) -> + S1 = cont_state(CS, S#xmerl_scanner{continuation_fun = F}), + initial_state(T, S1); +initial_state([{rules, R}|T], S) -> + initial_state(T, S#xmerl_scanner{rules = R, + keep_rules = true}); +initial_state([{rules, Read, Write, RS}|T], S) -> + S1 = rules_state(RS, S#xmerl_scanner{rules_read_fun = Read, + rules_write_fun = Write, + keep_rules = true}), + initial_state(T, S1); +initial_state([{user_state, F}|T], S) -> + initial_state(T, S#xmerl_scanner{user_state = F}); +initial_state([{space, L}|T], S) -> + initial_state(T, S#xmerl_scanner{space = L}); +initial_state([{line, L}|T], S) -> + initial_state(T, S#xmerl_scanner{line = L}); +initial_state([{namespace_conformant, F}|T], S) when F==true; F==false -> + initial_state(T, S#xmerl_scanner{namespace_conformant = F}); +initial_state([{validation, F}|T], S) + when F==off; F==dtd; F==schema; F==true; F==false -> + initial_state(T, S#xmerl_scanner{validation = validation_value(F)}); +initial_state([{schemaLocation, SL}|T], S) when is_list(SL) -> + initial_state(T, S#xmerl_scanner{schemaLocation=SL}); +initial_state([{quiet, F}|T], S) when F==true; F==false -> + initial_state(T, S#xmerl_scanner{quiet = F}); +initial_state([{doctype_DTD,DTD}|T], S) -> + initial_state(T,S#xmerl_scanner{doctype_DTD = DTD}); +initial_state([{text_decl,Bool}|T], S) -> + initial_state(T,S#xmerl_scanner{text_decl=Bool}); +initial_state([{environment,Env}|T], S) -> + initial_state(T,S#xmerl_scanner{environment=Env}); +initial_state([{xmlbase, D}|T], S) -> + initial_state(T, S#xmerl_scanner{xmlbase = D}); +initial_state([{encoding, Enc}|T], S) -> + initial_state(T, S#xmerl_scanner{encoding = Enc}); +initial_state([], S=#xmerl_scanner{rules = undefined}) -> + Tab = ets:new(rules, [set, public]), + S#xmerl_scanner{rules = Tab}; +initial_state([], S) -> + S. + +validation_value(true) -> + dtd; +validation_value(false) -> + off; +validation_value(F) -> + F. + +%% Used for compacting (some) indentations. +%% See also fast_accumulate_whitespace(). +common_data() -> + {comdata(lists:duplicate(60, $\s), []), + comdata(lists:duplicate(15, $\t), []), + "\n"}. + +comdata([], CD)-> + list_to_tuple(CD); +comdata([_ | T]=L, CD) -> + comdata(T, [[$\n | L] | CD]). + +%%% ----------------------------------------------------- +%%% Default modifier functions + +%%% Hooks: +%%% - {element, Line, Name, Attrs, Content} +%%% - {processing_instruction, Line, Data} + +hook(X, State) -> + {X, State}. + +%%% Events: +%%% +%%% #xmerl_event{event : started | ended, +%%% line : integer(), +%%% col : integer(), +%%% data} +%%% +%%% Data Events +%%% document started, ended +%%% #xmlElement started, ended +%%% #xmlAttribute ended +%%% #xmlPI ended +%%% #xmlComment ended +%%% #xmlText ended +event(_X, S) -> + S. + +%% The acc/3 function can return either {Acc�, S'} or {Acc', Pos', S'}, +%% where Pos' can be derived from X#xmlElement.pos, X#xmlText.pos, or +%% X#xmlAttribute.pos (whichever is the current object type.) +%% The acc/3 function is not allowed to redefine the type of object +%% being defined, but _is_ allowed to either ignore it or split it +%% into multiple objects (in which case {Acc',Pos',S'} should be returned.) +%% If {Acc',S'} is returned, Pos will be incremented by 1 by default. +%% Below is an example of an acceptable operation +acc(X = #xmlText{value = Text}, Acc, S) -> + {[X#xmlText{value = Text}|Acc], S}; +acc(X, Acc, S) -> + {[X|Acc], S}. + +fetch({system, URI}, S) -> + fetch_URI(URI, S); +fetch({public, _PublicID, URI}, S) -> + fetch_URI(URI, S). + +%%% Always assume an external resource can be found locally! Thus +%%% don't bother fetching with e.g. HTTP. Returns the path where the +%%% resource is found. The path to the external resource is given by +%%% URI directly or the option fetch_path (additional paths) or +%%% directory (base path to external resource) +fetch_URI(URI, S) -> + %% assume URI is a filename + Split = filename:split(URI), + Filename = fun([])->[];(X)->lists:last(X) end (Split), + Fullname = + case Split of %% how about Windows systems? + ["file:"|Name]-> %% absolute path, see RFC2396 sect 3 + %% file:/dtd_name + filename:join(["/"|Name]); + ["/"|Rest] when Rest /= [] -> + %% absolute path name + URI; + ["http:"|_Rest] -> + {http,URI}; + [] -> %% empty systemliteral + []; + _ -> + filename:join(S#xmerl_scanner.xmlbase, URI) + end, + Path = path_locate(S#xmerl_scanner.fetch_path, Filename, Fullname), + ?dbg("fetch(~p) -> {file, ~p}.~n", [URI, Path]), + {ok, Path, S}. + +path_locate(_, _, {http,_}=URI) -> + URI; +path_locate(_, _, []) -> + []; +path_locate([Dir|Dirs], FN, FullName) -> + F = filename:join(Dir, FN), + case file:read_file_info(F) of + {ok, #file_info{type = regular}} -> + {file,F}; + _ -> + path_locate(Dirs, FN, FullName) + end; +path_locate([], _FN, FullName) -> + {file,FullName}. + + +cont(_F, Exception, US) -> + Exception(US). + +close(S) -> + S. + + +%%% ----------------------------------------------------- +%%% Scanner + +%%% [1] document ::= prolog element Misc* +scan_document(Str0, S=#xmerl_scanner{event_fun = Event, + line = L, col = C, + environment=Env, + encoding=Charset, + validation=ValidateResult}) -> + S1 = Event(#xmerl_event{event = started, + line = L, + col = C, + data = document}, S), + + %% Transform to given character set. + %% Note that if another character set is given in the encoding + %% attribute in a XML declaration that one will be used later + Str=if + Charset == "utf-8" -> + Str0; + Charset=/=undefined -> % Default character set is UTF-8 + xmerl_ucs:to_unicode(Str0,list_to_atom(Charset)); + true -> %% Charset is undefined if no external input is + %% given, and no auto detection of character + %% encoding was made. + Str0 + end, +%% M1 = erlang:memory(), +%% io:format("Memory status before prolog: ~p~n",[M1]), + {T1, S2} = scan_prolog(Str, S1, _StartPos = 1), +%% M2 = erlang:memory(), +%% io:format("Memory status after prolog: ~p~n",[M2]), + %%io:format("scan_document 2, prolog parsed~n",[]), + T2 = scan_mandatory("<",T1,1,S2,expected_element_start_tag), +%% M3 = erlang:memory(), +%% io:format("Memory status before element: ~p~n",[M3]), + {Res, T3, S3} =scan_element(T2,S2,_StartPos = 1), +%% M4 = erlang:memory(), +%% io:format("Memory status after element: ~p~n",[M4]), + {Tail, S4}=scan_misc(T3, S3, _StartPos = 1), +%% M5 = erlang:memory(), +%% io:format("Memory status after misc: ~p~n",[M5]), + + S5 = #xmerl_scanner{} = Event(#xmerl_event{event = ended, + line = S4#xmerl_scanner.line, + col = S4#xmerl_scanner.col, + data = document}, S4), + + {Res2,S6} = case validation_mode(ValidateResult) of + off -> + {Res,cleanup(S5)}; + dtd when Env == element; Env == prolog -> + check_decl2(S5), + case xmerl_validate:validate(S5,Res) of + {'EXIT',{error,Reason}} -> + S5b=cleanup(S5), + ?fatal({failed_validation,Reason}, S5b); + {'EXIT',Reason} -> + S5b=cleanup(S5), + ?fatal({failed_validation,Reason}, S5b); + {error,Reason} -> + S5b=cleanup(S5), + ?fatal({failed_validation,Reason}, S5b); + {error,Reason,_Next} -> + S5b=cleanup(S5), + ?fatal({failed_validation,Reason}, S5b); + _XML -> + {Res,cleanup(S5)} + end; + schema -> + case schemaLocations(Res,S5) of + {ok,Schemas} -> + cleanup(S5), + %%io:format("Schemas: ~p~nRes: ~p~ninhertih_options(S): ~p~n", + %% [Schemas,Res,inherit_options(S5)]), + XSDRes = xmerl_xsd:process_validate(Schemas,Res, + inherit_options(S5)), + handle_schema_result(XSDRes,S5); + _ -> + {Res,cleanup(S5)} + end; + _ -> + {Res,cleanup(S5)} + end, + + {Res2, Tail, S6}. + + +scan_decl(Str, S=#xmerl_scanner{event_fun = Event, + line = L, col = C, + environment=_Env, + encoding=_Charset, + validation=_ValidateResult}) -> + S1 = Event(#xmerl_event{event = started, + line = L, + col = C, + data = document}, S), + + case scan_prolog(Str, S1, _StartPos = 1) of + {T2="<"++_, S2} -> + {{S2#xmerl_scanner.user_state,T2},[],S2}; + {[], S2}-> + {[],[],S2}; + {T2, S2} -> + {_,_,S3} = scan_content(T2,S2,[],_Attrs=[],S2#xmerl_scanner.space, + _Lang=[],_Parents=[],#xmlNamespace{}), + {T2,[],S3} + end. + + +%%% [22] Prolog +%%% prolog ::= XMLDecl? Misc* (doctypedecl Misc*)? +%%% +%% empty text declarations are handled by the first function clause. +scan_prolog([], S=#xmerl_scanner{continuation_fun = F}, Pos) -> + ?dbg("cont()...~n", []), + F(fun(MoreBytes, S1) -> scan_prolog(MoreBytes, S1, Pos) end, + fun(S1) -> {[], S1} end, + S); +scan_prolog("<?xml"++T,S0=#xmerl_scanner{encoding=Charset0,col=Col,line=L},Pos) + when ?whitespace(hd(T)) -> + {Charset,T3, S3}= + if + Col==1,L==1,S0#xmerl_scanner.text_decl==true -> + ?dbg("prolog(\"<?xml\")~n", []), + ?bump_col(5), + {_,T1,S1} = mandatory_strip(T,S), + {Decl,T2, S2}=scan_text_decl(T1,S1), + Encoding=Decl#xmlDecl.encoding, + {Encoding,T2, S2#xmerl_scanner{encoding=Encoding}}; + Col==1,L==1 -> + ?dbg("prolog(\"<?xml\")~n", []), + ?bump_col(5), + {Decl,T2, S2}=scan_xml_decl(T, S), + Encoding=Decl#xmlDecl.encoding, + {Encoding,T2, S2#xmerl_scanner{encoding=Encoding}}; + true -> + ?fatal({xml_declaration_must_be_first_in_doc,Col,L},S0) + end, + %% Charset0 is either (1) 'iso-10646-utf-1' (transformation by + %% auto detection), (2) undefined (no auto detection and no + %% external encoding), (3) any other encoding format that must be + %% conformant to the internal explicitly given encoding. The two + %% former cases implies that the explicit internal encoding + %% (Charset) may be different from Charset0. + + %% Now transform to declared character set. + if + Charset==Charset0 -> % Document already transformed to this charset! + scan_prolog(T3, S3, Pos); + Charset0=/=undefined -> + %% For example may an external entity + %% have the BOM for utf-16 and the internal + %% explicit encoding='utf-16', then it will be auto + %% detected and transformed, Charset0 will be + %% 'iso-10646-utf-1', and Charset will be 'utf-16', all + %% legal. + %% + scan_prolog(T3,S3#xmerl_scanner{encoding=Charset0},Pos); + Charset == "utf-8" -> + scan_prolog(T3, S3, Pos); + Charset=/=undefined -> % Document not previously transformed + T4=xmerl_ucs:to_unicode(T3,list_to_atom(Charset)), + scan_prolog(T4, S3, Pos); + true -> % No encoding info given + scan_prolog(T3, S3, Pos) + end; +scan_prolog("<!DOCTYPE" ++ T, S0=#xmerl_scanner{environment=prolog, + encoding=_Charset}, Pos) -> + ?dbg("prolog(\"<!DOCTYPE\")~n", []), + ?bump_col(9), + %% If no known character set assume it is UTF-8 + T1=if + %% Charset==undefined -> xmerl_ucs:to_unicode(T,'utf-8'); + true -> T + end, + {T2, S1} = scan_doctype(T1, S), + scan_misc(T2, S1, Pos); +scan_prolog(Str="%"++_T,S=#xmerl_scanner{environment={external,_}},_Pos) -> + scan_ext_subset(Str,S); +scan_prolog(Str, S0 = #xmerl_scanner{user_state=_US,encoding=_Charset},Pos) -> + ?dbg("prolog(\"<\")~n", []), + + %% Check for Comments, PI before possible DOCTYPE declaration + ?bump_col(1), + %% If no known character set assume it is UTF-8 + T=if +%% Charset==undefined -> xmerl_ucs:to_unicode(Str,'utf-8'); + true -> Str + end, + {T1, S1}=scan_misc(T, S, Pos), + scan_prolog2(T1,S1,Pos). + + + +scan_prolog2([], S=#xmerl_scanner{continuation_fun = F}, Pos) -> + ?dbg("cont()...~n", []), + F(fun(MoreBytes, S1) -> scan_prolog2(MoreBytes, S1, Pos) end, + fun(S1) -> {[], S1} end, + S); +scan_prolog2("<!DOCTYPE" ++ T, S0=#xmerl_scanner{environment=prolog}, Pos) -> + ?dbg("prolog(\"<!DOCTYPE\")~n", []), + ?bump_col(9), + {T1, S1} = scan_doctype(T, S), + scan_misc(T1, S1, Pos); +scan_prolog2(Str = "<!" ++ _, S, _Pos) -> + ?dbg("prolog(\"<!\")~n", []), + %% In e.g. a DTD, we jump directly to markup declarations + scan_ext_subset(Str, S); +scan_prolog2(Str, S0 = #xmerl_scanner{user_state=_US},Pos) -> + ?dbg("prolog(\"<\")~n", []), + + %% Here we consider the DTD provided by doctype_DTD option, + S1 = + case S0 of + #xmerl_scanner{validation=dtd,doctype_DTD=DTD} when is_list(DTD) -> + S=fetch_DTD(undefined,S0), + check_decl(S), + S; + _ -> S0 + end, + %% Check for more Comments and PI after DOCTYPE declaration +% ?bump_col(1), + scan_misc(Str, S1, Pos). + + + + +%%% [27] Misc ::= Comment | PI | S +%% Note: +%% - Neither of Comment and PI are returned in the resulting parsed +%% structure. +%% - scan_misc/3 implements Misc* as that is how the rule is always used +scan_misc([], S=#xmerl_scanner{continuation_fun = F}, Pos) -> + ?dbg("cont()...~n", []), + F(fun(MoreBytes, S1) -> scan_misc(MoreBytes, S1, Pos) end, + fun(S1) -> {[], S1} end, + S); +scan_misc("<!--" ++ T, S0, Pos) -> % Comment + ?bump_col(4), + {_, T1, S1} = scan_comment(T, S, Pos, _Parents = [], _Lang = []), + scan_misc(T1,S1,Pos); +scan_misc("<?" ++ T, S0, Pos) -> % PI + ?dbg("prolog(\"<?\")~n", []), + ?bump_col(2), + {_PI, T1, S1} = scan_pi(T, S, Pos), + scan_misc(T1,S1,Pos); +scan_misc(T=[H|_T], S, Pos) when ?whitespace(H) -> + ?dbg("prolog(whitespace)~n", []), + {_,T1,S1}=strip(T,S), + scan_misc(T1,S1,Pos); +scan_misc(T,S,_Pos) -> + {T,S}. + + +cleanup(S=#xmerl_scanner{keep_rules = false, + rules = Rules}) -> + ets:delete(Rules), + S#xmerl_scanner{rules = undefined}; +cleanup(S) -> + S. + +%%% Prolog and Document Type Declaration XML 1.0 Section 2.8 +%% [23] XMLDecl ::= '<?xml' VersionInfo EncodingDecl? SDDecl? S? '?>' +%% [24] VersionInfo ::= S 'version' Eq ("'" VersionNum "'" | '"' VersionNum '"') +scan_xml_decl(T, S) -> + %% VersionInfo [24] is mandatory + {_,T1,S1} = mandatory_strip(T,S), + {T2,S2} = + case T1 of + "version" ++ _T2 -> + {_T2,S1#xmerl_scanner{col=S1#xmerl_scanner.col+7}}; + _ -> ?fatal(expected_version_attribute,S1) + end, + {T3, S3} = scan_eq(T2, S2), + {Vsn, T4, S4} = scan_xml_vsn(T3, S3), + Attr = #xmlAttribute{name = version, + parents = [{xml, _XMLPos = 1}], + value = Vsn}, + scan_xml_decl(T4, S4, #xmlDecl{attributes = [Attr]}). + +scan_xml_decl([], S=#xmerl_scanner{continuation_fun = F}, Decl) -> + ?dbg("cont()...~n", []), + F(fun(MoreBytes, S1) -> scan_xml_decl(MoreBytes, S1, Decl) end, + fun(S1) -> {[], [], S1} end, + S); +scan_xml_decl("?>" ++ T, S0, Decl) -> + ?bump_col(2), + return_xml_decl(T,S,Decl); +scan_xml_decl(T,S=#xmerl_scanner{event_fun = _Event},Decl) when ?whitespace(hd(T)) -> + {_,T1,S1}=mandatory_strip(T,S), + scan_xml_decl2(T1,S1,Decl); +scan_xml_decl(_T,S=#xmerl_scanner{event_fun = _Event},_Decl) -> + ?fatal(preformat([expected,one,'of:'],['?>',whitespace_character],","),S). + +scan_xml_decl2("?>" ++ T, S0,Decl) -> + ?bump_col(2), + return_xml_decl(T,S,Decl); +scan_xml_decl2("encoding" ++ T, S0 = #xmerl_scanner{event_fun = Event}, + Decl0 = #xmlDecl{attributes = Attrs}) -> + %% [80] EncodingDecl + ?bump_col(8), + {T1, S1} = scan_eq(T, S), + {EncName, T2, S2} = scan_enc_name(T1, S1), + LowEncName=xmerl_lib:to_lower(EncName), + Attr = #xmlAttribute{name = encoding, + parents = [{xml, _XMLPos = 1}], + value = LowEncName}, + Decl = Decl0#xmlDecl{encoding = LowEncName, + attributes = [Attr|Attrs]}, + S3 = #xmerl_scanner{} = Event(#xmerl_event{event = ended, + line = S0#xmerl_scanner.line, + col = S0#xmerl_scanner.col, + data = Attr}, S2), + case T2 of + "?>" ++ _T3 -> + scan_xml_decl3(T2,S3,Decl); + _ -> + {_,T3,S4} = mandatory_strip(T2,S3), + scan_xml_decl3(T3, S4, Decl) + end; +scan_xml_decl2(T="standalone" ++ _T,S,Decl) -> + scan_xml_decl3(T,S,Decl); +scan_xml_decl2(_BadString,S,_Decl) -> + ?fatal(preformat([expected,one,'of:'],['?>',standalone,encoding],","),S). +% ?fatal(lists:flatten(io_lib:format("~s ~s ~s: ~s, ~s, ~s",[expected,one,'of','?>',standalone,encoding])),S). +% ?fatal({expected_one_of,"?>",standalone,encoding},S). + +scan_xml_decl3("?>" ++ T, S0,Decl) -> + ?bump_col(2), + return_xml_decl(T,S,Decl); +scan_xml_decl3("standalone" ++ T,S0 = #xmerl_scanner{event_fun = Event}, + Decl0 = #xmlDecl{attributes = Attrs}) -> + %% [32] SDDecl + ?bump_col(10), + {T1, S1} = scan_eq(T, S), + {StValue,T2,S2}=scan_standalone_value(T1,S1), + Attr = #xmlAttribute{name = standalone, + parents = [{xml, _XMLPos = 1}], + value = StValue}, + Decl = Decl0#xmlDecl{standalone = StValue, + attributes = [Attr|Attrs]}, + S3 = #xmerl_scanner{} = Event(#xmerl_event{event = ended, + line = S0#xmerl_scanner.line, + col = S0#xmerl_scanner.col, + data = Attr}, S2), + {_,T3,S4} = strip(T2,S3), + T4 = scan_mandatory("?>",T3,2,S4,expected_xml_decl_endtag), +%% "?>" ++ T4 = T3, + return_xml_decl(T4, S4#xmerl_scanner{col=S4#xmerl_scanner.col+2}, Decl). + + +return_xml_decl(T,S=#xmerl_scanner{hook_fun = _Hook, + event_fun = Event}, + Decl0 = #xmlDecl{attributes = Attrs}) -> + ?strip1, + Decl = Decl0#xmlDecl{attributes = lists:reverse(Attrs)}, + S2 = #xmerl_scanner{} = Event(#xmerl_event{event = ended, + line = S#xmerl_scanner.line, + col = S#xmerl_scanner.col, + data = Decl}, S1), +%% {Ret, S3} = Hook(Decl, S2), +%% {Ret, T1, S3}. + {Decl, T1, S2}. + + +scan_standalone_value("'yes'" ++T,S0)-> + ?bump_col(5), + {'yes',T,S#xmerl_scanner{standalone=yes}}; +scan_standalone_value("\"yes\"" ++T,S0)-> + ?bump_col(5), + {'yes',T,S#xmerl_scanner{standalone=yes}}; +scan_standalone_value("'no'" ++T,S0) -> + ?bump_col(4), + {'no',T,S}; +scan_standalone_value("\"no\"" ++T,S0) -> + ?bump_col(4), + {'no',T,S}. + +%%% +%%% Text declaration XML 1.0 section 4.3.1 +%%% [77] TextDecl ::= '<?xml' VersionInfo? EncodingDecl S? '?>' +scan_text_decl(T,S=#xmerl_scanner{event_fun = Event}) -> + {#xmlDecl{attributes=Attrs}=Decl0,T1,S1} = scan_optional_version(T,S), + T2 = + case T1 of + "encoding" ++ _T2 -> _T2; + _ -> + ?fatal(expected_encoding_attribute,S1) + end, + S2 = S1#xmerl_scanner{col = S1#xmerl_scanner.col + 8}, + {T3, S3} = scan_eq(T2, S2), + {EncName, T4, S4} = scan_enc_name(T3, S3), + LowEncName=xmerl_lib:to_lower(EncName), + ?strip5, + Attr = #xmlAttribute{name = encoding, + parents = [{xml,1}], + value = LowEncName}, + Decl = Decl0#xmlDecl{encoding = LowEncName, + attributes = [Attr|Attrs]}, + S6=#xmerl_scanner{} = Event(#xmerl_event{event = ended, + line = S5#xmerl_scanner.line, + col = S5#xmerl_scanner.col, + data = Attr}, S5), + scan_text_decl(T5,S6,Decl). + +scan_text_decl("?>"++T,S0 = #xmerl_scanner{hook_fun = _Hook, + event_fun = Event}, + Decl0 = #xmlDecl{attributes = Attrs}) -> + ?bump_col(2), + ?strip1, + Decl = Decl0#xmlDecl{attributes = lists:reverse(Attrs)}, + S2 = #xmerl_scanner{} = Event(#xmerl_event{event = ended, + line = S0#xmerl_scanner.line, + col = S0#xmerl_scanner.col, + data = Decl}, S1), +%% {Ret, S3} = Hook(Decl, S2), +%% {Ret, T1, S3}; + {Decl, T1, S2}; +scan_text_decl([H|_T],S,_) -> + ?fatal({unexpected_character_in_text_declaration,H},S). + +scan_optional_version("version"++T,S0) -> + ?bump_col(7), + ?strip1, + {T2, S2} = scan_eq(T1, S1), + {Vsn, T3, S3} = scan_xml_vsn(T2, S2), + {_,T4,S4} = mandatory_strip(T3,S3), + Attr = #xmlAttribute{name = version,parents = [{xml,1}],value = Vsn}, + {#xmlDecl{attributes=[Attr]},T4,S4}; +scan_optional_version(T,S) -> + {#xmlDecl{attributes=[]},T,S}. + + + +%%%%%%% [81] EncName +scan_enc_name([], S=#xmerl_scanner{continuation_fun = F}) -> + ?dbg("cont()...~n", []), + F(fun(MoreBytes, S1) -> scan_enc_name(MoreBytes, S1) end, + fun(S1) -> ?fatal(expected_encoding_name, S1) end, + S); +scan_enc_name([H|T], S0) when H >= $"; H =< $' -> + ?bump_col(1), + scan_enc_name(T, S, H, []). + + +scan_enc_name([], S=#xmerl_scanner{continuation_fun = F}, Delim, Acc) -> + ?dbg("cont()...~n", []), + F(fun(MoreBytes, S1) -> scan_enc_name(MoreBytes, S1, Delim, Acc) end, + fun(S1) -> ?fatal(expected_encoding_name, S1) end, + S); +scan_enc_name([H|T], S0, Delim, Acc) when H >= $a, H =< $z -> + ?bump_col(1), + scan_enc_name2(T, S, Delim, [H|Acc]); +scan_enc_name([H|T], S0, Delim, Acc) when H >= $A, H =< $Z -> + ?bump_col(1), + scan_enc_name2(T, S, Delim, [H|Acc]); +scan_enc_name([H|_T],S,_Delim,_Acc) -> + ?fatal({error,{unexpected_character_in_Enc_Name,H}},S). + +scan_enc_name2([], S=#xmerl_scanner{continuation_fun = F}, Delim, Acc) -> + ?dbg("cont()...~n", []), + F(fun(MoreBytes, S1) -> scan_enc_name2(MoreBytes, S1, Delim, Acc) end, + fun(S1) -> ?fatal(expected_encoding_name, S1) end, + S); +scan_enc_name2([H|T], S0, H, Acc) -> + ?bump_col(1), + {lists:reverse(Acc), T, S}; +scan_enc_name2([H|T], S0, Delim, Acc) when H >= $a, H =< $z -> + ?bump_col(1), + scan_enc_name2(T, S, Delim, [H|Acc]); +scan_enc_name2([H|T], S0, Delim, Acc) when H >= $A, H =< $Z -> + ?bump_col(1), + scan_enc_name2(T, S, Delim, [H|Acc]); +scan_enc_name2([H|T], S0, Delim, Acc) when H >= $0, H =< $9 -> + ?bump_col(1), + scan_enc_name2(T, S, Delim, [H|Acc]); +scan_enc_name2([H|T], S0, Delim, Acc) when H == $.; H == $_; H == $- -> + ?bump_col(1), + scan_enc_name2(T, S, Delim, [H|Acc]). + + +%%%%%%% [26] VersionNum +%%% VersionNum ::= ([a-zA-Z0-9_.:] | '-')+ +scan_xml_vsn([], S=#xmerl_scanner{continuation_fun = F}) -> + ?dbg("cont()...~n", []), + F(fun(MoreBytes, S1) -> scan_xml_vsn(MoreBytes, S1) end, + fun(S1) -> ?fatal(unexpected_end, S1) end, + S); +scan_xml_vsn([H|T], S) when H==$"; H==$'-> + xml_vsn(T, S#xmerl_scanner{col = S#xmerl_scanner.col+1}, H, []). + +xml_vsn([], S=#xmerl_scanner{continuation_fun = F}, Delim, Acc) -> + ?dbg("cont()...~n", []), + F(fun(MoreBytes, S1) -> xml_vsn(MoreBytes, S1, Delim, Acc) end, + fun(S1) -> ?fatal(unexpected_end, S1) end, + S); +xml_vsn([H|T], S=#xmerl_scanner{col = C}, H, Acc) -> + {lists:reverse(Acc), T, S#xmerl_scanner{col = C+1}}; +xml_vsn([H|T], S=#xmerl_scanner{col = C},Delim, Acc) when H >= $a, H =< $z -> + xml_vsn(T, S#xmerl_scanner{col = C+1}, Delim, [H|Acc]); +xml_vsn([H|T], S=#xmerl_scanner{col = C},Delim, Acc) when H >= $A, H =< $Z -> + xml_vsn(T, S#xmerl_scanner{col = C+1}, Delim, [H|Acc]); +xml_vsn([H|T], S=#xmerl_scanner{col = C},Delim, Acc) when H >= $0, H =< $9 -> + xml_vsn(T, S#xmerl_scanner{col = C+1}, Delim, [H|Acc]); +xml_vsn([H|T], S=#xmerl_scanner{col = C}, Delim, Acc) -> + case lists:member(H, "_.:-") of + true -> + xml_vsn(T, S#xmerl_scanner{col = C+1}, Delim, [H|Acc]); + false -> + ?fatal({invalid_vsn_char, H}, S) + end. + +%%%%%%% [16] PI ::= '<?' PITarget (S (Char* - (Char* '?>' Char*)))? '?>' + +scan_pi([], S=#xmerl_scanner{continuation_fun = F}, Pos) -> + ?dbg("cont()...~n", []), + F(fun(MoreBytes, S1) -> scan_pi(MoreBytes, S1, Pos) end, + fun(S1) -> ?fatal(unexpected_end, S1) end, + S); +scan_pi(Str = [H1,H2,H3 | T],S0=#xmerl_scanner{line = L, col = C}, Pos) + when H1==$x;H1==$X -> + %% names beginning with [xX][mM][lL] are reserved for future use. + ?bump_col(3), + if + ((H2==$m) or (H2==$M)) and + ((H3==$l) or (H3==$L)) -> + scan_wellknown_pi(T,S,Pos); + true -> + {Target, _NamespaceInfo, T1, S1} = scan_name(Str, S), + scan_pi(T1, S1, Target, L, C, Pos, []) + end; +scan_pi(Str, S=#xmerl_scanner{line = L, col = C}, Pos) -> + {Target, _NamespaceInfo, T1, S1} = scan_name(Str, S), + scan_pi(T1, S1, Target, L, C, Pos,[]). + + +%%% More info on xml-stylesheet can be found at: +%%% "Associating Style Sheets with XML documents", Version 1.0, +%%% W3C Recommendation 29 June 1999 (http://www.w3.org/TR/xml-stylesheet/) +scan_wellknown_pi("-stylesheet"++T, S0=#xmerl_scanner{line=L,col=C},Pos) -> + ?dbg("prolog(\"<?xml-stylesheet\")~n", []), + ?bump_col(16), + scan_pi(T, S, "xml-stylesheet",L,C,Pos,[]); +scan_wellknown_pi(Str,S,_Pos) -> + ?fatal({invalid_target_name, lists:sublist(Str, 1, 10)}, S). + + + +scan_pi([], S=#xmerl_scanner{continuation_fun = F}, Target,L, C, Pos, Acc) -> + ?dbg("cont()...~n", []), + F(fun(MoreBytes, S1) -> scan_pi(MoreBytes, S1, Target, L, C, Pos, Acc) end, + fun(S1) -> ?fatal(unexpected_end, S1) end, + S); +scan_pi("?>" ++ T, S0 = #xmerl_scanner{hook_fun = Hook, + event_fun = Event}, + Target, L, C, Pos, Acc) -> + ?bump_col(2), + PI = #xmlPI{name = Target, + pos = Pos, + value = lists:reverse(Acc)}, + S1 = #xmerl_scanner{} = Event(#xmerl_event{event = ended, + line = L, + col = C, + data = PI}, S), + {Ret, S2} = Hook(PI, S1), + {Ret, T, S2}; +scan_pi([H|T], S, Target, L, C, Pos, Acc) when ?whitespace(H) -> + ?strip1, + scan_pi2(T1, S1, Target, L, C, Pos, Acc); +scan_pi([H|_T],S,_Target, _L, _C, _Pos, _Acc) -> + ?fatal({expected_whitespace_OR_end_of_PI,{char,H}}, S). + +scan_pi2([], S=#xmerl_scanner{continuation_fun = F}, Target,L, C, Pos, Acc) -> + ?dbg("cont()...~n", []), + F(fun(MoreBytes, S1) -> scan_pi2(MoreBytes, S1, Target, L, C, Pos, Acc) end, + fun(S1) -> ?fatal(unexpected_end, S1) end, + S); +scan_pi2("?>" ++ T, S0 = #xmerl_scanner{hook_fun = Hook, + event_fun = Event}, + Target, L, C, Pos, Acc) -> + ?bump_col(2), + PI = #xmlPI{name = Target, + pos = Pos, + value = lists:reverse(Acc)}, + S1 = #xmerl_scanner{} = Event(#xmerl_event{event = ended, + line = L, + col = C, + data = PI}, S), + {Ret, S2} = Hook(PI, S1), + {Ret, T, S2}; +scan_pi2(Str, S0, Target, L, C, Pos, Acc) -> + ?bump_col(1), + {Ch,T} = wfc_legal_char(Str,S), + scan_pi2(T, S, Target, L, C, Pos, [Ch|Acc]). + + + +%% [28] doctypedecl ::= +%% '<!DOCTYPE' S Name (S ExternalID)? S? ('[' intSubset ']' S?)? '>' +scan_doctype([], S=#xmerl_scanner{continuation_fun = F}) -> + ?dbg("cont()...~n", []), + F(fun(MoreBytes, S1) -> scan_doctype(MoreBytes, S1) end, + fun(S1) -> ?fatal(unexpected_end, S1) end, + S); +scan_doctype(T, S) -> + {_,T1,S1} = mandatory_strip(T,S), + {DTName, _NamespaceInfo, T2, S2} = scan_name(T1, S1), + ?strip3, + scan_doctype1(T3, S3#xmerl_scanner{doctype_name = DTName}). + + +%% [75] ExternalID ::= 'SYSTEM' S SystemLiteral +%% | 'PUBLIC' S PubidLiteral S SystemLiteral +scan_doctype1([], S=#xmerl_scanner{continuation_fun = F}) -> + ?dbg("cont()...~n", []), + F(fun(MoreBytes, S1) -> scan_doctype1(MoreBytes, S1) end, + fun(S1) -> ?fatal(unexpected_end, S1) end, + S); +scan_doctype1("PUBLIC" ++ T, S0) -> + ?bump_col(6), + {_,T1,S1} = mandatory_strip(T,S), + {PIDL, T2, S2} = scan_pubid_literal(T1, S1), + {_,T3,S3} = mandatory_strip(T2,S2), + {SL, T4, S4} = scan_system_literal(T3, S3), + ?strip5, + scan_doctype2(T5, S5, {public, PIDL, SL}); +scan_doctype1("SYSTEM" ++ T, S0) -> + ?bump_col(6), + {_,T1,S1} = mandatory_strip(T,S), + {SL, T2, S2} = scan_system_literal(T1, S1), + ?strip3, + scan_doctype2(T3, S3, {system, SL}); +scan_doctype1(T, S) -> + scan_doctype2(T, S, undefined). + + +scan_doctype2([], S=#xmerl_scanner{continuation_fun = F},DTD) -> + ?dbg("cont()...~n", []), + F(fun(MoreBytes, S1) -> scan_doctype2(MoreBytes, S1, DTD) end, + fun(S1) -> ?fatal(unexpected_end, S1) end, + S); +scan_doctype2("[" ++ T, S0, DTD) -> + ?bump_col(1), + ?strip1, + scan_doctype3(T1, S1, DTD); +scan_doctype2(">" ++ T, S0, DTD) -> + ?bump_col(1), + ?strip1, + S2 = fetch_DTD(DTD, S1), + check_decl(S2), + {T1, S2}; +scan_doctype2(_T,S,_DTD) -> + ?fatal(expected_end_of_DOCTYPE_declaration, S). + +%% [28a] DeclSep ::= PEReference | S +%% [28b] intSubset ::= (markupdecl | DeclSep)* +scan_doctype3([], S=#xmerl_scanner{continuation_fun = F},DTD) -> + ?dbg("cont()...~n", []), + F(fun(MoreBytes, S1) -> scan_doctype3(MoreBytes, S1,DTD) end, + fun(S1) -> ?fatal(unexpected_end, S1) end, + S); +scan_doctype3("%" ++ T, S0, DTD) -> + ?bump_col(1), + {PERefName, T1, S1} = scan_pe_reference(T, S), + ?strip2, + case expand_pe_reference(PERefName, S2,as_PE) of + {system, _} = Name -> + S3 = fetch_DTD(Name, S2), + check_decl(S3), + scan_doctype3(T2, S3, DTD); + {public, _} = Name -> + S3 = fetch_DTD(Name, S2), + check_decl(S3), + scan_doctype3(T2, S3, DTD); + {public, _, _} = Name -> + S3 = fetch_DTD(Name, S2), + check_decl(S3), + scan_doctype3(T2, S3, DTD); + ExpRef when is_list(ExpRef) -> % Space added, see Section 4.4.8 + {_,T3,S3} = strip(ExpRef++T2,S2), + scan_doctype3(T3,S3,DTD) + end; +scan_doctype3("]" ++ T, S0, DTD) -> + ?bump_col(1), + ?strip1, + S2 = fetch_DTD(DTD, S1), + check_decl(S2), + T2 = scan_mandatory(">",T1,1,S2,expected_doctype_end_tag), +%% ">" ++ T2 = T1, + {T2, S2}; +scan_doctype3(T, S, DTD) -> + {_, T1, S1} = scan_markup_decl(T, S), + scan_doctype3(T1, S1, DTD). + + + +fetch_DTD(undefined, S=#xmerl_scanner{doctype_DTD=URI}) when is_list(URI)-> + %% allow to specify DTD name when it isn't available in xml stream + fetch_DTD({system,URI},S#xmerl_scanner{doctype_DTD=option_provided}); +fetch_DTD(undefined, S) -> + S; +% fetch_DTD(_,S=#xmerl_scanner{validation=false}) -> +% S; +fetch_DTD(DTDSpec, S)-> + case fetch_and_parse(DTDSpec,S,[{text_decl,true}, + {environment,{external,subset}}]) of + NewS when is_record(NewS,xmerl_scanner) -> + NewS; + {_Res,_Tail,_Sx} -> % Continue with old scanner data, result in Rules + S + end. + +fetch_and_parse(ExtSpec,S=#xmerl_scanner{fetch_fun=Fetch, + rules=Rules, + xmlbase = XMLBase}, + Options0) -> + RetS = + case Fetch(ExtSpec, S) of + {ok, NewS} -> + %% For backward compatibility only. This will be removed later!! + NewS; + {ok, not_fetched,NewS} -> + NewS; + {ok, DataRet, NewS = #xmerl_scanner{ + fetch_path=FetchPath, + user_state = UState, + event_fun = Event, + hook_fun = Hook, + fetch_fun = Fetch1, + close_fun = Close1, + continuation_fun = Cont, + acc_fun = Acc, + rules_read_fun = Read, + rules_write_fun = Write, + validation = Valid, + quiet = Quiet, + encoding = Charset + }} -> + EvS = event_state(NewS), + HoS = hook_state(NewS), + FeS = fetch_state(NewS), + CoS = cont_state(NewS), + Options = Options0++[{fetch_path,FetchPath}, + {user_state, UState}, + {rules, Rules}, + {event_fun, Event, EvS}, + {hook_fun, Hook, HoS}, + {fetch_fun, Fetch1, FeS}, + {close_fun, Close1}, + {continuation_fun, Cont, CoS}, + {rules, Read, Write, ""}, + {acc_fun, Acc}, + {validation,Valid}, + {quiet,Quiet}, + {encoding,Charset}], + + case DataRet of + {file, F} -> + int_file_decl(F, Options,Charset); + {string, String} -> + int_string_decl(String, Options,XMLBase,file_name_unknown); + _ -> + %% other scheme + {DataRet,[],NewS} + end; + Error -> + ?fatal({error_fetching_DTD, {ExtSpec, Error}}, S) + end, + case RetS of + #xmerl_scanner{} -> + RetS#xmerl_scanner{text_decl=false, + environment=S#xmerl_scanner.environment}; + _ -> RetS + end. + + +fetch_not_parse(ExtSpec,S=#xmerl_scanner{fetch_fun=Fetch}) -> + case Fetch(ExtSpec,S) of + {ok, not_fetched,_NewS} -> + ?fatal({error_fetching_external_source,ExtSpec},S); + {ok, DataRet, NewS} -> + {String,LocationName} = + case DataRet of + {file,F} -> + {get_file(F,S),F}; + {string,Str} -> + {binary_to_list(Str),file_name_unknown}; + {http,URI} -> + {{http,URI},URI}; + _ -> DataRet + end, + {String, NewS#xmerl_scanner{filename=LocationName}}; + _ -> + ?fatal({error_fetching_external_resource,ExtSpec},S) + end. + +get_file(F,S) -> +% io:format("get_file F=~p~n",[F]), + case file:read_file(F) of + {ok,Bin} -> + binary_to_list(Bin); + Err -> + ?fatal({error_reading_file,F,Err},S) + end. +%% check_decl/1 +%% Now it is necessary to check that all referenced types is declared, +%% since it is legal to reference some xml types before they are +%% declared. +check_decl(#xmerl_scanner{validation=V}) when V =/= dtd -> + ok; +check_decl(#xmerl_scanner{rules=Tab} = S) -> + check_notations(Tab,S), + check_elements(Tab,S), %% check also attribute defs for element + check_entities(Tab,S). + +check_notations(Tab,S) -> + case ets:match(Tab,{{notation,'$1'},undeclared}) of + [[]] -> ok; + [] -> ok; + [L] when is_list(L) -> + ?fatal({error_missing_declaration_in_DTD,hd(L)},S); + Err -> + ?fatal({error_missing_declaration_in_DTD,Err},S) + end. + +check_elements(Tab,S) -> + case catch ets:match(Tab,{{elem_def,'_'},'$2'},10) of + {_,_}=M -> + Fun = fun({Match,'$end_of_table'},_F) -> + lists:foreach(fun(X)->check_elements2(X,S) end, + Match), + ok; + ('$end_of_table',_) -> + ok; + ({Match,Cont},F) -> + lists:foreach(fun(X)->check_elements2(X,S) end, + Match), + F(ets:match(Cont),F) + end, + Fun(M,Fun); + '$end_of_table' -> ok; + Err -> ?fatal({error_missing_declaration_in_DTD,Err},S) + end. + +% it is not an error to declare attributes for an element that is not +% declared. +check_elements2([#xmlElement{attributes=Attrs}],S) -> + check_attributes(Attrs,S); +check_elements2(_,_) -> + ok. + +check_attributes([{N1,'ID',_,_,_}=Attr|Rest],S) -> + case lists:keysearch('ID',2,Rest) of + {value,Att2} -> + ?fatal({error_more_than_one_ID_def,N1,element(1,Att2)},S); + _ -> + ok + end, + vc_ID_Attribute_Default(Attr,S), + check_attributes(Rest,S); +check_attributes([{_,{enumeration,_},_,_,_}=Attr|T],S) -> + vc_Enumeration(Attr,S), + check_attributes(T,S); +check_attributes([{_,Ent,_,_,_}=Attr|T],S) + when Ent=='ENTITY';Ent=='ENTITIES' -> + vc_Entity_Name(Attr,S), + check_attributes(T,S); +check_attributes([_|T],S) -> + check_attributes(T,S); +check_attributes([],_S) -> + ok. + +check_entities(Tab,S=#xmerl_scanner{validation=dtd}) -> + case ets:match(Tab,{{entity,'$1'},undeclared}) of + [[]] -> ok; + [] -> ok; + [L] when is_list(L) -> + ?fatal({error_missing_declaration_in_DTD,hd(L)},S); + Err -> + ?fatal({error_missing_declaration_in_DTD,Err},S) + end; +check_entities(_,_) -> + ok. + + +%% check_decl2/1: checks that all referenced ID attributes are declared +check_decl2(S=#xmerl_scanner{rules=Tab}) -> + check_referenced_ids(Tab,S). + + +check_referenced_ids(Tab,S) -> + case ets:match(Tab,{{id,'$1'},undeclared}) of + [[]] -> ok; + [] -> ok; + [L] when is_list(L) -> + ?fatal({error_missing_declaration_in_DTD,hd(L)},S); + Err -> + ?fatal({error_missing_declaration_in_DTD,Err},S) + end. + +%%%%%%% [30] extSubSet ::= TextDecl? extSubsetDecl + +scan_ext_subset([], S=#xmerl_scanner{continuation_fun = F}) -> + ?dbg("cont()...~n", []), + F(fun(MoreBytes, S1) -> scan_ext_subset(MoreBytes, S1) end, + fun(S1) -> {[], S1} end, + S); +scan_ext_subset("%" ++ T, S0) -> + %% DeclSep [28a]: WFC: PE Between Declarations. + %% The replacement text of a parameter entity reference in a + %% DeclSep must match the production extSubsetDecl. + ?bump_col(1), + {T1,S1} = scan_decl_sep(T,S), + scan_ext_subset(T1, S1); +scan_ext_subset("<![" ++ T, S0) -> + ?bump_col(3), + ?strip1, + {_, T2, S2} = scan_conditional_sect(T1, S1), + scan_ext_subset(T2,S2); +scan_ext_subset(T, S) when ?whitespace(hd(T)) -> + {_,T1,S1} = strip(T,S), + scan_ext_subset(T1, S1); +scan_ext_subset(T, S) -> + {_, T1, S1} = scan_markup_decl(T, S), + scan_ext_subset(T1, S1). + + +%%%%%%% [28a] DeclSep ::= PEReference | S +scan_decl_sep(T,S) -> + {PERefName, T1, S1} = scan_pe_reference(T, S), + {ExpandedRef,S2} = + case expand_pe_reference(PERefName,S1,as_PE) of + Tuple when is_tuple(Tuple) -> + %% {system,URI} or {public,URI} + {ExpRef,_Sx}=fetch_not_parse(Tuple,S1), + {ExpRef,S1}; + ExpRef -> + {ExpRef,S1} + end, + {_,TRef,S3} = strip(ExpandedRef,S2), + {_,S4}=scan_ext_subset(TRef,S3), + {T1,S4}. +% scan_decl_sep(T,S=#xmerl_scanner{rules_read_fun=Read, +% rules_write_fun=Write, +% rules_delete_fun=Delete}) -> +% {PERefName, T1, S1} = scan_pe_reference(T, S), +% {ExpandedRef,S2} = +% case expand_pe_reference(PERefName,S1,as_PE) of +% Tuple when tuple(Tuple) -> +% %% {system,URI} or {public,URI} +% {ExpRef,Sx}=fetch_not_parse(Tuple,S1), +% {EntV,_,_S2} = scan_entity_value(ExpRef, Sx, no_delim, +% PERefName,parameter), +% %% should do an update Write(parameter_entity) so next +% %% expand_pe_reference is faster +% Delete(parameter_entity,PERefName,_S2), +% _S3 = Write(parameter_entity,PERefName,EntV,_S2), +% EntV2 = Read(parameter_entity,PERefName,_S3), +% {" " ++ EntV2 ++ " ",_S3}; +% ExpRef -> +% {ExpRef,S1} +% end, +% {_, T3, S3} = strip(ExpandedRef,S2), +% {_T4,S4} = scan_ext_subset(T3,S3), +% strip(T1,S4). + +%%%%%%% [61] ConditionalSect ::= includeSect | ignoreSect + +scan_conditional_sect([], S=#xmerl_scanner{continuation_fun = F}) -> + ?dbg("cont()...~n", []), + F(fun(MoreBytes, S1) -> scan_conditional_sect(MoreBytes, S1) end, + fun(S1) -> ?fatal(unexpected_end, S1) end, + S); +scan_conditional_sect("IGNORE" ++ T, S0) -> + ?bump_col(6), + ?strip1, + T2 = scan_mandatory("[",T1,1,S,expected_IGNORE_bracket), +% "[" ++ T2 = T1, + {_,T3,S3} = strip(T2,S1), + scan_ignore(T3,S3); +scan_conditional_sect("INCLUDE" ++ T, S0) -> + ?bump_col(7), + ?strip1, + T2 = scan_mandatory("[",T1,1,S,expected_INCLUDE_bracket), +% "[" ++ T2 = T1, + {_,T3,S3} = strip(T2,S1), + scan_include(T3, S3); +scan_conditional_sect("%"++T,S0) -> + ?bump_col(1), + {PERefName, T1, S1} = scan_pe_reference(T, S), + ExpRef = expand_pe_reference(PERefName, S1,as_PE), + {_,T2,S2} = strip(ExpRef ++ T1,S1), + scan_conditional_sect(T2,S2). + + +%%%% [63] ignoreSect ::= '<![' S? 'IGNORE' S? '[' ignoreSectContents* ']]>' +%%%% [64] ignoreSectContents ::= Ignore ('<![' ignoreSectContents ']]>' Ignore)* +%%%% [65] Ignore ::= Char* - (Char* ('<![' | ']]>') Char*) +scan_ignore(Str,S) -> + scan_ignore(Str,S,0). + +scan_ignore([], S=#xmerl_scanner{continuation_fun = F},Level) -> + ?dbg("cont()...~n", []), + F(fun(MoreBytes, S1) -> scan_ignore(MoreBytes, S1,Level) end, + fun(S1) -> ?fatal(unexpected_end, S1) end, + S); +scan_ignore("<![" ++ T, S0,Level) -> + %% nested conditional section. Topmost condition is ignore, though + ?bump_col(3), + scan_ignore(T, S,Level+1); +scan_ignore("]]>" ++ T, S0,0) -> + ?bump_col(3), + {[], T, S}; +scan_ignore("]]>" ++ T, S0,Level) -> + ?bump_col(3), + scan_ignore(T, S,Level-1); +scan_ignore([_H|T],S0,Level) -> + ?bump_col(1), + scan_ignore(T,S,Level). + + +%%%%%%% [62] includeSect ::= '<![' S? 'INCLUDE' S? '[' extSubsetDecl ']]>' +scan_include([], S=#xmerl_scanner{continuation_fun = F}) -> + ?dbg("cont()...~n", []), + F(fun(MoreBytes, S1) -> scan_include(MoreBytes, S1) end, + fun(S1) -> ?fatal(unexpected_end, S1) end, + S); +scan_include("]]>" ++ T, S0) -> + ?bump_col(3), + {[], T, S}; +scan_include("%" ++ T, S0) -> + ?bump_col(1), + {PERefName, T1, S1} = scan_pe_reference(T, S), + ExpRef = expand_pe_reference(PERefName, S1,as_PE), + {_,T2,S2} = strip(ExpRef ++ T1,S1), + scan_include(T2, S2); +scan_include("<![" ++ T, S0) -> + ?bump_col(3), + ?strip1, + {_, T2, S2} = scan_conditional_sect(T1, S1), + ?strip3, + scan_include(T3,S3); +scan_include(T, S) -> + {_, T1, S1} = scan_markup_decl(T, S), + scan_include(T1, S1). + + +%%%%%%% [29] markupdecl ::= elementdecl | AttlistDecl | EntityDecl | +%%%%%%% NotationDecl | PI |Comment +%%%%%%% [45] elementdecl ::= '<!ELEMENT' S Name S contentspec S? '>' + +%% Validity constraint: Unique Type Declaration: No element type may be +%% declared more than once. +%% +scan_markup_decl([], S=#xmerl_scanner{continuation_fun = F}) -> + ?dbg("cont()...~n", []), + F(fun(MoreBytes, S1) -> scan_markup_decl(MoreBytes, S1) end, + fun(S1) -> {[], [], S1} end, + S); +scan_markup_decl("<!--" ++ T, S0) -> + ?bump_col(4), + scan_comment(T, S); +scan_markup_decl("<?" ++ T, S0) -> + ?bump_col(2), + {_PI, T1, S1} = scan_pi(T, S,_Pos=markup), + strip(T1, S1); +scan_markup_decl("<!ELEMENT" ++ T, + #xmerl_scanner{rules_read_fun = Read, + rules_write_fun = Write, + rules_delete_fun = Delete} = S0) -> + ?bump_col(9), + {_,T1,S1} = mandatory_strip(T,S), + {Ename, _NamespaceInfo, T2, S2} = scan_name(T1, S1), + Element = + case Read(elem_def, Ename, S2) of + El = #xmlElement{elementdef=Decl} when Decl =/= undeclared -> + case S2#xmerl_scanner.validation of + dtd -> + ?fatal({already_defined, Ename}, S2); + _ -> + Delete(elem_def,Ename,S2), + El + end; + El = #xmlElement{} -> + Delete(elem_def,Ename,S2), + El; + undefined -> + #xmlElement{} + end, + {_,T3,S3} = mandatory_strip(T2,S2), + {Edef, T4, S4} = scan_contentspec(T3, S3), + ?strip5, + {">" ++ T6,S6} = scan_element_completion(T5,S5), + S7 = Write(elem_def, Ename, + Element#xmlElement{name = Ename, + content = Edef, + elementdef=S6#xmerl_scanner.environment}, + S6#xmerl_scanner{col=S6#xmerl_scanner.col+1}), + strip(T6,S7); +scan_markup_decl("<!ENTITY" ++ T, S0) -> + %% <!ENTITY [%] entity.name NDATA notation.name> + %% <!ENTITY [%] entity.name "replacement text"> + %% <!ENTITY [%] entity.name SYSTEM "system.identifier"> + %% <!ENTITY [%] entity.name PUBLIC public.identifier "system.identifier"> + ?bump_col(8), + {_,T1,S1} = mandatory_strip(T,S), + {T2, S2} = scan_entity(T1, S1), + strip(T2,S2); +scan_markup_decl("<!NOTATION" ++ T, S0) -> + %% <!NOTATION notation.name "public.identifier" "helper.application"> + ?bump_col(10), + {_,T1,S1} = mandatory_strip(T,S), + {T2, S2} = scan_notation_decl(T1, S1), + strip(T2,S2); +scan_markup_decl("<!ATTLIST" ++ T, + #xmerl_scanner{rules_read_fun = Read, + rules_write_fun = Write, + rules_delete_fun= Delete} = S0) -> + %% <!ATTLIST Ename ( AttrName Type Value )*> + ?bump_col(9), + {_,T1,S1} = mandatory_strip(T,S), + {Ename, _NamespaceInfo, T2, S2} = scan_name(T1, S1), +% ?strip3, + {Attributes, T4, S4} = scan_attdef(T2, S2), + {EDEF,MergedAttrs} = + case Read(elem_def, Ename, S4) of + undefined -> %% this may happen when the ELEMENT is declared in + %% the external DTD but the ATTLIST in the + %% internal DTD. + {#xmlElement{},update_attributes(Attributes,[])}; + Edef = #xmlElement{attributes = OldAttrs} -> + Delete(elem_def,Ename,S4), + %% the slot in rules table must be empty so that the + %% later write has the assumed effect. Read maybe + %% should empty the table slot. + {Edef,update_attributes(Attributes, OldAttrs)} + end, + NewEdef = EDEF#xmlElement{name=Ename,attributes = MergedAttrs}, + S5 = Write(elem_def, Ename, NewEdef, S4), + T5 = T4, + strip(T5,S5); +scan_markup_decl(_Str,S) -> + ?fatal(expected_markup,S). + +scan_element_completion(T,S) -> + scan_markup_completion_gt(T,S). + +update_attributes(NewAttrs, OldAttrs) -> + update_attributes1(NewAttrs,lists:reverse(OldAttrs)). + +update_attributes1([A = {Name,_Type,_DefaultV,_DefaultD,_Env}|Attrs], + OldAttrs) -> + case lists:keymember(Name, 1, OldAttrs) of + true -> + update_attributes1(Attrs, OldAttrs); + false -> + update_attributes1(Attrs, [A|OldAttrs]) + end; +update_attributes1([],Acc) -> + lists:reverse(Acc). + + +%%%%%%% [53] AttDef + +scan_attdef([], S=#xmerl_scanner{continuation_fun = F}) -> + ?dbg("cont()...~n", []), + F(fun(MoreBytes, S1) -> scan_attdef(MoreBytes, S1) end, + fun(S1) -> ?fatal(unexpected_end, S1) end, + S); +scan_attdef(T, S) -> + scan_attdef(T, S, _AttrAcc = []). + + +scan_attdef([], S=#xmerl_scanner{continuation_fun = F}, Attrs) -> + ?dbg("cont()...~n", []), + F(fun(MoreBytes, S1) -> scan_attdef(MoreBytes, S1, Attrs) end, + fun(S1) -> ?fatal(unexpected_end, S1) end, + S); +scan_attdef(">" ++ T, S0, Attrs) -> + ?bump_col(1), + {lists:reverse(Attrs), T, S}; +scan_attdef("%" ++ _T, S=#xmerl_scanner{environment=prolog}, _Attrs) -> + ?fatal({error,{wfc_PEs_In_Internal_Subset}},S); +scan_attdef("%" ++ T, S0, Attrs) -> + ?bump_col(1), + {PERefName, T1, S1} = scan_pe_reference(T, S), + ExpRef = expand_pe_reference(PERefName, S1,as_PE), + {_,T2,S2} = strip(ExpRef ++ T1,S1), + scan_attdef(T2, S2, Attrs); +scan_attdef(T,S,Attrs) -> + {_,T1,S1} = mandatory_strip(T,S), + scan_attdef2(T1,S1,Attrs). + +scan_attdef2(">" ++ T, S0, Attrs) -> + ?bump_col(1), + {lists:reverse(Attrs), T, S}; +scan_attdef2("%" ++ _T, S=#xmerl_scanner{environment=prolog}, _Attrs) -> + ?fatal({error,{wfc_PEs_In_Internal_Subset}},S); +scan_attdef2("%" ++ T, S0, Attrs) -> + ?bump_col(1), + {PERefName, T1, S1} = scan_pe_reference(T, S), + ExpRef = expand_pe_reference(PERefName, S1,as_PE), + {_,T2,S2} = strip(ExpRef ++ T1,S1), + scan_attdef2(T2, S2, Attrs); +scan_attdef2(T, S, Attrs) -> + {AttName, _NamespaceInfo, T1, S1} = scan_name(T, S), + {_,T2,S2} = mandatory_strip(T1,S1), + {AttType, T3, S3} = scan_att_type(T2, S2), + {_,T4,S4} = mandatory_strip(T3,S3), + {{DefaultDecl,DefaultValue}, T5, S5} = scan_default_decl(T4, S4, AttType), + ?strip6, + Attr = {AttName, AttType,DefaultValue,DefaultDecl, + S#xmerl_scanner.environment}, + scan_attdef2(T6, S6, [Attr|Attrs]). + + +%%% [54] StringType +scan_att_type([], S=#xmerl_scanner{continuation_fun = F}) -> + ?dbg("cont()...~n", []), + F(fun(MoreBytes, S1) -> scan_att_type(MoreBytes, S1) end, + fun(S1) -> ?fatal(unexpected_end, S1) end, + S); +scan_att_type("CDATA" ++ T, S0) -> + ?bump_col(5), + {'CDATA', T, S}; +%%% [55] TokenizedType +scan_att_type("IDREFS" ++ T, S0) -> + ?bump_col(6), + {'IDREFS', T, S}; +scan_att_type("IDREF" ++ T, S0) -> + ?bump_col(5), + {'IDREF', T, S}; +scan_att_type("ID" ++ T, S0) -> + ?bump_col(2), + {'ID', T, S}; +scan_att_type("ENTITY" ++ T, S0) -> + ?bump_col(6), + {'ENTITY', T, S}; +scan_att_type("ENTITIES" ++ T, S0) -> + ?bump_col(8), + {'ENTITIES', T, S}; +scan_att_type("NMTOKENS" ++ T, S0) -> + ?bump_col(8), + {'NMTOKENS', T, S}; +scan_att_type("NMTOKEN" ++ T, S0) -> + ?bump_col(7), + {'NMTOKEN', T, S}; +%%% [57] EnumeratedType +scan_att_type("NOTATION" ++ T, S0) -> + ?bump_col(8), + {_,T1,S1} = mandatory_strip(T,S), + T2 = scan_mandatory("(",T1,1,S1,expected_parenthesis_after_NOTATION), +% "(" ++ T2 = T1, + S2 = S1, + ?strip3, + {Name, _NamespaceInfo, T4, S4} = scan_name(T3, S3), + notation_exists(Name, S4), + ?strip5, + scan_notation_type(T5, S5, [Name]); +scan_att_type("(" ++ T, S0) -> + ?bump_col(1), + ?strip1, + {NmToken, _NamespaceInfo, T2, S2} = scan_nmtoken(T1, S1), + ?strip3, + scan_enumeration(T3, S3, [NmToken]); +scan_att_type("%" ++ _T, S=#xmerl_scanner{environment=prolog}) -> + ?fatal({error,{wfc_PEs_In_Internal_Subset}},S); +scan_att_type("%" ++ T, S0) -> + ?bump_col(1), + {PERefName, T1, S1} = scan_pe_reference(T, S), + ExpRef = expand_pe_reference(PERefName, S1,in_literal), + {ExpRef,T1,S1}. + +%%% [58] NotationType + +scan_notation_type([], S=#xmerl_scanner{continuation_fun = F}, Acc) -> + ?dbg("cont()...~n", []), + F(fun(MoreBytes, S1) -> scan_notation_type(MoreBytes, S1, Acc) end, + fun(S1) -> ?fatal(unexpected_end, S1) end, + S); +scan_notation_type(")" ++ T, S0, Acc) -> + ?bump_col(1), + {{notation, lists:reverse(Acc)}, T, S}; +scan_notation_type("|" ++ T, S0, Acc) -> + ?bump_col(1), + ?strip1, + {Name, _NamespaceInfo, T2, S2} = scan_name(T1, S1), + notation_exists(Name, S2), + ?strip3, + scan_notation_type(T3, S3, [Name | Acc]). + +%%% Validity constraint for NotationType: +%%% The used notation names must be declared in the DTD, but they may +%%% be declared later. +notation_exists(Name, #xmerl_scanner{rules_read_fun = Read, + rules_write_fun = Write } = S) -> + case Read(notation, Name, S) of + undefined -> + %% this is legal, since the referenced NOTATION + %% may be declared later in internal or external + %% subset. + Write(notation,Name,undeclared,S); + _Value -> + ok + end. + +%%% [59] Enumeration + +scan_enumeration([], S=#xmerl_scanner{continuation_fun = F}, Acc) -> + ?dbg("cont()...~n", []), + F(fun(MoreBytes, S1) -> scan_enumeration(MoreBytes, S1, Acc) end, + fun(S1) -> ?fatal(unexpected_end, S1) end, + S); +scan_enumeration(")" ++ T, S0, Acc) -> + ?bump_col(1), + {{enumeration, lists:reverse(Acc)}, T, S}; +scan_enumeration("|" ++ T, S0, Acc) -> + ?bump_col(1), + ?strip1, + {NmToken, _NamespaceInfo, T2, S2} = scan_nmtoken(T1, S1), + ?strip3, + scan_enumeration(T3, S3, [NmToken|Acc]). + + +%%%%%%% [60] DefaultDecl + +scan_default_decl([], S=#xmerl_scanner{continuation_fun = F}, Type) -> + ?dbg("cont()...~n", []), + F(fun(MoreBytes, S1) -> scan_default_decl(MoreBytes, S1, Type) end, + fun(S1) -> ?fatal(unexpected_end, S1) end, + S); +scan_default_decl("#REQUIRED" ++ T, S0, _Type) -> + ?bump_col(9), + {{'#REQUIRED',no_value}, T, S}; +scan_default_decl("#IMPLIED" ++ T, S0, _Type) -> + ?bump_col(8), + {{'#IMPLIED',no_value}, T, S}; +scan_default_decl("#FIXED" ++ T, S0, Type) -> + ?bump_col(6), + {_,T1,S1} = mandatory_strip(T,S), + {Value,T2,S2,_} = default_value(T1, S1, Type), + {{'#FIXED',Value},T2,S2}; +scan_default_decl(Str, S, Type) -> + {Value,T1,S1,_} = default_value(Str, S, Type), + {{no_decl,Value},T1,S1}. + + +%% There is room here to validate against Type, but we don't do it at +%% the moment. +default_value(T, S, Type) -> + {_Val, _T1, _S1,_} = scan_att_value(T, S, Type). + + +%%%%%%% [71] EntityDef + +scan_entity([], S=#xmerl_scanner{continuation_fun = F}) -> + ?dbg("cont()...~n", []), + F(fun(MoreBytes, S1) -> scan_entity(MoreBytes, S1) end, + fun(S1) -> ?fatal(unexpected_end, S1) end, + S); +scan_entity("%" ++ T, #xmerl_scanner{rules_write_fun = Write} = S0) -> + %% parameter entity + ?bump_col(1), + {_,T1,S1} = mandatory_strip(T,S), + {PEName, _NamespaceInfo, T2, S2} = scan_name_no_colons(T1, S1), + {_,T3,S3} = mandatory_strip(T2,S2), + {PEDef, T4, S4} = scan_pe_def(T3, S3, PEName), + ?strip5, + {">" ++ T6,S6} = scan_entity_completion(T5,S5), + S7 = Write(parameter_entity, PEName, PEDef, S6), + {T6, S7}; +scan_entity(T, #xmerl_scanner{rules_write_fun = Write, + rules_read_fun = Read, + rules_delete_fun = Delete} = S) -> + %% generic entity + {EName, _NamespaceInfo, T1, S1} = scan_name_no_colons(T, S), + {_,T2,S2} = mandatory_strip(T1,S1), + {EDef, EntType, T3, S3} = scan_entity_def(T2, S2, EName), + check_entity_recursion(EName,S3), + ?strip4, + {">" ++ T5,S5} = scan_entity_completion(T4,S4), + case Read(entity,EName,S5) of + undeclared -> Delete(entity,EName,S5); + _ -> ok + end, + S6 = Write(entity, EName, {S5#xmerl_scanner.environment,EntType,EDef}, S5), + {T5, S6}. + +scan_entity_completion(T,S) -> + scan_markup_completion_gt(T,S). + +%%%%%%% [73] EntityDef + +scan_entity_def([], S=#xmerl_scanner{continuation_fun = F}, EName) -> + ?dbg("cont()...~n", []), + F(fun(MoreBytes, S1) -> scan_entity_def(MoreBytes, S1, EName) end, + fun(S1) -> ?fatal(unexpected_end, S1) end, + S); +scan_entity_def("'" ++ T, S0, EName) -> + ?bump_col(1), + {EVal,T1,S1}=scan_entity_value(T, S, $', EName,general), + {EVal,internal,T1,S1}; +scan_entity_def("\"" ++ T, S0, EName) -> + ?bump_col(1), + {EVal,T1,S1}=scan_entity_value(T, S, $", EName,general), + {EVal,internal,T1,S1}; +%% external general entity, parsed or unparsed. +scan_entity_def(Str, S, EName) -> + {ExtID, T1, S1} = scan_external_id(Str, S), + {NData, T2, S2} = scan_ndata_decl(T1, S1), + case NData of + {ndata,_} -> + %% if NDATA exists it is an unparsed ENTITY + {{ExtID,NData},external,T2,S2}; + _ -> + case fetch_and_parse(ExtID,S2, + [{text_decl,true}, + {environment,{external,{entity,EName}}}]) of + {{_USret,Entity},_Tail,_Sx} -> + {Entity, external,T2, S2}; + {Entity,_Tail,Sx} -> + OldRef=S2#xmerl_scanner.entity_references, + NewRef=Sx#xmerl_scanner.entity_references, + {Entity,external,T2, + S2#xmerl_scanner{entity_references=OldRef++NewRef}}; + {error,enoent} -> % this bad entity is declared, + % but it may not be referenced, + % then it would not be an + % error. + {{error,enoent},external,T2,S2} + end + end. + + +scan_ndata_decl([], S=#xmerl_scanner{continuation_fun = F}) -> + ?dbg("cont()...~n", []), + F(fun(MoreBytes, S1) -> scan_ndata_decl(MoreBytes, S1) end, + fun(S1) -> ?fatal(unexpected_end, S1) end, + S); +scan_ndata_decl(Str = ">"++_T, S) -> + {[], Str, S}; +scan_ndata_decl(T, S) -> + {_,T1,S1} = mandatory_strip(T,S), + scan_ndata_decl2(T1,S1). +scan_ndata_decl2(Str = ">"++_T,S) -> + {[], Str, S}; +scan_ndata_decl2("NDATA" ++ T,S0 = #xmerl_scanner{rules_read_fun = Read, + rules_write_fun = Write}) -> + ?bump_col(5), + {_,T1,S1} = mandatory_strip(T,S), + {Name, _NamespaceInfo, T2, S2} = scan_name(T1, S1), + case Read(notation, Name, S2) of + undefined -> %% this is legal, since the referenced NOTATION + %% may be declared later in internal or external + %% subset. + Write(notation,Name,undeclared,S2), + {{ndata,Name},T2,S2}; + _Value -> + {{ndata, Name}, T2, S2} + end. + +%%%%%%% [39] element + +scan_element(T, S, Pos) -> + scan_element(T, S, Pos, S#xmerl_scanner.space, + _Lang = [], _Parents = [], #xmlNamespace{}). + +scan_element(T, S=#xmerl_scanner{line=L,col=C}, + Pos, SpaceDefault,Lang, Parents, NS) -> + {Name, NamespaceInfo, T1, S1} = scan_name(T, S), + vc_Element_valid(Name,S), + ?strip2, + scan_element(T2, S2, Pos, Name, L, C, _Attrs = [], + Lang, Parents, NamespaceInfo, NS, + SpaceDefault). + + +scan_element("/", S=#xmerl_scanner{continuation_fun = F}, + Pos, Name, StartL, StartC, Attrs, Lang, Parents, + NSI, NS, SpaceDefault) -> + ?dbg("trailing / detected~n", []), + F(fun(MoreBytes, S1) -> scan_element("/" ++ MoreBytes, S1, + Pos, Name, StartL, StartC, Attrs, + Lang,Parents,NSI,NS,SpaceDefault) end, + fun(S1) -> ?fatal(unexpected_end, S1) end, + S); +scan_element([], S=#xmerl_scanner{continuation_fun = F}, + Pos, Name, StartL, StartC, Attrs, Lang, Parents, + NSI, NS, SpaceDefault) -> + ?dbg("cont()...~n", []), + F(fun(MoreBytes, S1) -> scan_element(MoreBytes, S1, + Pos, Name, StartL, StartC, Attrs, + Lang,Parents,NSI,NS,SpaceDefault) end, + fun(S1) -> ?fatal(unexpected_end, S1) end, + S); +scan_element("/>" ++ T, S0 = #xmerl_scanner{hook_fun = Hook, + event_fun = Event, + line = L, col = C, + xmlbase_cache=XMLBase}, Pos, + Name, _StartL, _StartC, Attrs0, Lang, Parents, NSI, + Namespace, _SpaceDefault) -> + ?bump_col(2), + Attrs = lists:reverse(Attrs0), + E=processed_whole_element(S, Pos, Name, Attrs, Lang, Parents,NSI,Namespace), + + wfc_unique_att_spec(Attrs,S), + S1 = #xmerl_scanner{} = Event(#xmerl_event{event = ended, + line = L, + col = C, + data = E}, S0), + {Ret, S2} = Hook(E, S1), + S2b=S2#xmerl_scanner{xmlbase=XMLBase}, + {Ret, T, S2b}; +scan_element(">", S=#xmerl_scanner{continuation_fun = F}, + Pos, Name, StartL, StartC, Attrs, Lang, Parents, + NSI, NS, SpaceDefault) -> + ?dbg("trailing > detected~n", []), + F(fun(MoreBytes, S1) -> scan_element(">" ++ MoreBytes, S1, + Pos, Name, StartL, StartC, Attrs, + Lang,Parents,NSI,NS,SpaceDefault) end, + fun(S1) -> ?fatal(unexpected_end, S1) end, + S); +scan_element(">" ++ T, S0 = #xmerl_scanner{event_fun = Event, + hook_fun = Hook, + line = L, col = C, + xmlbase_cache=XMLBase, + space = SpaceOption}, + Pos, Name, StartL, StartC, Attrs0, Lang, Parents, + NSI, Namespace, SpaceDefault) -> + ?bump_col(1), + Attrs = lists:reverse(Attrs0), + wfc_unique_att_spec(Attrs,S), + XMLSpace = case lists:keysearch('xml:space', #xmlAttribute.name, Attrs) of + false -> SpaceDefault; + {value, #xmlAttribute{value="default"}} -> SpaceOption; + {value, #xmlAttribute{value="preserve"}} -> preserve; + _ -> SpaceDefault + end, + + E0=processed_whole_element(S,Pos,Name,Attrs,Lang,Parents,NSI,Namespace), + S1 = #xmerl_scanner{} = Event(#xmerl_event{event = started, + line = StartL, + col = StartC, + data = E0}, S), + + {Content, T1, S2} = scan_content(T, S1, Name, Attrs, XMLSpace, + E0#xmlElement.language, + [{Name, Pos}|Parents], Namespace), + + Element=E0#xmlElement{content=Content, + xmlbase=E0#xmlElement.xmlbase}, + S3 = #xmerl_scanner{} = Event(#xmerl_event{event = ended, + line = L, + col = C, + data = Element}, S2), + {Ret, S4} = Hook(Element, S3), + S4b=S4#xmerl_scanner{xmlbase=XMLBase}, + {Ret, T1, S4b}; +scan_element(T, S, Pos, Name, StartL, StartC, Attrs, Lang, Parents, + NSI, NS, SpaceDefault) -> + {AttName, NamespaceInfo, T1, S1} = scan_name(T, S), + {T2, S2} = scan_eq(T1, S1), + {AttType,_DefaultDecl} = get_att_type(S2,AttName,Name), + {AttValue, T3, S3,IsNorm} = scan_att_value(T2, S2, AttType), +%% check_default_value(S3,DefaultDecl,AttValue), + NewNS = check_namespace(AttName, NamespaceInfo, AttValue, NS), + wfc_whitespace_betw_attrs(hd(T3),S3), + ?strip4, + AttrPos = case Attrs of + [] -> + 1; + [#xmlAttribute{pos = P}|_] -> + P+1 + end, + Attr = #xmlAttribute{name = AttName, + pos = AttrPos, + language = Lang, + namespace = NamespaceInfo, + value = AttValue, + normalized = IsNorm}, + XMLBase=if + AttName=='xml:base' -> + resolve_relative_uri(AttValue,S4#xmerl_scanner.xmlbase); + true -> + S4#xmerl_scanner.xmlbase + end, + + #xmerl_scanner{event_fun = Event, + line = Line, + col = Col} = S4, + S5 = Event(#xmerl_event{event = ended, + line = Line, + col = Col, + data = Attr}, + S4#xmerl_scanner{xmlbase=XMLBase, + xmlbase_cache=S#xmerl_scanner.xmlbase}), + scan_element(T4, S5, Pos, Name, StartL, StartC, [Attr|Attrs], + Lang, Parents, NSI, NewNS, SpaceDefault). + +get_att_type(S=#xmerl_scanner{rules_read_fun=Read},AttName,ElemName) -> + case Read(elem_def,ElemName,S) of + #xmlElement{attributes = Attrs} -> + case lists:keysearch(AttName,1,Attrs) of + {value,{_,AttType,_,DefaultDecl,_}} -> + {AttType,DefaultDecl}; + _ -> {'CDATA',no_value} %% undefined attribute shall be treated as CDATA + end; + _ -> {'CDATA',no_value} + end. + +resolve_relative_uri(NewBase="/"++_,CurrentBase) -> + case xmerl_uri:parse(CurrentBase) of + {error,_Reason} -> + NewBase; + {Scheme,Host,Port,_Path,_Query} -> + atom_to_list(Scheme)++Host++":"++integer_to_list(Port)++NewBase + end; +resolve_relative_uri(NewBase,CurrentBase) -> + filename:join(CurrentBase,NewBase). + + +processed_whole_element(S=#xmerl_scanner{hook_fun = _Hook, + xmlbase = XMLBase, + line = _L, col = _C, + event_fun = _Event}, + Pos, Name, Attrs, Lang, Parents, NSI, Namespace) -> + Language = check_language(Attrs, Lang), + + {ExpName, ExpAttrs} = + case S#xmerl_scanner.namespace_conformant of + true -> + %% expand attribute names. We need to do this after having + %% scanned all attributes of the element, since (as far as + %% I can tell), XML Names only specifies that namespace attrs + %% are valid within the whole scope of the element in which + %% they are declared, which should also mean that even if they + %% are declared after some other attributes, the namespace + %% should apply to those attributes as well. + %% Note that the default URI does not apply to attrbute names. + TempNamespace = Namespace#xmlNamespace{default = []}, + ExpAttrsX = + [A#xmlAttribute{ + expanded_name=expanded_name( + A#xmlAttribute.name, + A#xmlAttribute.namespace, + % NSI, + TempNamespace, S)} || A <- Attrs], + {expanded_name(Name, NSI, Namespace, S), ExpAttrsX}; + false -> + {Name, Attrs} + end, + + #xmlElement{name = Name, + xmlbase = XMLBase, + pos = Pos, + parents = Parents, + attributes = ExpAttrs, + language = Language, + expanded_name = ExpName, + nsinfo = NSI, + namespace = Namespace}. + + +check_language([#xmlAttribute{name='xml:lang',value=Lang}|_], _) -> + Lang; +check_language([_|T], Lang) -> + check_language(T, Lang); +check_language([], Lang) -> + Lang. + + +check_namespace(xmlns, _, Value, NS) -> + NS#xmlNamespace{default = list_to_atom(Value)}; +check_namespace(_, {"xmlns", Prefix}, Value, + NS = #xmlNamespace{nodes = Ns}) -> + NS#xmlNamespace{nodes = keyreplaceadd( + Prefix, 1, Ns, {Prefix, list_to_atom(Value)})}; +check_namespace(_, _, _, NS) -> + NS. + + +expanded_name(Name, [], #xmlNamespace{default = []}, _S) -> + Name; +expanded_name(Name, [], #xmlNamespace{default = URI}, _S) -> + {URI, Name}; +expanded_name(_Name, {"xmlns", Local}, _NS, _S) -> % CHECK THIS /JB + {"xmlns",Local}; +expanded_name(_Name, {Prefix, Local}, #xmlNamespace{nodes = Ns}, S) -> + case lists:keysearch(Prefix, 1, Ns) of + {value, {_, URI}} -> + {URI, list_to_atom(Local)}; + false -> + %% A namespace constraint of XML Names is that the prefix + %% must be declared + ?fatal({namespace_prefix_not_declared, Prefix}, S) + end. + + + + +keyreplaceadd(K, Pos, [H|T], Obj) when K == element(Pos, H) -> + [Obj|T]; +keyreplaceadd(K, Pos, [H|T], Obj) -> + [H|keyreplaceadd(K, Pos, T, Obj)]; +keyreplaceadd(_K, _Pos, [], Obj) -> + [Obj]. + +%%%%%%% [10] AttValue +%% normalize the attribute value according to XML 1.0 section 3.3.3 + +scan_att_value([], S=#xmerl_scanner{continuation_fun = F},AT) -> + ?dbg("cont()...~n", []), + F(fun(MoreBytes, S1) -> scan_att_value(MoreBytes, S1, AT) end, + fun(S1) -> ?fatal(unexpected_end, S1) end, + S); +scan_att_value("%"++_T,S=#xmerl_scanner{environment=prolog},_AttType) -> + ?fatal({error,{wfc_PEs_In_Internal_Subset}},S); +scan_att_value("%"++T,S0=#xmerl_scanner{rules_read_fun=Read, + rules_write_fun=Write, + rules_delete_fun=Delete},AttType) -> + ?bump_col(1), + {Name,T1,S1} = scan_pe_reference(T,S), + {ExpandedRef,S2} = + case expand_pe_reference(Name,S1,in_literal) of + Tuple when is_tuple(Tuple) -> + %% {system,URI} or {public,URI} + %% Included in literal, just get external file. + {ExpRef,Sx}=fetch_not_parse(Tuple,S1), + {EntV,_,_S2} = scan_entity_value(ExpRef, Sx, no_delim, + Name,parameter), + %% should do an update Write(parameter_entity) so next + %% expand_pe_reference is faster + Delete(parameter_entity,Name,_S2), + _S3 = Write(parameter_entity,Name,EntV,_S2), + EntV2 = Read(parameter_entity,Name,_S3), + {EntV2,_S3}; + ExpRef -> + {ExpRef,S1} + end, + {_,T2,S3} = strip(ExpandedRef ++ T1,S2), + scan_att_value(T2,S3,AttType); +scan_att_value([H|T], S0,'CDATA'=AT) when H == $"; H == $' -> + ?bump_col(1), + scan_att_chars(T, S, H, [],[], AT,false); +scan_att_value([H|T], S0,AttType) when H == $"; H == $' -> + ?bump_col(1), + {T1,S1,IsNorm} = normalize(T,S,false), + scan_att_chars(T1, S1, H, [],[], AttType,IsNorm). + +scan_att_chars([],S=#xmerl_scanner{continuation_fun=F},H,Acc,TmpAcc,AT,IsNorm)-> + ?dbg("cont()...~n", []), + F(fun(MoreBytes, S1) -> + scan_att_chars(MoreBytes, S1, H, Acc,TmpAcc,AT,IsNorm) + end, + fun(S1) -> ?fatal(unexpected_end, S1) end, + S); +scan_att_chars([H|T], S0, H, Acc, TmpAcc,AttType,IsNorm) -> % End quote + ?bump_col(1), + check_att_default_val(S#xmerl_scanner.validation,TmpAcc,AttType,S), + {Acc2,S2,IsNorm2} = + if + AttType == 'CDATA' -> {Acc,S,IsNorm}; + true -> + normalize(Acc,S,IsNorm) + end, + {lists:reverse(Acc2), T, S2,IsNorm2}; +scan_att_chars("&" ++ T, S0, Delim, Acc, TmpAcc,AT,IsNorm) -> % Reference + ?bump_col(1), + {ExpRef, T1, S1} = scan_reference(T, S), + case markup_delimeter(ExpRef) of + true -> + scan_att_chars(T1,S1,Delim,[ExpRef|Acc],[ExpRef|TmpAcc],AT,IsNorm); + _ -> + scan_att_chars(string_to_char_set(S#xmerl_scanner.encoding,ExpRef) + ++ T1, S1, Delim, Acc,TmpAcc, AT,IsNorm) + end; +scan_att_chars("<" ++ _T, S0, _Delim, _Acc,_, _,_) -> % Tags not allowed here + ?fatal(unexpected_char, S0); +scan_att_chars([H|T], S0, Delim, Acc, _TmpAcc,'CDATA',IsNorm) + when ?whitespace(H) -> + ?bump_col(1), + scan_att_chars(T, S, Delim, [$\s|Acc], [],'CDATA',IsNorm); +scan_att_chars([H|T], S0, Delim, Acc, TmpAcc,AT,IsNorm) + when ?whitespace(H) -> + ?bump_col(1), + {T1,S1,IsNorm2} = normalize(T,S,IsNorm), + check_att_default_val(S#xmerl_scanner.validation,TmpAcc,AT,S1), + scan_att_chars(T1, S1, Delim, [$\s|Acc],[], AT,IsNorm2); +scan_att_chars(Str, S0, Delim, Acc, TmpAcc,AT,IsNorm) -> + ?bump_col(1), + {Ch,T} = to_ucs(S#xmerl_scanner.encoding,Str), + valid_Char(S#xmerl_scanner.validation,AT,Ch,S), + scan_att_chars(T, S, Delim, [Ch|Acc], [Ch|TmpAcc],AT,IsNorm). + +markup_delimeter("&")-> true; +markup_delimeter("\"") -> true; +markup_delimeter("\'") -> true; +markup_delimeter("<") -> true; +markup_delimeter(">") -> true; +markup_delimeter("%") -> true; +markup_delimeter(_) -> false. + +check_att_default_val(dtd,[],_Ent,_S) -> + ok; +check_att_default_val(dtd,RevName,Ent,S) -> + check_att_default_val(lists:reverse(RevName),Ent,S); +check_att_default_val(_,_,_,_) -> + ok. + +check_att_default_val(Name,Ent,S=#xmerl_scanner{rules_write_fun=Write}) + when Ent == 'ENTITY'; Ent == 'ENTITIES' -> + case xmerl_lib:is_letter(hd(Name)) of + true -> ok; + _ -> ?fatal({illegal_first_character,Ent,Name},S) + end, + SName = list_to_atom(Name), + Write(entity,SName,undeclared,S); +check_att_default_val(Name,IDR,S=#xmerl_scanner{rules_write_fun=Write}) + when IDR == 'IDREF'; IDR == 'IDREFS' -> + case xmerl_lib:is_letter(hd(Name)) of + true -> ok; + _ -> ?fatal({illegal_first_character,IDR,Name},S) + end, + SName = list_to_atom(Name), + Write(id,SName,undeclared,S); +check_att_default_val(Name,'ID',S=#xmerl_scanner{rules_write_fun=Write, + rules_read_fun=Read, + rules_delete_fun=Delete}) -> + case xmerl_lib:is_name(Name) of + false -> + ?fatal({'ID_names_must_be_Name_production',Name},S); + _ -> + ok + end, + SName = if + is_list(Name) -> list_to_atom(Name); + true -> Name + end, + case Read(id,SName,S) of + undeclared -> %% was referenced in IDREF/IDREFS before defined + Delete(id,SName,S); + SName -> ?fatal({values_must_be_unique,'ID',SName},S); + undefined -> ok + end, + Write(id,SName,SName,S); +check_att_default_val(_,_,_) -> + ok. + +valid_Char(dtd,AT,C,S) when AT=='NMTOKEN';AT=='NMTOKENS' -> + vc_Valid_Char(AT,C,S); +valid_Char(_,_,[C],S) -> + case xmerl_lib:is_char(C) of + true -> + ok; + false -> + ?fatal({unexpected_char,C}, S) + end; +valid_Char(_,_,C,S) -> + case xmerl_lib:is_char(C) of + true -> + ok; + false -> + ?fatal({unexpected_char,C}, S) + end. + + + +%%%%%%% [43] content + +scan_content(T, S, Name, Attrs, Space, Lang, Parents, NS) -> + scan_content(T, S, _Pos = 1, Name, Attrs, Space, + Lang, Parents, NS, _Acc = [],_MarkupDel=[]). + +scan_content("<", S= #xmerl_scanner{continuation_fun = F}, + Pos, Name, Attrs, Space, Lang, Parents, NS, Acc,_) -> + ?dbg("trailing < detected~n", []), + F(fun(MoreBytes, S1) -> scan_content("<" ++ MoreBytes, S1, + Pos, Name, Attrs, + Space, Lang, Parents, NS, Acc,[]) end, + fun(S1) -> ?fatal(unexpected_end, S1) end, + S); +scan_content([], S=#xmerl_scanner{environment={external,{entity,_}}}, + _Pos, _Name, _Attrs, _Space, _Lang, _Parents, _NS, Acc,_) -> + {lists:reverse(Acc),[],S}; +scan_content([], S=#xmerl_scanner{environment=internal_parsed_entity}, + _Pos, _Name, _Attrs, _Space, _Lang, _Parents, _NS, Acc,_) -> + {lists:reverse(Acc),[],S}; +scan_content([], S=#xmerl_scanner{continuation_fun = F}, + Pos, Name, Attrs, Space, Lang, Parents, NS, Acc,_) -> + ?dbg("cont()...~n", []), + F(fun(MoreBytes, S1) -> scan_content(MoreBytes, S1, + Pos, Name, Attrs, + Space, Lang, Parents, NS, Acc,[]) end, + fun(S1) -> ?fatal(unexpected_end, S1) end, + S); +scan_content("</" ++ T, S0, _Pos, Name, _Attrs, _Space, _Lang, + _Parents, _NS, Acc,[]) -> + ?bump_col(2), + {ETagName, _NamespaceInfo, T1, S1} = scan_name(T, S), + if ETagName == Name -> + ok; + true -> + ?fatal({endtag_does_not_match, {was,ETagName,should_have_been, Name}}, S) + end, + ?strip2, + case T2 of + ">" ++ T3 -> + {lists:reverse(Acc), T3, S2}; + _ -> + ?fatal({error,{unexpected_end_of_STag}},S) + end; +scan_content([$&|_T]=Str, + #xmerl_scanner{environment={external,{entity,EName}}} = S0, + Pos, Name, Attrs, Space, Lang, Parents, NS, Acc,_) -> + {_EntV,T1,S1}=scan_entity_value(Str,S0 ,[],EName,general), + %%This is a problem. All referenced entities in the external entity must be checked for recursion, thus parse the contentbut,skip result. + scan_content(T1,S1,Pos, Name, Attrs, Space, Lang, Parents, NS, Acc,[]); +scan_content("&"++T, + #xmerl_scanner{environment=internal_parsed_entity} = S, + Pos, Name, Attrs, Space, Lang, Parents, NS, Acc,_) -> + {_, T1, S1} = scan_reference(T, S), + scan_content(T1,S1,Pos, Name, Attrs, Space, Lang, Parents, NS, Acc,[]); +scan_content("&" ++ T, S0, Pos, Name, Attrs, Space, Lang, Parents, NS, Acc,[]) -> + ?bump_col(1), + {ExpRef, T1, S1} = scan_reference(T, S), + case markup_delimeter(ExpRef) of + true -> scan_content(ExpRef++T1,S1,Pos,Name,Attrs,Space,Lang,Parents,NS,Acc,ExpRef); + _ -> + scan_content(string_to_char_set(S1#xmerl_scanner.encoding,ExpRef)++T1,S1,Pos,Name,Attrs,Space,Lang,Parents,NS,Acc,[]) + end; +scan_content("<!--" ++ T, S, Pos, Name, Attrs, Space, Lang, Parents, NS, Acc,[]) -> + {_, T1, S1} = scan_comment(T, S, Pos, Parents, Lang), + scan_content(T1, S1, Pos+1, Name, Attrs, Space, Lang, Parents, NS, Acc,[]); +scan_content("<" ++ T, S0, Pos, Name, Attrs, Space, Lang, Parents, NS, Acc,[]) -> + ?bump_col(1), + {Markup, T1, S1} = + scan_content_markup(T, S, Pos, Name, Attrs, Space, Lang, Parents, NS), + AccF = S1#xmerl_scanner.acc_fun, + {NewAcc, NewPos, NewS} = case AccF(Markup, Acc, S1) of + {Acc2, S2} -> + {Acc2, Pos+1, S2}; + {Acc2, Pos2, S2} -> + {Acc2, Pos2, S2} + end, + scan_content(T1, NewS, NewPos, Name, Attrs, Space, Lang, + Parents, NS, NewAcc,[]); +scan_content([_H|T], S= #xmerl_scanner{environment={external,{entity,_}}}, + Pos, Name, Attrs, Space, Lang, Parents, NS, Acc,_) -> + %% Guess we have to scan the content to find any internal entity + %% references. + scan_content(T,S,Pos, Name, Attrs, Space, Lang, Parents, NS, Acc,[]); +scan_content(T, S=#xmerl_scanner{acc_fun = F, + event_fun = Event, + hook_fun=Hook, + line = _L}, + Pos, Name, Attrs, Space, Lang, Parents, NS, Acc,MarkupDel) -> + Text0 = #xmlText{pos = Pos, + parents = Parents}, + S1 = #xmerl_scanner{} = Event(#xmerl_event{event = started, + line = S#xmerl_scanner.line, + data = Text0}, S), + {Data, T1, S2} = scan_char_data(T, S1, Space,MarkupDel), + Text = Text0#xmlText{value = Data}, + {Ret,S2b} = Hook(Text,S2), + S3 = #xmerl_scanner{} = Event(#xmerl_event{event = ended, + line = S2b#xmerl_scanner.line, + data = Ret}, S2b), + {NewAcc, NewPos, NewS} = case F(Ret, Acc, S3) of + {Acc4, S4} -> + {Acc4, Pos+1, S4}; + {Acc4, Pos4, S4} -> + {Acc4, Pos4, S4} + end, + scan_content(T1, NewS, NewPos, Name, Attrs, Space, Lang, + Parents, NS, NewAcc,[]). + + +scan_content_markup([], S=#xmerl_scanner{continuation_fun = F}, + Pos, Name, Attrs, Space, Lang, Parents, NS) -> + ?dbg("cont()...~n", []), + F(fun(MoreBytes, S1) -> scan_content_markup( + MoreBytes,S1,Pos,Name, + Attrs,Space,Lang,Parents,NS) end, + fun(S1) -> ?fatal(unexpected_end, S1) end, + S); +scan_content_markup("![CDATA[" ++ T, S0, Pos, _Name, _Attrs, + _Space, _Lang, Parents, _NS) -> + ?bump_col(8), + scan_cdata(T, S, Pos, Parents); +scan_content_markup("?"++T,S0,Pos,_Name,_Attrs,_Space,_Lang,_Parents,_NS) -> + ?bump_col(1), + scan_pi(T, S, Pos); +scan_content_markup(T, S, Pos, _Name, _Attrs, Space, Lang, Parents, NS) -> + scan_element(T, S, Pos, Space, Lang, Parents, NS). + +scan_char_data(T, S, Space,MUD) -> + scan_char_data(T, S, Space,MUD, _Acc = []). + +%%%%%%% [14] CharData + +scan_char_data([], S=#xmerl_scanner{environment={external,{entity,_}}}, + _Space,_MUD, Acc) -> + + {lists:reverse(Acc), [], S}; +scan_char_data([], S=#xmerl_scanner{environment=internal_parsed_entity}, + _Space, _MUD,Acc) -> + + {lists:reverse(Acc), [], S}; +scan_char_data([], S=#xmerl_scanner{continuation_fun = F}, Space, _MUD,Acc) -> + ?dbg("cont()...~n", []), + F(fun(MoreBytes, S1) -> scan_char_data(MoreBytes,S1,Space,_MUD,Acc) end, + fun(S1) -> ?fatal(unexpected_end, S1) end, + S); +scan_char_data([$&|T], S,Space,"&",Acc) -> + scan_char_data(T, S, Space,[], [$&|Acc]); +scan_char_data(T=[$&|_], S,_Space,_MUD,Acc) -> + + {lists:reverse(Acc), T, S}; +scan_char_data("]]>" ++ _T, S, _Space,_MUD, _Acc) -> + %% See Section 2.4: Especially: + %% "The right angle bracket (>) MAY be represented using the string ">", + %% and MUST, for compatibility, be escaped using either ">" or a + %% character reference when it appears in the string "]]>" in content, when + %% that string is not marking the end of a CDATA section. + ?fatal(unexpected_cdata_end, S); +scan_char_data([$<|T],S,Space,"<", Acc) -> + scan_char_data(T, S, Space,[], [$<|Acc]); +scan_char_data(T = [$<|_], S, _Space,_MUD,Acc) -> + + {lists:reverse(Acc), T, S}; +scan_char_data(T = [H|R], S, Space,MUD, Acc) when ?whitespace(H) -> + if + MUD =:= [], Acc =:= [], H =:= $\n, Space =:= preserve -> + case fast_accumulate_whitespace(R, S, T) of + {done, Reply} -> + Reply; + {NewAcc, T1, S1} -> + scan_char_data(T1, S1, Space, MUD, NewAcc) + end; + true -> + {NewAcc, T1, S1} = accumulate_whitespace(T, S, Space, Acc), + scan_char_data(T1, S1, Space,MUD,NewAcc) + end; +scan_char_data([H1,H2|_T],S,_Space,_MUD,_Acc) when ?non_character(H1,H2) -> + ?fatal({error,{not_allowed_to_use_Unicode_noncharacters}},S); +scan_char_data("]]>"++_T,S,_Space,_MUD,_Acc) -> + ?fatal({error,{illegal_character_in_content,"]]>"}},S); +scan_char_data(Str,S0,Space,MUD,Acc) -> + ?bump_col(1), + {Ch,T} = wfc_legal_char(Str,S), + scan_char_data(T,S,Space,MUD,[Ch|Acc]). + + + +%%%%%%% [18]-[21] CDATA + +scan_cdata(Str, S, Pos, Parents) -> + scan_cdata(Str, S, Pos, Parents, _Acc = []). + + +scan_cdata([], S=#xmerl_scanner{continuation_fun = F}, Pos, Parents, Acc) -> + ?dbg("cont()...~n", []), + F(fun(MoreBytes, S1) -> scan_cdata(MoreBytes, S1, Pos, Parents, Acc) end, + fun(S1) -> ?fatal(unexpected_end, S1) end, + S); +scan_cdata("]]>" ++ T, S0, Pos, Parents, Acc) -> + ?bump_col(3), + {#xmlText{pos = Pos, + parents = Parents, + value = lists:reverse(Acc), + type = cdata}, T, S}; +scan_cdata(Str, S0, Pos, Parents, Acc) -> + {Ch,T} = to_ucs(S0#xmerl_scanner.encoding,Str), + case xmerl_lib:is_char(Ch) of + true -> + ?bump_col(1), + scan_cdata(T, S, Pos, Parents, [Ch|Acc]); + false -> + ?fatal({unexpected_char,Ch}, S0) + end. + + +%%%%%%% [67] Reference +%% returns a three tuple {Result,RestBuf,State} + +scan_reference([], S=#xmerl_scanner{continuation_fun = F}) -> + ?dbg("cont()...~n", []), + F(fun(MoreBytes, S1) -> scan_reference(MoreBytes, S1) end, + fun(S1) -> ?fatal(unexpected_end, S1) end, + S); +scan_reference("#x" ++ T, S0) -> + %% [66] CharRef + ?bump_col(1), + if hd(T) /= $; -> + {[Ch], T2, S2} = scan_char_ref_hex(T, S, 0), + {to_char_set(S2#xmerl_scanner.encoding,Ch),T2,S2}; + true -> + ?fatal(invalid_char_ref, S) + end; +scan_reference("#" ++ T, S0) -> + %% [66] CharRef + ?bump_col(1), + if hd(T) /= $; -> + scan_char_ref_dec(T, S, []); + true -> + ?fatal(invalid_char_ref, S) + end; +scan_reference(T, S) -> + case catch scan_entity_ref(T, S) of + {'EXIT', _} -> + ?fatal(error_scanning_entity_ref,S); + Other -> + Other + end. + + +%% Chapter 4.4.2: ... the replacement text of entities used to escape +%% markup delimiters (the entities amp, lt, gt, apos, quot) is always treated +%% as data. (The string "AT&T;" expands to "AT&T;" and the remaining +%% ampersand is not recognized as an entity-reference delimiter.)" +%% +%% How to achieve this? My current approach is to insert the *strings* "&", +%% "<", ">", "'", and "\"" instead of the characters. The processor will +%% ignore them when performing multiple expansions. This means, for now, that +%% the character data output by the processor is (1-2 levels) deep. +%% At some suitable point, we should flatten these, so that application-level +%% processors should not have to be aware of this detail. + +scan_entity_ref([], S=#xmerl_scanner{continuation_fun = F}) -> + ?dbg("cont()...~n", []), + F(fun(MoreBytes, S1) -> scan_entity_ref(MoreBytes, S1) end, + fun(S1) -> ?fatal(unexpected_end, S1) end, + S); +scan_entity_ref("amp;" ++ T, S0) -> + ?bump_col(4), + {"&", T, S}; +scan_entity_ref("lt;" ++ T, S0) -> + ?bump_col(3), + {"<", T, S}; +scan_entity_ref("gt;" ++ T, S0) -> + ?bump_col(3), + {">", T, S}; +scan_entity_ref("apos;" ++ T, S0) -> + ?bump_col(5), + {"'", T, S}; +scan_entity_ref("quot;" ++ T, S0) -> + ?bump_col(5), + {"\"", T, S}; +scan_entity_ref(T, S) -> + {Name, _NamespaceInfo, T1, S1} = scan_name(T, S), + T2 = scan_mandatory(";",T1,1,S1,expected_entity_reference_semicolon), +% ";" ++ T2 = T1, + S2 = S1, + Entity = expand_reference(Name, S2), + {Entity, T2, S2}. + + +%%%%%%% [69] PEReference + +scan_pe_reference(T, S) -> + {Name, _NamespaceInfo, T1, S1} = scan_name(T, S), + T2 = scan_mandatory(";",T1,1,S1,expected_parsed_entity_reference_semicolon), +% ";" ++ T2 = T1, + {Name, T2, S1#xmerl_scanner{col = S1#xmerl_scanner.col+1}}. + +expand_pe_reference(Name, #xmerl_scanner{rules_read_fun = Read} = S,WS) -> + case Read(parameter_entity, Name, S) of + undefined -> + ?fatal({unknown_parameter_entity, Name}, S); % WFC or VC failure + Err={error,_Reason} -> + ?fatal(Err,S); + Tuple when is_tuple(Tuple) -> + Tuple; + Result -> + if + WS == in_literal -> Result; + true -> " "++Result++" " + end + end. + +% Currently unused +% +% expand_external_pe_reference(Name, #xmerl_scanner{rules_read_fun = Read} = S) -> +% case Read(parameter_entity, Name, S) of +% undefined -> +% ?fatal({unknown_parameter_entity, Name}, S); +% Result -> +% fetch_DTD(Result,S) +% end. + + +%%%%%%% [68] EntityReference + +expand_reference(Name, #xmerl_scanner{environment={external,{entity,_}}}) -> + atom_to_list(Name); +expand_reference(Name, #xmerl_scanner{environment=internal_parsed_entity}) -> + atom_to_list(Name); +expand_reference(Name, #xmerl_scanner{rules_read_fun = Read} = S) -> + case Read(entity, Name, S) of + undefined -> + ?fatal({unknown_entity_ref, Name}, S); + {_,external,{error,enoent}} -> + ?fatal({error,{entity_target_not_found,{error,enoent},Name}},S); + {DefEnv,EntType,Value} -> + wfc_Entity_Declared(DefEnv,S,Name), + Value2 = string_to_char_set(S#xmerl_scanner.encoding,Value), + wfc_Internal_parsed_entity(EntType,Value2,S), + Value + end. + + +%%%%%%% [66] CharRef + +scan_char_ref_dec([], S=#xmerl_scanner{continuation_fun = F}, Acc) -> + ?dbg("cont()...~n", []), + F(fun(MoreBytes, S1) -> scan_char_ref_dec(MoreBytes, S1, Acc) end, + fun(S1) -> ?fatal(unexpected_end, S1) end, + S); +scan_char_ref_dec([H|T], S0, Acc) when H >= $0, H =< $9 -> + ?bump_col(1), + scan_char_ref_dec(T, S, [H|Acc]); +scan_char_ref_dec(";" ++ T, S0, Acc) -> + ?bump_col(1), + Ref = list_to_integer(lists:reverse(Acc)), + {Ch,_} = wfc_legal_char(Ref,S), + {[Ch], T, S}. %% changed return value from [[Ref]] + + +scan_char_ref_hex([], S=#xmerl_scanner{continuation_fun = F}, Acc) -> + ?dbg("cont()...~n", []), + F(fun(MoreBytes, S1) -> scan_char_ref_hex(MoreBytes, S1, Acc) end, + fun(S1) -> ?fatal(unexpected_end, S1) end, + S); +scan_char_ref_hex([H|T], S0, Acc) when H >= $0, H =< $9 -> + ?bump_col(1), + Dec = H - $0, + scan_char_ref_hex(T, S, (Dec bor (Acc bsl 4))); +scan_char_ref_hex([H|T], S0, Acc) when H >= $a, H =< $f -> + ?bump_col(1), + Dec = (H - $a) + 10, + scan_char_ref_hex(T, S, (Dec bor (Acc bsl 4))); +scan_char_ref_hex([H|T], S0, Acc) when H >= $A, H =< $F -> + ?bump_col(1), + Dec = (H - $A) + 10, + scan_char_ref_hex(T, S, (Dec bor (Acc bsl 4))); +scan_char_ref_hex(";" ++ T, S0, Acc) -> + ?bump_col(1), + {Ch,_} = wfc_legal_char(Acc,S), + {[Ch], T, S}. %% changed return value from [[Acc]] + + + +%%%%%%% [25] Eq +%%% Eq ::= S? '=' S? +scan_eq(T, S) -> + ?strip1, + case T1 of + [$=|T2] -> + S2 = S1#xmerl_scanner{col=S1#xmerl_scanner.col+1}, + ?strip3, + {T3, S3}; + _ -> + ?fatal(assignment_expected,S) + end. + + +%% scan_name/2 +%% +%% We perform some checks here to make sure that the names conform to +%% the "Namespaces in XML" specification. This is an option. +%% +%% Qualified Name: +%% [6] QName ::= (Prefix ':')? LocalPart +%% [7] Prefix ::= NCName +%% [8] LocalPart ::= NCName +%% [4] NCName ::= (Letter | '_') (NCNameChar)* +%% [5] NCNameChar ::= Letter | Digit | '.' | '-' | '_' +%% | CombiningChar | Extender + + +%% The effect of XML Names (namespace) conformance is that: +%% - All element types and attribute names contain either zero or one colon +%% - No entity names, PI targets, or notation names contain any colons. +%% +%% scan_name_no_colons/2 will ensure that the name contains no colons iff +%% the scanner has been told to be namespace conformant. Otherwise, it will +%% behave exactly like scan_name/2. +%% +scan_name_no_colons(Str, S) -> + NSC = S#xmerl_scanner.namespace_conformant, + case NSC of + true -> + {Target, NSI, T1, S1} = + scan_name(Str,S#xmerl_scanner{namespace_conformant=no_colons}), + {Target,NSI,T1,S1#xmerl_scanner{namespace_conformant=NSC}}; + false -> + scan_name(Str, S) + end. + + + +%% [5] Name ::= (Letter | '_' | ':') (NameChar)* +scan_name([], S=#xmerl_scanner{continuation_fun = F}) -> + ?dbg("cont()...~n", []), + F(fun(MoreBytes, S1) -> scan_name(MoreBytes, S1) end, + fun(S1) -> ?fatal(unexpected_end, S1) end, + S); +scan_name(Str = [$:|T], S0 = #xmerl_scanner{namespace_conformant = NSC}) -> + if NSC == false -> + ?bump_col(1), + scan_nmtoken(T, S, [$:], NSC); + NSC == no_colons -> + ?fatal({invalid_NCName, lists:sublist(Str, 1, 6)}, S0); + true -> + %% In order to conform with the "Namespaces in XML" spec, + %% we cannot allow names to begin with ":" + ?fatal({invalid_NCName, lists:sublist(Str, 1, 6)}, S0) + end; +scan_name([$_|T], S0 = #xmerl_scanner{namespace_conformant = NSC}) -> + ?bump_col(1), + scan_nmtoken(T, S, [$_], NSC); +scan_name("%"++_T,S=#xmerl_scanner{environment=prolog}) -> + ?fatal({error,{wfc_PEs_In_Internal_Subset}},S); +scan_name("%"++T,S0=#xmerl_scanner{environment={external,_}}) -> + %% parameter entity that expands to a name + ?bump_col(1), + {PERefName, T1, S1} = scan_pe_reference(T, S), + ExpRef = expand_pe_reference(PERefName, S1,as_PE), + {_,T2,S2} = strip(ExpRef ++ T1,S1), + scan_name(T2,S2); +scan_name(Str, S0 = #xmerl_scanner{namespace_conformant = NSC}) -> + {Ch,T} = to_ucs(S0#xmerl_scanner.encoding,Str), + case xmerl_lib:is_letter(Ch) of + true -> + ?bump_col(1), + scan_nmtoken(T, S, [Ch], NSC); + false -> + ?fatal({invalid_name, lists:sublist(Str, 1, 6)}, S0) + end; +scan_name(Str, S) -> + ?fatal({invalid_name, Str}, S). + + + + + + +scan_nmtoken(Str, S, Acc, NSC) -> + scan_nmtoken(Str, S, Acc, _Prefix = [], _Local = Acc, NSC,isLatin1(hd(Acc),true)). + +%% scan_nmtoken/2 +%% [7] NmToken ::= (NameChar)+ +scan_nmtoken([], S=#xmerl_scanner{continuation_fun = F}) -> + ?dbg("cont()...~n", []), + F(fun(MoreBytes, S1) -> scan_nmtoken(MoreBytes, S1) end, + fun(S1) -> ?fatal(unexpected_end, S1) end, + S); +scan_nmtoken("%"++T, S0=#xmerl_scanner{environment={external,_}}) -> + ?bump_col(1), + {PERefName, T1, S1} = scan_pe_reference(T, S), + ExpRef = expand_pe_reference(PERefName, S1,as_PE), + {_,T2,S2} = strip(ExpRef ++ T1,S1), + scan_nmtoken(T2,S2); +scan_nmtoken(Str, S) -> + {Ch,T} = to_ucs(S#xmerl_scanner.encoding,Str), + case xmerl_lib:is_namechar(Ch) of + true -> + scan_nmtoken(T, S#xmerl_scanner{col = S#xmerl_scanner.col+1}, + _Acc = [Ch], _Prefix = [], _Local = [Ch], + _NamespaceConformant = false,isLatin1(Ch,true)); + false -> + ?fatal({invalid_nmtoken, lists:sublist(Str, 1, 6)}, S) + end. + + +scan_nmtoken([], S=#xmerl_scanner{continuation_fun = F}, + Acc, Prefix, Local, NSC,IsLatin1) -> + ?dbg("cont()...~n", []), + F(fun(MoreBytes, S1) -> scan_nmtoken(MoreBytes,S1,Acc,Prefix,Local,NSC,IsLatin1) end, + fun(S1) -> {list_to_atom(lists:reverse(Acc)), + namespace_info(Prefix, Local),[],S1} end, + S); +%% whitespace marks the end of a name +scan_nmtoken(Str = [H|_], S, Acc, Prefix, Local, _NSC,true) when ?whitespace(H) -> + %% we don't strip here because the occurrence of whitespace may be an error + %% e.g. <!ELEMENT spec (front, body, back ?)> + NmString = lists:reverse(Acc), + {list_to_atom(NmString), namespace_info(Prefix, Local), Str, S}; +scan_nmtoken(Str = [$:|_], S, Acc, [], _Local, no_colons,_IsLatin1) -> + ?fatal({invalid_NCName, + lists:sublist(lists:reverse(Acc) ++ Str, 1, 6)}, S); +scan_nmtoken([$:|T], S0, Acc, [], Local, NSC, IsLatin1) -> + ?bump_col(1), + scan_nmtoken(T, S, [$:|Acc], lists:reverse(Local), [], NSC,IsLatin1); +scan_nmtoken(Str = [$:|_T], S, Acc, _Prefix, _Local, _NSC = true,_IsLatin1) -> + %% non-empty Prefix means that we've encountered a ":" already. + %% Conformity with "Namespaces in XML" requires + %% at most one colon in a name + ?fatal({invalid_NCName, + lists:sublist(lists:reverse(Acc) ++ Str, 1, 6)}, S); + +%% non-namechar also marks the end of a name +scan_nmtoken(Str, S0, Acc, Prefix, Local, NSC,IsLatin1) -> + ?bump_col(1), + {Ch,T} = to_ucs(S#xmerl_scanner.encoding,Str), + case {xmerl_lib:is_namechar(Ch),IsLatin1} of + {true,_} -> + scan_nmtoken(T, S, [Ch|Acc], Prefix, [Ch|Local], NSC,isLatin1(Ch,IsLatin1)); + {_,true} -> + NmStr = lists:reverse(Acc), + {list_to_atom(NmStr), namespace_info(Prefix, Local), Str, S}; + _ -> + {lists:reverse(Acc), namespace_info(Prefix, Local), Str, S} + end. + +namespace_info([], _) -> + []; +namespace_info(Prefix, Local) -> + {Prefix, lists:reverse(Local)}. + +isLatin1(_Ch,false) -> + false; +isLatin1(Ch,_) when Ch > 255 -> + false; +isLatin1(_,_) -> + true. + +%%%%%%% [11] SystemLiteral + +scan_system_literal([], S=#xmerl_scanner{continuation_fun = F}) -> + ?dbg("cont()...~n", []), + F(fun(MoreBytes, S1) -> scan_system_literal(MoreBytes, S1) end, + fun(S1) -> ?fatal(unexpected_end, S1) end, + S); +scan_system_literal("\"" ++ T, S) -> + scan_system_literal(T, S, $", []); +scan_system_literal("'" ++ T, S) -> + scan_system_literal(T, S, $', []). + + +scan_system_literal([], S=#xmerl_scanner{continuation_fun = F}, + Delimiter, Acc) -> + ?dbg("cont()...~n", []), + F(fun(MoreBytes, S1) -> scan_system_literal(MoreBytes,S1,Delimiter,Acc) end, + fun(S1) -> ?fatal(unexpected_end, S1) end, + S); +scan_system_literal([H|T], S, H, Acc) -> + {lists:reverse(Acc), T, S#xmerl_scanner{col = S#xmerl_scanner.col+1}}; +scan_system_literal("#"++_R, S, _H, _Acc) -> + %% actually not a fatal error + ?fatal(fragment_identifier_in_system_literal,S); +scan_system_literal(Str, S, Delimiter, Acc) -> + {Ch,T} = to_ucs(S#xmerl_scanner.encoding,Str), + scan_system_literal(T, S#xmerl_scanner{col = S#xmerl_scanner.col+1}, + Delimiter, [Ch|Acc]). + + +%%%%%%% [12] PubidLiteral + +scan_pubid_literal([], S=#xmerl_scanner{continuation_fun = F}) -> + ?dbg("cont()...~n", []), + F(fun(MoreBytes, S1) -> scan_pubid_literal(MoreBytes, S1) end, + fun(S1) -> ?fatal(unexpected_end, S1) end, + S); +scan_pubid_literal([H|T], S) when H == $"; H == $' -> + scan_pubid_literal(T, S#xmerl_scanner{col = S#xmerl_scanner.col+1}, H, []); +scan_pubid_literal([H|_T], S) -> + ?fatal({invalid_pubid_char, H}, S). + + +scan_pubid_literal([], S=#xmerl_scanner{continuation_fun = F}, + Delimiter, Acc) -> + ?dbg("cont()...~n", []), + F(fun(MoreBytes, S1) -> scan_pubid_literal(MoreBytes,S1,Delimiter,Acc) end, + fun(S1) -> ?fatal(unexpected_end, S1) end, + S); +scan_pubid_literal([H|T], S, H, Acc) -> + {lists:reverse(Acc), T, S#xmerl_scanner{col = S#xmerl_scanner.col+1}}; +scan_pubid_literal(Str = [H|_], S, Delimiter, Acc) when ?whitespace(H) -> + %% Before matching public identifiers, all whitespace must be normalized, + %% so we do that here + {_, T, S1} = pub_id_strip(Str, S), + scan_pubid_literal(T, S1, Delimiter, [16#20|Acc]); +scan_pubid_literal([H|T], S, Delimiter, Acc) -> + case is_pubid_char(H) of + true -> + scan_pubid_literal( + T, S#xmerl_scanner{col = S#xmerl_scanner.col+1}, + Delimiter, [H|Acc]); + false -> + ?fatal({invalid_pubid_char, H}, S) + end. + +%% We do not match whitespace here, even though they're allowed in public +%% identifiers. This is because we normalize this whitespace as we scan +%% (see above in scan_pubid_literal()) +%% +is_pubid_char(X) when X >= $a, X =< $z -> true; +is_pubid_char(X) when X >= $A, X =< $Z -> true; +is_pubid_char(X) when X >= $0, X =< $9 -> true; +is_pubid_char(X) -> + lists:member(X, "-'()+,./:=?;!*#@$_%"). + + +%%%%%%% [46] contentspec + +scan_contentspec([], S=#xmerl_scanner{continuation_fun = F}) -> + ?dbg("cont()...~n", []), + F(fun(MoreBytes, S1) -> scan_contentspec(MoreBytes, S1) end, + fun(S1) -> ?fatal(unexpected_end, S1) end, + S); +scan_contentspec("EMPTY" ++ T, S0) -> + ?bump_col(5), + {empty, T, S}; +scan_contentspec("ANY" ++ T, S0) -> + ?bump_col(3), + {any, T, S}; +scan_contentspec("%" ++ _T, S=#xmerl_scanner{environment=prolog}) -> + ?fatal({error,{wfc_PEs_In_Internal_Subset}},S); +scan_contentspec("%" ++ T, S0) -> + ?bump_col(1), + {PERefName, T1, S1} = scan_pe_reference(T, S), + ExpRef = expand_pe_reference(PERefName, S1,as_PE), + {_,T2,S2} = strip(ExpRef ++ T1,S1), + scan_contentspec(T2, S2); +scan_contentspec("(" ++ T, S0) -> + ?bump_col(1), + ?strip1, + scan_elem_content(T1, S1); +scan_contentspec(_Str,S) -> + ?fatal(unexpected_character,S). + + +%%%%%%% [47] children +%%%%%%% [51] Mixed + +scan_elem_content(T, S) -> + scan_elem_content(T, S, _Context = children, _Mode = unknown, _Acc = []). + +scan_elem_content([], S=#xmerl_scanner{continuation_fun = F}, + Context, Mode, Acc) -> + ?dbg("cont()...~n", []), + F(fun(MoreBytes,S1) -> scan_elem_content(MoreBytes,S1,Context,Mode,Acc) end, + fun(S1) -> ?fatal(unexpected_end, S1) end, + S); +scan_elem_content(")" ++ T, S0, Context, Mode0, Acc0) -> + ?bump_col(1), + {Mode, Acc} = case {Mode0, Acc0} of + {unknown, [_X]} -> + {seq, Acc0}; + {M, _L} when M == seq; M == choice -> + {Mode0, lists:reverse(Acc0)} + end, + {Occurrence, T1, S1} = scan_occurrence(T, S), + vc_No_Duplicate_Types(S,Context,Acc), + case {Occurrence, Context,Acc} of + {once, mixed,['#PCDATA']} -> ok; % It is not ok when there are + % more names than '#PCDATA' + % and no '*'. + {'*', mixed,_} -> ok; + {Other, mixed,_} -> + ?fatal({illegal_for_mixed_content, Other}, S1); + _ -> + ok + end, + ?strip2, + {format_elem_content({Occurrence, {Mode, Acc}}), T2, S2}; +scan_elem_content("#PCDATA" ++ _T, S, not_mixed, _Mode, _Acc) -> + ?fatal({error,{extra_set_of_parenthesis}},S); +scan_elem_content("#PCDATA" ++ _T, S, _Cont, Mode, Acc) + when Mode==choice;Mode==seq;Acc/=[] -> + ?fatal({error,{invalid_format_of_mixed_content}},S); +scan_elem_content("#PCDATA" ++ T, S0, _Context, Mode, Acc) -> + ?bump_col(7), + ?strip1, + scan_elem_content(T1, S1, mixed, Mode, ['#PCDATA'|Acc]); +scan_elem_content("," ++ _T, S, _Context, choice, _Acc) -> + ?fatal({mixing_comma_and_vertical_bar_in_content_model},S); +scan_elem_content("," ++ T, S0, Context, _Mode, Acc) -> + ?bump_col(1), + ?strip1, + scan_elem_content2(T1, S1, Context, seq, Acc); +scan_elem_content("|" ++ _T, S, _Context, seq, _Acc) -> + ?fatal({mixing_comma_and_vertical_bar_in_content_model},S); +scan_elem_content("|" ++ T, S0, Context, _Mode, Acc) -> + ?bump_col(1), + ?strip1, + scan_elem_content2(T1, S1, Context, choice, Acc); +scan_elem_content(T, S, Context, Mode, Acc) -> + scan_elem_content2(T, S, Context, Mode, Acc). + +scan_elem_content2("(" ++ _T, S, mixed, _Mode, _Acc) -> + ?fatal({error, + {element_names_must_not_be_parenthesized_in_mixed_content}},S); +scan_elem_content2("(" ++ T, S0, Context, Mode, Acc) -> + ?bump_col(1), + ?strip1, + {Inner, T2, S2} = scan_elem_content(T1, S1, not_mixed, unknown, []), + scan_elem_content(T2, S2, Context, Mode, [Inner|Acc]); +scan_elem_content2("%" ++ _T,S=#xmerl_scanner{environment=prolog},_Context,_Mode,_Acc) -> + ?fatal({error,{wfc_PEs_In_Internal_Subset}},S); +scan_elem_content2("%" ++ T, S0, Context, Mode, Acc) -> + ?bump_col(1), + {PERefName, T1, S1} = scan_pe_reference(T, S), + ExpRef = expand_pe_reference(PERefName, S1,as_PE), + {_,T2,S2}=strip(ExpRef++T1,S1), + scan_elem_content(T2, S2, Context, Mode, Acc); +scan_elem_content2(T, S, Context, Mode, Acc) -> + {Name, _NameStr, T1, S1} = scan_name(T, S), + {Occurrence, T2, S2} = scan_occurrence(T1, S1), + case {Occurrence, Context} of + {once, mixed} -> ok; + {Other, mixed} -> + ?fatal({illegal_for_mixed_content, Other}, S1); + _ -> + ok + end, + ?strip3, + mandatory_delimeter_wfc(T3,S3), + NewAcc = [format_elem_content({Occurrence, Name}) | Acc], + scan_elem_content(T3, S3, Context, Mode, NewAcc). + + +format_elem_content({once, What}) -> What; +format_elem_content(Other) -> Other. + + +scan_occurrence([], S=#xmerl_scanner{continuation_fun = F}) -> + ?dbg("cont()...~n", []), + F(fun(MoreBytes, S1) -> scan_occurrence(MoreBytes, S1) end, + fun(S1) -> ?fatal(unexpected_end, S1) end, + S); +scan_occurrence([$?|T], S0) -> + ?bump_col(1), + {'?', T, S}; +scan_occurrence([$+|T], S0) -> + ?bump_col(1), + {'+', T, S}; +scan_occurrence([$*|T], S0) -> + ?bump_col(1), + {'*', T, S}; +scan_occurrence(T, S) -> + {once, T , S}. + +%%% Tests of Validity Constraints + + +%% first part of VC: Name Token +vc_Valid_Char(_AT,C,S) -> + case xmerl_lib:is_namechar(C) of + true -> + ok; + _ -> + ?fatal({error,{validity_constraint_Name_Token,C}},S) + end. + + + +vc_ID_Attribute_Default(_,#xmerl_scanner{validation=Valid}) + when Valid /= dtd -> + ok; +vc_ID_Attribute_Default({_,'ID',_,Def,_},_S) + when Def=='#IMPLIED';Def=='#REQUIRED' -> + ok; +vc_ID_Attribute_Default({_,'ID',_,Def,_},S) -> + ?fatal({error,{validity_constraint_error_ID_Attribute_Default,Def}},S). + +vc_Enumeration({_Name,{_,NameList},DefaultVal,_,_},S) + when is_list(DefaultVal) -> + case lists:member(list_to_atom(DefaultVal),NameList) of + true -> + ok; + _ -> + ?fatal({error,{vc_enumeration,list_to_atom(DefaultVal),NameList}},S) + end; +vc_Enumeration({_Name,{_,_NameList},_DefaultVal,_,_},_S) -> + ok. + +vc_Entity_Name({_Name,'ENTITY',DefaultVal,_,_},S) when is_list(DefaultVal) -> + Read = S#xmerl_scanner.rules_read_fun, + case Read(entity,list_to_atom(DefaultVal),S) of + {_,external,{_,{ndata,_}}} -> + ok; + _ -> ?fatal({error,{vc_Entity_Name,list_to_atom(DefaultVal)}},S) + end; +vc_Entity_Name({_Name,'ENTITY',_,_,_},_S) -> + ok; +vc_Entity_Name({_,'ENTITIES',DefaultVal,_,_},S) when is_list(DefaultVal) -> + Read = S#xmerl_scanner.rules_read_fun, + NameListFun = fun([],Acc,_St,_Fun) -> + lists:reverse(Acc); + (Str,Acc,St,Fun) -> + {N,_,St2,Str2} = scan_name(Str,St), + Fun(Str2,[N|Acc],St2,Fun) + end, + NameList = NameListFun(DefaultVal,[],S,NameListFun), + VcFun = + fun(X) -> + case Read(entity,X,S) of + {_,external,{_,{ndata,_}}} -> + ok; + _ -> ?fatal({error,{vc_Entity_Name,X}},S) + end + end, + lists:foreach(VcFun,NameList); +vc_Entity_Name({_,'ENTITIES',_,_,_},_S) -> + ok. + +vc_No_Duplicate_Types(#xmerl_scanner{validation=dtd} = S,mixed,Acc) -> + CheckDupl = + fun([H|T],F) -> + case lists:member(H,T) of + true -> + ?fatal({no_duplicate_types_allowed,H},S); + _ -> F(T,F) + end; + ([],_) -> ok + end, + CheckDupl(Acc,CheckDupl); +vc_No_Duplicate_Types(_,_,_) -> + ok. + + +%%% Tests of Well-Formededness Constraints + + +mandatory_delimeter_wfc(","++_T,_S) -> + ok; +mandatory_delimeter_wfc("|"++_T,_S) -> + ok; +mandatory_delimeter_wfc(")"++_T,_S) -> + ok; +mandatory_delimeter_wfc("%"++_T,_S) -> + %% a parameter reference is ok + ok; +mandatory_delimeter_wfc(T,S) -> + ?fatal({comma_or_vertical_bar_mandatory_between_names_in_content_model,T},S). + + +wfc_unique_att_spec([],_S) -> + ok; +wfc_unique_att_spec([#xmlAttribute{name=N}|Atts],S) -> + case lists:keymember(N,#xmlAttribute.name,Atts) of + true -> + ?fatal({error,{unique_att_spec_required,N}},S); + _ -> + wfc_unique_att_spec(Atts,S) + end. + +wfc_legal_char(Chars,S) when is_list(Chars)-> + {Ch,Rest} = to_ucs(S#xmerl_scanner.encoding,Chars), + case xmerl_lib:is_char(Ch) of + true -> + {Ch,Rest}; + _ -> + ?fatal({error,{wfc_Legal_Character,Ch}},S) + end; +wfc_legal_char(Ch,S) -> + case xmerl_lib:is_char(Ch) of + true -> + {Ch,[]}; + _ -> + ?fatal({error,{wfc_Legal_Character,Ch}},S) + end. + + +wfc_whitespace_betw_attrs(WS,_S) when ?whitespace(WS) -> + ok; +wfc_whitespace_betw_attrs($/,_S) -> + ok; +wfc_whitespace_betw_attrs($>,_S) -> + ok; +wfc_whitespace_betw_attrs(_,S) -> + ?fatal({whitespace_required_between_attributes},S). + +wfc_Entity_Declared({external,_},S=#xmerl_scanner{standalone=yes},Name) -> + ?fatal({reference_to_externally_defed_entity_standalone_doc,Name},S); +wfc_Entity_Declared({external,_},_S,_) -> + ok; +wfc_Entity_Declared(_Env,_S,_) -> + ok. + +wfc_Internal_parsed_entity(internal,Value,S) -> + %% WFC test that replacement text matches production content + scan_content(Value,S#xmerl_scanner{environment=internal_parsed_entity}, + _Name=[],[],S#xmerl_scanner.space,_Lang=[],_Prnt=[], + #xmlNamespace{}); +wfc_Internal_parsed_entity(_,_,_) -> + ok. + + +vc_Element_valid(_Name,#xmerl_scanner{environment=internal_parsed_entity}) -> + ok; +vc_Element_valid(Name,S=#xmerl_scanner{rules_read_fun=Read, + validation=dtd}) -> + case Read(elem_def,Name,S) of + #xmlElement{elementdef=undeclared} -> + ?fatal({error,{error_missing_element_declaration_in_DTD,Name}},S); undefined -> + ?fatal({error,{error_missing_element_declaration_in_DTD,Name}},S); _ -> ok + end; +vc_Element_valid(_,_) -> + ok. + +%%%%%%% [74] PEDef + + +scan_pe_def([], S=#xmerl_scanner{continuation_fun = F}, PEName) -> + ?dbg("cont()...~n", []), + F(fun(MoreBytes, S1) -> scan_pe_def(MoreBytes, S1, PEName) end, + fun(S1) -> ?fatal(unexpected_end, S1) end, + S); +scan_pe_def("'" ++ T, S0, PEName) -> + ?bump_col(1), + scan_entity_value(T, S, $', PEName,parameter); +scan_pe_def("\"" ++ T, S0, PEName) -> + ?bump_col(1), + scan_entity_value(T, S, $", PEName,parameter); +scan_pe_def(Str, S, _PEName) -> + scan_external_id(Str, S). + + +%%%%%%% [82] NotationDecl + +scan_notation_decl(T, #xmerl_scanner{rules_write_fun = Write, + rules_read_fun=Read, + rules_delete_fun=Delete} = S) -> + {Name, _NameStr, T1, S1} = scan_name_no_colons(T, S), + {_,T2,S2} = mandatory_strip(T1,S1), + {Def, T3, S3} = scan_notation_decl1(T2, S2), + ?strip4, + T5 = scan_mandatory(">",T4,1,S4,expected_end_tag_notation_declaration), +% ">" ++ T5 = T4, + case Read(notation,Name,S) of + undeclared -> Delete(notation,Name,S4); + _ -> ok + end, + S5 = Write(notation, Name, Def, S4), + {T5, S5}. + +scan_notation_decl1([], S=#xmerl_scanner{continuation_fun = F}) -> + ?dbg("cont()...~n", []), + F(fun(MoreBytes, S1) -> scan_notation_decl1(MoreBytes, S1) end, + fun(S1) -> ?fatal(unexpected_end, S1) end, + S); +scan_notation_decl1("SYSTEM" ++ T, S0) -> + ?bump_col(6), + {_,T1,S1} = mandatory_strip(T,S), + {SL, T2, S2} = scan_system_literal(T1, S1), + {{system, SL}, T2, S2}; +scan_notation_decl1("PUBLIC" ++ T, S0) -> + ?bump_col(6), + {_,T1,S1} = mandatory_strip(T,S), + {PIDL, T2, S2} = scan_pubid_literal(T1, S1), + ?strip3, + case T3 of + ">" ++ _ -> + {{public, PIDL}, T3, + S3#xmerl_scanner{col = S3#xmerl_scanner.col+1}}; + _ -> + {SL, T4, S4} = scan_system_literal(T3, S3), + {{public, PIDL, SL}, T4, S4} + end. + +%%%%%%% [75] ExternalID + +scan_external_id([], S=#xmerl_scanner{continuation_fun = F}) -> + ?dbg("cont()...~n", []), + F(fun(MoreBytes, S1) -> scan_external_id(MoreBytes, S1) end, + fun(S1) -> ?fatal(unexpected_end, S1) end, + S); +scan_external_id("SYSTEM" ++ T, S0) -> + ?bump_col(6), + {_,T1,S1} = mandatory_strip(T,S), + {SL, T2, S2} = scan_system_literal(T1, S1), + {{system, SL}, T2, S2}; +scan_external_id("PUBLIC" ++ T, S0) -> + ?bump_col(6), + {_,T1,S1} = mandatory_strip(T,S), + {PIDL, T2, S2} = scan_pubid_literal(T1, S1), + {_,T3,S3} = mandatory_strip(T2,S2), + {SL, T4, S4} = scan_system_literal(T3, S3), + {{public, PIDL, SL}, T4, S4}. + + +%%%%%%% [9] EntityValue + +%% Note that we have two different scan functions for EntityValue +%% They differ in that this one checks for recursive calls to the same +%% parameter entity. + +scan_entity_value(Str, S, Delim, Name, Namespace) -> + scan_entity_value(Str, S, Delim, _Acc = [], Name, Namespace,[]). + + +scan_entity_value([], S=#xmerl_scanner{environment={external,{entity,_}}}, + _Delim,Acc,_,_,[]) -> + {lists:flatten(lists:reverse(Acc)), [], S}; +scan_entity_value([], S=#xmerl_scanner{environment={external,{entity,_}}, + validation=dtd}, + _Delim,_Acc,PEName,_,_) -> + {{error,{failed_VC_Proper_Declaration_PE_Nesting,1,PEName}},[],S}; +scan_entity_value([],S, + no_delim,Acc,_,_,[]) -> + {lists:flatten(lists:reverse(Acc)),[],S}; +scan_entity_value([],S=#xmerl_scanner{validation=dtd}, + no_delim,_Acc,PEName,_,_PENesting) -> + {{error,{failed_VC_Proper_Declaration_PE_Nesting,2,PEName}},[],S}; +scan_entity_value([], S=#xmerl_scanner{continuation_fun = F}, + Delim, Acc, PEName,Namespace,PENesting) -> + ?dbg("cont()...~n", []), + F(fun(MoreBytes, S1) -> + scan_entity_value(MoreBytes,S1, + Delim,Acc,PEName,Namespace,PENesting) + end, + fun(S1) -> ?fatal(unexpected_end, S1) end, + S); +scan_entity_value([Delim|T], S=#xmerl_scanner{validation=dtd}, + Delim,_Acc,PEName,_NS,PENesting) when length(PENesting) /= 0 -> + {{error,{failed_VC_Proper_Declaration_PE_Nesting,3,PEName}},T,S}; +scan_entity_value([Delim|T], S0, + Delim, Acc, _PEName,_NS,_PENesting) -> + ?bump_col(1), + {lists:flatten(lists:reverse(Acc)), T, S}; +scan_entity_value("%" ++ _T,S=#xmerl_scanner{environment=prolog},_,_,_,_,_) -> + ?fatal({error,{wfc_PEs_In_Internal_Subset}},S); +% %% This is a PEdecl in an external entity +% scan_entity_value([$%,WS|T], S0, Delim, Acc, PEName,Namespace,PENesting) +% when ?whitespace(WS) -> +% ?bump_col(2), +% scan_entity_value(T, S, Delim, [WS,$%|Acc], PEName,Namespace,PENesting); +scan_entity_value("%" ++ T, S0, Delim, Acc, PEName,Namespace,PENesting) -> + ?bump_col(1), + {PERefName, T1, S1} = scan_pe_reference(T, S), + if PERefName == PEName,Namespace==parameter -> + ?fatal({illegal_recursion_in_PE, PEName}, S1); + true -> + {ExpandedRef,S2} = + case expand_pe_reference(PERefName, S1, in_literal) of + %% actually should pe ref be expanded as_PE but + %% handle whitespace explicitly in this case. + Tuple when is_tuple(Tuple) -> + %% {system,URI} or {public,URI} + %% Included in literal. + {ExpRef,Sx}=fetch_not_parse(Tuple,S1), + {EntV,_,_S2} = + scan_entity_value(ExpRef, Sx, no_delim,[], + PERefName,parameter,[]), + %% should do an update Write(parameter_entity) + %% so next expand_pe_reference is faster + {EntV,_S2}; + ExpRef -> + {ExpRef,S1} + end, + %% single or duoble qoutes are not treated as delimeters + %% in passages "included in literal" + S3 = S2#xmerl_scanner{col=S2#xmerl_scanner.col+1}, + {Acc2,_,S4} = scan_entity_value(ExpandedRef,S3,no_delim,Acc, + PEName,Namespace,[]), +% {_,T2,S5} = strip(" "++T1,S4), + scan_entity_value(T1,S4#xmerl_scanner{line=S3#xmerl_scanner.line, + col=S3#xmerl_scanner.col}, + Delim,lists:reverse(Acc2), + PEName,Namespace,PENesting) +% scan_entity_value(T1,S4,Delim,lists:reverse(Acc2), +% PEName,Namespace,PENesting) + end; +scan_entity_value("&" ++ T, S0, Delim, Acc, PEName,Namespace,PENesting) -> + %% This is either a character entity or a general entity (internal + %% or external) reference. An internal general entity shall not be + %% expanded in an entity def XML1.0 section 4.5. + ?bump_col(1), + case T of + "#"++_T -> + {ExpRef, T1, S1} = scan_reference(T, S), + Tok = pe_nesting_token(ExpRef++T1,Namespace,S1#xmerl_scanner.validation), + case markup_delimeter(ExpRef) of + true -> + scan_entity_value(T1, S1, Delim, [ExpRef|Acc], PEName, + Namespace,pe_push(Tok,PENesting,S1)); + _ -> + ExpRef2 = string_to_char_set(S#xmerl_scanner.encoding,ExpRef), + scan_entity_value(ExpRef2 ++ T1, S1, Delim, Acc, PEName, + Namespace,pe_push(Tok,PENesting,S1)) + end; + _ -> %% General Entity is bypassed, though must check for + %% recursion: save referenced name now and check for + %% recursive reference after the whole entity definition is + %% completed. + {Name, _NamespaceInfo, T1, S1} = scan_name(T,S), + T2=scan_mandatory(";",T1,1,S1,expected_entity_reference_semicolon), + S2=save_refed_entity_name(Name,PEName,S1), + scan_entity_value(T2,S2,Delim,[";",atom_to_list(Name),"&"|Acc],PEName,Namespace,PENesting) + end; +%% The following clauses is for PE Nesting VC constraint +%% Start delimeter for ConditionalSection +scan_entity_value("<!["++T,S0,Delim,Acc,PEName,parameter=NS,PENesting)-> + ?bump_col(3), + scan_entity_value(T,S,Delim,["<!["|Acc],PEName,NS, + pe_push("<![",PENesting,S)); +%% Start delimeter for ConditionalSection (2) +scan_entity_value("["++T,S0,Delim,Acc,PEName,parameter=NS,PENesting)-> + ?bump_col(1), + scan_entity_value(T,S,Delim,["["|Acc],PEName,NS, + pe_push("[",PENesting,S)); +%% Start delimeter for comment +scan_entity_value("<!--"++T,S0,Delim,Acc,PEName,parameter=NS,PENesting)-> + ?bump_col(4), + scan_entity_value(T,S,Delim,["<!--"|Acc],PEName,NS, + pe_push("<!--",PENesting,S)); +%% Start delimeter for ElementDecl, AttListDecl,EntityDecl,NotationDecl +scan_entity_value("<!"++ T,S0,Delim,Acc,PEName, parameter=NS,PENesting) -> + ?bump_col(2), + scan_entity_value(T,S,Delim,["<!"|Acc],PEName,NS, + pe_push("<!",PENesting,S)); +%% Start delimeter for PI +scan_entity_value("<?"++T,S0,Delim,Acc,PEName, parameter=NS,PENesting) -> + ?bump_col(2), + scan_entity_value(T,S,Delim,["<?"|Acc],PEName,NS, + pe_push("<?",PENesting,S)); +%% Start delimeter for elements that matches the proper stop delimeter +%% for a markupdecl +scan_entity_value("</"++T,S0,Delim,Acc,PEName,parameter=NS,PENesting)-> + ?bump_col(2), + scan_entity_value(T,S,Delim,["</"|Acc],PEName,NS, + pe_push("</",PENesting,S)); +scan_entity_value("<"++T,S0,Delim,Acc,PEName,parameter=NS,PENesting)-> + ?bump_col(1), + scan_entity_value(T,S,Delim,["<"|Acc],PEName,NS, + pe_push("<",PENesting,S)); +%% Delimeter for contentspecs +scan_entity_value("("++T,S0,Delim,Acc,PEName,parameter=NS,PENesting)-> + ?bump_col(1), + scan_entity_value(T,S,Delim,["("|Acc],PEName,NS, + pe_push("(",PENesting,S)); +%% Stop delimeter for ElementDecl, AttListDecl,EntityDecl,NotationDecl +scan_entity_value(">"++ T,S0,Delim,Acc,PEName, parameter=NS,PENesting) -> + ?bump_col(1), + scan_entity_value(T,S,Delim,[">"|Acc],PEName,NS, + pe_pop(">",PENesting,S)); +%% Stop delimeter for PI +scan_entity_value("?>"++ T,S0,Delim,Acc,PEName, parameter=NS,PENesting) -> + ?bump_col(2), + scan_entity_value(T,S,Delim,["?>"|Acc],PEName,NS, + pe_pop("?>",PENesting,S)); +%% Stop delimeter for comment +scan_entity_value("-->"++ T,S0,Delim,Acc,PEName, parameter=NS,PENesting) -> + ?bump_col(3), + scan_entity_value(T,S,Delim,["-->"|Acc],PEName,NS, + pe_pop("-->",PENesting,S)); +%% Stop delimeter for ConditionalSection +scan_entity_value("]]>"++ T,S0,Delim,Acc,PEName, parameter=NS,PENesting) -> + ?bump_col(3), + scan_entity_value(T,S,Delim,["]]>"|Acc],PEName,NS, + pe_pop("]]>",PENesting,S)); +%% Stop delimeter added to match a content start delimeter included +scan_entity_value("/>"++ T,S0,Delim,Acc,PEName, parameter=NS,PENesting) -> + ?bump_col(2), + scan_entity_value(T,S,Delim,["/>"|Acc],PEName,NS, + pe_pop("/>",PENesting,S)); +scan_entity_value(")"++ T,S0,Delim,Acc,PEName, parameter=NS,PENesting) -> + ?bump_col(1), + scan_entity_value(T,S,Delim,[")"|Acc],PEName,NS, + pe_pop(")",PENesting,S)); +scan_entity_value("\n"++T, S, Delim, Acc, PEName,Namespace,PENesting) -> + scan_entity_value(T, S#xmerl_scanner{line=S#xmerl_scanner.line+1}, + Delim, ["\n"|Acc], PEName,Namespace,PENesting); +scan_entity_value(Str, S0, Delim, Acc, PEName,Namespace,PENesting) -> + {Ch,T} = to_ucs(S0#xmerl_scanner.encoding,Str), + case xmerl_lib:is_char(Ch) of + true -> + ?bump_col(1), + scan_entity_value(T, S, Delim, [Ch|Acc], PEName,Namespace,PENesting); + false -> + ?fatal({unexpected_char,Ch}, S0) + end. + + + +save_refed_entity_name(Name,PEName,S) -> + case predefined_entity(Name) of + true -> + S; + _ -> + save_refed_entity_name1(Name,PEName,S) + end. + +save_refed_entity_name1(Name,PEName, + S=#xmerl_scanner{entity_references=ERefs}) -> + case lists:keysearch(PEName,1,ERefs) of + {value,{_,Refs}} -> + NewRefs = + case lists:member(Name,Refs) of + true ->Refs; + _ -> [Name|Refs] + end, + S#xmerl_scanner{entity_references=lists:keyreplace(PEName,1,ERefs, + {PEName,NewRefs}) + }; + _ -> + S#xmerl_scanner{entity_references=[{PEName,[Name]}|ERefs]} + end. + + + +pe_push(Tok,Stack,_S) when Tok=="<!";Tok=="<?";Tok=="<!--";Tok=="<!["; + Tok=="[";Tok=="<";Tok=="</";Tok=="(" -> + [Tok|Stack]; +pe_push(Tok,Stack,#xmerl_scanner{validation=dtd}) + when Tok==")";Tok==">";Tok=="?>";Tok=="]]>";Tok=="-->";Tok=="/>"-> + [Tok|Stack]; +pe_push(_,Stack,_S) -> + Stack. + +pe_pop(">",["<!"|Rest],_S) -> Rest; +pe_pop("?>",["<?"|Rest],_S) -> Rest; +pe_pop("-->",["<!--"|Rest],_S) -> Rest; +pe_pop("]]>",["[","<!["|Rest],_S) -> Rest; +pe_pop("/>",["<"|Rest],_S) -> Rest; +pe_pop(">",["<"|Rest],_S) -> Rest; +pe_pop(">",["</"|Rest],_S) -> Rest; +pe_pop(")",["("|Rest],_S) -> Rest; +pe_pop(Token,_Stack,S=#xmerl_scanner{validation=dtd}) -> + ?fatal({error,{failed_VC_Proper_Declaration_PE_Nesting,5,Token}},S); +pe_pop(_,Rest,_) -> + Rest. + +pe_nesting_token("<!"++_T,parameter,dtd) -> "<!"; +pe_nesting_token("<?"++_T,parameter,dtd) -> "<?"; +pe_nesting_token("<!--"++_T,parameter,dtd) -> "<!--"; +pe_nesting_token("<!["++_T,parameter,dtd) -> "<!["; +pe_nesting_token("["++_T,parameter,dtd) -> "["; +pe_nesting_token("("++_T,parameter,dtd) -> "("; +pe_nesting_token(">"++_T,parameter,dtd) -> ">"; +pe_nesting_token("?>"++_T,parameter,dtd) -> "?>"; +pe_nesting_token("-->"++_T,parameter,dtd) -> "-->"; +pe_nesting_token("]]>"++_T,parameter,dtd) -> "]]>"; +pe_nesting_token(")"++_T,parameter,dtd) -> ")"; +pe_nesting_token("/>"++_T,parameter,dtd) -> "/>"; +pe_nesting_token(_,_,_) -> false. + +predefined_entity(amp) -> true; +predefined_entity(lt) -> true; +predefined_entity(gt) -> true; +predefined_entity(apos) -> true; +predefined_entity(quot) -> true; +predefined_entity(_) -> false. + +check_entity_recursion(EName, + S=#xmerl_scanner{entity_references=EntityRefList}) -> + Set = sofs:family(EntityRefList), + case catch sofs:family_to_digraph(Set, [acyclic]) of + {'EXIT',{cyclic,_}} -> + ?fatal({illegal_recursion_in_Entity, EName}, S); + DG -> + digraph:delete(DG), + ok + end. + + + + +%%%%%%% [15] Comment +scan_comment(Str, S) -> + scan_comment(Str, S, _Pos = undefined, _Parents = [], _Lang = []). + +scan_comment(Str,S=#xmerl_scanner{col=C,event_fun=Event}, Pos, Parents, Lang) -> + Comment = #xmlComment{pos = Pos, + parents = Parents, + language = Lang, + value = undefined}, + S1 = #xmerl_scanner{} = Event(#xmerl_event{event = started, + line = S#xmerl_scanner.line, + col = C, + pos = Pos, + data = Comment}, S), + + scan_comment1(Str, S1, Pos, Comment, _Acc = []). + +scan_comment1([], S=#xmerl_scanner{continuation_fun = F}, + Pos, Comment, Acc) -> + ?dbg("cont()...~n", []), + F(fun(MoreBytes, S1) -> scan_comment1(MoreBytes, S1, Pos, Comment, Acc) end, + fun(S1) -> ?fatal(unexpected_end, S1) end, + S); +scan_comment1("-->" ++ T, S0 = #xmerl_scanner{col = C, + event_fun = Event, + hook_fun = Hook}, + _Pos, Comment, Acc) -> + ?bump_col(3), + Comment1 = Comment#xmlComment{value = lists:reverse(Acc)}, + S1=#xmerl_scanner{}=Event(#xmerl_event{event = ended, + line=S#xmerl_scanner.line, + col = C, + data = Comment1}, S), + {Ret, S2} = Hook(Comment1, S1), + {_,T3,S3}=strip(T,S2), + {Ret,T3,S3}; +scan_comment1("--"++T,S,_Pos,_Comment,_Acc) -> + ?fatal({invalid_comment,"--"++[hd(T)]}, S); +scan_comment1("\n" ++ T, S=#xmerl_scanner{line = L}, Pos, Cmt, Acc) -> + scan_comment1(T, S#xmerl_scanner{line=L+1,col=1},Pos, Cmt, "\n" ++ Acc); +scan_comment1("\r\n" ++ T, S=#xmerl_scanner{line = L}, Pos, Cmt, Acc) -> + %% CR followed by LF is read as a single LF + scan_comment1(T, S#xmerl_scanner{line=L+1,col=1}, Pos, Cmt, "\n" ++ Acc); +scan_comment1("\r" ++ T, S=#xmerl_scanner{line = L}, Pos, Cmt, Acc) -> + %% CR not followed by LF is read as a LF + scan_comment1(T, S#xmerl_scanner{line=L+1,col=1}, Pos, Cmt, "\n" ++ Acc); +scan_comment1(Str, S=#xmerl_scanner{col = C}, Pos, Cmt, Acc) -> + {Ch,T} = wfc_legal_char(Str,S), + scan_comment1(T, S#xmerl_scanner{col=C+1}, Pos, Cmt, [Ch|Acc]). + +%%%%%%% + +scan_markup_completion_gt([$>|_R]=T,S) -> + {T,S}; +scan_markup_completion_gt([$%|T],S0) -> + ?bump_col(1), + {Name,T1,S1} = scan_pe_reference(T,S), + ExpandedRef = expand_pe_reference(Name,S1,as_PE), + {_,T2,S2} = strip(ExpandedRef++T1,S1), + scan_markup_completion_gt(T2,S2); +scan_markup_completion_gt(T,S) -> + ?fatal({error,{malformed_syntax_entity_completion,T}},S). + + +scan_mandatory(Pattern,T,N,S,ErrorMsg) -> + case lists:prefix(Pattern,T) of + true -> + lists:nthtail(N,T); + _ -> + ?fatal(ErrorMsg,S) + end. + + +strip(Str,S) -> + strip(Str,S,all). + +strip([], S=#xmerl_scanner{continuation_fun = F},_) -> + ?dbg("cont()... stripping whitespace~n", []), + F(fun(MoreBytes, S1) -> strip(MoreBytes, S1) end, + fun(S1) -> {[], [], S1} end, + S); +strip("\s" ++ T, S=#xmerl_scanner{col = C},Lim) -> + strip(T, S#xmerl_scanner{col = C+1},Lim); +strip("\t" ++ _T, S ,no_tab) -> + ?fatal({error,{no_tab_allowed}},S); +strip("\t" ++ T, S=#xmerl_scanner{col = C},Lim) -> + strip(T, S#xmerl_scanner{col = expand_tab(C)},Lim); +strip("\n" ++ T, S=#xmerl_scanner{line = L},Lim) -> + strip(T, S#xmerl_scanner{line = L+1, col = 1},Lim); +strip("\r\n" ++ T, S=#xmerl_scanner{line = L},Lim) -> + %% CR followed by LF is read as a single LF + strip(T, S#xmerl_scanner{line = L+1, col = 1},Lim); +strip("\r" ++ T, S=#xmerl_scanner{line = L},Lim) -> + %% CR not followed by LF is read as a LF + strip(T, S#xmerl_scanner{line = L+1, col = 1},Lim); +strip(Str, S,_Lim) -> + {[], Str, S}. + +%% demands a whitespace, though a parameter entity is ok, it will +%% expand with a whitespace on each side. +mandatory_strip([],S) -> + ?fatal({error,{whitespace_was_expected}},S); +mandatory_strip(T,S) when ?whitespace(hd(T)) -> + strip(T,S,all); +mandatory_strip([$%|T],S) when ?whitespace(hd(T)) -> %this is not a PERefence, but an PEDeclaration + ?fatal({error,{whitespace_was_expected}},S); +mandatory_strip([$%|_T]=T,S) -> + {[],T,S}; +mandatory_strip(_T,S) -> + ?fatal({error,{whitespace_was_expected}},S). + +%% strip but don't accept tab +pub_id_strip(Str, S) -> + strip(Str,S,no_tab). + + +normalize("&"++T,S,IsNorm) -> + case scan_reference(T, S) of + {ExpRef, T1, S1} when ?whitespace(hd(ExpRef)) -> + ExpRef2 = string_to_char_set(S#xmerl_scanner.encoding,ExpRef), + normalize(ExpRef2++T1,S1,IsNorm); + _ -> + {"&"++T,S,IsNorm} + end; +normalize(T,S,IsNorm) -> + case strip(T,S) of + {_,T,S} -> + {T,S,IsNorm}; + {_,T1,S1} -> + {T1,S1,true} + end. + + +%% Optimization: +%% - avoid building list of spaces or tabs; +%% - avoid reverse; +%% - compact two common indentation patterns. +%% Note: only to be called when a \n was found. +fast_accumulate_whitespace(" " ++ T, S, _) -> + fast_acc_spaces(T, S, 1); +fast_accumulate_whitespace("\t"++T, S, _) -> + fast_acc_tabs(T, S, 1); +fast_accumulate_whitespace("<"++_=R, S, _T) -> + #xmerl_scanner{common_data = CD, line = Line} = S, + {done, {element(3, CD), R, S#xmerl_scanner{col = 1, line = Line + 1}}}; +fast_accumulate_whitespace(_, S, T) -> + accumulate_whitespace(T, S, []). + +fast_acc_spaces(" " ++ T, S, N) -> + fast_acc_spaces(T, S, N + 1); +fast_acc_spaces(T, S, N) -> + fast_acc_end(T, S, N, N, $\s, 1). + +fast_acc_tabs("\t" ++ T, S, N) -> + fast_acc_tabs(T, S, N + 1); +fast_acc_tabs(T, S, N) -> + fast_acc_end(T, S, N, N * 8 + 1, $\t, 2). + +fast_acc_end(T, S, N, Col, C, CD_I) -> + #xmerl_scanner{common_data = CD, line = Line0} = S, + Line = Line0 + 1, + try + $< = hd(T), + {done,{element(N, element(CD_I, CD)), T, + S#xmerl_scanner{col = Col, line = Line}}} + catch _:_ -> + accumulate_whitespace(T, S, Line, Col, lists:duplicate(N, C)++"\n") + end. + + +%%% @spec accumulate_whitespace(T::string(),S::global_state(), +%%% atom(),Acc::string()) -> {Acc, T1, S1} +%%% +%%% @doc Function to accumulate and normalize whitespace. +accumulate_whitespace(T, S, preserve, Acc) -> + accumulate_whitespace(T, S, Acc); +accumulate_whitespace(T, S, normalize, Acc) -> + {_WsAcc, T1, S1} = accumulate_whitespace(T, S, []), + {[$\s|Acc], T1, S1}. + +accumulate_whitespace(T, S, Acc) -> + #xmerl_scanner{line = Line, col = Col} = S, + accumulate_whitespace(T, S, Line, Col, Acc). + +accumulate_whitespace([], S0, Line, Col, Acc) -> + #xmerl_scanner{continuation_fun = F} = S0, + S = S0#xmerl_scanner{line = Line, col = Col}, + ?dbg("cont()...~n", []), + F(fun(MoreBytes, S1) -> accumulate_whitespace(MoreBytes, S1, Acc) end, + fun(S1) -> {Acc, [], S1} end, + S); +accumulate_whitespace("\s" ++ T, S, Line, Col, Acc) -> + accumulate_whitespace(T, S, Line, Col+1, [$\s|Acc]); +accumulate_whitespace("\t" ++ T, S, Line, Col, Acc) -> + accumulate_whitespace(T, S, Line, expand_tab(Col), [$\t|Acc]); +accumulate_whitespace("\n" ++ T, S, Line, _Col, Acc) -> + accumulate_whitespace(T, S, Line+1, 1, [$\n|Acc]); +accumulate_whitespace("\r\n" ++ T, S, Line, _Col, Acc) -> + %% CR followed by LF is read as a single LF + accumulate_whitespace(T, S, Line+1, 1, [$\n|Acc]); +accumulate_whitespace("\r" ++ T, S, Line, _Col, Acc) -> + %% CR not followed by LF is read as a LF + accumulate_whitespace(T, S, Line+1, 1, [$\n|Acc]); +accumulate_whitespace(Str, S, Line, Col, Acc) -> + {Acc, Str, S#xmerl_scanner{line = Line, col = Col}}. + +expand_tab(Col) -> + Rem = (Col-1) rem 8, + _NewCol = Col + 8 - Rem. + +%% validation_mode(Validation) +%% Validation = off | dtd | schema | true | false +%% true and false are obsolete +validation_mode(false) -> + off; +validation_mode(true) -> + dtd; +validation_mode(Other) -> + Other. + + +schemaLocations(El,#xmerl_scanner{schemaLocation=[]}) -> + schemaLocations(El); +schemaLocations(El,#xmerl_scanner{schemaLocation=SL}) -> + case SL of + [{_,_}|_] -> + {ok,SL}; + _ -> + schemaLocations(El) + end. + +schemaLocations(#xmlElement{attributes=Atts,xmlbase=_Base}) -> + Pred = fun(#xmlAttribute{name=schemaLocation}) -> false; + (#xmlAttribute{namespace={_,"schemaLocation"}}) -> false; + (_) -> true + end, + case lists:dropwhile(Pred,Atts) of + [#xmlAttribute{value=Paths}|_] -> + + case string:tokens(Paths," ") of + L when length(L) > 0 -> + case length(L) rem 2 of + 0 -> + PairList = + fun([],_Fun) -> + []; + ([SLNS,SLLoc|Rest],Fun) -> + [{SLNS,SLLoc}|Fun(Rest,Fun)] + end, + {ok,PairList(L,PairList)}; + _ -> + {error,{schemaLocation_attribute,namespace_location_not_in_pair}} + end; + _ -> + {error,{missing_schemaLocation}} + end; + [] -> + {error,{missing_schemaLocation}} + end. + +inherit_options(S) -> + %%io:format("xsdbase: ~p~n",[S#xmerl_scanner.xmlbase]), + [{xsdbase,S#xmerl_scanner.xmlbase}]. + +handle_schema_result({XSDRes=#xmlElement{},_},S5) -> + {XSDRes,S5}; +handle_schema_result({error,Reason},S5) -> + ?fatal({failed_schema_validation,Reason},S5). + +%%% Helper functions + +fatal(Reason, S) -> + exit({fatal, {Reason, + {file,S#xmerl_scanner.filename}, + {line,S#xmerl_scanner.line}, + {col,S#xmerl_scanner.col}}}). + +%% preformat formats tokens in L1 and L2, L2 separated by Sep into a +%% list +preformat(L1,L2,Sep) -> + Format1= lists:flatten(lists:duplicate(length(L1)-1,"~s ")++"~s"), + Format2 = lists:flatten(lists:duplicate(length(L2)-1, + " ~s"++Sep)++" ~s"), + + lists:flatten(io_lib:format(Format1++Format2,L1++L2)). + + +%% BUG when we are many <!ATTLIST ..> balise none attributes has save in rules +rules_write(Context, Name, Value, #xmerl_scanner{rules = T} = S) -> + case ets:lookup(T, {Context, Name}) of + [] -> + ets:insert(T, {{Context, Name}, Value}); + _ -> + ok + end, + S. + + +rules_read(Context, Name, #xmerl_scanner{rules = T}) -> + case ets:lookup(T, {Context, Name}) of + [] -> + undefined; + [{_, V}] -> + V + end. + +rules_delete(Context,Name,#xmerl_scanner{rules = T}) -> + ets:delete(T,{Context,Name}). + +to_ucs(Encoding, Chars) when Encoding=="utf-8"; Encoding == undefined -> + utf8_2_ucs(Chars); +to_ucs(_,[C|Rest]) -> + {C,Rest}. + +utf8_2_ucs([A,B,C,D|Rest]) when A band 16#f8 =:= 16#f0, + B band 16#c0 =:= 16#80, + C band 16#c0 =:= 16#80, + D band 16#c0 =:= 16#80 -> + %% 11110vvv 10vvvvvv 10vvvvvv 10vvvvvv + case ((D band 16#3f) bor ((C band 16#3f) bsl 6) bor + ((B band 16#3f) bsl 12) bor ((A band 16#07) bsl 18)) of + Ch when Ch >= 16#10000 -> + {Ch,Rest}; + Ch -> + {{error,{bad_character,Ch}},Rest} + end; +utf8_2_ucs([A,B,C|Rest]) when A band 16#f0 =:= 16#e0, + B band 16#c0 =:= 16#80, + C band 16#c0 =:= 16#80 -> + %% 1110vvvv 10vvvvvv 10vvvvvv + case ((C band 16#3f) bor ((B band 16#3f) bsl 6) bor + ((A band 16#0f) bsl 12)) of + Ch when Ch >= 16#800 -> + {Ch,Rest}; + Ch -> + {{error,{bad_character,Ch}},Rest} + end; +utf8_2_ucs([A,B|Rest]) when A band 16#e0 =:= 16#c0, + B band 16#c0 =:= 16#80 -> + %% 110vvvvv 10vvvvvv + case ((B band 16#3f) bor ((A band 16#1f) bsl 6)) of + Ch when Ch >= 16#80 -> + {Ch,Rest}; + Ch -> + {{error,{bad_character,Ch}},Rest} + end; +utf8_2_ucs([A|Rest]) when A < 16#80 -> + {A,Rest}; +utf8_2_ucs([A|Rest]) -> + {{error,{bad_character,A}},Rest}. + +to_char_set("iso-10646-utf-1",Ch) -> + [Ch]; +to_char_set(UTF8,Ch) when UTF8 =:= "utf-8"; UTF8 =:= undefined -> + ucs_2_utf8(Ch); +to_char_set(_,Ch) -> + [Ch]. + +ucs_2_utf8(Ch) when Ch < 128 -> + %% 0vvvvvvv + [Ch]; +ucs_2_utf8(Ch) when Ch < 16#0800 -> + %% Ch: -----vvv vvvvvvvv + %% 110vvvvv 10vvvvvv + %% O1 = (Ch band 16#07c0) bsr 6, + %% O2 = (Ch band 16#003f), + [((Ch band 16#07c0) bsr 6) bor 16#c0,(Ch band 16#003f) bor 16#80]; +ucs_2_utf8(Ch) when Ch < 16#10000 -> + %% Ch: vvvvvvvv vvvvvvvv + %% 1110vvvv 10vvvvvv 10vvvvvv + %% O1 = (Ch band 16#f000) bsr 12 + %% O2 = (Ch band 16#0fc0) bsr 6 + %% O3 = (Ch band 16#003f) + [((Ch band 16#f000) bsr 12) bor 16#e0, + ((Ch band 16#0fc0) bsr 6) bor 16#80, + (Ch band 16#003f) bor 16#80]; +ucs_2_utf8(Ch) when Ch < 16#200000 -> + %% Ch: ---vvvvv vvvvvvvv vvvvvvvv + %% 11110vvv 10vvvvvv 10vvvvvv 10vvvvvv + %% O1 = (Ch band 16#1c0000) bsr 18 + %% O2 = (Ch band 16#03f000) bsr 12 + %% O3 = (Ch band 16#000fc0) bsr 6 + %% O4 = (Ch band 16#00003f) + [((Ch band 16#1c0000) bsr 18) bor 16#f0, + ((Ch band 16#03f000) bsr 12) bor 16#80, + ((Ch band 16#000fc0) bsr 6) bor 16#80, + (Ch band 16#00003f) bor 16#80]. + + +string_to_char_set(Enc,Str) when Enc =:= "utf-8"; Enc =:= undefined -> + lists:flatten([ucs_2_utf8(X)||X <- Str]); +string_to_char_set(_,Str) -> + Str. + +%% diagnose(Line) -> +%% Mem=erlang:memory(), +%% {OldTot,OldLine} = get_total(), +%% NewTot = +%% case {lists:keysearch(total,1,Mem),OldTot*1.1} of +%% {{_,{_,Tot}},Tot110} when Tot > Tot110 -> +%% io:format("From ~p to ~p, total memory: ~p (~p)~n",[OldLine,Line,Tot,OldTot]), +%% Tot; +%% {{_,{_,Tot}},_} -> +%% Tot +%% end, +%% put_total({NewTot,Line}). + +%% get_total() -> +%% case get(xmerl_mem) of +%% undefined -> +%% put(xmerl_mem,{0,0}), +%% {0,0}; +%% M -> M +%% end. + +%% put_total(M) -> +%% put(xmerl_mem,M). |