diff options
author | Erlang/OTP <[email protected]> | 2009-11-20 14:54:40 +0000 |
---|---|---|
committer | Erlang/OTP <[email protected]> | 2009-11-20 14:54:40 +0000 |
commit | 84adefa331c4159d432d22840663c38f155cd4c1 (patch) | |
tree | bff9a9c66adda4df2106dfd0e5c053ab182a12bd /lib/xmerl/src | |
download | otp-84adefa331c4159d432d22840663c38f155cd4c1.tar.gz otp-84adefa331c4159d432d22840663c38f155cd4c1.tar.bz2 otp-84adefa331c4159d432d22840663c38f155cd4c1.zip |
The R13B03 release.OTP_R13B03
Diffstat (limited to 'lib/xmerl/src')
41 files changed, 24681 insertions, 0 deletions
diff --git a/lib/xmerl/src/Makefile b/lib/xmerl/src/Makefile new file mode 100644 index 0000000000..7009b50f6c --- /dev/null +++ b/lib/xmerl/src/Makefile @@ -0,0 +1,251 @@ +# +# %CopyrightBegin% +# +# Copyright Ericsson AB 2003-2009. All Rights Reserved. +# +# The contents of this file are subject to the Erlang Public License, +# Version 1.1, (the "License"); you may not use this file except in +# compliance with the License. You should have received a copy of the +# Erlang Public License along with this software. If not, it can be +# retrieved online at http://www.erlang.org/. +# +# Software distributed under the License is distributed on an "AS IS" +# basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See +# the License for the specific language governing rights and limitations +# under the License. +# +# %CopyrightEnd% +# +# +include $(ERL_TOP)/make/target.mk +include $(ERL_TOP)/make/$(TARGET)/otp.mk + +# ---------------------------------------------------- +# Application version +# ---------------------------------------------------- +include ../vsn.mk +VSN=$(XMERL_VSN) + +# ---------------------------------------------------- +# Release directory specification +# ---------------------------------------------------- +RELSYSDIR = $(RELEASE_PATH)/lib/xmerl-$(VSN) + +# ---------------------------------------------------- +# Documentation directory specification +# ---------------------------------------------------- + +DOCDIR = ../doc + + +### XMERL use EDOC for documentation, to regenerate update paths as needed! +XMERL_APP = .. + +EDOC_APP = ../../edoc + +SYNTAX_TOOLS_APP = ../../syntax_tools + + + +# ---------------------------------------------------- +# Target Specs +# ---------------------------------------------------- + +EDOC_MODULES = \ + xmerl_scan \ + xmerl \ + xmerl_xs \ + xmerl_eventp \ + xmerl_xpath \ + xmerl_xsd + + + +MODULES = $(EDOC_MODULES) \ + xmerl_b64Bin \ + xmerl_b64Bin_scan \ + xmerl_html \ + xmerl_lib \ + xmerl_otpsgml \ + xmerl_regexp \ + xmerl_sgml \ + xmerl_simple \ + xmerl_text \ + xmerl_ucs \ + xmerl_uri \ + xmerl_validate \ + xmerl_xlate \ + xmerl_xml \ + xmerl_xpath_lib \ + xmerl_xpath_parse \ + xmerl_xpath_pred \ + xmerl_xpath_scan \ + xmerl_xsd_type \ + xmerl_sax_parser \ + xmerl_sax_parser_list \ + xmerl_sax_parser_latin1 \ + xmerl_sax_parser_utf8 \ + xmerl_sax_parser_utf16be \ + xmerl_sax_parser_utf16le \ + xmerl_sax_simple_dom \ + xmerl_sax_old_dom + + + +HRL_FILES = \ + ../include/xmerl.hrl \ + ../include/xmerl_xlink.hrl \ + ../include/xmerl_xpath.hrl \ + ../include/xmerl_xsd.hrl + +INCLUDE_DIR = ../include + +INTERNAL_HRL_FILES = \ + xmerl_internal.hrl \ + xmerl_sax_old_dom.hrl \ + xmerl_sax_parser.hrl + +ERL_FILES= $(MODULES:%=%.erl) + +APP_FILE= xmerl.app +APP_SRC= $(APP_FILE).src +APP_TARGET= $(EBIN)/$(APP_FILE) + +APPUP_FILE= xmerl.appup +APPUP_SRC= $(APPUP_FILE).src +APPUP_TARGET= $(EBIN)/$(APPUP_FILE) + +DOC_TARGET_FILES = $(EDOC_MODULES:%=$(DOCDIR)/%.html) +TARGET_FILES = $(MODULES:%=$(EBIN)/%.$(EMULATOR)) $(APP_TARGET) $(APPUP_TARGET) + +# ---------------------------------------------------- +# FLAGS +# ---------------------------------------------------- +ERL_COMPILE_FLAGS += \ + -I ../include \ + +warn_unused_vars + +# +bin_opt_info + + +# ---------------------------------------------------- +# Targets +# ---------------------------------------------------- +debug opt: $(TARGET_FILES) + +docs: +#docs: $(DOC_TARGET_FILES) + + +clean: + $(RM) $(TARGET_FILES) + $(RM) xmerl_xpath_parse.erl + $(RM) xmerl_b64Bin.erl + $(RM) xmerl_sax_parser_list.erl + $(RM) xmerl_sax_parser_latin1.erl + $(RM) xmerl_sax_parser_utf8.erl + $(RM) xmerl_sax_parser_utf16be.erl + $(RM) xmerl_sax_parser_utf16le.erl + $(RM) core *~ + +info: + @echo "MODULES: $(MODULES)" + @echo "EBIN: $(EBIN)" + @echo "EMULATOR: $(EMULATOR)" + @echo "APP_TARGET: $(APP_TARGET)" + @echo "TARGET_FILES: $(TARGET_FILES)" + @echo "DOC_TARGET_FILES: $(DOC_TARGET_FILES)" + @echo "DOCDIR/%html: $(DOCDIR)/%.html" + +realclean: clean + $(RM) $(DOC_TARGET_FILES) + +# ---------------------------------------------------- +# Special Build Targets +# ---------------------------------------------------- +EDOC_PATHS = \ + -pa $(EDOC_APP)/ebin -pa $(XMERL_APP)/ebin -pa $(SYNTAX_TOOLS_APP)/ebin + +$(APP_TARGET): $(APP_SRC) ../vsn.mk + sed -e 's;%VSN%;$(VSN);' $< > $@ + +$(APPUP_TARGET): $(APPUP_SRC) ../vsn.mk + sed -e 's;%VSN%;$(VSN);' $< > $@ + +xmerl_xpath_parse.erl: xmerl_xpath_parse.yrl + $(ERLC) -o $(ESRC) $< + +xmerl_b64Bin.erl: xmerl_b64Bin.yrl + $(ERLC) -o $(ESRC) $< + +xmerl_sax_parser_list.erl: xmerl_sax_parser_list.erlsrc xmerl_sax_parser_base.erlsrc + cat xmerl_sax_parser_list.erlsrc xmerl_sax_parser_base.erlsrc >$@ + +xmerl_sax_parser_latin1.erl: xmerl_sax_parser_latin1.erlsrc xmerl_sax_parser_base.erlsrc + cat xmerl_sax_parser_latin1.erlsrc xmerl_sax_parser_base.erlsrc >$@ + +xmerl_sax_parser_utf8.erl: xmerl_sax_parser_utf8.erlsrc xmerl_sax_parser_base.erlsrc + cat xmerl_sax_parser_utf8.erlsrc xmerl_sax_parser_base.erlsrc >$@ + +xmerl_sax_parser_utf16be.erl: xmerl_sax_parser_utf16be.erlsrc xmerl_sax_parser_base.erlsrc + cat xmerl_sax_parser_utf16be.erlsrc xmerl_sax_parser_base.erlsrc >$@ + +xmerl_sax_parser_utf16le.erl: xmerl_sax_parser_utf16le.erlsrc xmerl_sax_parser_base.erlsrc + cat xmerl_sax_parser_utf16le.erlsrc xmerl_sax_parser_base.erlsrc >$@ + +$(EBIN)/%.beam: %.erl + $(ERLC) $(ERL_COMPILE_FLAGS) -o $(EBIN) $< + +$(DOCDIR)/%.html: %.erl + $(ERL) -noshell $(EDOC_PATHS) \ + -run edoc_run file '"$<"' '[{dir,"$(DOCDIR)"}]' -s erlang halt + + +#$(DOCDIR)/%.html: %.erl +# $(ERL) $(EDOC_PATHS) -s edoc file $< ['{dir,"$(DOCDIR)"}'] + +# ---------------------------------------------------- +# Release Target +# ---------------------------------------------------- +#ifeq "${MA_TOP}" "" +#MA_TOP = ../../.. +#endif +#include $(MA_TOP)/config/make.release +include $(ERL_TOP)/make/otp_release_targets.mk + + +release_spec: opt + $(INSTALL_DIR) $(RELSYSDIR)/ebin + $(INSTALL_DATA) $(TARGET_FILES) $(RELSYSDIR)/ebin + $(INSTALL_DIR) $(RELSYSDIR)/src + $(INSTALL_DATA) $(ERL_FILES) $(INTERNAL_HRL_FILES) $(APP_SRC) $(APPUP_SRC) $(RELSYSDIR)/src + $(INSTALL_DATA) xmerl_xpath_parse.yrl $(RELSYSDIR)/src + $(INSTALL_DATA) xmerl_b64Bin.yrl $(RELSYSDIR)/src + $(INSTALL_DIR) $(RELSYSDIR)/include + $(INSTALL_DATA) $(HRL_FILES) $(RELSYSDIR)/include + +release_docs_spec: + + +release_tests_spec: + + +#------------------------------------------------------------ +# .hrl dependencies +$(EBIN)/xmerl.beam:../include/xmerl.hrl +$(EBIN)/xmerl_lib.beam:../include/xmerl.hrl +$(EBIN)/xmerl_scan.beam:../include/xmerl.hrl +$(EBIN)/xmerl_xlate.beam:../include/xmerl.hrl +$(EBIN)/xmerl_xml.beam:../include/xmerl.hrl +$(EBIN)/xmerl_html.beam:../include/xmerl.hrl +$(EBIN)/xmerl_text.beam:../include/xmerl.hrl +$(EBIN)/xmerl_eventp.beam:../include/xmerl.hrl +$(EBIN)/xmerl_simple.beam:../include/xmerl.hrl +$(EBIN)/xmerl_xpath.beam:../include/xmerl.hrl +$(EBIN)/xmerl_xpath_lib.beam:../include/xmerl.hrl +$(EBIN)/xmerl_xpath_pred.beam:../include/xmerl.hrl +$(EBIN)/xmerl_xpath_scan.beam:../include/xmerl.hrl +#$(EBIN)/xmerl_xsd.beam:../include/xmerl_xsd.hrl +#$(EBIN)/xmerl_xsd.beam:../include/xmerl.hrl + + diff --git a/lib/xmerl/src/xmerl.app.src b/lib/xmerl/src/xmerl.app.src new file mode 100644 index 0000000000..b471447bbd --- /dev/null +++ b/lib/xmerl/src/xmerl.app.src @@ -0,0 +1,43 @@ +{application,xmerl, + [{description,"XML parser"}, + {vsn,"%VSN%"}, + {modules,[ + xmerl, + xmerl_b64Bin, + xmerl_b64Bin_scan, + xmerl_eventp, + xmerl_html, + xmerl_lib, + xmerl_otpsgml, + xmerl_regexp, + xmerl_sax_parser, + xmerl_sax_parser_list, + xmerl_sax_parser_latin1, + xmerl_sax_parser_utf8, + xmerl_sax_parser_utf16be, + xmerl_sax_parser_utf16le, + xmerl_sax_simple_dom, + xmerl_sax_old_dom, + xmerl_scan, + xmerl_sgml, + xmerl_simple, + xmerl_text, + xmerl_ucs, + xmerl_uri, + xmerl_validate, + xmerl_xlate, + xmerl_xml, + xmerl_xpath, + xmerl_xpath_lib, + xmerl_xpath_parse, + xmerl_xpath_pred, + xmerl_xpath_scan, + xmerl_xs, + xmerl_xsd, + xmerl_xsd_type + ]}, + + {registered, []}, + {env, []}, + {applications, [kernel, stdlib]} + ]}. diff --git a/lib/xmerl/src/xmerl.appup.src b/lib/xmerl/src/xmerl.appup.src new file mode 100644 index 0000000000..0d8aa4eb04 --- /dev/null +++ b/lib/xmerl/src/xmerl.appup.src @@ -0,0 +1,14 @@ +{"%VSN%", + [ + {"1.1.11", + [ + ] + } + ], + [ + {"1.1.11", + [ + ] + } + ] +}. diff --git a/lib/xmerl/src/xmerl.erl b/lib/xmerl/src/xmerl.erl new file mode 100644 index 0000000000..cf78f7bdf7 --- /dev/null +++ b/lib/xmerl/src/xmerl.erl @@ -0,0 +1,320 @@ +%% +%% %CopyrightBegin% +%% +%% Copyright Ericsson AB 2003-2009. All Rights Reserved. +%% +%% The contents of this file are subject to the Erlang Public License, +%% Version 1.1, (the "License"); you may not use this file except in +%% compliance with the License. You should have received a copy of the +%% Erlang Public License along with this software. If not, it can be +%% retrieved online at http://www.erlang.org/. +%% +%% Software distributed under the License is distributed on an "AS IS" +%% basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See +%% the License for the specific language governing rights and limitations +%% under the License. +%% +%% %CopyrightEnd% +%% + +%% +%% Description : Functions to export simple and complete XML forms +%% + +%% @doc Functions for exporting XML data to an external format. +%% + +-module(xmerl). + +%-compile(export_all). + +-export([export/2, + export/3, + export_content/2, + export_element/2, + export_element/3, + export_simple/2, + export_simple/3, + export_simple_element/2, + export_simple_content/2, + callbacks/1]). + +-include("xmerl.hrl"). + + +%% @spec export(Content, Callback) -> ExportedFormat +%% @equiv export(Data, Callback, []) + +export(Content, Callback) -> + export(Content, Callback, []). + +%% @spec export(Content, Callback, RootAttributes) -> ExportedFormat +%% Content = [Element] +%% Callback = atom() +%% RootAttributes = [XmlAttributes] +%% @doc Exports normal, well-formed XML content, using the specified +%% callback-module. +%% <p><code>Element</code> is any of:</p> +%% <ul> +%% <li><code>#xmlText{}</code></li> +%% <li><code>#xmlElement{}</code></li> +%% <li><code>#xmlPI{}</code></li> +%% <li><code>#xmlComment{}</code></li> +%% <li><code>#xmlDecl{}</code></li> +%% </ul> +%% <p>(See <tt>xmerl.hrl</tt> for the record definitions.) +%% Text in <code>#xmlText{}</code> elements can be deep lists of +%% characters and/or binaries.</p> +%% +%% <p><code>RootAttributes</code> is a list of +%% <code>#xmlAttribute{}</code> attributes for the <code>#root#</code> +%% element, which implicitly becomes the parent of the given +%% <code>Content</code>. The tag-handler function for +%% <code>#root#</code> is thus called with the complete exported data of +%% <code>Content</code>. Root attributes can be used to specify +%% e.g. encoding or other metadata of an XML or HTML document.</p> +%% +%% <p>The <code>Callback</code> module should contain hook functions for +%% all tags present in the data structure. A hook function must have the +%% following format:</p> +%% <pre> Tag(Data, Attributes, Parents, E)</pre> +%% <p>where <code>E</code> is the corresponding <code>#xmlElement{}</code>, +%% <code>Data</code> is the already-exported contents of <code>E</code> +%% and <code>Attributes</code> is the list of +%% <code>#xmlAttribute{}</code> records of <code>E</code>. Finally, +%% <code>Parents</code> is the list of parent nodes of <code>E</code>, +%% on the form <code>[{ParentTag::atom(), +%% ParentPosition::integer()}]</code>.</p> +%% +%% <p>The hook function should return either the data to be exported, or +%% a tuple <code>{'#xml-alias#', NewTag::atom()}</code>, or a tuple +%% <code>{'#xml-redefine#', Content}</code>, where <code>Content</code> +%% is a content list (which can be on simple-form; see +%% <code>export_simple/2</code> for details).</p> +%% +%% <p>A callback module can inherit definitions from other callback +%% modules, through the required function <code>'#xml-interitance#() -> +%% [ModuleName::atom()]</code>.</p> +%% +%% @see export/2 +%% @see export_simple/3 + +export(Content, Callback, RootAttributes) when is_atom(Callback) -> + export1(Content, callbacks(Callback), RootAttributes); +export(Content, Callbacks, RootAttrs) when is_list(Callbacks) -> + export1(Content, Callbacks, RootAttrs). + +%% @spec export_simple(Content, Callback) -> ExportedFormat +%% @equiv export_simple(Content, Callback, []) + +export_simple(Content, Callback) -> + export_simple(Content, Callback, []). + +%% @spec export_simple(Content, Callback, RootAttributes) -> ExportedFormat +%% Content = [Element] +%% Callback = atom() +%% RootAttributes = [XmlAttributes] +%% @doc Exports "simple-form" XML content, using the specified +%% callback-module. +%% <p><code>Element</code> is any of:</p> +%% <ul> +%% <li><code>{Tag, Attributes, Content}</code></li> +%% <li><code>{Tag, Content}</code></li> +%% <li><code>Tag</code></li> +%% <li><code>IOString</code></li> +%% <li><code>#xmlText{}</code></li> +%% <li><code>#xmlElement{}</code></li> +%% <li><code>#xmlPI{}</code></li> +%% <li><code>#xmlComment{}</code></li> +%% <li><code>#xmlDecl{}</code></li> +%% </ul> +%% <p>where</p> +%% <ul> +%% <li><code>Tag = atom()</code></li> +%% <li><code>Attributes = [{Name, Value}]</code></li> +%% <li><code>Name = atom()</code></li> +%% <li><code>Value = IOString | atom() | integer()</code></li> +%% </ul> +%% <p>Normal-form XML elements can thus be included in the simple-form +%% representation. Note that content lists must be flat. An +%% <code>IOString</code> is a (possibly deep) list of characters and/or +%% binaries.</p> +%% +%% <p><code>RootAttributes</code> is a list of:</p> +%% <ul> +%% <li><code>XmlAttributes = #xmlAttribute{}</code></li> +%%</ul> +%% +%% <p>See <code>export/3</code> for details on the callback module and +%% the root attributes. The XML-data is always converted to normal form +%% before being passed to the callback module.</p> +%% +%% @see export/3 +%% @see export_simple/2 + +export_simple(Content, Callback, RootAttrs) when is_atom(Callback) -> + export_simple1(Content, callbacks(Callback), RootAttrs); +export_simple(Content, Callbacks, RootAttrs) when is_list(Callbacks) -> + export_simple1(Content, Callbacks, RootAttrs). + +export_simple1(Content, Callback, RootAttrs) -> + export1(xmerl_lib:expand_content(Content), Callback, RootAttrs). + +%% This exports proper XML content in root context. + +export1(Content, Callbacks, RootAttrs) when is_list(Content) -> + Result = export_content(Content, Callbacks), + Attrs = xmerl_lib:expand_attributes(RootAttrs, 1, [{'#root#',1}]), + Root = #xmlElement{name = '#root#', + pos = 1, + parents = [], + attributes = Attrs}, + Args = [Result, Root#xmlElement.attributes, [], Root], + tagdef('#root#',1,[],Args,Callbacks). + +%% @doc Exports simple XML content directly, without further context. + +export_simple_content(Content, Callback) when is_atom(Callback) -> + export_content(xmerl_lib:expand_content(Content), + callbacks(Callback)); +export_simple_content(Content, Callbacks) when is_list(Callbacks) -> + export_content(xmerl_lib:expand_content(Content), Callbacks). + + +%% @spec export_content(Content, Callbacks) -> term() +%% Content = [Element] +%% Callback = [atom()] +%% @doc Exports normal XML content directly, without further context. +export_content([#xmlText{value = Text} | Es], Callbacks) -> + [apply_text_cb(Callbacks, Text) | export_content(Es, Callbacks)]; +export_content([#xmlPI{} | Es], Callbacks) -> + export_content(Es, Callbacks); +export_content([#xmlComment{} | Es], Callbacks) -> + export_content(Es, Callbacks); +export_content([#xmlDecl{} | Es], Callbacks) -> + export_content(Es, Callbacks); +export_content([E | Es], Callbacks) -> + [export_element(E, Callbacks) | export_content(Es, Callbacks)]; +export_content([], _Callbacks) -> + []. + +%% @doc Exports a simple XML element directly, without further context. + +export_simple_element(Content, Callback) when is_atom(Callback) -> + export_element(xmerl_lib:expand_element(Content), + callbacks(Callback)); +export_simple_element(Content, Callbacks) when is_list(Callbacks) -> + export_element(xmerl_lib:expand_element(Content), Callbacks). + +%% @doc Exports a normal XML element directly, without further context. + +%% This is the usual DOM style parsing. + +export_element(E, CB) when is_atom(CB) -> + export_element(E, callbacks(CB)); +export_element(#xmlText{value = Text}, CBs) -> + apply_text_cb(CBs, Text); +export_element(E = #xmlElement{name = Tag, + pos = Pos, + attributes = Attributes, + parents = Parents, + content = Content}, CBs) -> + Data = export_content(Content, CBs), + Args = [Data, Attributes, Parents, E], + tagdef(Tag,Pos,Parents,Args,CBs); +export_element(#xmlPI{}, _CBs) -> + []; +export_element(#xmlComment{}, _CBs) -> + []; +export_element(#xmlDecl{}, _CBs) -> + []. + + +%% @spec export_element(E,CallbackModule,CallbackState) -> ExportedFormat +%% @doc For on-the-fly exporting during parsing (SAX style) of the XML +%% document. +export_element(E, CallbackModule, CallbackState) when is_atom(CallbackModule) -> + export_element(E, callbacks(CallbackModule), CallbackState); +export_element(#xmlText{value = Text},CallbackModule,_CallbackState) -> +%% apply_cb(CallbackModule, '#text#', '#text#', [Text,CallbackState]); + apply_text_cb(CallbackModule,Text); +export_element(E=#xmlElement{name = Tag, + pos = Pos, + parents = Parents, + attributes = Attributes, + content = Content},Callbacks,CBstate) -> + Args = [Content, Attributes,CBstate,E], + tagdef(Tag,Pos,Parents,Args,Callbacks); +export_element(#xmlPI{}, _CallbackModule, CallbackState) -> + CallbackState; +export_element(#xmlComment{},_CallbackModule, CallbackState) -> + CallbackState; +export_element(#xmlDecl{},_CallbackModule, CallbackState) -> + CallbackState. + +%% A thing returned with #xml-redefine is assumed to be a content list +%% The data may be on "simple" format. + +tagdef(Tag,Pos,Parents,Args,CBs) -> + case apply_tag_cb(CBs, Tag, Args) of + {'#xml-alias#', NewTag} -> + tagdef(NewTag,Pos,Parents,Args,CBs); + {'#xml-redefine#', Data} -> + export_content(xmerl_lib:expand_content(Data, Pos, Parents), + CBs); + Other -> + Other + end. + +%% @spec callbacks(Module) -> Result +%% Module = atom() +%% Result = [atom()] +%% @doc Find the list of inherited callback modules for a given module. + +callbacks(Module) -> + Result = check_inheritance(Module, []), +%%% io:format("callbacks = ~p~n", [lists:reverse(Result)]), + lists:reverse(Result). + +callbacks([M|Mods], Visited) -> + case lists:member(M, Visited) of + false -> + NewVisited = check_inheritance(M, Visited), + callbacks(Mods, NewVisited); + true -> + exit({cyclic_inheritance, {M, hd(Visited)}}) + end; +callbacks([], Visited) -> + Visited. + +check_inheritance(M, Visited) -> +%%% io:format("calling ~p:'#xml-inheritance#'()~n", [M]), + case M:'#xml-inheritance#'() of + [] -> + [M|Visited]; + Mods -> + callbacks(Mods, [M|Visited]) + end. + +apply_text_cb(Ms, Text) -> + apply_cb(Ms, '#text#', '#text#', [Text]). + +apply_tag_cb(Ms, F, Args) -> + apply_cb(Ms, F, '#element#', Args). + +apply_cb(Ms, F, Df, Args) -> + apply_cb(Ms, F, Df, Args, Ms). + +apply_cb([M|Ms], F, Df, Args, Ms0) -> + case catch apply(M, F, Args) of + {'EXIT', {undef,[{M,F,_}|_]}} -> + apply_cb(Ms, F, Df, Args, Ms0); + {'EXIT', Reason} -> + exit(Reason); + Res -> + Res + end; +apply_cb([], Df, Df, Args, _Ms0) -> + exit({unknown_tag, {Df, Args}}); +apply_cb([], F, Df, Args, Ms0) -> + apply_cb(Ms0, Df, Df, [F|Args]). diff --git a/lib/xmerl/src/xmerl_b64Bin.yrl b/lib/xmerl/src/xmerl_b64Bin.yrl new file mode 100644 index 0000000000..2d94ea30e2 --- /dev/null +++ b/lib/xmerl/src/xmerl_b64Bin.yrl @@ -0,0 +1,68 @@ +%% +%% %CopyrightBegin% +%% +%% Copyright Ericsson AB 2006-2009. All Rights Reserved. +%% +%% The contents of this file are subject to the Erlang Public License, +%% Version 1.1, (the "License"); you may not use this file except in +%% compliance with the License. You should have received a copy of the +%% Erlang Public License along with this software. If not, it can be +%% retrieved online at http://www.erlang.org/. +%% +%% Software distributed under the License is distributed on an "AS IS" +%% basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See +%% the License for the specific language governing rights and limitations +%% under the License. +%% +%% %CopyrightEnd% +%% + +%% The base64Binary grammar is as follows: +%% Base64Binary ::= ((B64S B64S B64S B64S)* +%% ((B64S B64S B64S B64) | +%% (B64S B64S B16S '=') | +%% (B64S B04S '=' #x20? '=')))? +%% +%% B64S ::= B64 #x20? +%% +%% B16S ::= B16 #x20? +%% +%% B04S ::= B04 #x20? +%% +%% B04 ::= [AQgw] +%% (B16 ::= [AEIMQUYcgkosw048]) +%% (B64 ::= [A-Za-z0-9+/]) + +%% Changes +%% B16x ::= [EIMUYckos048] +%% B64x ::= [B-DF-HJ-LN-PR-TV-XZabd-fh-jl-np-rt-vx-z1-35-79+/] + +%% B16 ::= B04 | B16x +%% B64 ::= B04 | B16x | B64x +%% Handle whitespace in scanner + +Nonterminals base64Binary base64Binary2 b64 b16. + +Terminals '=' b04 b16x b64x. + +Rootsymbol base64Binary. + +Endsymbol '$end'. + + + + +base64Binary -> base64Binary2. +base64Binary -> '$empty'. + +base64Binary2 -> b64 b64 b64 b64 base64Binary2. +base64Binary2 -> b64 b64 b64 b64. +base64Binary2 -> b64 b64 b16 '='. +base64Binary2 -> b64 b04 '=' '='. + +b64 -> b04. +b64 -> b16x. +b64 -> b64x. + +b16 -> b04. +b16 -> b16x. diff --git a/lib/xmerl/src/xmerl_b64Bin_scan.erl b/lib/xmerl/src/xmerl_b64Bin_scan.erl new file mode 100644 index 0000000000..6f40680547 --- /dev/null +++ b/lib/xmerl/src/xmerl_b64Bin_scan.erl @@ -0,0 +1,59 @@ +%% +%% %CopyrightBegin% +%% +%% Copyright Ericsson AB 2006-2009. All Rights Reserved. +%% +%% The contents of this file are subject to the Erlang Public License, +%% Version 1.1, (the "License"); you may not use this file except in +%% compliance with the License. You should have received a copy of the +%% Erlang Public License along with this software. If not, it can be +%% retrieved online at http://www.erlang.org/. +%% +%% Software distributed under the License is distributed on an "AS IS" +%% basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See +%% the License for the specific language governing rights and limitations +%% under the License. +%% +%% %CopyrightEnd% +%% + +-module(xmerl_b64Bin_scan). + +-export([scan/1]). + +-define(L, 1). + +scan(Str) -> + scan(Str,[]). + +scan([],Acc) -> + lists:reverse([{'$end', ?L, '$end'}|Acc]); +scan(Str,Acc) -> + case scan_token(Str) of + {Token,Rest} -> + scan(Rest,[Token|Acc]) + end. + +scan_token([$ ,H|T]) -> + scan_token([H|T]); +scan_token([H|T]) when H==$A;H==$Q;H==$g;H==$w -> + {{b04,?L,H},T}; +scan_token([H|T]) + when H==$E;H==$I;H==$M;H==$U;H==$Y;H==$c;H==$k;H==$o;H==$s;H==$0; + H==$4;H==$8 -> + {{b16x,?L,H},T}; +scan_token([H|T]) + when H==$B;H==$C;H==$D;H==$F;H==$G;H==$H;H==$J;H==$K;H==$L;H==$N; + H==$O;H==$P;H==$R;H==$S;H==$T;H==$V;H==$W;H==$X;H==$Z -> + {{b64x,?L,H},T}; +scan_token([H|T]) + when H==$a;H==$b;H==$d;H==$e;H==$f;H==$h;H==$i;H==$j;H==$l;H==$m;H==$n;H==$p; + H==$q;H==$r;H==$t;H==$u;H==$v;H==$x;H==$y;H==$z -> + {{b64x,?L,H},T}; +scan_token([H|T]) + when H==$1;H==$2;H==$3;H==$5;H==$6;H==$7;H==$9;H==$+;H==$/ -> + {{b64x,?L,H},T}; +scan_token("="++T) -> + {{'=',?L,"="},T}; +scan_token([H|_T]) -> + exit({error,{base64Binary_scan_illegal_char,H}}). diff --git a/lib/xmerl/src/xmerl_dtd.erl b/lib/xmerl/src/xmerl_dtd.erl new file mode 100644 index 0000000000..e69de29bb2 --- /dev/null +++ b/lib/xmerl/src/xmerl_dtd.erl diff --git a/lib/xmerl/src/xmerl_eventp.erl b/lib/xmerl/src/xmerl_eventp.erl new file mode 100644 index 0000000000..ad5c3cbc47 --- /dev/null +++ b/lib/xmerl/src/xmerl_eventp.erl @@ -0,0 +1,366 @@ +%% +%% %CopyrightBegin% +%% +%% Copyright Ericsson AB 2003-2009. All Rights Reserved. +%% +%% The contents of this file are subject to the Erlang Public License, +%% Version 1.1, (the "License"); you may not use this file except in +%% compliance with the License. You should have received a copy of the +%% Erlang Public License along with this software. If not, it can be +%% retrieved online at http://www.erlang.org/. +%% +%% Software distributed under the License is distributed on an "AS IS" +%% basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See +%% the License for the specific language governing rights and limitations +%% under the License. +%% +%% %CopyrightEnd% +%% + +%%% Description : Simple event-based processor (front-end to xmerl_scan) + +%% @doc Simple event-based front-ends to xmerl_scan for processing +%% of XML documents in streams and for parsing in SAX style. +%% Each contain more elaborate settings of xmerl_scan that makes usage of +%% the customization functions. +%% +-module(xmerl_eventp). +-vsn('0.19'). +-date('03-09-17'). + +-export([stream/2,stream_sax/4, file_sax/4, string_sax/4]). + +% -export([cont/3, rules_read/3,rules_write/4,fetch/2,close/1]). + +-include("xmerl.hrl"). +-include("xmerl_internal.hrl"). +-include_lib("kernel/include/file.hrl"). + +%% @spec stream(Fname::string(), Options::option_list()) -> xmlElement() +%% +%% @doc Parse file containing an XML document as a stream, DOM style. +%% Wrapper for a call to the XML parser <code>xmerl_scan</code> with a +%% <code>continuation_fun</code> for handling streams of XML data. +%% Note that the <code>continuation_fun</code>, <code>acc_fun</code>, +%% <code>fetch_fun</code>, <code>rules</code> and <code>close_fun</code> +%% options cannot be user defined using this parser. +stream(Fname, Options) -> + AccF = fun(X, Acc, S) -> acc(X,Acc,S) end, + case file:open(Fname, [read, raw, binary]) of + {ok, Fd} -> + B0 = list_to_binary([]), + ContS = [{B0, Fname, Fd}], + Opts=scanner_options(Options, + [{continuation_fun, fun cont/3, ContS}, + {acc_fun, AccF}, + {fetch_fun, fun fetch/2}, + {rules,fun rules_read/3,fun rules_write/4,""}, + {close_fun, fun close/1}]), + xmerl_scan:string([], Opts); + Error -> + Error + end. + + +%% @spec stream_sax(Fname,CallBackModule,UserState,Options) -> xmlElement() +%% Fname = string() +%% CallBackModule = atom() +%% Options = option_list() +%% +%% @doc Parse file containing an XML document as a stream, SAX style. +%% Wrapper for a call to the XML parser <code>xmerl_scan</code> with a +%% <code>continuation_fun</code> for handling streams of XML data. +%% Note that the <code>continuation_fun</code>, <code>acc_fun</code>, +%% <code>fetch_fun</code>, <code>rules</code>, <code>hook_fun</code>, +%% <code>close_fun</code> and <code>user_state</code> options cannot be user +%% defined using this parser. +stream_sax(Fname, CallBack, UserState,Options) -> + US={xmerl:callbacks(CallBack), UserState}, + AccF = fun(X, Acc, S) -> acc(X,Acc,S) end, + HookF= + fun(ParsedEntity, S) -> + {CBs,Arg}=xmerl_scan:user_state(S), +% io:format("stream_sax Arg=~p~n",[Arg]), + case ParsedEntity of + #xmlComment{} -> % Toss away comments... + {[],S}; + _ -> % Use callback module for the rest +% io:format("stream_sax ParsedEntity=~p~n",[ParsedEntity]), + case xmerl:export_element(ParsedEntity,CBs,Arg) of + {error,Reason} -> + throw({error,Reason}); + Resp -> +% io:format("stream_sax Resp=~p~n",[Resp]), + {Resp,xmerl_scan:user_state({CBs,Resp},S)} + end + end + end, + case file:open(Fname, [read, raw, binary]) of + {ok, Fd} -> + B0 = list_to_binary([]), + ContS = [{B0, Fname, Fd}], + Opts=scanner_options(Options, + [{acc_fun, AccF}, + {close_fun, fun close/1}, + {continuation_fun, fun cont/3, ContS}, + {fetch_fun, fun fetch/2}, + {hook_fun,HookF}, + {rules,fun rules_read/3,fun rules_write/4,""}, + {user_state,US}]), + xmerl_scan:string([], Opts); + Error -> + Error + end. + + +%% @spec file_sax(Fname::string(), CallBackModule::atom(), UserState, +%% Options::option_list()) -> NewUserState +%% +%% @doc Parse file containing an XML document, SAX style. +%% Wrapper for a call to the XML parser <code>xmerl_scan</code> with a +%% <code>hook_fun</code> for using xmerl export functionality directly after +%% an entity is parsed. +file_sax(Fname,CallBack, UserState, Options) -> + US={xmerl:callbacks(CallBack), UserState}, + AccF=fun(X,Acc,S) -> {[X|Acc], S} end, + HookF= + fun(ParsedEntity, S) -> + {CBs,Arg}=xmerl_scan:user_state(S), + case ParsedEntity of + #xmlComment{} -> % Toss away comments... + {[],S}; + _ -> % Use callback module for the rest + case xmerl:export_element(ParsedEntity,CBs,Arg) of + {error,Reason} -> + throw({error,Reason}); + Resp -> + {Resp,xmerl_scan:user_state({CBs,Resp},S)} + end + end + end, + + Opts=scanner_options(Options,[{acc_fun, AccF}, + {hook_fun,HookF}, + {user_state,US}]), + xmerl_scan:file(Fname,Opts). + + +%% @spec string_sax(String::list(), CallBackModule::atom(), UserState, +%% Options::option_list()) -> +%% xmlElement() +%% +%% @doc Parse file containing an XML document, SAX style. +%% Wrapper for a call to the XML parser <code>xmerl_scan</code> with a +%% <code>hook_fun</code> for using xmerl export functionality directly after +%% an entity is parsed. +string_sax(String,CallBack, UserState, Options) -> + US={xmerl:callbacks(CallBack), UserState}, + AccF=fun(X,Acc,S) -> {[X|Acc], S} end, + HookF= + fun(ParsedEntity, S) -> + {CBs,Arg}=xmerl_scan:user_state(S), + case ParsedEntity of + #xmlComment{} -> % Toss away comments... + {[],S}; + _ -> % Use callback module for the rest + case xmerl:export_element(ParsedEntity,CBs,Arg) of + {error,Reason} -> + throw({error,Reason}); + Resp -> + {Resp,xmerl_scan:user_state({CBs,Resp},S)} + end + end + end, + + Opts=scanner_options(Options,[{acc_fun, AccF}, + {hook_fun,HookF}, + {user_state,US}]), + xmerl_scan:string(String,Opts). + + + +%%% ---------------------------------------------------------------------------- +%%% Streaming support functions + +%%% Continuation callback function for xmerl_scan +cont(F, Exception, S) -> + case xmerl_scan:cont_state(S) of + [{_Fname, eof}|_] -> + Exception(S); + [{Sofar, Fname, Fd}|T] -> + cont2(F, Exception, Sofar, Fd, Fname, T, S) + end. + + +cont2(F, Exception, Sofar, Fd, Fname, T, S) -> + case catch read_chunk(Fd, Fname, Sofar) of + {ok, Bin} -> + find_good_split(list_to_binary([Sofar,Bin]), + F,Exception,Fd,Fname,T,S); + eof -> + file:close(Fd), + NewS = xmerl_scan:cont_state([{Fname, eof}|T], S), + F(binary_to_list(Sofar), NewS); + Error -> + exit(Error) + end. + +read_chunk(Fd, _Fname, _Sofar) -> + file:read(Fd, 8192). + +-ifndef(no_bitsyntax). + +find_good_split(Bin, F, Exception, Fd, Fname, T, S) -> + find_good_split(size(Bin)-1, Bin, F, Exception, Fd, Fname, T, S). + +find_good_split(0, B, F, Exception, Fd, Fname, T, S) -> + cont2(F, Exception, B, Fd, Fname, T, S); +find_good_split(Size, B, F, Exception, Fd, Fname, T, S) -> + case B of + <<_Bytes:Size/binary, H/integer, Tail/binary>> when ?whitespace(H) -> + {SubB,_} = split_binary(B, Size+1), + NewS = xmerl_scan:cont_state([{Tail, Fname, Fd}|T], S), + F(binary_to_list(SubB), NewS); + _ -> + find_good_split(Size-1, B, F, Exception, Fd, Fname, T, S) + end. + +-else. + +find_good_split(Bin, F, Exception, Fd, Fname, T, S) -> + find_good_split(size(Bin), Bin, F, Exception, Fd, Fname, T, S). + +find_good_split(0, B, F, Exception, Fd, Fname, T, S) -> + cont2(F, Exception, B, Fd, Fname, T, S); +find_good_split(Size, B, F, Exception, Fd, Fname, T, S) -> + case binary_to_list(B, Size, Size) of + [H] when ?whitespace(H) -> + {SubB,Tail} = split_binary(B, Size), + NewS = xmerl_scan:cont_state([{Tail, Fname, Fd}|T], S), + F(binary_to_list(SubB), NewS); + _ -> + find_good_split(Size-1, B, F, Exception, Fd, Fname, T, S) + end. + +-endif. + + + +%%% Accumulator callback function for xmerl_scan +acc(X = #xmlText{value = Text}, Acc, S) -> + case detect_nul_text(Text) of + ok-> + {[X#xmlText{value = lists:flatten(Text)}|Acc], S}; + nok-> + {Acc,S} + end; +acc(X, Acc, S) -> + {[X|Acc], S}. + +%%% don't acc xmlText when text contains only " " , "\n" and "\t". +detect_nul_text([H|T]) when H==10; H==32; H==9-> + detect_nul_text(T); +detect_nul_text([]) -> + nok; +detect_nul_text(_)-> + ok. + + + +%%% Fetch callback function for xmerl_scan +fetch({system, URI}, S) -> + fetch_URI(URI, S); +fetch({public, _PublicID, URI}, S) -> + fetch_URI(URI, S). + +fetch_URI(URI, S) -> + %% assume URI is a filename + Split = filename:split(URI), + Filename = lists:last(Split), + Fullname = + case Split of + ["/", _|_] -> + %% absolute path name + URI; + ["file:",Name]-> + %% file:/dtd_name + filename:join(S#xmerl_scanner.xmlbase, Name); + _ -> + filename:join(S#xmerl_scanner.xmlbase, URI) + end, + File = path_locate(S#xmerl_scanner.fetch_path, Filename, Fullname), + ?dbg("fetch(~p) -> {file, ~p}.~n", [URI, File]), + case file:open(File, [read, raw, binary]) of + {ok, Fd} -> + ContS=xmerl_scan:cont_state(S), + NewS=xmerl_scan:cont_state([{list_to_binary([]),File,Fd}|ContS],S), + {ok, {string, []}, NewS}; + _Error -> + ?dbg("ERROR fetch(~p) -> ~p~n", [URI, _Error]), + {ok, not_fetched, S} + end. + +path_locate([Dir|Dirs], FN, FullName) -> + F = filename:join(Dir, FN), + case file:read_file_info(F) of + {ok, #file_info{type = regular}} -> + F; + _ -> + path_locate(Dirs, FN, FullName) + end; +path_locate([], _FN, FullName) -> + FullName. + +%%% Close callback function for xmerl_scan +close(S) -> + ContS = xmerl_scan:cont_state(S), + case ContS of + [{_Fname, eof}|T] -> + xmerl_scan:cont_state(T, S); + [{_Sofar, _Fname, Fd}|T] -> + file:close(Fd), + xmerl_scan:cont_state(T, S) + end. + + +%%% Rules callback functions for xmerl_scan +rules_write(Context, Name, Value, #xmerl_scanner{rules = undefined}=S) -> + Tab = ets:new(rules, [set, public]), + rules_write(Context, Name, Value, S#xmerl_scanner{rules = Tab}); +rules_write(Context, Name, Value, #xmerl_scanner{rules = T} = S) -> + ets:insert(T, {{Context, Name}, Value}), + S. + +rules_read(_Context, _Name, #xmerl_scanner{rules = undefined}) -> + undefined; +rules_read(Context, Name, #xmerl_scanner{rules = T}) -> + case ets:lookup(T, {Context, Name}) of + [] -> + undefined; + [{_K, V}] -> + V + end. + + + +%%% ---------------------------------------------------------------------------- +%%% Generic helper functions + +scanner_options([H|T], Opts) -> + case catch keyreplace(H, 1, Opts) of + false -> + scanner_options(T, [H|Opts]); + NewOpts -> + scanner_options(T, NewOpts) + end; +scanner_options([], Opts) -> + Opts. + +keyreplace(X, Pos, [H|T]) when element(Pos, X) == element(Pos, H) -> + [X|T]; +keyreplace(X, Pos, [H|T]) -> + [H|keyreplace(X, Pos, T)]; +keyreplace(_, _Pos, []) -> + throw(false). + + diff --git a/lib/xmerl/src/xmerl_html.erl b/lib/xmerl/src/xmerl_html.erl new file mode 100644 index 0000000000..38fdeb2205 --- /dev/null +++ b/lib/xmerl/src/xmerl_html.erl @@ -0,0 +1,122 @@ +%% +%% %CopyrightBegin% +%% +%% Copyright Ericsson AB 2003-2009. All Rights Reserved. +%% +%% The contents of this file are subject to the Erlang Public License, +%% Version 1.1, (the "License"); you may not use this file except in +%% compliance with the License. You should have received a copy of the +%% Erlang Public License along with this software. If not, it can be +%% retrieved online at http://www.erlang.org/. +%% +%% Software distributed under the License is distributed on an "AS IS" +%% basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See +%% the License for the specific language governing rights and limitations +%% under the License. +%% +%% %CopyrightEnd% +%% + +%%% Description : Callback module for exporting XHTML to HTML. + +-module(xmerl_html). + +-export(['#xml-inheritance#'/0]). + +%% Note: we assume XHTML data, so all tags are lowercase! + +-export(['#root#'/4, + '#element#'/5, + '#text#'/1, + p/4]). + +-import(xmerl_lib, [start_tag/2, end_tag/1, is_empty_data/1, + find_attribute/2, export_text/1]). + +-include("xmerl.hrl"). + + +'#xml-inheritance#'() -> []. + + +%% The '#text#' function is called for every text segment. + +'#text#'(Text) -> + export_text(Text). + + +%% The '#root#' tag is called when the entire structure has been +%% exported. It does not appear in the structure itself. + +'#root#'(Data, Attrs, [], _E) -> + Ver = case find_attribute(version, Attrs) of + {value, V} -> + V; + false -> +%%% "-//W3C//DTD HTML 4.01//EN" % strict + "-//W3C//DTD HTML 4.01 Transitional//EN" % loose + end, + URI = case find_attribute(uri, Attrs) of + {value, U} -> + [" \"", U, "\""]; + false -> +%%% " \"http://www.w3.org/TR/html4/strict.dtd\"" +%%% " \"http://www.w3.org/TR/html4/loose.dtd\"" + "" + end, + ["<!DOCTYPE HTML PUBLIC \"", Ver, "\"", URI, ">\n", Data]. + + +%% HTML does not have the <Tag/> empty-element form of XML/XHTML. +markup(Tag, Attrs, Data) -> + [start_tag(Tag, Attrs), Data, end_tag(Tag)]. + +%% Some HTML elements must not have an end tag at all. +markup_noend(Tag, Attrs, Data) -> + [start_tag(Tag, Attrs), Data]. + + +%% For some element types, the end tag is forbidden. (In all other +%% cases, we always generate the end tag, to make sure that the scope of +%% a markup is not extended by mistake.) + +'#element#'(Tag, Data, Attrs, _Parents, _E) -> + case forbid_end(Tag) of + false -> + markup(Tag, Attrs, Data); + true -> + markup_noend(Tag, Attrs, Data) + end. + + +%% HTML tags with special handling + +p(Data, Attrs, _Parents, _E) -> + %% In general, we cannot drop the end tag for paragraph elements; + %% that is only allowed if we know that it is immediately followed + %% by some other block-level tag. + case is_empty_data(Data) of + true -> + %% Paragraph elements should never be completely empty. + markup(p, Attrs, "\s"); + false -> + markup(p, Attrs, Data) + end. + + +%% Utility functions + +forbid_end(area) -> true; +forbid_end(base) -> true; +forbid_end(basefont) -> true; +forbid_end(br) -> true; +forbid_end(col) -> true; +forbid_end(frame) -> true; +forbid_end(hr) -> true; +forbid_end(img) -> true; +forbid_end(input) -> true; +forbid_end(isindex) -> true; +forbid_end(link) -> true; +forbid_end(meta) -> true; +forbid_end(param) -> true; +forbid_end(_) -> false. diff --git a/lib/xmerl/src/xmerl_internal.hrl b/lib/xmerl/src/xmerl_internal.hrl new file mode 100644 index 0000000000..85df757138 --- /dev/null +++ b/lib/xmerl/src/xmerl_internal.hrl @@ -0,0 +1,46 @@ +%% +%% %CopyrightBegin% +%% +%% Copyright Ericsson AB 2009. All Rights Reserved. +%% +%% The contents of this file are subject to the Erlang Public License, +%% Version 1.1, (the "License"); you may not use this file except in +%% compliance with the License. You should have received a copy of the +%% Erlang Public License along with this software. If not, it can be +%% retrieved online at http://www.erlang.org/. +%% +%% Software distributed under the License is distributed on an "AS IS" +%% basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See +%% the License for the specific language governing rights and limitations +%% under the License. +%% +%% %CopyrightEnd% +%% +%% Contributor(s): +%% <[email protected]>: suggested #xmlDocument{} +%% +%%-------------------------------------------------------------------- +%% File : xmerl_internal.hrl +%% Description : Internal definitions +%% +%% Created : 11 Sep 2009 +%%-------------------------------------------------------------------- + +%%-------------------------------------------------------------------- +%% Macros +%%-------------------------------------------------------------------- +% -define(debug,1). + +-ifdef(debug). +-define(dbg(Fmt, Args), ok=io:format("~p: " ++ Fmt, [?LINE|Args])). +-define(DBG, ok=io:format("<<~p:~p>>~n", [?MODULE, ?LINE])). +-else. +-define(dbg(Fmt, Args), no_debug). +-define(DBG, no_debug). +-endif. + + +%%-------------------------------------------------------------------- +%% Records +%%-------------------------------------------------------------------- + diff --git a/lib/xmerl/src/xmerl_lib.erl b/lib/xmerl/src/xmerl_lib.erl new file mode 100644 index 0000000000..7b76a76a33 --- /dev/null +++ b/lib/xmerl/src/xmerl_lib.erl @@ -0,0 +1,1065 @@ +%% +%% %CopyrightBegin% +%% +%% Copyright Ericsson AB 2003-2009. All Rights Reserved. +%% +%% The contents of this file are subject to the Erlang Public License, +%% Version 1.1, (the "License"); you may not use this file except in +%% compliance with the License. You should have received a copy of the +%% Erlang Public License along with this software. If not, it can be +%% retrieved online at http://www.erlang.org/. +%% +%% Software distributed under the License is distributed on an "AS IS" +%% basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See +%% the License for the specific language governing rights and limitations +%% under the License. +%% +%% %CopyrightEnd% +%% + +%%% Description : Utility module for handling XML trees. +%%%---------------------------------------------------------------------- + +-module(xmerl_lib). + +-export([normalize_content/1, normalize_content/3, expand_content/1, + expand_content/3, normalize_element/1, normalize_element/3, + expand_element/1, expand_element/3, expand_attributes/1, + expand_attributes/3, export_text/1, flatten_text/1, + export_attribute/1, markup/2, markup/3, simplify_element/1, + simplify_content/1, start_tag/1, start_tag/2, end_tag/1, + empty_tag/1, empty_tag/2,is_empty_data/1, find_attribute/2, + remove_whitespace/1,to_lower/1]). + +-export([is_letter/1,is_namechar/1,is_ncname/1, + detect_charset/1,detect_charset/2,is_name/1,is_char/1]). + + +-export([mapxml/2, foldxml/3, mapfoldxml/3]). + +%% exports for XSD +-export([is_facet/1,is_builtin_simple_type/1,is_xsd_string/1]). + +-include("xmerl.hrl"). +-include("xmerl_xsd.hrl"). + + +%% Escape special characters `<' and `&', flattening the text. +%% Also escapes `>', just for symmetry. + +export_text(T) -> + export_text(T, []). + +export_text([$< | T], Cont) -> + "<" ++ export_text(T, Cont); +export_text([$> | T], Cont) -> + ">" ++ export_text(T, Cont); +export_text([$& | T], Cont) -> + "&" ++ export_text(T, Cont); +export_text([C | T], Cont) when is_integer(C) -> + [C | export_text(T, Cont)]; +export_text([T | T1], Cont) -> + export_text(T, [T1 | Cont]); +export_text([], [T | Cont]) -> + export_text(T, Cont); +export_text([], []) -> + []; +export_text(Bin, Cont) -> + export_text(binary_to_list(Bin), Cont). + + +%% Only flatten text. + +flatten_text(T) -> + flatten_text(T, []). + +flatten_text([C | T], Cont) when is_integer(C) -> + [C | flatten_text(T, Cont)]; +flatten_text([T | T1], Cont) -> + flatten_text(T, [T1 | Cont]); +flatten_text([], [T | Cont]) -> + flatten_text(T, Cont); +flatten_text([], []) -> + []; +flatten_text(Bin, Cont) -> + flatten_text(binary_to_list(Bin), Cont). + +%% Convert attribute value to a flat string, escaping characters `"', +%% `<' and `&'. (Note that single-quote characters are not escaped; the +%% markup-generating functions (`start_tag', `end_tag', ...) always use +%% `"' to delimit the attribute values.) + +export_attribute(I) when is_integer(I) -> + integer_to_list(I); +export_attribute(A) when is_atom(A) -> + export_attribute(atom_to_list(A), []); +export_attribute(S) -> + export_attribute(S, []). + +export_attribute([$< | T], Cont) -> + "<" ++ export_attribute(T, Cont); +export_attribute([$& | T], Cont) -> + "&" ++ export_attribute(T, Cont); +export_attribute([$" | T], Cont) -> + """ ++ export_attribute(T, Cont); +export_attribute([C | T], Cont) when is_integer(C) -> + [C | export_attribute(T, Cont)]; +export_attribute([T | T1], Cont) -> + export_attribute(T, [T1 | Cont]); +export_attribute([], [T | Cont]) -> + export_attribute(T, Cont); +export_attribute([], []) -> + []; +export_attribute(Bin, Cont) -> + export_attribute(binary_to_list(Bin), Cont). + + +%% SimpleContent: [SimpleElement] +%% SimpleElement: #xml...{} | String | {atom(), [Attr], SimpleContent} +%% | {atom(), SimpleContent} | atom() +%% Attr: {atom(), Value} | #xmlAttribute{} +%% Value: atom() | integer() | String +%% String: [char() | binary() | String] +%% +%% Because strings can be deep, we do not allow content lists to also be +%% deep; otherwise, traversal of the simple representation becomes too +%% complicated and expensive. Simple content lists are thus flat lists +%% of simple elements. + +%% TODO: namespace-qualified tags in simple-form? /RC + +%% 'normalize' is like 'expand', but also turns all text elements into +%% flat strings. + +normalize_element(Element) -> + normalize_element(Element, 1, []). + +normalize_element(Element, Pos, Parents) -> + expand_element(Element, Pos, Parents, true). + +%% 'expand' expands simple-form elements to normal XML elements. +%% All attribute values (also in #xmlAttribute records) become flat +%% strings, so that string comparisons can be made. Text elements are +%% not flattened. + +expand_element(Element) -> + expand_element(Element, 1, []). + +expand_element(Element, Pos, Parents) -> + expand_element(Element, Pos, Parents, false). + +expand_element(E = #xmlElement{}, Pos, Parents, Norm) -> + Content = expand_content(E#xmlElement.content, 1, Parents, Norm), + Attrs = expand_attributes(E#xmlElement.attributes, 1, []), + E#xmlElement{pos = Pos, + parents = Parents, + attributes = Attrs, + content = Content}; +expand_element(E = #xmlText{}, Pos, Parents, Norm) -> + E#xmlText{pos = Pos, + parents = Parents, + value = expand_text(E#xmlText.value, Norm)}; +expand_element(E = #xmlPI{}, Pos, _Parents, Norm) -> + E#xmlPI{pos = Pos, + value = expand_text(E#xmlPI.value, Norm)}; +expand_element(E = #xmlComment{}, Pos, Parents, Norm) -> + E#xmlComment{pos = Pos, + parents = Parents, + value = expand_text(E#xmlComment.value, Norm)}; +expand_element(E = #xmlDecl{}, _Pos, _Parents, _Norm) -> + Attrs = expand_attributes(E#xmlDecl.attributes, 1, []), + E#xmlDecl{attributes = Attrs}; +expand_element({Tag, Attrs, Content}, Pos, Parents, Norm) when is_atom(Tag) -> + NewParents = [{Tag, Pos} | Parents], + #xmlElement{name = Tag, + pos = Pos, + parents = Parents, + attributes = expand_attributes(Attrs, 1, NewParents), + content = expand_content(Content, 1, NewParents, Norm)}; +expand_element({Tag, Content}, Pos, Parents, Norm) when is_atom(Tag) -> + NewParents = [{Tag, Pos} | Parents], + #xmlElement{name = Tag, + pos = Pos, + parents = Parents, + attributes = [], + content = expand_content(Content, 1, NewParents, Norm)}; +expand_element(Tag, Pos, Parents, _Norm) when is_atom(Tag) -> + #xmlElement{name = Tag, + pos = Pos, + parents = Parents, + attributes = [], + content = []}; +expand_element(String, Pos, Parents, Norm) when is_list(String) -> + #xmlText{pos = Pos, + parents = Parents, + value = expand_text(String, Norm)}. + +expand_text(S, false) -> S; +expand_text(S, true) -> flatten_text(S). + +%% Content must be a flat list of elements. + +normalize_content(Content) -> + normalize_content(Content, 1, []). + +normalize_content(Content, Pos, Parents) -> + expand_content(Content, Pos, Parents, true). + +expand_content(Content) -> + expand_content(Content, 1, []). + +expand_content(Content, Pos, Parents) -> + expand_content(Content, Pos, Parents, false). + +expand_content([{H} | T], Pos, Parents, Norm) -> + expand_content(H ++ T, Pos, Parents, Norm); +expand_content([{F,S}|T], Pos, Parents, Norm) when is_function(F) -> + case F(S) of + done -> expand_content(T, Pos, Parents, Norm); + {C,S2} -> expand_content([{F,S2},C|T], Pos, Parents, Norm) + end; +expand_content([H | T], Pos, Parents, Norm) -> + [expand_element(H, Pos, Parents, Norm) + | expand_content(T, Pos+1, Parents, Norm)]; +expand_content([], _Pos, _Parents, _Norm) -> + []. + +expand_attributes(Attrs) -> + expand_attributes(Attrs, 1, []). + +%% Expanding always turns all attribute values into flat strings. + +expand_attributes([H = #xmlAttribute{} | T], Pos, Parents) -> + [H#xmlAttribute{pos = Pos, + value = expand_value(H#xmlAttribute.value)} + | expand_attributes(T, Pos+1, Parents)]; +expand_attributes([{P,S}|T], Pos, Parents) when is_function(P) -> + case P(S) of + done -> + expand_attributes(T, Pos, Parents); + {A,S2} -> + expand_attributes([{P,S2},A|T], Pos, Parents) + end; +expand_attributes([{K, V} | T], Pos, Parents) -> + [#xmlAttribute{name = K, + pos = Pos, + parents = Parents, + value = expand_value(V)} + | expand_attributes(T, Pos+1, Parents)]; +expand_attributes([], _Pos, _Parents) -> + []. + +expand_value(S) when is_atom(S) -> + atom_to_list(S); +expand_value(S) when is_integer(S) -> + integer_to_list(S); +expand_value(S) -> + flatten_text(S). + +%% We want simplification to yield a normal form, so we always generate +%% three-tuples for elements. PI, Comment and Decl elements are +%% discarded from content lists. Attribute values become flat +%% strings. Text elements are not flattened. + +simplify_element(#xmlElement{expanded_name = [], name = Tag, + attributes = Attrs, content = Content}) -> + {Tag, simplify_attributes(Attrs), simplify_content(Content)}; +simplify_element(#xmlElement{expanded_name = Name, + attributes = Attrs, content = Content}) -> + {Name, simplify_attributes(Attrs), simplify_content(Content)}; +simplify_element(#xmlText{value = Text}) -> + Text; +simplify_element({Tag, Attrs, Content}) when is_atom(Tag) -> + {Tag, simplify_attributes(Attrs), simplify_content(Content)}; +simplify_element({Tag, Content}) when is_atom(Tag) -> + {Tag, [], simplify_content(Content)}; +simplify_element(Tag) when is_atom(Tag) -> + {Tag, [], []}; +simplify_element(Text) when is_list(Text) -> + Text. + +simplify_content([#xmlPI{} | T]) -> + simplify_content(T); +simplify_content([#xmlComment{} | T]) -> + simplify_content(T); +simplify_content([#xmlDecl{} | T]) -> + simplify_content(T); +simplify_content([H | T]) -> + [simplify_element(H) | simplify_content(T)]; +simplify_content([]) -> + []. + +simplify_attributes([#xmlAttribute{name = K, value = V} | T]) + when is_atom(K) -> + [{K, expand_value(V)} | simplify_attributes(T)]; +simplify_attributes([H = {K, _} | T]) when is_atom(K) -> + [H | simplify_attributes(T)]; +simplify_attributes([]) -> + []. + +%% Looking up an attribute value + +find_attribute(Name, Attrs) -> + case lists:keysearch(Name, #xmlAttribute.name, Attrs) of + {value, #xmlAttribute{value = V}} -> + {value, V}; + false -> + false + end. + + +markup(Tag, Data) -> + markup(Tag, [], Data). + +markup(Tag, Attrs, []) -> + empty_tag(Tag, Attrs); +markup(Tag, Attrs, Data) -> + [start_tag(Tag, Attrs), Data, end_tag(Tag)]. + +start_tag(TagStr) -> + start_tag(TagStr, []). + +start_tag(Tag, Attrs) when is_atom(Tag) -> + start_tag(atom_to_list(Tag), Attrs); +start_tag(TagStr, []) -> + ["<", TagStr, ">"]; +start_tag(TagStr, Attrs) -> + ["<", TagStr, attributes(Attrs), ">"]. + +empty_tag(Tag) -> + empty_tag(Tag, []). + +empty_tag(Tag, Attrs) when is_atom(Tag) -> + empty_tag(atom_to_list(Tag), Attrs); +empty_tag(TagStr, []) -> + ["<", TagStr, "/>"]; +empty_tag(TagStr, Attrs) -> + ["<", TagStr, attributes(Attrs), "/>"]. + +end_tag(Tag) when is_atom(Tag) -> + end_tag(atom_to_list(Tag)); +end_tag(TagStr) -> + ["</", TagStr, ">"]. + +attributes(Attrs) -> + [attr_string(A) || A <- Attrs]. + +attr_string(#xmlAttribute{name = K, value = V}) -> + [" ", atom_to_list(K), "=\"", export_attribute(V), "\""]. + +is_empty_data([]) -> + true; +is_empty_data([X | Xs]) -> + case is_empty_data(X) of + false -> + false; + true -> + is_empty_data(Xs) + end; +is_empty_data(_) -> + false. + + +%% Removing normalised whitespace-only text segments. + +remove_whitespace([#xmlText{value = " "} | Data]) -> + remove_whitespace(Data); +remove_whitespace([E = #xmlElement{content = Content} | Data]) -> + [E#xmlElement{content = remove_whitespace(Content)} + | remove_whitespace(Data)]; +remove_whitespace([Other | Data]) -> + [Other | remove_whitespace(Data)]; +remove_whitespace([]) -> + []. + + +%%% ---------------------------------------------------------------------------- +%%% funs traversing the xmerl tree left-right and top-down + +%% mapxml +%% Fun is fun(Old#xmlElement) -> New#xmlElement +mapxml(Fun, #xmlElement{}= E) -> + C1 = Fun(E), + C2 = mapxml(Fun,lists:flatten(C1#xmlElement.content)), + C1#xmlElement{content=C2}; +mapxml(Fun, List) when is_list(List) -> + AFun = fun(E) -> mapxml(Fun, E) end, + lists:map(AFun, List); +mapxml(Fun, E) -> + Fun(E). + + +%% foldxml +%% Fun is fun(#xmlElement, OldAccu) -> NewAccu +foldxml(Fun, Accu0, #xmlElement{content=C}=E) -> + Accu1 = Fun(E, Accu0), + foldxml(Fun, Accu1, C); +foldxml(Fun, Accu, List) when is_list(List) -> + AFun = fun(E,A) -> foldxml(Fun, A, E) end, + lists:foldl(AFun, Accu, List); +foldxml(Fun, Accu, E) -> + Fun(E, Accu). + + +%% mapfoldxml +%% Fun is fun(Old#xmlElement, OldAccu) -> {New#xmlElement, NewAccu} +mapfoldxml(Fun, Accu0, #xmlElement{}=E) -> + {C1,Accu1} = Fun(E, Accu0), + {C2,Accu2} = mapfoldxml(Fun, Accu1, lists:flatten(C1#xmlElement.content)), + {C1#xmlElement{content=C2},Accu2}; +mapfoldxml(Fun, Accu, List) when is_list(List) -> + AFun = fun(E,A) -> mapfoldxml(Fun, A, E) end, + lists:mapfoldl(AFun, Accu, List); +mapfoldxml(Fun, Accu, E) -> + Fun(E,Accu). + + +%%% @spec detect_charset(T::list()) -> charset_info() +%%% @equiv detect_charset(undefined,T) +detect_charset(Content) -> + detect_charset(undefined,Content). + +%%% FIXME! Whatabout aliases etc? Shouldn't transforming with ucs be optional? +%%% @spec detect_charset(ExtCharset::atom(),T::list()) -> charset_info() +%%% @doc Automatically decides character set used in XML document. +%%% charset_info() is +%%% <table> +%%% <tr><td><code>{auto,'iso-10646-utf-1',Content} |</code></td></tr> +%%% <tr><td><code>{external,'iso-10646-utf-1',Content} |</code></td></tr> +%%% <tr><td><code>{undefined,undefined,Content} |</code></td></tr> +%%% <tr><td><code>{external,ExtCharset,Content}</code></td></tr> +%%% </table> +%%% ExtCharset is any externally declared character set (e.g. in HTTP +%%% Content-Type header) and Content is an XML Document. +%%% +detect_charset(ExtCharset,Content) when is_list(ExtCharset) -> + %% FIXME! Don't allow both atom and list for character set names + detect_charset(list_to_atom(ExtCharset),Content); +detect_charset(ExtCharset,Content) -> + case autodetect(ExtCharset,Content) of + {auto,Content1} -> + {auto,'iso-10646-utf-1',Content1}; + {external,Content1} -> + {external,'iso-10646-utf-1',Content1}; + {undefined,_} -> + {undefined,undefined,Content}; + {ExtCharset, Content} -> + {external,ExtCharset,Content} + end. + +%%------------------------------------------------------------------------------ +%% Auto detect what kind of character set we are dealing with and transform +%% to Erlang integer Unicode format if found. +%% Appendix F, Page 56-57, XML 1.0 W3C Recommendation 6 October 2000 +%% (http://www.w3.org/TR/REC-xml) +%% 00 00 00 3C ( "<" in UCS-4 big-endian) +%% 3C 00 00 00 ( "<" in UCS-4 little-endian) +%% FE FF (UTF-16 - big-endian Mark) +%% FF FE (UTF-16 - little-endian Mark) +%% 00 3C 00 3F ( "<?" in UTF-16 big-endian) +%% 3C 00 3F 00 ( "<?" in UTF-16 big-endian) +%% 3C 3F (7-bit,8-bit or mixed width encoding) +%% 4C 6F A7 94 (EBCDIC) - Not Implemented!!!! + +%% Check byte-order mark and transform to Unicode, Erlang integer +%%% --- With byte-order mark +autodetect(undefined,[0,0,16#fe,16#ff | Input]) -> + {auto, xmerl_ucs:from_ucs4be(Input)}; +autodetect('iso-10646-utf-1',[0,0,16#fe,16#ff | Input]) -> + {external, xmerl_ucs:from_ucs4be(Input)}; +autodetect(undefined,[16#ff,16#fe,0,0 | Input]) -> + {auto, xmerl_ucs:from_ucs4le(Input)}; +autodetect('iso-10646-utf-1',[16#ff,16#fe,0,0 | Input]) -> + {external, xmerl_ucs:from_ucs4le(Input)}; + +autodetect(undefined,[16#fe,16#ff | Input]) -> + {auto, xmerl_ucs:from_utf16be(Input)}; +autodetect('utf-16be',[16#fe,16#ff | Input]) -> + {external, xmerl_ucs:from_utf16be(Input)}; +autodetect(undefined,[16#ff,16#fe | Input]) -> + {auto, xmerl_ucs:from_utf16le(Input)}; +autodetect('utf-16le',[16#ff,16#fe | Input]) -> + {external, xmerl_ucs:from_utf16le(Input)}; + +autodetect(undefined,[16#ef,16#bb,16#bf | Input]) -> + {auto, xmerl_ucs:from_utf8(Input)}; +autodetect('utf-8',[16#ef,16#bb,16#bf | Input]) -> + {external, xmerl_ucs:from_utf8(Input)}; +autodetect('utf-8',[16#ff,16#fe | Input]) -> + {external, xmerl_ucs:from_utf16le(Input)}; +autodetect('utf-8',[16#fe,16#ff | Input]) -> + {external, xmerl_ucs:from_utf16be(Input)}; + +%%% --- Without byte-order mark +autodetect(undefined,[0,0,0,16#3c|Input]) -> + {auto, xmerl_ucs:from_ucs4be([0,0,0,16#3c|Input])}; +autodetect('iso-10646-utf-1',[0,0,0,16#3c|Input]) -> + {external, xmerl_ucs:from_ucs4be([0,0,0,16#3c|Input])}; +autodetect(undefined,[16#3c,0,0,0|Input]) -> + {auto, xmerl_ucs:from_ucs4le([16#3c,0,0,0|Input])}; +autodetect('iso-10646-utf-1',[16#3c,0,0,0|Input]) -> + {external, xmerl_ucs:from_ucs4le([16#3c,0,0,0|Input])}; + +autodetect(undefined,[0,16#3c,0,16#3f | Input]) -> + {auto, xmerl_ucs:from_utf16be([0,16#3c,0,16#3f|Input])}; +autodetect('utf-16be',[0,16#3c,0,16#3f | Input]) -> + {external, xmerl_ucs:from_utf16be([0,16#3c,0,16#3f|Input])}; +autodetect(undefined,[16#3c,0,16#3f,0 | Input]) -> + {auto, xmerl_ucs:from_utf16le([16#3c,0,16#3f,0|Input])}; +autodetect('utf-16le',[16#3c,0,16#3f,0 | Input]) -> + {external, xmerl_ucs:from_utf16le([16#3c,0,16#3f,0|Input])}; + +autodetect(ExtCharset,Content) -> + {ExtCharset, Content}. + + +is_ncname(A) when is_atom(A) -> + is_ncname(atom_to_list(A)); +is_ncname([$_|T]) -> + is_name1(T); +is_ncname([H|T]) -> + case is_letter(H) of + true -> + is_name1(T); + _ -> false + end. + +is_name(A) when is_atom(A) -> + is_name(atom_to_list(A)); +is_name([$_|T]) -> + is_name1(T); +is_name([$:|T]) -> + is_name1(T); +is_name([H|T]) -> + case is_letter(H) of + true -> + is_name1(T); + _ -> false + end. + +is_name1([]) -> + true; +is_name1([H|T]) -> + case is_namechar(H) of + true -> + is_name1(T); + _ -> false + end. + + + +% ======= +%%% UNICODE character definitions + +%%%%%%%% [2] Char + +is_char(16#09) -> true; +is_char(16#0A) -> true; +is_char(16#0D) -> true; +is_char(X) when X >= 16#20, X =< 16#D7FF -> true; +is_char(X) when X >= 16#E000, X =< 16#FFFD -> true; +is_char(X) when X >= 16#10000, X =< 16#10FFFF -> true; +is_char(_) -> false. + +%% 0 - not classified, +%% 1 - base_char or ideographic, +%% 2 - combining_char or digit or extender, +%% 3 - $. or $- or $_ or $: +-define(SMALL, {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,3,0,2,2,2,2,2,2,2,2,2,2,3,0, + 0,0,0,0,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, + 1,0,0,0,0,3,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, + 1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,2,0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, + 1,1,1,1,1,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, + 1,1,1,1,1,1,1,0,1,1,1,1,1,1,1,1}). + +%% [4] NameChar +is_namechar(X) -> + try element(X, ?SMALL) > 0 + catch _:_ -> + case is_letter(X) of + true -> true; + false -> + case is_digit(X) of + true -> true; + false -> + case is_combining_char(X) of + true -> true; + false -> + is_extender(X) + end + end + end + end. + +%% [84] Letter +is_letter(X) -> + try element(X, ?SMALL) =:= 1 + catch _:_ -> + case is_base_char(X) of + false -> + is_ideographic(X); + true -> + true + end + end. + +%% [85] BaseChar +is_base_char(X) when X >= 16#0041, X =< 16#005A -> true; +is_base_char(X) when X >= 16#0061, X =< 16#007A -> true; +is_base_char(X) when X >= 16#00C0, X =< 16#00D6 -> true; +is_base_char(X) when X >= 16#00D8, X =< 16#00F6 -> true; +is_base_char(X) when X >= 16#00F8, X =< 16#00FF -> true; +is_base_char(X) when X >= 16#0100, X =< 16#0131 -> true; +is_base_char(X) when X >= 16#0134, X =< 16#013E -> true; +is_base_char(X) when X >= 16#0141, X =< 16#0148 -> true; +is_base_char(X) when X >= 16#014A, X =< 16#017E -> true; +is_base_char(X) when X >= 16#0180, X =< 16#01C3 -> true; +is_base_char(X) when X >= 16#01CD, X =< 16#01F0 -> true; +is_base_char(X) when X >= 16#01F4, X =< 16#01F5 -> true; +is_base_char(X) when X >= 16#01FA, X =< 16#0217 -> true; +is_base_char(X) when X >= 16#0250, X =< 16#02A8 -> true; +is_base_char(X) when X >= 16#02BB, X =< 16#02C1 -> true; +is_base_char(16#0386) -> true; +is_base_char(X) when X >= 16#0388, X =< 16#038A -> true; +is_base_char(16#038C) -> true; +is_base_char(X) when X >= 16#038E, X =< 16#03A1 -> true; +is_base_char(X) when X >= 16#03A3, X =< 16#03CE -> true; +is_base_char(X) when X >= 16#03D0, X =< 16#03D6 -> true; +is_base_char(16#03DA) -> true; +is_base_char(16#03DC) -> true; +is_base_char(16#03DE) -> true; +is_base_char(16#03E0) -> true; +is_base_char(X) when X >= 16#03E2, X =< 16#03F3 -> true; +is_base_char(X) when X >= 16#0401, X =< 16#040C -> true; +is_base_char(X) when X >= 16#040E, X =< 16#044F -> true; +is_base_char(X) when X >= 16#0451, X =< 16#045C -> true; +is_base_char(X) when X >= 16#045E, X =< 16#0481 -> true; +is_base_char(X) when X >= 16#0490, X =< 16#04C4 -> true; +is_base_char(X) when X >= 16#04C7, X =< 16#04C8 -> true; +is_base_char(X) when X >= 16#04CB, X =< 16#04CC -> true; +is_base_char(X) when X >= 16#04D0, X =< 16#04EB -> true; +is_base_char(X) when X >= 16#04EE, X =< 16#04F5 -> true; +is_base_char(X) when X >= 16#04F8, X =< 16#04F9 -> true; +is_base_char(X) when X >= 16#0531, X =< 16#0556 -> true; +is_base_char(16#0559) -> true; +is_base_char(X) when X >= 16#0561, X =< 16#0586 -> true; +is_base_char(X) when X >= 16#05D0, X =< 16#05EA -> true; +is_base_char(X) when X >= 16#05F0, X =< 16#05F2 -> true; +is_base_char(X) when X >= 16#0621, X =< 16#063A -> true; +is_base_char(X) when X >= 16#0641, X =< 16#064A -> true; +is_base_char(X) when X >= 16#0671, X =< 16#06B7 -> true; +is_base_char(X) when X >= 16#06BA, X =< 16#06BE -> true; +is_base_char(X) when X >= 16#06C0, X =< 16#06CE -> true; +is_base_char(X) when X >= 16#06D0, X =< 16#06D3 -> true; +is_base_char(16#06D5) -> true; +is_base_char(X) when X >= 16#06E5, X =< 16#06E6 -> true; +is_base_char(X) when X >= 16#0905, X =< 16#0939 -> true; +is_base_char(16#093D) -> true; +is_base_char(X) when X >= 16#0958, X =< 16#0961 -> true; +is_base_char(X) when X >= 16#0985, X =< 16#098C -> true; +is_base_char(X) when X >= 16#098F, X =< 16#0990 -> true; +is_base_char(X) when X >= 16#0993, X =< 16#09A8 -> true; +is_base_char(X) when X >= 16#09AA, X =< 16#09B0 -> true; +is_base_char(16#09B2) -> true; +is_base_char(X) when X >= 16#09B6, X =< 16#09B9 -> true; +is_base_char(X) when X >= 16#09DC, X =< 16#09DD -> true; +is_base_char(X) when X >= 16#09DF, X =< 16#09E1 -> true; +is_base_char(X) when X >= 16#09F0, X =< 16#09F1 -> true; +is_base_char(X) when X >= 16#0A05, X =< 16#0A0A -> true; +is_base_char(X) when X >= 16#0A0F, X =< 16#0A10 -> true; +is_base_char(X) when X >= 16#0A13, X =< 16#0A28 -> true; +is_base_char(X) when X >= 16#0A2A, X =< 16#0A30 -> true; +is_base_char(X) when X >= 16#0A32, X =< 16#0A33 -> true; +is_base_char(X) when X >= 16#0A35, X =< 16#0A36 -> true; +is_base_char(X) when X >= 16#0A38, X =< 16#0A39 -> true; +is_base_char(X) when X >= 16#0A59, X =< 16#0A5C -> true; +is_base_char(16#0A5E) -> true; +is_base_char(X) when X >= 16#0A72, X =< 16#0A74 -> true; +is_base_char(X) when X >= 16#0A85, X =< 16#0A8B -> true; +is_base_char(16#0A8D) -> true; +is_base_char(X) when X >= 16#0A8F, X =< 16#0A91 -> true; +is_base_char(X) when X >= 16#0A93, X =< 16#0AA8 -> true; +is_base_char(X) when X >= 16#0AAA, X =< 16#0AB0 -> true; +is_base_char(X) when X >= 16#0AB2, X =< 16#0AB3 -> true; +is_base_char(X) when X >= 16#0AB5, X =< 16#0AB9 -> true; +is_base_char(16#0ABD) -> true; +is_base_char(16#0AE0) -> true; +is_base_char(X) when X >= 16#0B05, X =< 16#0B0C -> true; +is_base_char(X) when X >= 16#0B0F, X =< 16#0B10 -> true; +is_base_char(X) when X >= 16#0B13, X =< 16#0B28 -> true; +is_base_char(X) when X >= 16#0B2A, X =< 16#0B30 -> true; +is_base_char(X) when X >= 16#0B32, X =< 16#0B33 -> true; +is_base_char(X) when X >= 16#0B36, X =< 16#0B39 -> true; +is_base_char(16#0B3D) -> true; +is_base_char(X) when X >= 16#0B5C, X =< 16#0B5D -> true; +is_base_char(X) when X >= 16#0B5F, X =< 16#0B61 -> true; +is_base_char(X) when X >= 16#0B85, X =< 16#0B8A -> true; +is_base_char(X) when X >= 16#0B8E, X =< 16#0B90 -> true; +is_base_char(X) when X >= 16#0B92, X =< 16#0B95 -> true; +is_base_char(X) when X >= 16#0B99, X =< 16#0B9A -> true; +is_base_char(16#0B9C) -> true; +is_base_char(X) when X >= 16#0B9E, X =< 16#0B9F -> true; +is_base_char(X) when X >= 16#0BA3, X =< 16#0BA4 -> true; +is_base_char(X) when X >= 16#0BA8, X =< 16#0BAA -> true; +is_base_char(X) when X >= 16#0BAE, X =< 16#0BB5 -> true; +is_base_char(X) when X >= 16#0BB7, X =< 16#0BB9 -> true; +is_base_char(X) when X >= 16#0C05, X =< 16#0C0C -> true; +is_base_char(X) when X >= 16#0C0E, X =< 16#0C10 -> true; +is_base_char(X) when X >= 16#0C12, X =< 16#0C28 -> true; +is_base_char(X) when X >= 16#0C2A, X =< 16#0C33 -> true; +is_base_char(X) when X >= 16#0C35, X =< 16#0C39 -> true; +is_base_char(X) when X >= 16#0C60, X =< 16#0C61 -> true; +is_base_char(X) when X >= 16#0C85, X =< 16#0C8C -> true; +is_base_char(X) when X >= 16#0C8E, X =< 16#0C90 -> true; +is_base_char(X) when X >= 16#0C92, X =< 16#0CA8 -> true; +is_base_char(X) when X >= 16#0CAA, X =< 16#0CB3 -> true; +is_base_char(X) when X >= 16#0CB5, X =< 16#0CB9 -> true; +is_base_char(16#0CDE) -> true; +is_base_char(X) when X >= 16#0CE0, X =< 16#0CE1 -> true; +is_base_char(X) when X >= 16#0D05, X =< 16#0D0C -> true; +is_base_char(X) when X >= 16#0D0E, X =< 16#0D10 -> true; +is_base_char(X) when X >= 16#0D12, X =< 16#0D28 -> true; +is_base_char(X) when X >= 16#0D2A, X =< 16#0D39 -> true; +is_base_char(X) when X >= 16#0D60, X =< 16#0D61 -> true; +is_base_char(X) when X >= 16#0E01, X =< 16#0E2E -> true; +is_base_char(16#0E30) -> true; +is_base_char(X) when X >= 16#0E32, X =< 16#0E33 -> true; +is_base_char(X) when X >= 16#0E40, X =< 16#0E45 -> true; +is_base_char(X) when X >= 16#0E81, X =< 16#0E82 -> true; +is_base_char(16#0E84) -> true; +is_base_char(X) when X >= 16#0E87, X =< 16#0E88 -> true; +is_base_char(16#0E8A) -> true; +is_base_char(16#0E8D) -> true; +is_base_char(X) when X >= 16#0E94, X =< 16#0E97 -> true; +is_base_char(X) when X >= 16#0E99, X =< 16#0E9F -> true; +is_base_char(X) when X >= 16#0EA1, X =< 16#0EA3 -> true; +is_base_char(16#0EA5) -> true; +is_base_char(16#0EA7) -> true; +is_base_char(X) when X >= 16#0EAA, X =< 16#0EAB -> true; +is_base_char(X) when X >= 16#0EAD, X =< 16#0EAE -> true; +is_base_char(16#0EB0) -> true; +is_base_char(X) when X >= 16#0EB2, X =< 16#0EB3 -> true; +is_base_char(16#0EBD) -> true; +is_base_char(X) when X >= 16#0EC0, X =< 16#0EC4 -> true; +is_base_char(X) when X >= 16#0F40, X =< 16#0F47 -> true; +is_base_char(X) when X >= 16#0F49, X =< 16#0F69 -> true; +is_base_char(X) when X >= 16#10A0, X =< 16#10C5 -> true; +is_base_char(X) when X >= 16#10D0, X =< 16#10F6 -> true; +is_base_char(16#1100) -> true; +is_base_char(X) when X >= 16#1102, X =< 16#1103 -> true; +is_base_char(X) when X >= 16#1105, X =< 16#1107 -> true; +is_base_char(16#1109) -> true; +is_base_char(X) when X >= 16#110B, X =< 16#110C -> true; +is_base_char(X) when X >= 16#110E, X =< 16#1112 -> true; +is_base_char(16#113C) -> true; +is_base_char(16#113E) -> true; +is_base_char(16#1140) -> true; +is_base_char(16#114C) -> true; +is_base_char(16#114E) -> true; +is_base_char(16#1150) -> true; +is_base_char(X) when X >= 16#1154, X =< 16#1155 -> true; +is_base_char(16#1159) -> true; +is_base_char(X) when X >= 16#115F, X =< 16#1161 -> true; +is_base_char(16#1163) -> true; +is_base_char(16#1165) -> true; +is_base_char(16#1167) -> true; +is_base_char(16#1169) -> true; +is_base_char(X) when X >= 16#116D, X =< 16#116E -> true; +is_base_char(X) when X >= 16#1172, X =< 16#1173 -> true; +is_base_char(16#1175) -> true; +is_base_char(16#119E) -> true; +is_base_char(16#11A8) -> true; +is_base_char(16#11AB) -> true; +is_base_char(X) when X >= 16#11AE, X =< 16#11AF -> true; +is_base_char(X) when X >= 16#11B7, X =< 16#11B8 -> true; +is_base_char(16#11BA) -> true; +is_base_char(X) when X >= 16#11BC, X =< 16#11C2 -> true; +is_base_char(16#11EB) -> true; +is_base_char(16#11F0) -> true; +is_base_char(16#11F9) -> true; +is_base_char(X) when X >= 16#1E00, X =< 16#1E9B -> true; +is_base_char(X) when X >= 16#1EA0, X =< 16#1EF9 -> true; +is_base_char(X) when X >= 16#1F00, X =< 16#1F15 -> true; +is_base_char(X) when X >= 16#1F18, X =< 16#1F1D -> true; +is_base_char(X) when X >= 16#1F20, X =< 16#1F45 -> true; +is_base_char(X) when X >= 16#1F48, X =< 16#1F4D -> true; +is_base_char(X) when X >= 16#1F50, X =< 16#1F57 -> true; +is_base_char(16#1F59) -> true; +is_base_char(16#1F5B) -> true; +is_base_char(16#1F5D) -> true; +is_base_char(X) when X >= 16#1F5F, X =< 16#1F7D -> true; +is_base_char(X) when X >= 16#1F80, X =< 16#1FB4 -> true; +is_base_char(X) when X >= 16#1FB6, X =< 16#1FBC -> true; +is_base_char(16#1FBE) -> true; +is_base_char(X) when X >= 16#1FC2, X =< 16#1FC4 -> true; +is_base_char(X) when X >= 16#1FC6, X =< 16#1FCC -> true; +is_base_char(X) when X >= 16#1FD0, X =< 16#1FD3 -> true; +is_base_char(X) when X >= 16#1FD6, X =< 16#1FDB -> true; +is_base_char(X) when X >= 16#1FE0, X =< 16#1FEC -> true; +is_base_char(X) when X >= 16#1FF2, X =< 16#1FF4 -> true; +is_base_char(X) when X >= 16#1FF6, X =< 16#1FFC -> true; +is_base_char(16#2126) -> true; +is_base_char(X) when X >= 16#212A, X =< 16#212B -> true; +is_base_char(16#212E) -> true; +is_base_char(X) when X >= 16#2180, X =< 16#2182 -> true; +is_base_char(X) when X >= 16#3041, X =< 16#3094 -> true; +is_base_char(X) when X >= 16#30A1, X =< 16#30FA -> true; +is_base_char(X) when X >= 16#3105, X =< 16#312C -> true; +is_base_char(X) when X >= 16#ac00, X =< 16#d7a3 -> true; +is_base_char(_) -> + false. + +%% [86] Ideographic +is_ideographic(X) when X >= 16#4e00, X =< 16#9fa5 -> true; +is_ideographic(16#3007) -> true; +is_ideographic(X) when X >= 16#3021, X =< 16#3029 -> true; +is_ideographic(_) -> + false. + +%% [87] CombiningChar +is_combining_char(X) when X >= 16#0300, X =< 16#0345 -> true; +is_combining_char(X) when X >= 16#0360, X =< 16#0361 -> true; +is_combining_char(X) when X >= 16#0483, X =< 16#0486 -> true; +is_combining_char(X) when X >= 16#0591, X =< 16#05a1 -> true; +is_combining_char(X) when X >= 16#05a3, X =< 16#05b9 -> true; +is_combining_char(X) when X >= 16#05bb, X =< 16#05bd -> true; +is_combining_char(16#05bf) -> true; +is_combining_char(X) when X >= 16#05c1, X =< 16#05c2 -> true; +is_combining_char(16#05c4) -> true; +is_combining_char(X) when X >= 16#064b, X =< 16#0652 -> true; +is_combining_char(16#0670) -> true; +is_combining_char(X) when X >= 16#06d6, X =< 16#06dc -> true; +is_combining_char(X) when X >= 16#06dd, X =< 16#06df -> true; +is_combining_char(X) when X >= 16#06e0, X =< 16#06e4 -> true; +is_combining_char(X) when X >= 16#06e7, X =< 16#06e8 -> true; +is_combining_char(X) when X >= 16#06ea, X =< 16#06ed -> true; +is_combining_char(X) when X >= 16#0901, X =< 16#0903 -> true; +is_combining_char(16#093c) -> true; +is_combining_char(X) when X >= 16#093e, X =< 16#094c -> true; +is_combining_char(16#094d) -> true; +is_combining_char(X) when X >= 16#0951, X =< 16#0954 -> true; +is_combining_char(X) when X >= 16#0962, X =< 16#0963 -> true; +is_combining_char(X) when X >= 16#0981, X =< 16#0983 -> true; +is_combining_char(16#09bc) -> true; +is_combining_char(16#09be) -> true; +is_combining_char(16#09bf) -> true; +is_combining_char(X) when X >= 16#09c0, X =< 16#09c4 -> true; +is_combining_char(X) when X >= 16#09c7, X =< 16#09c8 -> true; +is_combining_char(X) when X >= 16#09cb, X =< 16#09cd -> true; +is_combining_char(16#09d7) -> true; +is_combining_char(X) when X >= 16#09e2, X =< 16#09e3 -> true; +is_combining_char(16#0a02) -> true; +is_combining_char(16#0a3c) -> true; +is_combining_char(16#0a3e) -> true; +is_combining_char(16#0a3f) -> true; +is_combining_char(X) when X >= 16#0a40, X =< 16#0a42 -> true; +is_combining_char(X) when X >= 16#0a47, X =< 16#0a48 -> true; +is_combining_char(X) when X >= 16#0a4b, X =< 16#0a4d -> true; +is_combining_char(X) when X >= 16#0a70, X =< 16#0a71 -> true; +is_combining_char(X) when X >= 16#0a81, X =< 16#0a83 -> true; +is_combining_char(16#0abc) -> true; +is_combining_char(X) when X >= 16#0abe, X =< 16#0ac5 -> true; +is_combining_char(X) when X >= 16#0ac7, X =< 16#0ac9 -> true; +is_combining_char(X) when X >= 16#0acb, X =< 16#0acd -> true; +is_combining_char(X) when X >= 16#0b01, X =< 16#0b03 -> true; +is_combining_char(16#0b3c) -> true; +is_combining_char(X) when X >= 16#0b3e, X =< 16#0b43 -> true; +is_combining_char(X) when X >= 16#0b47, X =< 16#0b48 -> true; +is_combining_char(X) when X >= 16#0b4b, X =< 16#0b4d -> true; +is_combining_char(X) when X >= 16#0b56, X =< 16#0b57 -> true; +is_combining_char(X) when X >= 16#0b82, X =< 16#0b83 -> true; +is_combining_char(X) when X >= 16#0bbe, X =< 16#0bc2 -> true; +is_combining_char(X) when X >= 16#0bc6, X =< 16#0bc8 -> true; +is_combining_char(X) when X >= 16#0bca, X =< 16#0bcd -> true; +is_combining_char(16#0bd7) -> true; +is_combining_char(X) when X >= 16#0c01, X =< 16#0c03 -> true; +is_combining_char(X) when X >= 16#0c3e, X =< 16#0c44 -> true; +is_combining_char(X) when X >= 16#0c46, X =< 16#0c48 -> true; +is_combining_char(X) when X >= 16#0c4a, X =< 16#0c4d -> true; +is_combining_char(X) when X >= 16#0c55, X =< 16#0c56 -> true; +is_combining_char(X) when X >= 16#0c82, X =< 16#0c83 -> true; +is_combining_char(X) when X >= 16#0cbe, X =< 16#0cc4 -> true; +is_combining_char(X) when X >= 16#0cc6, X =< 16#0cc8 -> true; +is_combining_char(X) when X >= 16#0cca, X =< 16#0ccd -> true; +is_combining_char(X) when X >= 16#0cd5, X =< 16#0cd6 -> true; +is_combining_char(X) when X >= 16#0d02, X =< 16#0d03 -> true; +is_combining_char(X) when X >= 16#0d3e, X =< 16#0d43 -> true; +is_combining_char(X) when X >= 16#0d46, X =< 16#0d48 -> true; +is_combining_char(X) when X >= 16#0d4a, X =< 16#0d4d -> true; +is_combining_char(16#0d57) -> true; +is_combining_char(16#0e31) -> true; +is_combining_char(X) when X >= 16#0e34, X =< 16#0e3a -> true; +is_combining_char(X) when X >= 16#0e47, X =< 16#0e4e -> true; +is_combining_char(16#0eb1) -> true; +is_combining_char(X) when X >= 16#0eb4, X =< 16#0eb9 -> true; +is_combining_char(X) when X >= 16#0ebb, X =< 16#0ebc -> true; +is_combining_char(X) when X >= 16#0ec8, X =< 16#0ecd -> true; +is_combining_char(X) when X >= 16#0f18, X =< 16#0f19 -> true; +is_combining_char(16#0f35) -> true; +is_combining_char(16#0f37) -> true; +is_combining_char(16#0f39) -> true; +is_combining_char(16#0f3e) -> true; +is_combining_char(16#0f3f) -> true; +is_combining_char(X) when X >= 16#0f71, X =< 16#0f84 -> true; +is_combining_char(X) when X >= 16#0f86, X =< 16#0f8b -> true; +is_combining_char(X) when X >= 16#0f90, X =< 16#0f95 -> true; +is_combining_char(16#0f97) -> true; +is_combining_char(X) when X >= 16#0f99, X =< 16#0fad -> true; +is_combining_char(X) when X >= 16#0fb1, X =< 16#0fb7 -> true; +is_combining_char(16#0fb9) -> true; +is_combining_char(X) when X >= 16#20d0, X =< 16#20dc -> true; +is_combining_char(16#20e1) -> true; +is_combining_char(X) when X >= 16#302a, X =< 16#302f -> true; +is_combining_char(16#3099) -> true; +is_combining_char(16#309a) -> true; +is_combining_char(_) -> false. + +%% [88] Digit +is_digit(X) when X >= 16#0030, X =< 16#0039 -> true; +is_digit(X) when X >= 16#0660, X =< 16#0669 -> true; +is_digit(X) when X >= 16#06F0, X =< 16#06F9 -> true; +is_digit(X) when X >= 16#0966, X =< 16#096f -> true; +is_digit(X) when X >= 16#09e6, X =< 16#09ef -> true; +is_digit(X) when X >= 16#0a66, X =< 16#0a6f -> true; +is_digit(X) when X >= 16#0ae6, X =< 16#0aef -> true; +is_digit(X) when X >= 16#0b66, X =< 16#0b6f -> true; +is_digit(X) when X >= 16#0be7, X =< 16#0bef -> true; +is_digit(X) when X >= 16#0c66, X =< 16#0c6f -> true; +is_digit(X) when X >= 16#0ce6, X =< 16#0cef -> true; +is_digit(X) when X >= 16#0d66, X =< 16#0d6f -> true; +is_digit(X) when X >= 16#0e50, X =< 16#0e59 -> true; +is_digit(X) when X >= 16#0ed0, X =< 16#0ed9 -> true; +is_digit(X) when X >= 16#0f20, X =< 16#0f29 -> true; +is_digit(_) -> false. + +%% [89] Extender +is_extender(16#00b7) -> true; +is_extender(16#02d0) -> true; +is_extender(16#02d1) -> true; +is_extender(16#0387) -> true; +is_extender(16#0640) -> true; +is_extender(16#0e46) -> true; +is_extender(16#0ec6) -> true; +is_extender(16#3005) -> true; +is_extender(X) when X >= 16#3031, X =< 16#3035 -> true; +is_extender(X) when X >= 16#309d, X =< 16#309e -> true; +is_extender(X) when X >= 16#30fc, X =< 16#30fe -> true; +is_extender(_) -> false. + +to_lower(Str) -> + to_lower(Str, []). +to_lower([C|Cs], Acc) when C >= $A, C =< $Z -> + to_lower(Cs, [C+($a-$A)| Acc]); +to_lower([C|Cs], Acc) -> + to_lower(Cs, [C| Acc]); +to_lower([], Acc) -> + lists:reverse(Acc). + +%%% XSD helpers + +is_facet(length) -> true; +is_facet(minLength) -> true; +is_facet(maxLength) -> true; +is_facet(pattern) -> true; +is_facet(enumeration) -> true; +is_facet(whiteSpace) -> true; +is_facet(maxInclusive) -> true; +is_facet(maxExclusive) -> true; +is_facet(minInclusive) -> true; +is_facet(minExclusive) -> true; +is_facet(totalDigits) -> true; +is_facet(fractionDigits) -> true; +is_facet(_) -> false. + + +is_builtin_simple_type({Type,_,?XSD_NAMESPACE}) when is_atom(Type) -> + is_builtin_simple_type(atom_to_list(Type)); +is_builtin_simple_type({Type,_,?XSD_NAMESPACE}) -> + is_builtin_simple_type(Type); +is_builtin_simple_type({_,_,_}) -> + false; +is_builtin_simple_type("string") -> true; +is_builtin_simple_type("normalizedString") -> true; +is_builtin_simple_type("token") -> true; +is_builtin_simple_type("base64Binary") -> true; +is_builtin_simple_type("hexBinary") -> true; +is_builtin_simple_type("integer") -> true; +is_builtin_simple_type("positiveInteger") -> true; +is_builtin_simple_type("negativeInteger") -> true; +is_builtin_simple_type("nonNegativeInteger") -> true; +is_builtin_simple_type("nonPositiveInteger") -> true; +is_builtin_simple_type("long") -> true; +is_builtin_simple_type("unsignedLong") -> true; +is_builtin_simple_type("int") -> true; +is_builtin_simple_type("unsignedInt") -> true; +is_builtin_simple_type("short") -> true; +is_builtin_simple_type("unsignedShort") -> true; +is_builtin_simple_type("decimal") -> true; +is_builtin_simple_type("float") -> true; +is_builtin_simple_type("double") -> true; +is_builtin_simple_type("boolean") -> true; +is_builtin_simple_type("duration") -> true; +is_builtin_simple_type("dateTime") -> true; +is_builtin_simple_type("date") -> true; +is_builtin_simple_type("time") -> true; +is_builtin_simple_type("gYear") -> true; +is_builtin_simple_type("gYearMonth") -> true; +is_builtin_simple_type("gMonth") -> true; +is_builtin_simple_type("gMonthDay") -> true; +is_builtin_simple_type("gDay") -> true; +is_builtin_simple_type("Name") -> true; +is_builtin_simple_type("QName") -> true; +is_builtin_simple_type("NCName") -> true; +is_builtin_simple_type("anyURI") -> true; +is_builtin_simple_type("language") -> true; +is_builtin_simple_type("ID") -> true; +is_builtin_simple_type("IDREF") -> true; +is_builtin_simple_type("IDREFS") -> true; +is_builtin_simple_type("ENTITY") -> true; +is_builtin_simple_type("ENTITIES") ->true; +is_builtin_simple_type("NOTATION") -> true; +is_builtin_simple_type("NMTOKEN") -> true; +is_builtin_simple_type("NMTOKENS") -> true; +is_builtin_simple_type("byte") -> true; +is_builtin_simple_type("unsignedByte") -> true; +is_builtin_simple_type(_) -> false. + +is_xsd_string({Type,_,?XSD_NAMESPACE}) when is_atom(Type) -> + is_xsd_string(Type); +is_xsd_string({Type,_,?XSD_NAMESPACE}) -> + is_xsd_string(Type); +is_xsd_string({_,_,_}) -> + false; +is_xsd_string(Atom) when is_atom(Atom) -> + is_xsd_string(atom_to_list(Atom)); +is_xsd_string("string") -> + true; +is_xsd_string("normalizedString") -> + true; +is_xsd_string("token") -> + true; +is_xsd_string("language") -> + true; +is_xsd_string("Name") -> + true; +is_xsd_string("NMTOKEN") -> + true; +is_xsd_string("NMTOKENS") -> + true; +is_xsd_string("NCName") -> + true; +is_xsd_string("ID") -> + true; +is_xsd_string("IDREF") -> + true; +is_xsd_string("IDREFS") -> + true; +is_xsd_string("ENTITY") -> + true; +is_xsd_string("ENTITIES") -> + true; +is_xsd_string(_) -> + false. diff --git a/lib/xmerl/src/xmerl_otpsgml.erl b/lib/xmerl/src/xmerl_otpsgml.erl new file mode 100644 index 0000000000..38688e788f --- /dev/null +++ b/lib/xmerl/src/xmerl_otpsgml.erl @@ -0,0 +1,162 @@ +%% +%% %CopyrightBegin% +%% +%% Copyright Ericsson AB 2004-2009. All Rights Reserved. +%% +%% The contents of this file are subject to the Erlang Public License, +%% Version 1.1, (the "License"); you may not use this file except in +%% compliance with the License. You should have received a copy of the +%% Erlang Public License along with this software. If not, it can be +%% retrieved online at http://www.erlang.org/. +%% +%% Software distributed under the License is distributed on an "AS IS" +%% basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See +%% the License for the specific language governing rights and limitations +%% under the License. +%% +%% %CopyrightEnd% +%% + +%%% Description : Callback module for exporting XHTML to OTP-SGML. + +-module(xmerl_otpsgml). + +-export(['#xml-inheritance#'/0]). + +%% Note: we assume XML data, so all tags are lowercase! + +-export(['#root#'/4, + '#element#'/5, + '#text#'/1, + p/4]). + +-import(xmerl_lib, [markup/3, start_tag/2, is_empty_data/1, + export_text/1]). + +-include("xmerl.hrl"). + + +'#xml-inheritance#'() -> [xmerl_sgml]. + + +%% The '#text#' function is called for every text segment. + +'#text#'(Text) -> + export_text(Text). + + +%% The '#root#' tag is called when the entire structure has been +%% exported. It does not appear in the structure itself. + +'#root#'(Data, _Attrs, [], _E) -> + ["<!doctype erlref PUBLIC \"-//Stork//DTD erlref//EN\">\n",Data]. + + +%% Note that SGML does not have the <Tag/> empty-element form. +%% Furthermore, for some element types, the end tag is forbidden. (In +%% all other cases, we always generate the end tag, to make sure that +%% the scope of a markup is not extended by mistake.) + +'#element#'(Tag, Data, Attrs, _Parents, _E) -> +% io:format("parents:\n~p\n",[_Parents]), + case convert_tag(Tag,Attrs) of + {false,NewTag,NewAttrs} -> + markup(NewTag, NewAttrs, Data); + {true,NewTag,NewAttrs} -> + [start_tag(NewTag, NewAttrs), Data] + end. + + +%% HTML tags with special handling + +p(Data, Attrs, _Parents, _E) -> + %% In general, we cannot drop the end tag for paragraph elements; + %% that is only allowed if we know that it is immediately followed + %% by some other block-level tag. + case is_empty_data(Data) of + true -> + %% Paragraph elements should never be completely empty. + markup(p, Attrs, "\s"); + false -> + markup(p, Attrs, Data) + end. + + +%% Utility functions + +convert_tag(code,Attrs) -> convert_tag(c,Attrs); +convert_tag(strong,Attrs) -> convert_tag(em,Attrs); +convert_tag(b,Attrs) -> convert_tag(em,Attrs); +convert_tag(underline,Attrs) -> convert_tag(em,Attrs); % what is underline in sgml??? +convert_tag(dl,Attrs) -> convert_tag(taglist,Attrs); +convert_tag(dt,Attrs) -> convert_tag(tag,Attrs); +convert_tag(dd,Attrs) -> convert_tag(item,Attrs); +convert_tag(ul,Attrs) -> convert_tag(list,Attrs); +convert_tag(li,Attrs) -> convert_tag(item,Attrs); +convert_tag(tt,Attrs) -> convert_tag(c,Attrs); +%convert_tag(a, Attrs) -> convert_tag(seealso,convert_seealso_attrs(Attrs)); +convert_tag(a, Attrs) -> convert_tag(convert_aref(Attrs),convert_aref_attrs(convert_aref(Attrs),Attrs)); +convert_tag(Tag,Attrs) -> {forbid_end(Tag),Tag,Attrs}. + +convert_aref([#xmlAttribute{name = href, value = V}|_Rest]) -> + %% search if it is a html link, thus make it a 'url' ref otherwise + %% a 'seealso'. + case html_content(V) of + true -> + url; + _ -> + seealso + end; +convert_aref([#xmlAttribute{name = K}|Rest]) -> + io:format("Warning: ignoring attribute \'~p\' for tag \'a\'\n",[K]), + convert_aref(Rest). +convert_aref_attrs(url,Attrs) -> + Attrs; +convert_aref_attrs(SA,[#xmlAttribute{name = href, value = V}=A|Rest]) -> + [A#xmlAttribute{name=marker,value=V}|convert_aref_attrs(SA,Rest)]; +convert_aref_attrs(_,[])-> + []. +html_content([]) -> + false; +html_content([$.|Rest]) -> + case Rest of + "htm"++_EmaNfeR -> + true; + _ -> html_content(Rest) + end; +html_content([_H|T]) -> + html_content(T). + +% convert_seealso_attrs([#xmlAttribute{name = href, value = V} = A|Rest]) -> +% [A#xmlAttribute{name=marker,value=normalize_web_ref(V)}|convert_seealso_attrs(Rest)]; +% convert_seealso_attrs([#xmlAttribute{name = K}|Rest]) -> +% io:format("Warning: ignoring attribute \'~p\' for tag \'a\'\n",[K]), +% convert_seealso_attrs(Rest); +% convert_seealso_attrs([]) -> +% []. + +% normalize_web_ref(RefName) -> +% normalize_web_ref1(lists:reverse(RefName)). + +% normalize_web_ref1("lmth."++EmaNfeR) -> +% lists:reverse(EmaNfeR); +% normalize_web_ref1("mth"++EmaNfeR) -> +% lists:reverse(EmaNfeR); +% normalize_web_ref1(RefName) -> +% RefName. + +forbid_end(area) -> true; +forbid_end(base) -> true; +forbid_end(basefont) -> true; +forbid_end(br) -> true; +forbid_end(col) -> true; +forbid_end(frame) -> true; +forbid_end(hr) -> true; +forbid_end(img) -> true; +forbid_end(input) -> true; +forbid_end(isindex) -> true; +forbid_end(link) -> true; +forbid_end(marker) -> true; +forbid_end(meta) -> true; +forbid_end(param) -> true; +forbid_end(_) -> false. diff --git a/lib/xmerl/src/xmerl_regexp.erl b/lib/xmerl/src/xmerl_regexp.erl new file mode 100644 index 0000000000..0c53e6f34a --- /dev/null +++ b/lib/xmerl/src/xmerl_regexp.erl @@ -0,0 +1,1437 @@ +%% +%% %CopyrightBegin% +%% +%% Copyright Ericsson AB 2006-2009. All Rights Reserved. +%% +%% The contents of this file are subject to the Erlang Public License, +%% Version 1.1, (the "License"); you may not use this file except in +%% compliance with the License. You should have received a copy of the +%% Erlang Public License along with this software. If not, it can be +%% retrieved online at http://www.erlang.org/. +%% +%% Software distributed under the License is distributed on an "AS IS" +%% basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See +%% the License for the specific language governing rights and limitations +%% under the License. +%% +%% %CopyrightEnd% +%% + +%% +-module(xmerl_regexp). + +%% This module provides a basic set of regular expression functions +%% for strings. The functions provided are taken from AWK. +%% +%% Note that we interpret the syntax tree of a regular expression +%% directly instead of converting it to an NFA and then interpreting +%% that. This method seems to go significantly faster. + +-export([sh_to_awk/1,parse/1,format_error/1,match/2,first_match/2,matches/2]). +-export([sub/3,gsub/3,split/2,sub_match/2,sub_first_match/2]). + +-export([make_nfa/1,make_dfa/1,make_dfa/2,compile/1]). + +-import(string, [substr/2,substr/3]). +-import(lists, [reverse/1,reverse/2,last/1,duplicate/2,seq/2]). +-import(lists, [member/2,keysearch/3,keysort/2,map/2,foldl/3]). +-import(ordsets, [is_element/2,add_element/2,union/2,subtract/2]). + +%%-compile([export_all]). + +-export([setup/1,compile_proc/2]). + +setup(RE0) -> + RE = setup(RE0, [$^]), + Pid = spawn(?MODULE,compile_proc,[self(),RE]), + receive + {ok,Result} -> + Result + after 2000 -> + exit(Pid,force), + parse(RE) + end. + %% compile(RE). +%%RE. +compile_proc(From,RE) -> + Res = compile(RE), + From ! {ok,Res}. + + +setup([$\\,$d|S],Acc) -> setup(S,"]9-0[" ++Acc); +setup([$\\,$D|S],Acc) -> setup(S,"]9-0^[" ++Acc); +setup([$\\,$s|S],Acc) -> setup(S,"]s\\t\\n\\r\\[" ++Acc); +setup([$\\,$S|S],Acc) -> setup(S,"]\\s\\t\\n\\r^[" ++Acc); +setup([$\\,$i|S],Acc) -> setup(S,"]z-aZ-A_:[" ++Acc); %% Only Latin-1 now +setup([$\\,$I|S],Acc) -> setup(S,"]z-aZ-A_:^[" ++Acc); +setup([$\\,$c|S],Acc) -> setup(S,"]9-0z-aZ-A_:."++[183]++"-[" ++Acc); +setup([$\\,$C|S],Acc) -> setup(S,"]9-0z-aZ-A_:."++[183]++"-^[" ++Acc); +%% fixme setup([$\\,$w|S]) -> {{char_class,"\s\t\n\r"},S}; +%% fixme setup([$\\,$W|S]) -> {{comp_class,"\s\t\n\r"},S}; +%% Letter, Any +%% fixme setup(["\\p{L}" ++ S) -> {{char_class,"\s\t\n\r"},S}; +%% fixme setup(["\\P{L}" ++ S) -> {{comp_class,"\s\t\n\r"},S}; +%% Letter, Uppercase +%% fixme setup(["\\p{Lu}" ++ S) -> {{char_class,"\s\t\n\r"},S}; +%% fixme setup(["\\P{Lu}" ++ S) -> {{comp_class,"\s\t\n\r"},S}; +%% Letter, Lowercase +%% fixme setup(["\\p{Ll}" ++ S) -> {{char_class,"\s\t\n\r"},S}; +%% fixme setup(["\\P{Ll}" ++ S) -> {{comp_class,"\s\t\n\r"},S}; +%% Letter, Titlecase +%% fixme setup(["\\p{Lt}" ++ S) -> {{char_class,"\s\t\n\r"},S}; +%% fixme setup(["\\P{Lt}" ++ S) -> {{comp_class,"\s\t\n\r"},S}; +%% Letter, Modifier +%% fixme setup(["\\p{Lm}" ++ S) -> {{char_class,"\s\t\n\r"},S}; +%% fixme setup(["\\P{Lm}" ++ S) -> {{comp_class,"\s\t\n\r"},S}; +%% Letter, Other +%% fixme setup(["\\p{Lo}" ++ S) -> {{char_class,"\s\t\n\r"},S}; +%% fixme setup(["\\P{Lo}" ++ S) -> {{comp_class,"\s\t\n\r"},S}; +%% Mark, Any +%% fixme setup(["\\p{M}" ++ S) -> {{char_class,"\s\t\n\r"},S}; +%% fixme setup(["\\P{M}" ++ S) -> {{comp_class,"\s\t\n\r"},S}; +%% Mark, Nonspacing +%% fixme setup(["\\p{Mn}" ++ S) -> {{char_class,"\s\t\n\r"},S}; +%% fixme setup(["\\P{Mn}" ++ S) -> {{comp_class,"\s\t\n\r"},S}; +%% Mark, Spacing Combining +%% fixme setup(["\\p{Mc}" ++ S) -> {{char_class,"\s\t\n\r"},S}; +%% fixme setup(["\\P{Mc}" ++ S) -> {{comp_class,"\s\t\n\r"},S}; +%% Mark, Enclosing +%% fixme setup(["\\p{Me}" ++ S) -> {{char_class,"\s\t\n\r"},S}; +%% fixme setup(["\\P{Me}" ++ S) -> {{comp_class,"\s\t\n\r"},S}; +%% Number, Any +%% fixme setup(["\\p{N}" ++ S) -> {{char_class,"\s\t\n\r"},S}; +%% fixme setup(["\\P{N}" ++ S) -> {{comp_class,"\s\t\n\r"},S}; +%% Number, Decimal Digit +%% fixme setup(["\\p{Nd}" ++ S) -> {{char_class,"\s\t\n\r"},S}; +%% fixme setup(["\\P{Nd}" ++ S) -> {{comp_class,"\s\t\n\r"},S}; +%% Number, Letter +%% fixme setup(["\\p{Nl}" ++ S) -> {{char_class,"\s\t\n\r"},S}; +%% fixme setup(["\\P{Nl}" ++ S) -> {{comp_class,"\s\t\n\r"},S}; +%% Number, Other +%% fixme setup(["\\p{No}" ++ S) -> {{char_class,"\s\t\n\r"},S}; +%% fixme setup(["\\P{No}" ++ S) -> {{comp_class,"\s\t\n\r"},S}; +%% Punctuation, Any +%% fixme setup(["\\p{P}" ++ S) -> {{char_class,"\s\t\n\r"},S}; +%% fixme setup(["\\P{P}" ++ S) -> {{comp_class,"\s\t\n\r"},S}; +%% Punctuation, Connector +%% fixme setup(["\\p{Pc}" ++ S) -> {{char_class,"\s\t\n\r"},S}; +%% fixme setup(["\\P{Pc}" ++ S) -> {{comp_class,"\s\t\n\r"},S}; +%% Punctuation, Dash +%% fixme setup(["\\p{Pd}" ++ S) -> {{char_class,"\s\t\n\r"},S}; +%% fixme setup(["\\P{Pd}" ++ S) -> {{comp_class,"\s\t\n\r"},S}; +%% Punctuation, Open +%% fixme setup(["\\p{Ps}" ++ S) -> {{char_class,"\s\t\n\r"},S}; +%% fixme setup(["\\P{Ps}" ++ S) -> {{comp_class,"\s\t\n\r"},S}; +%% Punctuation, Close +%% fixme setup(["\\p{Pe}" ++ S) -> {{char_class,"\s\t\n\r"},S}; +%% fixme setup(["\\P{Pe}" ++ S) -> {{comp_class,"\s\t\n\r"},S}; +%% Punctuation, Initial quote (may behave like Ps or Pe, depending on usage) +%% fixme setup(["\\p{Pi}" ++ S) -> {{char_class,"\s\t\n\r"},S}; +%% fixme setup(["\\P{Pi}" ++ S) -> {{comp_class,"\s\t\n\r"},S}; +%% Punctuation, Final quote (may behave like Ps or Pe, depending on usage) +%% fixme setup(["\\p{Pf}" ++ S) -> {{char_class,"\s\t\n\r"},S}; +%% fixme setup(["\\P{Pf}" ++ S) -> {{comp_class,"\s\t\n\r"},S}; +%% Punctuation, Other +%% fixme setup(["\\p{Po}" ++ S) -> {{char_class,"\s\t\n\r"},S}; +%% fixme setup(["\\P{Po}" ++ S) -> {{comp_class,"\s\t\n\r"},S}; +%% Symbol, Any +%% fixme setup(["\\p{S}" ++ S) -> {{char_class,"\s\t\n\r"},S}; +%% fixme setup(["\\P{S}" ++ S) -> {{comp_class,"\s\t\n\r"},S}; +%% Symbol, Math +%% fixme setup(["\\p{Sm}" ++ S) -> {{char_class,"\s\t\n\r"},S}; +%% fixme setup(["\\P{Sm}" ++ S) -> {{comp_class,"\s\t\n\r"},S}; +%% Symbol, Currency +%% fixme setup(["\\p{Sc}" ++ S) -> {{char_class,"\s\t\n\r"},S}; +%% fixme setup(["\\P{Sc}" ++ S) -> {{comp_class,"\s\t\n\r"},S}; +%% Symbol, Modifier +%% fixme setup(["\\p{Sk}" ++ S) -> {{char_class,"\s\t\n\r"},S}; +%% fixme setup(["\\P{Sk}" ++ S) -> {{comp_class,"\s\t\n\r"},S}; +%% Symbol, Other +%% fixme setup(["\\p{So}" ++ S) -> {{char_class,"\s\t\n\r"},S}; +%% fixme setup(["\\P{So}" ++ S) -> {{comp_class,"\s\t\n\r"},S}; +%% Separator, Any +%% fixme setup(["\\p{Z}" ++ S) -> {{char_class,"\s\t\n\r"},S}; +%% fixme setup(["\\P{Z}" ++ S) -> {{comp_class,"\s\t\n\r"},S}; +%% Separator, Space +%% fixme setup(["\\p{Zs}" ++ S) -> {{char_class,"\s\t\n\r"},S}; +%% fixme setup(["\\P{Zs}" ++ S) -> {{comp_class,"\s\t\n\r"},S}; +%% Separator, Line +%% fixme setup(["\\p{Zl}" ++ S) -> {{char_class,"\s\t\n\r"},S}; +%% fixme setup(["\\P{Zl}" ++ S) -> {{comp_class,"\s\t\n\r"},S}; +%% Separator, Paragraph +%% fixme setup(["\\p{Zp}" ++ S) -> {{char_class,"\s\t\n\r"},S}; +%% fixme setup(["\\P{Zp}" ++ S) -> {{comp_class,"\s\t\n\r"},S}; +%% Other, Any +%% fixme setup(["\\p{C}" ++ S) -> {{char_class,"\s\t\n\r"},S}; +%% fixme setup(["\\P{C}" ++ S) -> {{comp_class,"\s\t\n\r"},S}; +%% Other, Control +%% fixme setup(["\\p{Cc}" ++ S) -> {{char_class,"\s\t\n\r"},S}; +%% fixme setup(["\\P{Cc}" ++ S) -> {{comp_class,"\s\t\n\r"},S}; +%% Other, Format +%% fixme setup(["\\p{Cf}" ++ S) -> {{char_class,"\s\t\n\r"},S}; +%% fixme setup(["\\P{Cf}" ++ S) -> {{comp_class,"\s\t\n\r"},S}; +%% Other, Surrogate not supported by schema recommendation +%% fixme setup(["\\p{Cs}" ++ S) -> {{char_class,"\s\t\n\r"},S}; +%% fixme setup(["\\P{Cs}" ++ S) -> {{comp_class,"\s\t\n\r"},S}; +%% Other, Private Use +%% fixme setup(["\\p{Co}" ++ S) -> {{char_class,"\s\t\n\r"},S}; +%% fixme setup(["\\P{Co}" ++ S) -> {{comp_class,"\s\t\n\r"},S}; +%% Other, Not assigned (no characters in the file have this property) +%% fixme setup(["\\p{Cn}" ++ S) -> {{char_class,"\s\t\n\r"},S}; +%% fixme setup(["\\P{Cn}" ++ S) -> {{comp_class,"\s\t\n\r"},S}; +setup([A|S], Acc) -> setup(S, [A|Acc]); +setup([],Acc) -> reverse([$$|Acc]). + +%% sh_to_awk(ShellRegExp) +%% Convert a sh style regexp into a full AWK one. The main difficulty is +%% getting character sets right as the conventions are different. + +sh_to_awk(Sh) -> "^(" ++ sh_to_awk_1(Sh). %Fix the beginning + +sh_to_awk_1([$*|Sh]) -> %This matches any string + ".*" ++ sh_to_awk_1(Sh); +sh_to_awk_1([$?|Sh]) -> %This matches any character + [$.|sh_to_awk_1(Sh)]; +sh_to_awk_1([$[,$^,$]|Sh]) -> %This takes careful handling + "\\^" ++ sh_to_awk_1(Sh); +%% Must move '^' to end. +sh_to_awk_1("[^" ++ Sh) -> [$[|sh_to_awk_2(Sh, true)]; +sh_to_awk_1("[!" ++ Sh) -> "[^" ++ sh_to_awk_2(Sh, false); +sh_to_awk_1([$[|Sh]) -> [$[|sh_to_awk_2(Sh, false)]; +sh_to_awk_1([C|Sh]) -> + %% Unspecialise everything else which is not an escape character. + case sh_special_char(C) of + true -> [$\\,C|sh_to_awk_1(Sh)]; + false -> [C|sh_to_awk_1(Sh)] + end; +sh_to_awk_1([]) -> ")$". %Fix the end + +sh_to_awk_2([$]|Sh], UpArrow) -> [$]|sh_to_awk_3(Sh, UpArrow)]; +sh_to_awk_2(Sh, UpArrow) -> sh_to_awk_3(Sh, UpArrow). + +sh_to_awk_3([$]|Sh], true) -> "^]" ++ sh_to_awk_1(Sh); +sh_to_awk_3([$]|Sh], false) -> [$]|sh_to_awk_1(Sh)]; +sh_to_awk_3([C|Sh], UpArrow) -> [C|sh_to_awk_3(Sh, UpArrow)]; +sh_to_awk_3([], true) -> [$^|sh_to_awk_1([])]; +sh_to_awk_3([], false) -> sh_to_awk_1([]). + +%% -type sh_special_char(char()) -> bool(). +%% Test if a character is a special character. + +sh_special_char($|) -> true; +sh_special_char($*) -> true; +sh_special_char($+) -> true; +sh_special_char($?) -> true; +sh_special_char($() -> true; +sh_special_char($)) -> true; +sh_special_char($\\) -> true; +sh_special_char($^) -> true; +sh_special_char($$) -> true; +sh_special_char($.) -> true; +sh_special_char($[) -> true; +sh_special_char($]) -> true; +sh_special_char($") -> true; +sh_special_char(_C) -> false. + +%% parse(RegExp) -> {ok,RE} | {error,E}. +%% Parse the regexp described in the string RegExp. + +parse(S) -> + case catch reg(S, 0) of + {R,Sc,[]} -> {ok,{regexp,{R,Sc}}}; + {_R,_Sc,[C|_]} -> {error,{illegal,[C]}}; + {error,E} -> {error,E} + end. + +%% format_error(Error) -> String. + +format_error({interval_range,What}) -> + ["illegal interval range",io_lib:write_string(What)]; +format_error({illegal,What}) -> ["illegal character `",What,"'"]; +format_error({unterminated,What}) -> ["unterminated `",What,"'"]; +format_error({posix_cc,What}) -> + ["illegal POSIX character class ",io_lib:write_string(What)]; +format_error({char_class,What}) -> + ["illegal character class ",io_lib:write_string(What)]. + +%% match(String, RegExp) -> {match,Start,Length} | nomatch | {error,E}. +%% Find the longest match of RegExp in String. + +match(S, RegExp) when is_list(RegExp) -> + case parse(RegExp) of + {ok,RE} -> match(S, RE); + {error,E} -> {error,E} + end; +match(S, {regexp,RE}) -> + case match_re(RE, S, 1, 0, -1) of + {Start,Len} when Len >= 0 -> + {match,Start,Len}; + {_Start,_Len} -> nomatch + end; +match(S, {comp_regexp,RE}) -> + case match_comp(RE, S, 1, 0, -1) of + {Start,Len} when Len >= 0 -> + {match,Start,Len}; + {_Start,_Len} -> nomatch + end. + +match_re(RE, [_|Cs]=S0, P0, Mst, Mlen) -> + case re_apply(S0, P0, RE) of + {match,P1,_S1,_Subs} -> + Len = P1-P0, + if Len > Mlen -> match_re(RE, Cs, P0+1, P0, Len); + true -> match_re(RE, Cs, P0+1, Mst, Mlen) + end; + nomatch -> match_re(RE, Cs, P0+1, Mst, Mlen); + never_match -> {Mst,Mlen} %No need to go on + end; +match_re(_RE, _S, _P, Mst, Mlen) -> {Mst,Mlen}. + +match_comp(RE, [_|Cs]=S0, P0, Mst, Mlen) -> + case comp_apply(S0, P0, RE) of + {match,P1,_S1} -> + Len = P1-P0, + if Len > Mlen -> match_comp(RE, Cs, P0+1, P0, Len); + true -> match_comp(RE, Cs, P0+1, Mst, Mlen) + end; + nomatch -> match_comp(RE, Cs, P0+1, Mst, Mlen) + end; +match_comp(_RE, _S, _P, Mst, Mlen) -> {Mst,Mlen}. + +%% match_re(RE, S0, Pos0, Mst, Mlen) -> +%% case first_match_re(RE, S0, Pos0) of +%% {St,Len,_} -> %Found a match +%% Pos1 = St + 1, %Where to start next match +%% S1 = lists:nthtail(Pos1-Pos0, S0), +%% if Len > Mlen -> match_re(RE, S1, Pos1, St, Len); +%% true -> match_re(RE, S1, Pos1, Mst, Mlen) +%% end; +%% nomatch -> {Mst,Mlen} +%% end. + +%% match_comp(RE, S0, Pos0, Mst, Mlen) -> +%% case first_match_comp(RE, S0, Pos0) of +%% {St,Len} -> %Found a match +%% Pos1 = St + 1, %Where to start next match +%% S1 = lists:nthtail(Pos1-Pos0, S0), +%% if Len > Mlen -> match_comp(RE, S1, Pos1, St, Len); +%% true -> match_comp(RE, S1, Pos1, Mst, Mlen) +%% end; +%% nomatch -> {Mst,Mlen} +%% end. + +%% first_match(String, RegExp) -> {match,Start,Length} | nomatch | {error,E}. +%% Find the first match of RegExp in String. + +first_match(S, RegExp) when is_list(RegExp) -> + case parse(RegExp) of + {ok,RE} -> first_match(S, RE); + {error,E} -> {error,E} + end; +first_match(S, {regexp,RE}) -> + case first_match_re(RE, S, 1) of + {Start,Len,_} -> {match,Start,Len}; + nomatch -> nomatch + end; +first_match(S, {comp_regexp,RE}) -> + case first_match_comp(RE, S, 1) of + {Start,Len} -> {match,Start,Len}; + nomatch -> nomatch + end. + +first_match_re(RE, S, St) when S /= [] -> + case re_apply(S, St, RE) of + {match,P,_Rest,Subs} -> {St,P-St,Subs}; + nomatch -> first_match_re(RE, tl(S), St+1); + never_match -> nomatch + end; +first_match_re(_RE, [], _St) -> nomatch. + +first_match_comp(RE, S, St) when S /= [] -> + case comp_apply(S, St, RE) of + {match,P,_Rest} -> {St,P-St}; + nomatch -> first_match_comp(RE, tl(S), St+1) + end; +first_match_comp(_RE, [], _St) -> nomatch. + +%% matches(String, RegExp) -> {match,[{Start,Length}]} | {error,E}. +%% Return the all the non-overlapping matches of RegExp in String. + +matches(S, RegExp) when is_list(RegExp) -> + case parse(RegExp) of + {ok,RE} -> matches(S, RE); + {error,E} -> {error,E} + end; +matches(S, {regexp,RE}) -> {match,matches_re(S, RE, 1)}; +matches(S, {comp_regexp,RE}) -> {match,matches_comp(S, RE, 1)}. + +matches_re([_|Cs]=S0, RE, P0) -> + case re_apply(S0, P0, RE) of + {match,P0,S1,_Subs} -> %0 length match + [{P0,0}|matches_re(tl(S1), RE, P0+1)]; + {match,P1,S1,_Subs} -> + [{P0,P1-P0}|matches_re(S1, RE, P1)]; + nomatch -> matches_re(Cs, RE, P0+1); + never_match -> [] + end; +matches_re([], _RE, _P) -> []. + +matches_comp([_|Cs]=S0, RE, P0) -> + case comp_apply(S0, P0, RE) of + {match,P0,S1} -> %0 length match + [{P0,0}|matches_comp(tl(S1), RE, P0+1)]; + {match,P1,S1} -> + [{P0,P1-P0}|matches_comp(S1, RE, P1)]; + nomatch -> matches_comp(Cs, RE, P0+1) + end; +matches_comp([], _RE, _P) -> []. + +%% sub(String, RegExp, Replace) -> {ok,RepString,RepCount} | {error,E}. +%% Substitute the first match of the regular expression RegExp with +%% the string Replace in String. Accept pre-parsed regular +%% expressions. + +sub(String, RegExp, Rep) when is_list(RegExp) -> + case parse(RegExp) of + {ok,RE} -> sub(String, RE, Rep); + {error,E} -> {error,E} + end; +sub(String, {regexp,RE}, Rep) -> + case sub_re(String, 1, RE, [], Rep) of + {yes,NewStr} -> {ok,NewStr,1}; + no -> {ok,String,0} + end; +sub(String, {comp_regexp,RE}, Rep) -> + case sub_comp(String, 1, RE, [], Rep) of + {yes,NewStr} -> {ok,NewStr,1}; + no -> {ok,String,0} + end. + +%% sub_re(String, Position, Regexp, Before, Replacement) -> +%% {NewString,Count}. +%% sub_comp(String, Position, Regexp, Before, Replacement) -> +%% {NewString,Count}. +%% Step forward over String until a match is found saving stepped over +%% chars in Before. Return reversed Before prepended to replacement +%% and rest of string. + +sub_re([C|Cs]=S0, P0, RE, Bef, Rep) -> + case re_apply(S0, P0, RE) of + {match,P0,_S1,_} -> %Ignore 0 length match + sub_re(Cs, P0+1, RE, [C|Bef], Rep); + {match,P1,Rest,_Gps} -> + {yes,reverse(Bef, sub_repl(Rep, substr(S0, 1, P1-P0), Rest))}; + nomatch -> sub_re(Cs, P0+1, RE, [C|Bef], Rep); + never_match -> no %No need to go on + end; +sub_re([], _P, _RE, _Bef, _Rep) -> no. + +sub_comp([C|Cs]=S0, P0, RE, Bef, Rep) -> + case comp_apply(S0, P0, RE) of + {match,P0,_S1} -> %Ignore 0 length match + sub_comp(Cs, P0+1, RE, [C|Bef], Rep); + {match,P1,Rest} -> + {yes,reverse(Bef, sub_repl(Rep, substr(S0, 1, P1-P0), Rest))}; + nomatch -> sub_comp(Cs, P0+1, RE, [C|Bef], Rep) + end; +sub_comp([], _P, _RE, _Bef, _Rep) -> no. + +sub_repl([$&|Rep], M, Rest) -> M ++ sub_repl(Rep, M, Rest); +sub_repl("\\&" ++ Rep, M, Rest) -> [$&|sub_repl(Rep, M, Rest)]; +sub_repl([C|Rep], M, Rest) -> [C|sub_repl(Rep, M, Rest)]; +sub_repl([], _M, Rest) -> Rest. + +%% gsub(String, RegExp, Replace) -> {ok,RepString,RepCount} | {error,E}. +%% Substitute every match of the regular expression RegExp with the +%% string New in String. Accept pre-parsed regular expressions. + +gsub(String, RegExp, Rep) when is_list(RegExp) -> + case parse(RegExp) of + {ok,RE} -> gsub(String, RE, Rep); + {error,E} -> {error,E} + end; +gsub(String, {regexp,RE}, Rep) -> + case gsub_re(String, 1, RE, [], Rep) of + {NewStr,N} -> {ok,NewStr,N}; + no -> {ok,String,0} %No substitutions + end; +gsub(String, {comp_regexp,RE}, Rep) -> + case gsub_comp(String, 1, RE, [], Rep) of + {NewStr,N} -> {ok,NewStr,N}; + no -> {ok,String,0} %No substitutions + end. + +%% gsub_re(String, Position, Regexp, Before, Replacement) -> +%% {NewString,Count}. +%% gsub_comp(String, Position, Regexp, Before, Replacement) -> +%% {NewString,Count}. +%% Step forward over String until a match is found saving stepped over +%% chars in Before. Call recursively to do rest of string after +%% match. Return reversed Before prepended to return from recursive +%% call. + +gsub_re([C|Cs]=S0, P0, RE, Bef, Rep) -> + case re_apply(S0, P0, RE) of + {match,P0,_S1,_} -> %Ignore 0 length match + gsub_re(Cs, P0+1, RE, [C|Bef], Rep); + {match,P1,S1,_Gps} -> + case gsub_re(S1, P1, RE, [], Rep) of + {NewStr,N0} -> %Substituitions + {reverse(Bef, sub_repl(Rep, substr(S0, 1, P1-P0), NewStr)), + N0+1}; + no -> %No substituitions. + {reverse(Bef, sub_repl(Rep, substr(S0, 1, P1-P0), S1)),1} + end; + %%No match so step forward saving C on Bef. + nomatch -> gsub_re(Cs, P0+1, RE, [C|Bef], Rep); + never_match -> no %No need to go on + end; +gsub_re([], _P, _RE, _Bef, _Rep) -> no. + +gsub_comp([C|Cs]=S0, P0, RE, Bef, Rep) -> + case comp_apply(S0, P0, RE) of + {match,P0,_S1} -> %Ignore 0 length match + gsub_comp(Cs, P0+1, RE, [C|Bef], Rep); + {match,P1,S1} -> + case gsub_comp(S1, P1, RE, [], Rep) of + {NewStr,N0} -> %Substituitions + {reverse(Bef, sub_repl(Rep, substr(S0, 1, P1-P0), NewStr)), + N0+1}; + no -> %No substituitions. + {reverse(Bef, sub_repl(Rep, substr(S0, 1, P1-P0), S1)),1} + end; + %%No match so step forward saving C on Bef. + nomatch -> gsub_comp(Cs, P0+1, RE, [C|Bef], Rep) + end; +gsub_comp([], _P, _RE, _Bef, _Rep) -> no. + +%% split(String, RegExp) -> {ok,[SubString]} | {error,E}. +%% Split a string into substrings where the RegExp describes the +%% field seperator. The RegExp " " is specially treated. + +split(String, " ") -> %This is really special + {ok,{regexp,RE}} = parse("[ \t]+"), + case split_apply_re(String, RE, true) of + [[]|Ss] -> {ok,Ss}; + Ss -> {ok,Ss} + end; +split(String, RegExp) when is_list(RegExp) -> + case parse(RegExp) of + {ok,{regexp,RE}} -> {ok,split_apply_re(String, RE, false)}; + {error,E} -> {error,E} + end; +split(String, {regexp,RE}) -> {ok,split_apply_re(String, RE, false)}; +split(String, {comp_regexp,RE}) -> {ok,split_apply_comp(String, RE, false)}. + +split_apply_re(S, RE, Trim) -> split_apply_re(S, 1, RE, Trim, []). + +split_apply_re([], _P, _RE, true, []) -> []; +split_apply_re([], _P, _RE, _T, Sub) -> [reverse(Sub)]; +split_apply_re([C|Cs]=S, P0, RE, T, Sub) -> + case re_apply(S, P0, RE) of + {match,P0,_S1,_} -> %Ignore 0 length match + split_apply_re(Cs, P0+1, RE, T, [C|Sub]); + {match,P1,S1,_} -> + [reverse(Sub)|split_apply_re(S1, P1, RE, T, [])]; + nomatch -> + split_apply_re(Cs, P0+1, RE, T, [C|Sub]); + never_match -> [reverse(Sub, S)] %No need to go on + end. + +split_apply_comp(S, RE, Trim) -> split_apply_comp(S, 1, RE, Trim, []). + +%%split_apply_comp([], _P, _RE, true, []) -> []; +split_apply_comp([], _P, _RE, _T, Sub) -> [reverse(Sub)]; +split_apply_comp([C|Cs]=S, P0, RE, T, Sub) -> + case comp_apply(S, P0, RE) of + {match,P0,_S1} -> %Ignore 0 length match + split_apply_comp(Cs, P0+1, RE, T, [C|Sub]); + {match,P1,S1} -> + [reverse(Sub)|split_apply_comp(S1, P1, RE, T, [])]; + nomatch -> + split_apply_comp(Cs, P0+1, RE, T, [C|Sub]) + end. + +%% sub_match(String, RegExp) -> +%% {match,Start,Length,SubExprs} | nomatch | {error,E}. +%% Find the longest match of RegExp in String. + +sub_match(S, RegExp) when is_list(RegExp) -> + case parse(RegExp) of + {ok,RE} -> sub_match(S, RE); + {error,E} -> {error,E} + end; +sub_match(S, {regexp,RE}) -> + case sub_match_re(RE, S, 1, 0, -1, none) of + {Start,Len,Subs} when Len >= 0 -> + {match,Start,Len,Subs}; + {_Start,_Len,_Subs} -> nomatch + end. + +sub_match_re(RE, S0, Pos0, Mst, Mlen, Msubs) -> + case first_match_re(RE, S0, Pos0) of + {St,Len,Subs} -> %Found a match + Pos1 = St + 1, %Where to start next match + S1 = lists:nthtail(Pos1-Pos0, S0), + if Len > Mlen -> sub_match_re(RE, S1, Pos1, St, Len, Subs); + true -> sub_match_re(RE, S1, Pos1, Mst, Mlen, Msubs) + end; + nomatch -> {Mst,Mlen,Msubs} + end. + +%% sub_first_match(String, RegExp) -> +%% {match,Start,Length,SubExprs} | nomatch | {error,E}. +%% Find the longest match of RegExp in String, return Start and Length +%% as well as tuple of sub-expression matches. + +sub_first_match(S, RegExp) when is_list(RegExp) -> + {ok,RE} = parse(RegExp), + sub_first_match(S, RE); +sub_first_match(S, {regexp,RE}) -> + case first_match_re(RE, S, 1) of + {St,Len,Subs} -> {match,St,Len,Subs}; + nomatch -> nomatch + end. + + +%% This is the regular expression grammar used. It is equivalent to the +%% one used in AWK, except that we allow ^ $ to be used anywhere and fail +%% in the matching. +%% +%% reg -> reg1 : '$1'. +%% reg1 -> reg1 "|" reg2 : {'or','$1','$2'}. +%% reg1 -> reg2 : '$1'. +%% reg2 -> reg2 reg3 : {concat,'$1','$2'}. +%% reg2 -> reg3 : '$1'. +%% reg3 -> reg3 "*" : {kclosure,'$1'}. +%% reg3 -> reg3 "+" : {pclosure,'$1'}. +%% reg3 -> reg3 "?" : {optional,'$1'}. +%% reg3 -> reg3 "{" [Min],[Max] "}" : {closure_range, Num, '$1'} see below +%% reg3 -> reg4 : '$1'. +%% reg4 -> "(" reg ")" : '$2'. +%% reg4 -> "\\" char : '$2'. +%% reg4 -> "^" : bos. +%% reg4 -> "$" : eos. +%% reg4 -> "." : char. +%% reg4 -> "[" class "]" : {char_class,char_class('$2')} +%% reg4 -> "[" "^" class "]" : {comp_class,char_class('$3')} +%% reg4 -> "\"" chars "\"" : char_string('$2') +%% reg4 -> char : '$1'. +%% reg4 -> empty : epsilon. +%% The grammar of the current regular expressions. The actual parser +%% is a recursive descent implementation of the grammar. + +reg(S, Sc) -> reg1(S, Sc). + +%% reg1 -> reg2 reg1' +%% reg1' -> "|" reg2 +%% reg1' -> empty + +reg1(S0, Sc0) -> + {L,Sc1,S1} = reg2(S0, Sc0), + reg1p(S1, L, Sc1). + +reg1p([$||S0], L, Sc0) -> + {R,Sc1,S1} = reg2(S0, Sc0), + reg1p(S1, {'or',L,R}, Sc1); +reg1p(S, L, Sc) -> {L,Sc,S}. + +%% reg2 -> reg3 reg2' +%% reg2' -> reg3 +%% reg2' -> empty + +reg2(S0, Sc0) -> + {L,Sc1,S1} = reg3(S0, Sc0), + reg2p(S1, L, Sc1). + +reg2p([C|S0], L, Sc0) when C /= $|, C /= $) -> + {R,Sc1,S1} = reg3([C|S0], Sc0), + %% reg2p(S1, {concat,L,R}, Sc1); + case is_integer(R) of + true -> + case L of + {literal,Lit} -> + reg2p(S1, {literal,Lit ++[R]}, Sc1); + {concat,S2,Char} when is_integer(Char) -> + reg2p(S1, {concat,S2,{literal,[Char,R]}}, Sc1); + {concat,S2,{literal,Lit}} -> + reg2p(S1, {concat,S2,{literal,Lit ++ [R]}}, Sc1); + Char when is_integer(Char) -> + reg2p(S1, {literal,[Char,R]}, Sc1); + _ -> + reg2p(S1, {concat,L,R}, Sc1) + end; + false -> + reg2p(S1, {concat,L,R}, Sc1) + end; +reg2p(S, L, Sc) -> {L,Sc,S}. + +%% reg3 -> reg4 reg3' +%% reg3' -> "*" reg3' +%% reg3' -> "+" reg3' +%% reg3' -> "?" reg3' +%% reg3' -> "{" [Min],[Max] "}" reg3' +%% reg3' -> empty + +reg3(S0, Sc0) -> + {L,Sc1,S1} = reg4(S0, Sc0), + reg3p(S1, L, Sc1). + +reg3p([$*|S], L, Sc) -> reg3p(S, {kclosure,L}, Sc); +reg3p([$+|S], L, Sc) -> reg3p(S, {pclosure,L}, Sc); +reg3p([$?|S], L, Sc) -> reg3p(S, {optional,L}, Sc); +reg3p([${|Cs0], L, Sc) -> % $} + case interval_range(Cs0) of + {none,none,_Cs1} -> parse_error({interval_range,[${|Cs0]}); + {N,M,[$}|Cs1]} -> reg3p(Cs1, {iclosure,L,N,M}, Sc); + {_N,_M,_Cs1} -> parse_error({unterminated,"{"}) + end; +reg3p(S, L, Sc) -> {L,Sc,S}. + +reg4([$(|S0], Sc0) -> + Sc1 = Sc0+1, + case reg(S0, Sc1) of + {R,Sc2,[$)|S1]} -> {{subexpr,Sc1,R},Sc2,S1}; + {_R,_Sc,_S} -> parse_error({unterminated,"("}) + end; +reg4([$^|S], Sc) -> {bos,Sc,S}; +reg4([$$|S], Sc) -> {eos,Sc,S}; +reg4([$.|S], Sc) -> {{comp_class,"\n"},Sc,S}; +reg4("[^" ++ S0, Sc) -> + case char_class(S0) of + {Cc,[$]|S1]} -> {{comp_class,Cc},Sc,S1}; + {_Cc,_S} -> parse_error({unterminated,"["}) + end; +reg4([$[|S0], Sc) -> + case char_class(S0) of + {Cc,[$]|S1]} -> {{char_class,Cc},Sc,S1}; + {_Cc,_S1} -> parse_error({unterminated,"["}) + end; +%reg4([$"|S0], Sc) -> +% case char_string(S0) of +% {St,[$"|S1]} -> {St,Sc,S1}; +% {St,S1} -> parse_error({unterminated,"\""}) +% end; +reg4([C0|S0], Sc) when + is_integer(C0), C0 /= $*, C0 /= $+, C0 /= $?, C0 /= $], C0 /= $), C0 /= $} -> + %% Handle \ quoted characters as well, at least those we see. + {C1,S1} = char(C0, S0), + {C1,Sc,S1}; +reg4(S=[$)|_], Sc) -> {epsilon,Sc,S}; +reg4([C|_S], _Sc) -> parse_error({illegal,[C]}); +reg4([], Sc) -> {epsilon,Sc,[]}. + +char($\\, [O1,O2,O3|S]) when + O1 >= $0, O1 =< $7, O2 >= $0, O2 =< $7, O3 >= $0, O3 =< $7 -> + {(O1*8 + O2)*8 + O3 - 73*$0,S}; +char($\\, [C|S]) -> {escape_char(C),S}; +char($\\, []) -> parse_error({unterminated,"\\"}); +char(C, S) -> {C,S}. + +escape_char($n) -> $\n; %\n = LF +escape_char($r) -> $\r; %\r = CR +escape_char($t) -> $\t; %\t = TAB +escape_char($v) -> $\v; %\v = VT +escape_char($b) -> $\b; %\b = BS +escape_char($f) -> $\f; %\f = FF +escape_char($e) -> $\e; %\e = ESC +escape_char($s) -> $\s; %\s = SPACE +escape_char($d) -> $\d; %\d = DEL +escape_char(C) -> C. + +char_class([$]|S0]) -> + {Cc,S1} = char_class(S0, [$]]), + {pack_cc(Cc),S1}; +char_class(S0) -> + {Cc,S1} = char_class(S0, []), + {pack_cc(Cc),S1}. + +pack_cc(Cc0) -> + %% First sort the list. + Cc1 = lists:usort(fun ({Cf1,_}, {Cf2,_}) -> Cf1 < Cf2; + ({Cf1,_}, C) -> Cf1 < C; + (C, {Cf,_}) -> C < Cf; + (C1, C2) -> C1 =< C2 + end, Cc0), + pack_cc1(Cc1). + +pack_cc1([{Cf1,Cl1},{Cf2,Cl2}|Cc]) when Cl1 >= Cf2, Cl1 =< Cl2 -> + pack_cc1([{Cf1,Cl2}|Cc]); +pack_cc1([{Cf1,Cl1},{Cf2,Cl2}|Cc]) when Cl1 >= Cf2, Cl1 >= Cl2 -> + pack_cc1([{Cf1,Cl1}|Cc]); +pack_cc1([{Cf1,Cl1},{Cf2,Cl2}|Cc]) when Cl1+1 == Cf2 -> + pack_cc1([{Cf1,Cl2}|Cc]); +pack_cc1([{Cf,Cl},C|Cc]) when Cl >= C -> pack_cc1([{Cf,Cl}|Cc]); +pack_cc1([{Cf,Cl},C|Cc]) when Cl+1 == C -> pack_cc1([{Cf,C}|Cc]); +pack_cc1([C,{Cf,Cl}|Cc]) when C == Cf-1 -> pack_cc1([{C,Cl}|Cc]); +pack_cc1([C1,C2|Cc]) when C1+1 == C2 -> pack_cc1([{C1,C2}|Cc]); +pack_cc1([C|Cc]) -> [C|pack_cc1(Cc)]; +pack_cc1([]) -> []. + +char_class("[:" ++ S0, Cc0) -> %Start of POSIX char class + case posix_cc(S0, Cc0) of + {Cc1,":]" ++ S1} -> char_class(S1, Cc1); + {_,_S1} -> parse_error({posix_cc,"[:" ++ S0}) + end; +char_class([C1|S0], Cc) when C1 /= $] -> + case char(C1, S0) of + {Cf,[$-,C2|S1]} when C2 /= $] -> + case char(C2, S1) of + {Cl,S2} when Cf < Cl -> char_class(S2, [{Cf,Cl}|Cc]); + {_Cl,_S2} -> parse_error({char_class,[C1|S0]}) + end; + {C,S1} -> char_class(S1, [C|Cc]) + end; +char_class(S, Cc) -> {Cc,S}. + +%% posix_cc(String, CharClass) -> {NewCharClass,RestString}. +%% Handle POSIX character classes, use Latin-1 character set. + +posix_cc("alnum" ++ S, Cc) -> + {[{$0,$9},{$A,$Z},{192,214},{216,223},{$a,$z},{224,246},{248,255}|Cc],S}; +posix_cc("alpha" ++ S, Cc) -> + {[{$A,$Z},{192,214},{216,223},{$a,$z},{224,246},{248,255}|Cc],S}; +posix_cc("blank" ++ S, Cc) -> {[$\s,$\t,160|Cc],S}; +posix_cc("cntrl" ++ S, Cc) -> {[{0,31},{127,159}|Cc],S}; +posix_cc("digit" ++ S, Cc) -> {[{$0,$9}|Cc],S}; +posix_cc("graph" ++ S, Cc) -> {[{33,126},{161,255}|Cc],S}; +posix_cc("lower" ++ S, Cc) -> {[{$a,$z},{224,246},{248,255}|Cc],S}; +posix_cc("print" ++ S, Cc) -> {[{32,126},{160,255}|Cc],S}; +posix_cc("punct" ++ S, Cc) -> {[{$!,$/},{$:,$?},{${,$~},{161,191}|Cc],S}; +posix_cc("space" ++ S, Cc) -> {[$\s,$\t,$\f,$\r,$\v,160|Cc],S}; +posix_cc("upper" ++ S, Cc) -> {[{$A,$Z},{192,214},{216,223}|Cc],S}; +posix_cc("xdigit" ++ S, Cc) -> {[{$a,$f},{$A,$F},{$0,$9}|Cc],S}; +posix_cc(S, _Cc) -> parse_error({posix_cc,"[:" ++ S}). + +interval_range(Cs0) -> + case number(Cs0) of + {none,Cs1} -> {none,none,Cs1}; + {N,[$,|Cs1]} -> + case number(Cs1) of + {none,Cs2} -> {N,any,Cs2}; + {M,Cs2} -> {N,M,Cs2} + end; + {N,Cs1} -> {N,none,Cs1} + end. + +number([C|Cs]) when C >= $0, C =< $9 -> + number(Cs, C - $0); +number(Cs) -> {none,Cs}. + +number([C|Cs], Acc) when C >= $0, C =< $9 -> + number(Cs, 10*Acc + (C - $0)); +number(Cs, Acc) -> {Acc,Cs}. + +parse_error(E) -> throw({error,E}). + +%char_string([C|S]) when C /= $" -> char_string(S, C); +%char_string(S) -> {epsilon,S}. + +%char_string([C|S0], L) when C /= $" -> +% char_string(S0, {concat,L,C}); +%char_string(S, L) -> {L,S}. + +%% re_apply(String, StartPos, RegExp) -> +%% {match,RestPos,Rest,SubExprs} | nomatch. +%% +%% Apply the (parse of the) regular expression RegExp to String. If +%% there is a match return the position of the remaining string and +%% the string if else return 'nomatch'. +%% +%% StartPos should be the real start position as it is used to decide +%% if we are at the beginning of the string. + +re_apply(S, St, {RE,Sc}) -> + Subs = erlang:make_tuple(Sc, none), %Make a sub-regexp table. + Res = re_apply(RE, [], S, St, Subs), + %% io:format("~p x ~p -> ~p\n", [RE,S,Res]), + Res. + +re_apply(epsilon, More, S, P, Subs) -> %This always matches + re_apply_more(More, S, P, Subs); +re_apply({'or',RE1,RE2}, More, S, P, Subs) -> + re_apply_or(re_apply(RE1, More, S, P, Subs), + re_apply(RE2, More, S, P, Subs)); +re_apply({concat,RE1,RE2}, More, S0, P, Subs) -> + re_apply(RE1, [RE2|More], S0, P, Subs); +re_apply({literal,[C|Lcs]}, More, [C|S], P, Subs) -> + re_apply_lit(Lcs, More, S, P+1, Subs); %Have matched first char +re_apply({kclosure,RE}, More, S0, P0, Subs0) -> + %% Greedy so try RE first, no difference here actually. + Loop = case re_apply(RE, [], S0, P0, Subs0) of + {match,P0,_S1,_Subs1} -> %0 length match, don't loop! + nomatch; + {match,P1,S1,Subs1} -> + re_apply_more([{kclosure,RE}|More], S1, P1, Subs1); + nomatch -> nomatch; + never_match -> never_match + end, + re_apply_or(Loop, re_apply_more(More, S0, P0, Subs0)); +re_apply({pclosure,RE}, More, S, P, Subs) -> + re_apply(RE, [{kclosure,RE}|More], S, P, Subs); +re_apply({optional,RE}, More, S, P, Subs) -> + %% Greedy so try RE first, no difference here actually. + re_apply_or(re_apply(RE, More, S, P, Subs), + re_apply_more(More, S, P, Subs)); +re_apply({iclosure,RE,N,M}, More, S, P, Subs) when N > 0 -> + re_apply(RE, [{iclosure,RE,N-1,M}|More], S, P, Subs); +re_apply({iclosure,RE,0,M}, More, S, P, Subs) -> + Exp = expand_opt(RE, M), + re_apply(Exp, More, S, P, Subs); +re_apply({subexpr,N,RE}, More, S, P, Subs) -> + re_apply(RE, [{endsub,N,P}|More], S, P, Subs); +re_apply({endsub,N,St}, More, S, P, Subs0) -> + Subs1 = setelement(N, Subs0, {St,P-St}), %Record sub-expr + re_apply_more(More, S, P, Subs1); +re_apply(bos, More, S, 1, Subs) -> re_apply_more(More, S, 1, Subs); +re_apply(bos, _More, _S, _, _) -> never_match; +re_apply(eos, More, [$\n], P, Subs) -> re_apply_more(More, [], P, Subs); +re_apply(eos, More, [], P, Subs) -> re_apply_more(More, [], P, Subs); +re_apply({char_class,Cc}, More, [C|S], P, Subs) -> + case in_char_class(C, Cc) of + true -> re_apply_more(More, S, P+1, Subs); + false -> nomatch + end; +re_apply({comp_class,Cc}, More, [C|S], P, Subs) -> + case in_char_class(C, Cc) of + true -> nomatch; + false -> re_apply_more(More, S, P+1, Subs) + end; +re_apply(C, More, [C|S], P, Subs) when is_integer(C) -> + re_apply_more(More, S, P+1, Subs); +re_apply(_RE, _More, _S, _P, _Subs) -> + %% io:format("~p : ~p\n", [_RE,_S]), + nomatch. + +%% re_apply_more([RegExp], String, Length, SubsExprs) -> +%% {match,RestPos,Rest,SubExprs} | nomatch. + +re_apply_more([RE|More], S, P, Subs) -> re_apply(RE, More, S, P, Subs); +re_apply_more([], S, P, Subs) -> {match,P,S,Subs}. + +%% re_apply_lit(Literal, More, String, Position, SubExprs) -> +%% {match,RestPos,Rest,SubExprs} | nomatch. +re_apply_lit([C|Lit], More, [C|Cs], P, Subs) -> + re_apply_lit(Lit, More, Cs, P+1, Subs); +re_apply_lit([], More, Cs, P, Subs) -> + re_apply_more(More, Cs, P, Subs); +re_apply_lit(_Lit, _More, _Cs, _P, _Subs) -> + nomatch. + +%% expand_iclosure(RE, N, M) -> RE. + +expand_iclosure(RE, 0, M) -> expand_opt(RE, M); +expand_iclosure(RE, N, M) -> + {concat,RE,expand_iclosure(RE, N-1, M)}. + +%% expand_opt(RegExp, Count) -> RE. +%% Handle all the cases. + +expand_opt(_RE, none) -> epsilon; +expand_opt(RE, any) -> {kclosure,RE}; +expand_opt(_RE, 0) -> epsilon; +expand_opt(RE, 1) -> {optional,RE}; +expand_opt(RE, N) -> + {optional,{concat,RE,expand_opt(RE, N-1)}}. + +%% find_prefix(PrefixStr, SourceStr) +%% if PrefixStr is a prefix of Str then return {ok,RemainingStr} +%% otherwise return false + +%% find_prefix([C|Prest], [C|Rest]) -> +%% find_prefix(Prest, Rest); +%% find_prefix([], Rest) -> {yes,Rest}; +%% find_prefix(_, _) -> no. + +%% in_char_class(Char, Class) -> bool(). + +in_char_class(C, [{C1,C2}|_Cc]) when C >= C1, C =< C2 -> true; +in_char_class(C, [C|_Cc]) -> true; +in_char_class(C, [_|Cc]) -> in_char_class(C, Cc); +in_char_class(_C, []) -> false. + +%% re_apply_or(Match1, Match2, SubExprs) -> +%% {match,RestPos,Rest,SubExprs} | nomatch. +%% If we want the best match then choose the longest match, else just +%% choose one by trying sequentially. + +re_apply_or(M1={match,P1,_,_},{match,P2,_,_}) when P1 >= P2 -> M1; +re_apply_or({match,_,_,_}, M2={match,_,_,_}) -> M2; +re_apply_or(never_match, R2) -> R2; +re_apply_or(R1, never_match) -> R1; +re_apply_or(nomatch, R2) -> R2; +re_apply_or(R1, nomatch) -> R1. + +%% Record definitions for the NFA, DFA and compiler. + +-record(nfa_state, {no,edges=[],accept=no}). +-record(dfa_state, {no,nfa=[],trans=[],accept=no}). + +-record(c_state, {no,trans=[],tmin=0,smin=none,tmax=0,smax=none, + accept=false,spec=[]}). + +%% We use standard methods, Thompson's construction and subset +%% construction, to create first an NFA and then a DFA from the +%% regexps. A non-standard feature is that we work with sets of +%% character ranges (crs) instead sets of characters. This is most +%% noticeable when constructing DFAs. The major benefit is that we can +%% handle characters from any set, not just limited ASCII or 8859, +%% even 16/32 bit unicode. +%% +%% The whole range of characters is 0-maxchar, where maxchar is a BIG +%% number. We don't make any assumptions about the size of maxchar, it +%% is just bigger than any character. +%% +%% Using character ranges makes describing many regexps very simple, +%% for example the regexp "." just becomes the range +%% [{0-9},{11-maxchar}]. + +%% make_nfa(RegExpActions) -> {ok,{NFA,StartState}} | {error,E}. +%% Build a complete nfa from a list of {RegExp,Action}. The NFA field +%% accept has values {yes,Action}|no. The NFA is a list of states. + +make_nfa(REAs0) -> + case parse_reas(REAs0) of + {ok,REAs1} -> + {NFA,Start} = build_combined_nfa(REAs1), + {ok,{NFA,Start}}; + {error,E} -> {error,E} + end. + +%% make_dfa(RegExpActions) -> {ok,{DFA,StartState}} | {error,E}. +%% make_dfa(RegExpActions, LowestState) -> {ok,{DFA,StartState}} | {error,E}. +%% Build a complete dfa from a list of {RegExp,Action}. The DFA field +%% accept has values {yes,Action}|no. If multiple Regexps can result +%% in same match string then RegExpActions list define priority. + +make_dfa(REAs) -> make_dfa(REAs, 0). + +make_dfa(REAs0, Low) -> + case parse_reas(REAs0) of + {ok,REAs1} -> + {NFA,Start0} = build_combined_nfa(REAs1), + {DFA0,Start1} = build_dfa(NFA, Start0), + {DFA,Start} = minimise_dfa(DFA0, Start1, Low), + {ok,{DFA,Start}}; + {error,E} -> {error,E} + end. + +parse_reas(REAs) -> parse_reas(REAs, []). + +parse_reas([{{regexp,{R,_Sc}},A}|REAs], S) -> %Already parsed + parse_reas(REAs, [{R,A}|S]); +parse_reas([{RegExp,A}|REAs], S) -> + case parse(RegExp) of + {ok,{regexp,{R,_Sc}}} -> parse_reas(REAs, [{R,A}|S]); + {error,E} -> {error,E} + end; +parse_reas([], Stack) -> {ok,reverse(Stack)}. + +%% build_combined_nfa(RegExpActionList) -> {NFA,StartState}. +%% Build the combined NFA using Thompson's construction straight out +%% of the book. Build the separate NFAs in the same order as the +%% rules so that the accepting have ascending states have ascending +%% state numbers. Start numbering the states from 1 as we put the +%% states in a tuple with the state number as the index. + +build_combined_nfa(REAs) -> + {NFA,Starts,Next} = build_nfa_list(REAs, [], [], 1), + F = #nfa_state{no=Next,edges=epsilon_trans(Starts),accept=no}, + {[F|NFA],Next}. + +build_nfa_list([{RE,Action}|REAs], NFA0, Starts, Next0) -> + {NFA1,Next1,Start} = build_nfa(RE, Next0, Action), + build_nfa_list(REAs, NFA1 ++ NFA0, [Start|Starts], Next1); +build_nfa_list([], NFA, Starts, Next) -> + {NFA,reverse(Starts),Next}. + +epsilon_trans(Sts) -> [ {epsilon,S} || S <- Sts ]. + +%% build_nfa(RegExp, NextState, Action) -> {NFA,NextFreeState,StartState}. +%% When building the NFA states for a ??? we don't build the end +%% state, just allocate a State for it and return this state +%% number. This allows us to avoid building unnecessary states for +%% concatenation which would then have to be removed by overwriting +%% an existing state. + +build_nfa(RE, Next, Action) -> + {NFA,N,E} = build_nfa(RE, Next+1, Next, []), + {[#nfa_state{no=E,accept={yes,Action}}|NFA],N,Next}. + +%% build_nfa(RegExp, NextState, StartState, NFA) -> {NFA,NextState,EndState}. +%% The NFA is a list of nfa_state is no predefined order. The state +%% number of the returned EndState is already allocated! + +build_nfa({'or',RE1,RE2}, N0, S, NFA0) -> + {NFA1,N1,E1} = build_nfa(RE1, N0+1, N0, NFA0), + {NFA2,N2,E2} = build_nfa(RE2, N1+1, N1, NFA1), + E = N2, + {[#nfa_state{no=S,edges=[{epsilon,N0},{epsilon,N1}]}, + #nfa_state{no=E1,edges=[{epsilon,E}]}, + #nfa_state{no=E2,edges=[{epsilon,E}]}|NFA2], + N2+1,E}; +build_nfa({literal,[]}, N, S, NFA) -> + {NFA,N,S}; +build_nfa({literal,[C|Cs]}, N0, S, NFA0) -> + {NFA1,N1,E1} = build_nfa(C, N0, S, NFA0), + build_nfa({literal,Cs}, N1, E1, NFA1); +build_nfa({concat,RE1,RE2}, N0, S, NFA0) -> + {NFA1,N1,E1} = build_nfa(RE1, N0, S, NFA0), + {NFA2,N2,E2} = build_nfa(RE2, N1, E1, NFA1), + {NFA2,N2,E2}; +build_nfa({kclosure,RE}, N0, S, NFA0) -> + {NFA1,N1,E1} = build_nfa(RE, N0+1, N0, NFA0), + E = N1, + {[#nfa_state{no=S,edges=[{epsilon,N0},{epsilon,E}]}, + #nfa_state{no=E1,edges=[{epsilon,N0},{epsilon,E}]}|NFA1], + N1+1,E}; +build_nfa({pclosure,RE}, N0, S, NFA0) -> + {NFA1,N1,E1} = build_nfa(RE, N0+1, N0, NFA0), + E = N1, + {[#nfa_state{no=S,edges=[{epsilon,N0}]}, + #nfa_state{no=E1,edges=[{epsilon,N0},{epsilon,E}]}|NFA1], + N1+1,E}; +build_nfa({optional,RE}, N0, S, NFA0) -> + {NFA1,N1,E1} = build_nfa(RE, N0+1, N0, NFA0), + E = N1, + {[#nfa_state{no=S,edges=[{epsilon,N0},{epsilon,E}]}, + #nfa_state{no=E1,edges=[{epsilon,E}]}|NFA1], + N1+1,E}; +build_nfa({iclosure,RE,I1,I2}, N, S, NFA) -> + Exp = expand_iclosure(RE, I1, I2), + build_nfa(Exp, N, S, NFA); +build_nfa({char_class,Cc}, N, S, NFA) -> + {[#nfa_state{no=S,edges=[{nfa_char_class(Cc),N}]}|NFA],N+1,N}; +build_nfa({comp_class,Cc}, N, S, NFA) -> + {[#nfa_state{no=S,edges=[{nfa_comp_class(Cc),N}]}|NFA],N+1,N}; +build_nfa(epsilon, N, S, NFA) -> + {NFA,N,S}; +build_nfa({group,RE}, N, S, NFA) -> %%% FIXME %%%%%%% + build_nfa(RE, N, S, NFA); +build_nfa({subexpr,_N,RE}, N, S, NFA) -> %%% FIXME %%%%%%% + build_nfa(RE, N, S, NFA); +build_nfa(bos, N, S, NFA) -> + {[#nfa_state{no=S,edges=[{[bos],N}]}|NFA],N+1,N}; +build_nfa(eos, N, S, NFA) -> + {[#nfa_state{no=S,edges=[{[eos],N}]}|NFA],N+1,N}; +%%{[#nfa_state{no=S,edges=[{[eos],N}]}|NFA],N+1,N}; +build_nfa(C, N, S, NFA) when is_integer(C) -> + {[#nfa_state{no=S,edges=[{[{C,C}],N}]}|NFA],N+1,N}. + +nfa_char_class(Cc) -> + Crs = lists:foldl(fun({C1,C2}, Set) -> add_element({C1,C2}, Set); + (C, Set) -> add_element({C,C}, Set) end, [], Cc), + %% io:fwrite("cc: ~p\n", [Crs]), + pack_crs(Crs). + +pack_crs([{C1,C2}=Cr,{C3,C4}|Crs]) when C1 =< C3, C2 >= C4 -> + %% C1 C2 + %% C3 C4 + pack_crs([Cr|Crs]); +pack_crs([{C1,C2},{C3,C4}|Crs]) when C2 >= C3, C2 < C4 -> + %% C1 C2 + %% C3 C4 + pack_crs([{C1,C4}|Crs]); +pack_crs([{C1,C2},{C3,C4}|Crs]) when C2 + 1 == C3 -> + %% C1 C2 + %% C3 C4 + pack_crs([{C1,C4}|Crs]); +pack_crs([Cr|Crs]) -> [Cr|pack_crs(Crs)]; +pack_crs([]) -> []. + +nfa_comp_class(Cc) -> + Crs = nfa_char_class(Cc), + %% io:fwrite("comp: ~p\n", [Crs]), + comp_crs(Crs, 0). + +comp_crs([{C1,C2}|Crs], Last) -> + [{Last,C1-1}|comp_crs(Crs, C2+1)]; +comp_crs([], Last) -> [{Last,maxchar}]. + +%% build_dfa(NFA, NfaStartState) -> {DFA,DfaStartState}. +%% Build a DFA from an NFA using "subset construction". The major +%% difference from the book is that we keep the marked and unmarked +%% DFA states in seperate lists. New DFA states are added to the +%% unmarked list and states are marked by moving them to the marked +%% list. We assume that the NFA accepting state numbers are in +%% ascending order for the rules and use ordsets to keep this order. + +build_dfa(NFA0, Start) -> + %% We want NFA as sorted tuple for fast access, assume lowest state 1. + NFA1 = list_to_tuple(keysort(#nfa_state.no, NFA0)), + D = #dfa_state{no=0,nfa=eclosure([Start], NFA1),accept=no}, + {build_dfa([D], 1, [], NFA1),0}. + +%% build_dfa([UnMarked], NextState, [Marked], NFA) -> DFA. +%% Traverse the unmarked states. Temporarily add the current unmarked +%% state to the marked list before calculating translation, this is +%% to avoid adding too many duplicate states. Add it properly to the +%% marked list afterwards with correct translations. + +build_dfa([U|Us0], N0, Ms, NFA) -> + {Ts,Us1,N1} = build_dfa(U#dfa_state.nfa, Us0, N0, [], [U|Ms], NFA), + M = U#dfa_state{trans=Ts,accept=accept(U#dfa_state.nfa, NFA)}, + build_dfa(Us1, N1, [M|Ms], NFA); +build_dfa([], _N, Ms, _NFA) -> Ms. + +%% build_dfa([NfaState], [Unmarked], NextState, [Transition], [Marked], NFA) -> +%% {Transitions,UnmarkedStates,NextState}. +%% Foreach NFA state set calculate the legal translations. N.B. must +%% search *BOTH* the unmarked and marked lists to check if DFA state +%% already exists. As the range of characters is potentially VERY +%% large we cannot explicitly test all characters. Instead we first +%% calculate the set of all disjoint character ranges which are +%% possible candidates to the set of NFA states. + +build_dfa(Set, Us, N, Ts, Ms, NFA) -> + %% List of all transition sets. + Crs0 = [Cr || S <- Set, + {Crs,_St} <- (element(S, NFA))#nfa_state.edges, + is_list(Crs), + Cr <- Crs ], + Crs1 = lists:usort(Crs0), %Must remove duplicates! + %% Build list of disjoint test ranges. + Test = disjoint_crs(Crs1), + %% io:fwrite("bd: ~p\n ~p\n ~p\n ~p\n", [Set,Crs0,Crs1,Test]), + build_dfa(Test, Set, Us, N, Ts, Ms, NFA). + +%% disjoint_crs([CharRange]) -> [CharRange]. +%% Take a sorted list of char ranges and make a sorted list of +%% disjoint char ranges. No new char range extends past an existing +%% char range. + +disjoint_crs([{_C1,C2}=Cr1,{C3,_C4}=Cr2|Crs]) when C2 < C3 -> + %% C1 C2 + %% C3 C4 + [Cr1|disjoint_crs([Cr2|Crs])]; +disjoint_crs([{C1,C2},{C3,C4}|Crs]) when C1 == C3 -> + %% C1 C2 + %% C3 C4 + [{C1,C2}|disjoint_crs(add_element({C2+1,C4}, Crs))]; +disjoint_crs([{C1,C2},{C3,C4}|Crs]) when C1 < C3, C2 >= C3, C2 < C4 -> + %% C1 C2 + %% C3 C4 + [{C1,C3-1}|disjoint_crs(union([{C3,C2},{C2+1,C4}], Crs))]; +disjoint_crs([{C1,C2},{C3,C4}|Crs]) when C1 < C3, C2 == C4 -> + %% C1 C2 + %% C3 C4 + [{C1,C3-1}|disjoint_crs(add_element({C3,C4}, Crs))]; +disjoint_crs([{C1,C2},{C3,C4}|Crs]) when C1 < C3, C2 > C4 -> + %% C1 C2 + %% C3 C4 + [{C1,C3-1}|disjoint_crs(union([{C3,C4},{C4+1,C2}], Crs))]; +disjoint_crs([Cr|Crs]) -> [Cr|disjoint_crs(Crs)]; +disjoint_crs([]) -> []. + +build_dfa([Cr|Crs], Set, Us, N, Ts, Ms, NFA) -> + case eclosure(move(Set, Cr, NFA), NFA) of + S when S /= [] -> + case keysearch(S, #dfa_state.nfa, Us) of + {value,#dfa_state{no=T}} -> + build_dfa(Crs, Set, Us, N, [{Cr,T}|Ts], Ms, NFA); + false -> + case keysearch(S, #dfa_state.nfa, Ms) of + {value,#dfa_state{no=T}} -> + build_dfa(Crs, Set, Us, N, [{Cr,T}|Ts], Ms, NFA); + false -> + U = #dfa_state{no=N,nfa=S}, + build_dfa(Crs, Set, [U|Us], N+1, [{Cr,N}|Ts], Ms, NFA) + end + end; + [] -> + build_dfa(Crs, Set, Us, N, Ts, Ms, NFA) + end; +build_dfa([], _Set, Us, N, Ts, _Ms, _NFA) -> + {Ts,Us,N}. + +%% eclosure([State], NFA) -> [State]. +%% move([State], Char, NFA) -> [State]. +%% These are straight out of the book. As eclosure uses ordsets then +%% the generated state sets are in ascending order. + +eclosure(Sts, NFA) -> eclosure(Sts, NFA, []). + +eclosure([St|Sts], NFA, Ec) -> + #nfa_state{edges=Es} = element(St, NFA), + eclosure([ N || {epsilon,N} <- Es, + not is_element(N, Ec) ] ++ Sts, + NFA, add_element(St, Ec)); +eclosure([], _NFA, Ec) -> Ec. + +move(Sts, Cr, NFA) -> + [ St || N <- Sts, + {Crs,St} <- (element(N, NFA))#nfa_state.edges, + is_list(Crs), +%% begin +%% io:fwrite("move1: ~p\n", [{Sts,Cr,Crs,in_crs(Cr,Crs)}]), +%% true +%% end, + in_crs(Cr, Crs) ]. + +in_crs({C1,C2}, [{C3,C4}|_Crs]) when C1 >= C3, C2 =< C4 -> true; +in_crs(Cr, [Cr|_Crs]) -> true; %Catch bos and eos. +in_crs(Cr, [_|Crs]) -> in_crs(Cr, Crs); +in_crs(_Cr, []) -> false. + +%% accept([State], NFA) -> true | false. +%% Scan down the state list until we find an accepting state. + +accept([St|Sts], NFA) -> + case element(St, NFA) of + #nfa_state{accept={yes,A}} -> {yes,A}; + #nfa_state{accept=no} -> accept(Sts, NFA) + end; +accept([], _NFA) -> no. + +%% minimise_dfa(DFA, StartState, FirstState) -> {DFA,StartState}. +%% Minimise the DFA by removing equivalent states. We consider a +%% state if both the transitions and the their accept state is the +%% same. First repeatedly run throught the DFA state list removing +%% equivalent states and updating remaining transitions with +%% remaining equivalent state numbers. When no more reductions are +%% possible then pack the remaining state numbers to get consecutive +%% states. + +minimise_dfa(DFA0, Start, N) -> + case min_dfa(DFA0) of + {DFA1,[]} -> %No reduction! + {DFA2,Rs} = pack_dfa(DFA1, N), + {min_update(DFA2, Rs),min_new_state(Start, Rs)}; + {DFA1,Rs} -> + minimise_dfa(min_update(DFA1, Rs), min_new_state(Start, Rs), N) + end. + +min_dfa(DFA) -> min_dfa(DFA, [], []). + +min_dfa([D|DFA0], Rs0, MDFA) -> + {DFA1,Rs1} = min_delete(DFA0, D#dfa_state.trans, D#dfa_state.accept, + D#dfa_state.no, Rs0, []), + min_dfa(DFA1, Rs1, [D|MDFA]); +min_dfa([], Rs, MDFA) -> {MDFA,Rs}. + +min_delete([#dfa_state{no=N,trans=T,accept=A}|DFA], T, A, NewN, Rs, MDFA) -> + min_delete(DFA, T, A, NewN, [{N,NewN}|Rs], MDFA); +min_delete([D|DFA], T, A, NewN, Rs, MDFA) -> + min_delete(DFA, T, A, NewN, Rs, [D|MDFA]); +min_delete([], _T, _A, _NewN, Rs, MDFA) -> {MDFA,Rs}. + +min_update(DFA, Rs) -> + [ D#dfa_state{trans=min_update_trans(D#dfa_state.trans, Rs)} || D <- DFA ]. + +min_update_trans(Tr, Rs) -> + [ {C,min_new_state(S, Rs)} || {C,S} <- Tr ]. + +min_new_state(Old, [{Old,New}|_Reds]) -> New; +min_new_state(Old, [_R|Reds]) -> min_new_state(Old, Reds); +min_new_state(Old, []) -> Old. + +pack_dfa(DFA, N) -> pack_dfa(DFA, N, [], []). + +pack_dfa([D|DFA], NewN, Rs, PDFA) -> + pack_dfa(DFA, NewN+1, [{D#dfa_state.no,NewN}|Rs], + [D#dfa_state{no=NewN}|PDFA]); +pack_dfa([], _NewN, Rs, PDFA) -> {PDFA,Rs}. + +%% comp_apply(String, StartPos, DFAReg) -> {match,RestPos,Rest} | nomatch. +%% Apply the DFA of a regular expression to a string. If +%% there is a match return the position of the remaining string and +%% the string if else return 'nomatch'. +%% +%% StartPos should be the real start position as it is used to decide +%% if we are at the beginning of the string. + +comp_apply(Cs, P, {DFA,Start,_Fail}) -> + comp_apply(element(Start, DFA), Cs, P, DFA, nomatch). + +comp_apply(#c_state{spec=[]}=St, Cs, P, DFA, Accept) -> + comp_apply_tr(St, Cs, P, DFA, Accept); +comp_apply(#c_state{spec=Sp}=St, Cs, P, DFA, Accept) -> + comp_apply_sp(St, Cs, P, DFA, Accept, Sp). + +comp_apply_tr(#c_state{trans=none,accept=A}, Cs, P, _DFA, Accept) -> + %% End state. + accept_value(A, Cs, P, Accept); +comp_apply_tr(#c_state{trans=Tr,tmin=Tmin,smin=Smin,tmax=Tmax,smax=Smax,accept=A}, + [C|Cs]=Cs0, P, DFA, Accept) -> + %% Get the next state number to go to. + NextSt = if C =< Tmin -> Smin; %Below transition table + C >= Tmax -> Smax; %Above transition table + true -> %Otherwise use table + element(C - Tmin, Tr) + end, + comp_apply(element(NextSt, DFA), Cs, P+1, DFA, + accept_value(A, Cs0, P, Accept)); +comp_apply_tr(#c_state{trans=_Tr,accept=A}, [], P, _DFA, Accept) -> + accept_value(A, [], P, Accept). + +comp_apply_sp(_St, Cs, 1, DFA, Accept, [{bos,S}|_]) -> + comp_apply(element(S, DFA), Cs, 1, DFA, Accept); +comp_apply_sp(_St, [$\n], P, DFA, Accept, [{eos,S}|_]) -> + comp_apply(element(S, DFA), [], P, DFA, Accept); +comp_apply_sp(_St, [], P, DFA, Accept, [{eos,S}|_]) -> + comp_apply(element(S, DFA), [], P, DFA, Accept); +comp_apply_sp(St, Cs, P, DFA, Accept, [_|Sp]) -> + comp_apply_sp(St, Cs, P, DFA, Accept, Sp); +comp_apply_sp(St, Cs, P, DFA, Accept, []) -> + comp_apply_tr(St, Cs, P, DFA, Accept). + +accept_value(true, Cs, P, _Accept) -> {match,P,Cs}; +accept_value(false, _Cs, _P, Accept) -> Accept. + +%% compile(RegExp) -> {ok,RE} | {error,E}. +%% Parse the regexp described in the string RegExp. + +compile(RegExp) -> + case make_dfa([{RegExp,yes}], 2) of + {ok,{DFA0,Start}} -> + Fail = 1, + DFA1 = [#dfa_state{no=Fail,accept=no,trans=[]}|DFA0], + DFA = tuplelise_dfa(DFA1, 1), + {ok,{comp_regexp,{DFA,Start,Fail}}}; + {error,E} -> {error,E} + end. + +%% tuplelise_dfa(DFAstates, NoAcceptState) -> {{CompState},FirstState}. + +tuplelise_dfa(DFA0, NoAccept) -> + DFA1 = map(fun (#dfa_state{no=N,trans=Ts,accept=A}) -> + {Tr,Tmin,Smin,Tmax,Smax,Sp} = build_trans(Ts, NoAccept), + #c_state{no=N,trans=Tr,tmin=Tmin,smin=Smin, + tmax=Tmax,smax=Smax, + accept=fix_accept(A),spec=Sp} + end, DFA0), + list_to_tuple(keysort(#dfa_state.no, DFA1)). + +build_trans(Ts0, NoAccept) -> + %% Split transitions into character ranges and specials. + {Ts1,Sp1} = foldl(fun ({{_,_},_}=T, {Ts,Sp}) -> {[T|Ts],Sp}; + ({_,_}=T, {Ts,Sp}) -> {Ts,[T|Sp]} + end, {[],[]}, Ts0), + if Ts1 == [] -> + {none,none,none,none,none,Sp1}; + true -> + %% Have transitions, convert to tuple. + Ts2 = keysort(1, Ts1), + {Tmin,Smin,Ts3} = min_trans(Ts2, NoAccept), + %% io:fwrite("exptr: ~p\n", [{Ts3,Tmin}]), + {Trans,Tmax,Smax} = expand_trans(Ts3, Tmin, NoAccept), + {list_to_tuple(Trans),Tmin,Smin,Tmax,Smax,Sp1} + end. + +min_trans([{{0,C2},S}|Crs], _Def) -> {C2,S,Crs}; +min_trans([{{C1,_C2},_S}|_]=Crs, Def) -> {C1-1,Def,Crs}. + +expand_trans([{{C1,maxchar},S}], Last, Def) -> + Trs = duplicate(C1-(Last+1), Def), + {Trs,C1,S}; +expand_trans([{{C1,C2},S}], Last, Def) -> + Trs = duplicate(C1-(Last+1), Def) ++ duplicate(C2-C1+1, S), + {Trs,C2+1,Def}; +expand_trans([{{C1,C2},S}|Crs], Last, Def) -> + {Trs0,Tmax,Smax} = expand_trans(Crs, C2, Def), + Trs1 = duplicate(C1-(Last+1), Def) ++ duplicate(C2-C1+1, S) ++ Trs0, + {Trs1,Tmax,Smax}. + +fix_accept({yes,_}) -> true; +fix_accept(no) -> false. + diff --git a/lib/xmerl/src/xmerl_sax_old_dom.erl b/lib/xmerl/src/xmerl_sax_old_dom.erl new file mode 100644 index 0000000000..c357816a1e --- /dev/null +++ b/lib/xmerl/src/xmerl_sax_old_dom.erl @@ -0,0 +1,293 @@ +%%-*-erlang-*- +%%-------------------------------------------------------------------- +%% %CopyrightBegin% +%% +%% Copyright Ericsson AB 2009. All Rights Reserved. +%% +%% The contents of this file are subject to the Erlang Public License, +%% Version 1.1, (the "License"); you may not use this file except in +%% compliance with the License. You should have received a copy of the +%% Erlang Public License along with this software. If not, it can be +%% retrieved online at http://www.erlang.org/. +%% +%% Software distributed under the License is distributed on an "AS IS" +%% basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See +%% the License for the specific language governing rights and limitations +%% under the License. +%% +%% %CopyrightEnd% +%%---------------------------------------------------------------------- +%% File : xmerl_sax_old_dom.erl +%% Description : +%% +%% Created : 02 Oct 2008 +%%---------------------------------------------------------------------- +-module(xmerl_sax_old_dom). + +%%---------------------------------------------------------------------- +%% Include files +%%---------------------------------------------------------------------- +-include("xmerl_sax_old_dom.hrl"). + +%%---------------------------------------------------------------------- +%% External exports +%%---------------------------------------------------------------------- +-export([ + initial_state/0, + get_dom/1, + event/3 + ]). + +%%---------------------------------------------------------------------- +%% Internal exports +%%---------------------------------------------------------------------- +-export([ + ]). + +%%====================================================================== +%% Macros +%%====================================================================== +%%---------------------------------------------------------------------- +%% Error handling +%%---------------------------------------------------------------------- +-define(error(Reason), + throw({xmerl_sax_old_dom_error, Reason})). + +%%====================================================================== +%% Records +%%====================================================================== + +%%---------------------------------------------------------------------- +%% State record for the validator +%%---------------------------------------------------------------------- +-record(xmerl_sax_old_dom_state, { + tags=[], %% Tag stack + cno=[], %% Current node number + namespaces = [], %% NameSpace stack + dom=[] %% DOM structure + }). + +%%====================================================================== +%% External functions +%%====================================================================== +%%---------------------------------------------------------------------- +%% Function: initial_state() -> Result +%% Parameters: +%% Result: +%% Description: +%%---------------------------------------------------------------------- +initial_state() -> + #xmerl_sax_old_dom_state{}. + +%%---------------------------------------------------------------------- +%% Function: get_dom(State) -> Result +%% Parameters: +%% Result: +%% Description: +%%---------------------------------------------------------------------- +get_dom(#xmerl_sax_old_dom_state{dom=Dom}) -> + Dom. + +%%---------------------------------------------------------------------- +%% Function: event(Event, LineNo, State) -> Result +%% Parameters: +%% Result: +%% Description: +%%---------------------------------------------------------------------- +event(Event, _LineNo, State) -> + build_dom(Event, State). + + +%%====================================================================== +%% Internal functions +%%====================================================================== + +%%---------------------------------------------------------------------- +%% Function : build_dom(Event, State) -> Result +%% Parameters: Event = term() +%% State = #xmerl_sax_old_dom_state{} +%% Result : #xmerl_sax_old_dom_state{} | +%% Description: +%%---------------------------------------------------------------------- + +%% Document +%%---------------------------------------------------------------------- +build_dom(startDocument, State) -> + State#xmerl_sax_old_dom_state{dom=[startDocument]}; +build_dom(endDocument, + #xmerl_sax_old_dom_state{dom=[#xmlElement{content=C} = Current |D]} = State) -> + case D of + [startDocument] -> + State#xmerl_sax_old_dom_state{dom=[Current#xmlElement{ + content=lists:reverse(C) + }]}; + [#xmlDecl{} = Decl, startDocument] -> + State#xmerl_sax_old_dom_state{dom=[Decl, Current#xmlElement{ + content=lists:reverse(C) + }]}; + _ -> + io:format("~p\n", [D]), + ?error("we're not at end the document when endDocument event is encountered.") + end; + +%% Element +%%---------------------------------------------------------------------- +build_dom({startElement, Uri, LocalName, QName, Attributes}, + #xmerl_sax_old_dom_state{tags=T, cno=CN, namespaces=NS, dom=D} = State) -> + + A = parse_attributes(LocalName, Attributes), + {Num, NewCN} = + case CN of + [] -> + {1, [1]}; + [ N |CNs] -> + {N, [1, N+1 |CNs]} + end, + + NsInfo = + case QName of + {[], _} -> []; + QN -> QN + end, + NameAsAtom = convert_qname_to_atom(QName), + + State#xmerl_sax_old_dom_state{tags=[{NameAsAtom, Num} |T], + cno=NewCN, + dom=[#xmlElement{name=NameAsAtom, + expanded_name=NameAsAtom, + nsinfo=NsInfo, + namespace=#xmlNamespace{default=list_to_atom(Uri), + nodes=NS}, + pos=Num, + parents=T, + attributes=lists:reverse(A), + xmlbase="." + } | D]}; +build_dom({endElement, _Uri, LocalName, QName}, + #xmerl_sax_old_dom_state{tags=[_ |T], + cno=[_ |CN], + dom=[#xmlElement{name=CName, content=C} = Current, + #xmlElement{content=PC} = Parent | D]} = State) -> + case convert_qname_to_atom(QName) of + CName -> + State#xmerl_sax_old_dom_state{tags=T, + cno=CN, + dom=[Parent#xmlElement{ + content=[Current#xmlElement{ + content=lists:reverse(C) + } + |PC] + } | D]}; + _ -> + ?error("Got end of element: " ++ LocalName ++ " but expected: " ++ + Current#xmlElement.name) + end; + +%% Text +%%---------------------------------------------------------------------- +build_dom({characters, String}, + #xmerl_sax_old_dom_state{tags=T, + cno=[Num |CN], + dom=[#xmlElement{content=C} = Current| D]} = State) -> + State#xmerl_sax_old_dom_state{cno=[Num+1 |CN], + dom=[Current#xmlElement{content=[#xmlText{value=String, parents=T, pos=Num, type=text} + |C]} | D]}; +build_dom({ignorableWhitespace, String}, + #xmerl_sax_old_dom_state{tags=T, + cno=[Num |CN], + dom=[#xmlElement{content=C} = Current| D]} = State) -> + State#xmerl_sax_old_dom_state{cno=[Num+1 |CN], + dom=[Current#xmlElement{content=[#xmlText{value=String, + parents=T, pos=Num, + type=text} + |C]} | D]}; + +%% Comments +%%---------------------------------------------------------------------- +build_dom({comment, String}, + #xmerl_sax_old_dom_state{tags=T, + cno=[Num |CN], + dom=[#xmlElement{content=C} = Current| D]} = State) -> + State#xmerl_sax_old_dom_state{cno=[Num+1 |CN], + dom=[Current#xmlElement{content=[#xmlComment{parents=T, pos=Num, value=String}|C]} | D]}; + +%% NameSpaces +%%---------------------------------------------------------------------- +build_dom({startPrefixMapping, [], _Uri}, State) -> + State; +build_dom({startPrefixMapping, Prefix, Uri}, + #xmerl_sax_old_dom_state{namespaces=NS} = State) -> + State#xmerl_sax_old_dom_state{namespaces=[{Prefix, list_to_atom(Uri)} |NS]}; +build_dom({endPrefixMapping, Prefix}, + #xmerl_sax_old_dom_state{namespaces=[{Prefix, _} |NS]} = State) -> + State#xmerl_sax_old_dom_state{namespaces=NS}; + +%% Processing instructions +%%---------------------------------------------------------------------- +build_dom({processingInstruction,"xml", PiData}, + #xmerl_sax_old_dom_state{dom=D} = State) -> + {Vsn, PiData1} = find_and_remove_attribute("version", PiData, []), + {Enc, PiData2} = find_and_remove_attribute("encoding", PiData1, []), + {Standalone, PiData3} = find_and_remove_attribute("standalone", PiData2, yes), + State#xmerl_sax_old_dom_state{dom=[#xmlDecl{vsn=Vsn, encoding=Enc, standalone=Standalone, attributes=PiData3}| D]}; +build_dom({processingInstruction, PiTarget, PiData}, + #xmerl_sax_old_dom_state{cno=[Num |CN], + dom=[#xmlElement{content=C} = Current| D]} = State) -> + State#xmerl_sax_old_dom_state{cno=[Num+1 |CN], + dom=[Current#xmlElement{content=[#xmlPI{name=PiTarget,pos=Num, value=PiData} + |C]} | D]}; +%% Default +%%---------------------------------------------------------------------- +build_dom(_E, State) -> + State. + + +%%---------------------------------------------------------------------- +%% Function : parse_attributes(ElName, Attributes) -> Result +%% Parameters: +%% Result : +%% Description: +%%---------------------------------------------------------------------- +parse_attributes(ElName, Attributes) -> + parse_attributes(ElName, Attributes, 1, []). + +parse_attributes(_, [], _, Acc) -> + Acc; +parse_attributes(ElName, [{_Uri, Prefix, LocalName, AttrValue} |As], N, Acc) -> + Name = convert_qname_to_atom({Prefix,LocalName}), + NsInfo = + case Prefix of + [] -> []; + P -> {P,LocalName} + end, + parse_attributes(ElName, As, N+1, [#xmlAttribute{name=Name, + pos=N, + nsinfo=NsInfo, + value=AttrValue, + normalized=false} |Acc]). + +%%---------------------------------------------------------------------- +%% Function : convert_qname_to_atom(QName) -> Result +%% Parameters: +%% Result : +%% Description: +%%---------------------------------------------------------------------- +convert_qname_to_atom({[], N}) -> + list_to_atom(N); +convert_qname_to_atom({P,N}) -> + list_to_atom(P ++ ":" ++ N). + +%%---------------------------------------------------------------------- +%% Function : find_and_remove_attribute(Key, Data, Default) -> Result +%% Parameters: +%% Result : +%% Description: +%%---------------------------------------------------------------------- +find_and_remove_attribute(Key, Data, Default) -> + case lists:keysearch(Key, 1, Data) of + {value, {Key, Value}} -> + Data2 = lists:keydelete(Key, 1, Data), + {Value, Data2}; + false -> + {Default, Data} + end. diff --git a/lib/xmerl/src/xmerl_sax_old_dom.hrl b/lib/xmerl/src/xmerl_sax_old_dom.hrl new file mode 100644 index 0000000000..92000f8d7f --- /dev/null +++ b/lib/xmerl/src/xmerl_sax_old_dom.hrl @@ -0,0 +1,138 @@ +%% +%% %CopyrightBegin% +%% +%% Copyright Ericsson AB 2009. All Rights Reserved. +%% +%% The contents of this file are subject to the Erlang Public License, +%% Version 1.1, (the "License"); you may not use this file except in +%% compliance with the License. You should have received a copy of the +%% Erlang Public License along with this software. If not, it can be +%% retrieved online at http://www.erlang.org/. +%% +%% Software distributed under the License is distributed on an "AS IS" +%% basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See +%% the License for the specific language governing rights and limitations +%% under the License. +%% +%% %CopyrightEnd% +%% Contributor(s): +%% <[email protected]>: suggested #xmlDocument{} +%% +%%---------------------------------------------------------------------- +%% #0. BASIC INFORMATION +%%---------------------------------------------------------------------- +%% File: xmerl.hrl +%% Author : Ulf Wiger <[email protected]> +%% Date : 00-09-22 +%% Description : Record and macro definitions for xmerl +%%---------------------------------------------------------------------- + + + +%% records generated by the scanner +%% -------------------------------- + +%% XML declaration +-record(xmlDecl,{ + vsn, % string() XML version + encoding, % string() Character encoding + standalone, % (yes | no) + attributes % [#xmlAttribute()] Other attributes than above + }). + +%% Attribute +-record(xmlAttribute,{ + name, % atom() + expanded_name=[],% atom() | {string(),atom()} + nsinfo = [], % {Prefix, Local} | [] + namespace = [], % inherits the element's namespace + parents = [], % [{atom(),integer()}] + pos, % integer() + language = [], % inherits the element's language + value, % IOlist() | atom() | integer() + normalized % atom() one of (true | false) + }). + +%% namespace record +-record(xmlNamespace,{ + default = [], + nodes = [] + }). + +%% namespace node - i.e. a {Prefix, URI} pair +%% TODO: these are not currently used?? /RC +-record(xmlNsNode,{ + prefix, + uri = [] + }). + +%% XML Element +%% content = [#xmlElement()|#xmlText()|#xmlPI()|#xmlComment()|#xmlDecl()] +-record(xmlElement,{ + name, % atom() + expanded_name = [], % string() | {URI,Local} | {"xmlns",Local} + nsinfo = [], % {Prefix, Local} | [] + namespace=#xmlNamespace{}, + parents = [], % [{atom(),integer()}] + pos, % integer() + attributes = [], % [#xmlAttribute()] + content = [], + language = "", % string() + xmlbase="", % string() XML Base path, for relative URI:s + elementdef=undeclared % atom(), one of [undeclared | prolog | external | element] + }). + +%% plain text +%% IOlist = [char() | binary () | IOlist] +-record(xmlText,{ + parents = [], % [{atom(),integer()}] + pos, % integer() + language = [],% inherits the element's language + value, % IOlist() + type = text % atom() one of (text|cdata) + }). + +%% plain text +-record(xmlComment,{ + parents = [], % [{atom(),integer()}] + pos, % integer() + language = [], % inherits the element's language + value % IOlist() + }). + +%% processing instruction +-record(xmlPI,{ + name, % atom() + pos, % integer() + value % IOlist() + }). + +-record(xmlDocument,{ + content + }). + + +%% XPATH (xmerl_xpath, xmerl_pred_funcs) records + +-record(xmlContext, { + axis_type = forward, + context_node, + context_position = 1, + nodeset = [], + bindings = [], + functions = [], + namespace = [], + whole_document + }). + +-record(xmlNode, { + type = element, + node, + parents = [], + pos = 1 + }). + +-record(xmlObj, { + type, + value + }). diff --git a/lib/xmerl/src/xmerl_sax_parser.erl b/lib/xmerl/src/xmerl_sax_parser.erl new file mode 100644 index 0000000000..eb9f8deec6 --- /dev/null +++ b/lib/xmerl/src/xmerl_sax_parser.erl @@ -0,0 +1,399 @@ +%%-------------------------------------------------------------------- +%% %CopyrightBegin% +%% +%% Copyright Ericsson AB 2008-2009. All Rights Reserved. +%% +%% The contents of this file are subject to the Erlang Public License, +%% Version 1.1, (the "License"); you may not use this file except in +%% compliance with the License. You should have received a copy of the +%% Erlang Public License along with this software. If not, it can be +%% retrieved online at http://www.erlang.org/. +%% +%% Software distributed under the License is distributed on an "AS IS" +%% basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See +%% the License for the specific language governing rights and limitations +%% under the License. +%% +%% %CopyrightEnd% +%%---------------------------------------------------------------------- +%% File : xmerl_sax_parser.erl +%% Description : XML SAX parse API module. +%% +%% Created : 4 Jun 2008 +%%---------------------------------------------------------------------- +-module(xmerl_sax_parser). + +%%---------------------------------------------------------------------- +%% Include files +%%---------------------------------------------------------------------- +-include("xmerl_sax_parser.hrl"). + +%%---------------------------------------------------------------------- +%% External exports +%%---------------------------------------------------------------------- +-export([ + file/2, + stream/2 + ]). + +%%---------------------------------------------------------------------- +%% Internal exports +%%---------------------------------------------------------------------- +-export([ + default_continuation_cb/1 + ]). + +%%---------------------------------------------------------------------- +%% Macros +%%---------------------------------------------------------------------- + +%%---------------------------------------------------------------------- +%% Records +%%---------------------------------------------------------------------- + +%%====================================================================== +%% External functions +%%====================================================================== +%%---------------------------------------------------------------------- +%% Function: file(Filename, Options) -> Result +%% Input: Filename = string() +%% Options = [{OptTag, term()}] +%% OptTag = event_state | event_fun | continuation_state | +%% continuation_fun | .... +%% Output: Result = {ok, EventState, Rest} +%% Rest = unicode_binary() | latin1_binary() +%% EventState = term() +%% Description: Parse file containing an XML document. +%%---------------------------------------------------------------------- +file(Name,Options) -> + case file:open(Name, [raw, read,binary]) of + {error, Reason} -> + {error,{Name, file:format_error(Reason)}}; + {ok, FD} -> + Dir = filename:dirname(Name), + CL = filename:absname(Dir), + File = filename:basename(Name), + ContinuationFun = fun default_continuation_cb/1, + Res = stream(<<>>, [{continuation_fun, ContinuationFun}, + {continuation_state, FD}, + {current_location, CL}, + {entity, File} + |Options]), + file:close(FD), + Res + end. + +%%---------------------------------------------------------------------- +%% Function: stream(Xml, Options) -> Result +%% Input: Xml = string() | binary() +%% Options = [{OptTag, term()}] +%% OptTag = event_state | event_fun | continuation_state | +%% continuation_fun | .... +%% Output: Result = {ok, EventState, Rest} +%% Rest = unicode_binary() | latin1_binary() | [unicode_char()] +%% EventState = term() +%% Description: Parse a stream containing an XML document. +%%---------------------------------------------------------------------- +stream(Xml, Options) when is_list(Xml), is_list(Options) -> + State = parse_options(Options, initial_state()), + case State#xmerl_sax_parser_state.file_type of + dtd -> + xmerl_sax_parser_list:parse_dtd(Xml, State#xmerl_sax_parser_state{encoding = list}); + normal -> + xmerl_sax_parser_list:parse(Xml, State#xmerl_sax_parser_state{encoding = list}) + end; +stream(Xml, Options) when is_binary(Xml), is_list(Options) -> + case parse_options(Options, initial_state()) of + {error, Reason} -> {error, Reason}; + State -> + ParseFunction = + case State#xmerl_sax_parser_state.file_type of + dtd -> + parse_dtd; + normal -> + parse + end, + case detect_charset(Xml, State) of + {error, Reason} -> {fatal_error, + { + State#xmerl_sax_parser_state.current_location, + State#xmerl_sax_parser_state.entity, + 1 + }, + Reason, + [], + State#xmerl_sax_parser_state.event_state}; + {Xml1, State1} -> + parse(Xml1, State1, ParseFunction) + end + end. + + +%%====================================================================== +%% Internal functions +%%====================================================================== + +%%---------------------------------------------------------------------- +%% Function: parse(Encoding, Xml, State, F) -> Result +%% Input: Encoding = atom() +%% Xml = [integer()] | binary() +%% State = #xmerl_sax_parser_state +%% F = atom() +%% Output: Result = {ok, Rest, EventState} +%% Rest = list() | binary() +%% EventState = term() +%% Description: Chooses the correct parser depending on the encoding. +%%---------------------------------------------------------------------- +parse(Xml, #xmerl_sax_parser_state{encoding=utf8}=State, F) -> + xmerl_sax_parser_utf8:F(Xml, State); +parse(Xml, #xmerl_sax_parser_state{encoding={utf16,little}}=State, F) -> + xmerl_sax_parser_utf16le:F(Xml, State); +parse(Xml, #xmerl_sax_parser_state{encoding={utf16,big}}=State, F) -> + xmerl_sax_parser_utf16be:F(Xml, State); +parse(Xml, #xmerl_sax_parser_state{encoding=latin1}=State, F) -> + xmerl_sax_parser_latin1:F(Xml, State); +parse(_, #xmerl_sax_parser_state{encoding=Enc}, _) -> + {error, lists:flatten(io_lib:format("Charcter set ~p not supported", [Enc]))}. + +%%---------------------------------------------------------------------- +%% Function: initial_state/0 +%% Input: - +%% Output: #xmerl_sax_parser_state{} +%% Description: Creates the initial state record. +%%---------------------------------------------------------------------- +initial_state() -> + #xmerl_sax_parser_state{ + event_fun = fun default_event_cb/3, + ns = [{"xml", "http://www.w3.org/XML/1998/namespace"}], + current_location = ".", + entity = "" + }. + +%%---------------------------------------------------------------------- +%% Function: parse_options(Options, State) +%% Input: Options = [Option] +%% Option = {event_state, term()} | {event_fun, fun()} | +%% {continuation_state, term()} | {continuation_fun, fun()} | +%% {encoding, Encoding} | {file_type, FT} +%% FT = normal | dtd +%% Encoding = utf8 | utf16le | utf16be | list | iso8859 +%% State = #xmerl_sax_parser_state{} +%% Output: #xmerl_sax_parser_state{} +%% Description: Checks the parser options. +%%---------------------------------------------------------------------- +parse_options([], State) -> + State; +parse_options([{event_state, CbState} |Options], State) -> + parse_options(Options, State#xmerl_sax_parser_state{event_state = CbState}); +parse_options([{event_fun, CbF} |Options], State) -> + parse_options(Options, State#xmerl_sax_parser_state{event_fun = CbF}); +parse_options([{continuation_state, CState} |Options], State) -> + parse_options(Options, State#xmerl_sax_parser_state{continuation_state = CState}); +parse_options([{continuation_fun, CF} |Options], State) -> + parse_options(Options, State#xmerl_sax_parser_state{continuation_fun = CF}); +parse_options([{file_type, FT} |Options], State) when FT==normal; FT==dtd -> + parse_options(Options, State#xmerl_sax_parser_state{file_type = FT}); +parse_options([{encoding, E} |Options], State) -> + case check_encoding_option(E) of + {error, Reason} -> + {error, Reason}; + Enc -> + parse_options(Options, State#xmerl_sax_parser_state{encoding = Enc}) + end; +parse_options([{current_location, CL} |Options], State) -> + parse_options(Options, State#xmerl_sax_parser_state{current_location = CL}); +parse_options([{entity, Entity} |Options], State) -> + parse_options(Options, State#xmerl_sax_parser_state{entity = Entity}); +parse_options([skip_external_dtd |Options], State) -> + parse_options(Options, State#xmerl_sax_parser_state{skip_external_dtd = true}); +parse_options([O |_], _State) -> + {error, + lists:flatten(io_lib:format("Option: ~p not supported", [O]))}. + + +check_encoding_option(E) when E==utf8; E=={utf16,little}; E=={utf16,big}; + E==latin1; E==list -> + E; +check_encoding_option(utf16) -> + {utf16,big}; +check_encoding_option(E) -> + {error, io_lib:format("Charcter set ~p not supported", [E])}. + +%%---------------------------------------------------------------------- +%% Function: detect_charset(Xml, State) +%% Input: Xml = list() | binary() +%% State = #xmerl_sax_parser_state{} +%% Output: {utf8|utf16le|utf16be|iso8859, Xml, State} +%% Description: Detects which character set is used in a binary stream. +%%---------------------------------------------------------------------- +detect_charset(<<>>, #xmerl_sax_parser_state{continuation_fun = undefined} = _) -> + throw({error, "Can't detect character encoding due to no indata"}); +detect_charset(<<>>, #xmerl_sax_parser_state{continuation_fun = CFun, + continuation_state = CState} = State) -> + case CFun(CState) of + {<<>>, _} -> + throw({error, "Can't detect character encoding due to lack of indata"}); + {NewBytes, NewContState} -> + detect_charset(NewBytes, State#xmerl_sax_parser_state{continuation_state = NewContState}) + end; +detect_charset(Bytes, State) -> + case unicode:bom_to_encoding(Bytes) of + {latin1, 0} -> + detect_charset_1(Bytes, State); + {Enc, Length} -> + <<_:Length/binary, RealBytes/binary>> = Bytes, + {RealBytes, State#xmerl_sax_parser_state{encoding=Enc}} + end. + +detect_charset_1(<<16#00, 16#3C, 16#00, 16#3F, _/binary>> = Xml, State) -> + {Xml, State#xmerl_sax_parser_state{encoding={utf16, big}}}; +detect_charset_1(<<16#3C, 16#00, 16#3F, 16#00, _/binary>> = Xml, State) -> + {Xml, State#xmerl_sax_parser_state{encoding={utf16, little}}}; +detect_charset_1(<<16#3C, 16#3F, 16#78, 16#6D, 16#6C, Xml2/binary>> = Xml, State) -> + case parse_xml_directive(Xml2) of + {error, Reason} -> + {error, Reason}; + AttrList -> + case lists:keysearch("encoding", 1, AttrList) of + {value, {_, E}} -> + case convert_encoding(E) of + {error, Reason} -> + {error, Reason}; + Enc -> + {Xml, State#xmerl_sax_parser_state{encoding=Enc}} + end; + _ -> + {Xml, State#xmerl_sax_parser_state{encoding=utf8}} + end + end; +detect_charset_1(Xml, State) -> + {Xml, State#xmerl_sax_parser_state{encoding=utf8}}. + +%%---------------------------------------------------------------------- +%% Function: convert_encoding(Enc) +%% Input: Enc = string() +%% Output: utf8 | iso8859 +%% Description: Converting 7,8 bit and utf8 encoding strings to internal format. +%%---------------------------------------------------------------------- +convert_encoding(Enc) -> %% Just for 7,8 bit + utf8 + case string:to_lower(Enc) of + "utf-8" -> utf8; + "iso-8859-1" -> latin1; % Handle all iso-8859 as latin1 + "iso-8859-2" -> latin1; + "iso-8859-3" -> latin1; + "iso-8859-4" -> latin1; + "iso-8859-5" -> latin1; + "iso-8859-6" -> latin1; + "iso-8859-7" -> latin1; + "iso-8859-8" -> latin1; + "iso-8859-9" -> latin1; + _ -> {error, "Unknown encoding: " ++ Enc} + end. + +%%---------------------------------------------------------------------- +%% Function: parse_xml_directive(Xml) +%% Input: Xml = binary() +%% Acc = list() +%% Output: +%% Description: Parsing the xml declaration from the input stream. +%%---------------------------------------------------------------------- +parse_xml_directive(<<C, Rest/binary>>) when ?is_whitespace(C) -> + parse_xml_directive_1(Rest, []). + +%%---------------------------------------------------------------------- +%% Function: parse_xml_directive_1(Xml, Acc) -> [{Name, Value}] +%% Input: Xml = binary() +%% Acc = [{Name, Value}] +%% Name = string() +%% Value = string() +%% Output: see above +%% Description: Parsing the xml declaration from the input stream. +%%---------------------------------------------------------------------- +parse_xml_directive_1(<<C, Rest/binary>>, Acc) when ?is_whitespace(C) -> + parse_xml_directive_1(Rest, Acc); +parse_xml_directive_1(<<"?>", _/binary>>, Acc) -> + Acc; +parse_xml_directive_1(<<C, Rest/binary>>, Acc) when 97 =< C, C =< 122 -> + {Name, Rest1} = parse_name(Rest, [C]), + Rest2 = parse_eq(Rest1), + {Value, Rest3} = parse_value(Rest2), + parse_xml_directive_1(Rest3, [{Name, Value} |Acc]); +parse_xml_directive_1(_, _) -> + {error, "Unknown attribute in xml directive"}. + +%%---------------------------------------------------------------------- +%% Function: parse_xml_directive_1(Xml, Acc) -> Name +%% Input: Xml = binary() +%% Acc = string() +%% Output: Name = string() +%% Description: Parsing an attribute name from the stream. +%%---------------------------------------------------------------------- +parse_name(<<C, Rest/binary>>, Acc) when 97 =< C, C =< 122 -> + parse_name(Rest, [C |Acc]); +parse_name(Rest, Acc) -> + {lists:reverse(Acc), Rest}. + +%%---------------------------------------------------------------------- +%% Function: parse_eq(Xml) -> Rest +%% Input: Xml = binary() +%% Output: Rest = binary() +%% Description: Reads an '=' from the stream. +%%---------------------------------------------------------------------- +parse_eq(<<C, Rest/binary>>) when ?is_whitespace(C) -> + parse_eq(Rest); +parse_eq(<<"=", Rest/binary>>) -> + Rest. + +%%---------------------------------------------------------------------- +%% Function: parse_value(Xml) -> {Value, Rest} +%% Input: Xml = binary() +%% Output: Value = string() +%% Rest = binary() +%% Description: Parsing an attribute value from the stream. +%%---------------------------------------------------------------------- +parse_value(<<C, Rest/binary>>) when ?is_whitespace(C) -> + parse_value(Rest); +parse_value(<<C, Rest/binary>>) when C == $'; C == $" -> + parse_value_1(Rest, C, []). + +%%---------------------------------------------------------------------- +%% Function: parse_value_1(Xml, Stop, Acc) -> {Value, Rest} +%% Input: Xml = binary() +%% Stop = $' | $" +%% Acc = list() +%% Output: Value = string() +%% Rest = binary() +%% Description: Parsing an attribute value from the stream. +%%---------------------------------------------------------------------- +parse_value_1(<<Stop, Rest/binary>>, Stop, Acc) -> + {lists:reverse(Acc), Rest}; +parse_value_1(<<C, Rest/binary>>, Stop, Acc) -> + parse_value_1(Rest, Stop, [C |Acc]). + +%%====================================================================== +%%Default functions +%%====================================================================== +%%---------------------------------------------------------------------- +%% Function: default_event_cb(Event, LineNo, State) -> Result +%% Input: Event = tuple() +%% LineNo = integer() +%% State = term() +%% Output: Result = {ok, State} +%% Description: Default event callback printing event. +%%---------------------------------------------------------------------- +default_event_cb(_Event, _LineNo, State) -> + State. + +%%---------------------------------------------------------------------- +%% Function: default_continuation_cb(IoDevice) -> Result +%% IoDevice = iodevice() +%% Output: Result = {[char()], State} +%% Description: Default continuation callback reading blocks. +%%---------------------------------------------------------------------- +default_continuation_cb(IoDevice) -> + case file:read(IoDevice, 1024) of + eof -> + {<<>>, IoDevice}; + {ok, FileBin} -> + {FileBin, IoDevice} + end. diff --git a/lib/xmerl/src/xmerl_sax_parser.hrl b/lib/xmerl/src/xmerl_sax_parser.hrl new file mode 100644 index 0000000000..736316e069 --- /dev/null +++ b/lib/xmerl/src/xmerl_sax_parser.hrl @@ -0,0 +1,93 @@ +%%-------------------------------------------------------------------- +%% %CopyrightBegin% +%% +%% Copyright Ericsson AB 2008-2009. All Rights Reserved. +%% +%% The contents of this file are subject to the Erlang Public License, +%% Version 1.1, (the "License"); you may not use this file except in +%% compliance with the License. You should have received a copy of the +%% Erlang Public License along with this software. If not, it can be +%% retrieved online at http://www.erlang.org/. +%% +%% Software distributed under the License is distributed on an "AS IS" +%% basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See +%% the License for the specific language governing rights and limitations +%% under the License. +%% +%% %CopyrightEnd% +%%---------------------------------------------------------------------- +%% File : xmerl_sax_parser.hrl +%% Description : +%% +%% Created : 25 Jun 2008 +%%---------------------------------------------------------------------- +%%====================================================================== +%% Include files +%%====================================================================== + + +%%====================================================================== +%% Macros +%%====================================================================== + +%%---------------------------------------------------------------------- +%% Definition of XML whitespace characters. These are 'space', +%% 'carriage return', 'line feed' and 'tab' +%%---------------------------------------------------------------------- +-define(is_whitespace(C), C=:=?space ; C=:=?cr ; C=:=?lf ; C=:=?tab). +-define(space, 32). +-define(cr, 13). +-define(lf, 10). +-define(tab, 9). + +%%---------------------------------------------------------------------- +%% Definition of hexadecimal digits +%%---------------------------------------------------------------------- +-define(is_hex_digit(C), $0 =< C, C =< $9; $a =< C, C =< $f; $A =< C, C =< $F). + +%%---------------------------------------------------------------------- +%% Definition of XML charcters +%% +%% [2] Char #x9 | #xA | #xD | [#x20-#xD7FF] | [#xE000-#xFFFD] | [#x10000-#x10FFFF] +%%---------------------------------------------------------------------- +-define(is_char(C), ?space =< C, C =< 55295; C=:=?cr ; C=:=?lf ; C=:=?tab; + 57344 =< C, C =< 65533; 65536 =< C, C =< 1114111). + +%% non-characters according to Unicode: 16#ffff and 16#fffe +%% -define(non_character(H1,H2), H1==16#ff,H2==16#fe;H1==16#ff,H2==16#ff). +%% -define(non_ascii(H), list(H),hd(H)>=128;integer(H),H>=128). + +%%---------------------------------------------------------------------- +%% Error handling +%%---------------------------------------------------------------------- +-define(fatal_error(State, Reason), + throw({fatal_error, {State, Reason}})). + +%%====================================================================== +%% Records +%%====================================================================== + +%%---------------------------------------------------------------------- +%% State record for the SAX parser +%%---------------------------------------------------------------------- +-record(xmerl_sax_parser_state, { + event_state, % User state for events + event_fun, % Fun used for each event + continuation_state, % User state for continuation calls + continuation_fun, % Fun used to fetch more input + encoding=utf8, % Which encoding is used + line_no = 1, % Current line number + ns = [], % List of current namespaces + current_tag = [], % Current tag + end_tags = [], % Stack of tags used for end tag matching + match_end_tags = true, % Flag which defines if the parser should match on end tags + ref_table, % Table containing entitity definitions + standalone = no, % yes if the document is standalone and don't need an external DTD. + file_type = normal, % Can be normal, dtd and entity + current_location, % Location of the currently parsed XML entity + entity, % Parsed XML entity + skip_external_dtd = false % If true the external DTD is skipped during parsing + }). + + + diff --git a/lib/xmerl/src/xmerl_sax_parser_base.erlsrc b/lib/xmerl/src/xmerl_sax_parser_base.erlsrc new file mode 100644 index 0000000000..9d184152d1 --- /dev/null +++ b/lib/xmerl/src/xmerl_sax_parser_base.erlsrc @@ -0,0 +1,3571 @@ +%%-*-erlang-*- +%% %CopyrightBegin% +%% +%% Copyright Ericsson AB 2008-2009. All Rights Reserved. +%% +%% The contents of this file are subject to the Erlang Public License, +%% Version 1.1, (the "License"); you may not use this file except in +%% compliance with the License. You should have received a copy of the +%% Erlang Public License along with this software. If not, it can be +%% retrieved online at http://www.erlang.org/. +%% +%% Software distributed under the License is distributed on an "AS IS" +%% basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See +%% the License for the specific language governing rights and limitations +%% under the License. +%% +%% %CopyrightEnd% +%%---------------------------------------------------------------------- +%% Start of common source +%%---------------------------------------------------------------------- +%-compile(export_all). + +%%---------------------------------------------------------------------- +%% Include files +%%---------------------------------------------------------------------- +-include("xmerl_sax_parser.hrl"). + +%%---------------------------------------------------------------------- +%% External exports +%%---------------------------------------------------------------------- +-export([ + parse/2, + parse_dtd/2, + is_name_char/1, + is_name_start/1 + ]). + +%%---------------------------------------------------------------------- +%% Internal exports +%%---------------------------------------------------------------------- +-export([ + ]). + +%%---------------------------------------------------------------------- +%% Records +%%---------------------------------------------------------------------- + +%%---------------------------------------------------------------------- +%% Macros +%%---------------------------------------------------------------------- +-define(HTTP_DEF_PORT, 80). + +%%====================================================================== +%% External functions +%%====================================================================== +%%---------------------------------------------------------------------- +%% Function: parse(Xml, State) -> Result +%% Input: Xml = string() | binary() +%% State = #xmerl_sax_parser_state{} +%% Output: Result = {ok, Rest, EventState} | +%% EventState = term() +%% Description: Parsing XML from input stream. +%%---------------------------------------------------------------------- +parse(Xml, State) -> + RefTable = ets:new(xmerl_sax_entity_refs, [private]), + + State1 = event_callback(startDocument, State), + + case catch parse_document(Xml, State1#xmerl_sax_parser_state{ref_table=RefTable}) of + {ok, Rest, State2} -> + State3 = event_callback(endDocument, State2), + ets:delete(RefTable), + {ok, State3#xmerl_sax_parser_state.event_state, Rest}; + {fatal_error, {State2, Reason}} -> + State3 = event_callback(endDocument, State2), + ets:delete(RefTable), + format_error(fatal_error, State3, Reason); + {event_receiver_error, State2, {Tag, Reason}} -> + State3 = event_callback(endDocument, State2), + ets:delete(RefTable), + format_error(Tag, State3, Reason); + Other -> + _State2 = event_callback(endDocument, State1), + ets:delete(RefTable), + throw(Other) + end. + +%%---------------------------------------------------------------------- +%% Function: parse_dtd(Xml, State) -> Result +%% Input: Xml = string() | binary() +%% State = #xmerl_sax_parser_state{} +%% Output: Result = {ok, Rest, EventState} | +%% EventState = term() +%% Description: Parsing XML DTD from input stream. +%%---------------------------------------------------------------------- +parse_dtd(Xml, State) -> + RefTable = ets:new(xmerl_sax_entity_refs, [private]), + + State1 = event_callback(startDocument, State), + + case catch parse_external_entity_1(Xml, State1#xmerl_sax_parser_state{ref_table=RefTable}) of + {fatal_error, {State2, Reason}} -> + State3 = event_callback(endDocument, State2), + ets:delete(RefTable), + format_error(fatal_error, State3, Reason); + {event_receiver_error, State2, {Tag, Reason}} -> + State3 = event_callback(endDocument, State2), + format_error(Tag, State3, Reason); + {Rest, State2} when is_record(State2, xmerl_sax_parser_state) -> + State3 = event_callback(endDocument, State2), + ets:delete(RefTable), + {ok, State3#xmerl_sax_parser_state.event_state, Rest}; + Other -> + _State2 = event_callback(endDocument, State1), + ets:delete(RefTable), + throw(Other) + end. + + +%%====================================================================== +%% Internal functions +%%====================================================================== + +%%---------------------------------------------------------------------- +%% Function: parse_document(Rest, State) -> Result +%% Input: Rest = string() | binary() +%% State = #xmerl_sax_parser_state{} +%% Output: Result = {ok, Rest, State} +%% Description: Parsing an XML document +%% [1] document ::= prolog element Misc* +%%---------------------------------------------------------------------- +parse_document(Rest, State) when is_record(State, xmerl_sax_parser_state) -> + {Rest1, State1} = parse_xml_decl(Rest, State), + {Rest2, State2} = parse_misc(Rest1, State1, true), + {ok, Rest2, State2}. + + +%%---------------------------------------------------------------------- +%% Function: parse_xml_decl(Rest, State) -> Result +%% Input: Rest = string() | binary() +%% State = #xmerl_sax_parser_state{} +%% Output: Result = {Rest, State} +%% Description: Parsing the xml directive in the prolog. +%% [22] prolog ::= XMLDecl? Misc* (doctypedecl Misc*)? +%% [23] XMLDecl ::= '<?xml' VersionInfo EncodingDecl? SDDecl? S? '?>' +%%---------------------------------------------------------------------- +parse_xml_decl(?STRING_EMPTY, State) -> + cf(?STRING_EMPTY, State, fun parse_xml_decl/2); +parse_xml_decl(?BYTE_ORDER_MARK_1, State) -> + cf(?BYTE_ORDER_MARK_1, State, fun parse_xml_decl/2); +parse_xml_decl(?BYTE_ORDER_MARK_2, State) -> + cf(?BYTE_ORDER_MARK_2, State, fun parse_xml_decl/2); +parse_xml_decl(?BYTE_ORDER_MARK_REST(Rest), State) -> + cf(Rest, State, fun parse_xml_decl/2); +parse_xml_decl(?STRING("<"), State) -> + cf(?STRING("<"), State, fun parse_xml_decl/2); +parse_xml_decl(?STRING("<?"), State) -> + cf(?STRING("<?"), State, fun parse_xml_decl/2); +parse_xml_decl(?STRING("<?x"), State) -> + cf(?STRING("<?x"), State, fun parse_xml_decl/2); +parse_xml_decl(?STRING("<?xm"), State) -> + cf(?STRING("<?xm"), State, fun parse_xml_decl/2); +parse_xml_decl(?STRING("<?xml"), State) -> + cf(?STRING("<?xml"), State, fun parse_xml_decl/2); +parse_xml_decl(?STRING_REST("<?xml", Rest1), State) -> + parse_xml_decl_1(Rest1, State); +parse_xml_decl(Bytes, #xmerl_sax_parser_state{encoding=Enc} = State) when is_binary(Bytes) -> + case unicode:characters_to_list(Bytes, Enc) of + {incomplete, _, _} -> + cf(Bytes, State, fun parse_xml_decl/2); + {error, _Encoded, _Rest} -> + ?fatal_error(State, lists:flatten(io_lib:format("Bad character, not in ~p\n", [Enc]))); + _ -> + parse_prolog(Bytes, State) + end; +parse_xml_decl(Bytes, State) -> + parse_prolog(Bytes, State). + + +parse_xml_decl_1(?STRING_UNBOUND_REST(C, Rest) = Bytes, State) -> + if + ?is_whitespace(C) -> + {_XmlAttributes, Rest1, State1} = parse_version_info(Rest, State, []), + %State2 = event_callback({processingInstruction, "xml", XmlAttributes}, State1),% The XML decl. should not be reported as a PI + parse_prolog(Rest1, State1); + true -> + parse_prolog(?STRING_REST("<?xml", Bytes), State) + end; +parse_xml_decl_1(Bytes, State) -> + unicode_incomplete_check([Bytes, State, fun parse_xml_decl_1/2], undefined). + + + +%%---------------------------------------------------------------------- +%% Function: parse_prolog(Rest, State) -> Result +%% Input: Rest = string() | binary() +%% State = #xmerl_sax_parser_state{} +%% Output: Result = {Rest, State} +%% Description: Parsing XML prolog +%% [22] prolog ::= XMLDecl? Misc* (doctypedecl Misc*)? +%%---------------------------------------------------------------------- +parse_prolog(?STRING_EMPTY, State) -> + cf(?STRING_EMPTY, State, fun parse_prolog/2); +parse_prolog(?STRING("<"), State) -> + cf(?STRING("<"), State, fun parse_prolog/2); +parse_prolog(?STRING_REST("<?", Rest), State) -> + {Rest1, State1} = parse_pi(Rest, State), + parse_prolog(Rest1, State1); +parse_prolog(?STRING_REST("<!", Rest), State) -> + parse_prolog_1(Rest, State); +parse_prolog(?STRING_REST("<", Rest), State) -> + parse_stag(Rest, State); +parse_prolog(?STRING_UNBOUND_REST(C, _) = Rest, State) when ?is_whitespace(C) -> + {_WS, Rest1, State1} = whitespace(Rest, State, []), + parse_prolog(Rest1, State1); +parse_prolog(Bytes, State) -> + unicode_incomplete_check([Bytes, State, fun parse_prolog/2], + "expecting < or whitespace"). + + +parse_prolog_1(?STRING_EMPTY, State) -> + cf(?STRING_EMPTY, State, fun parse_prolog_1/2); +parse_prolog_1(?STRING("D"), State) -> + cf(?STRING("D"), State, fun parse_prolog_1/2); +parse_prolog_1(?STRING("DO"), State) -> + cf(?STRING("DO"), State, fun parse_prolog_1/2); +parse_prolog_1(?STRING("DOC"), State) -> + cf(?STRING("DOC"), State, fun parse_prolog_1/2); +parse_prolog_1(?STRING("DOCT"), State) -> + cf(?STRING("DOCT"), State, fun parse_prolog_1/2); +parse_prolog_1(?STRING("DOCTY"), State) -> + cf(?STRING("DOCTY"), State, fun parse_prolog_1/2); +parse_prolog_1(?STRING("DOCTYP"), State) -> + cf(?STRING("DOCTYP"), State, fun parse_prolog_1/2); +parse_prolog_1(?STRING_REST("DOCTYPE", Rest), State) -> + {Rest1, State1} = parse_doctype(Rest, State), + State2 = event_callback(endDTD, State1), + parse_prolog(Rest1, State2); +parse_prolog_1(?STRING("-"), State) -> + cf(?STRING("-"), State, fun parse_prolog_1/2); +parse_prolog_1(?STRING_REST("--", Rest), State) -> + {Rest1, State1} = parse_comment(Rest, State, []), + parse_prolog(Rest1, State1); +parse_prolog_1(Bytes, State) -> + unicode_incomplete_check([Bytes, State, fun parse_prolog_1/2], + "expecting comment or DOCTYPE"). + + + +%%---------------------------------------------------------------------- +%% Function: parse_version_info(Rest, State, Acc) -> Result +%% Input: Rest = string() | binary() +%% State = #xmerl_sax_parser_state{} +%% Acc = [{Name, Value}] +%% Name = string() +%% Value = string() +%% Output: Result = {[{Name, Value}], Rest, State} +%% Description: Parsing the version number in the XML directive. +%% [24] VersionInfo ::= S 'version' Eq (' VersionNum ' | " VersionNum ") +%%---------------------------------------------------------------------- +parse_version_info(?STRING_EMPTY, State, Acc) -> + cf(?STRING_EMPTY, State, Acc, fun parse_version_info/3); +parse_version_info(?STRING_UNBOUND_REST(C, _) = Rest, State, Acc) when ?is_whitespace(C) -> + {_WS, Rest1, State1} = whitespace(Rest, State, []), + parse_version_info(Rest1, State1, Acc); +parse_version_info(?STRING_UNBOUND_REST(C,Rest), State, Acc) -> + case is_name_start(C) of + true -> + case parse_name(Rest, State, [C]) of + {"version", Rest1, State1} -> + {Rest2, State2} = parse_eq(Rest1, State1), + {Version, Rest3, State3} = parse_att_value(Rest2, State2), + parse_xml_decl_rest(Rest3, State3, [{"version",Version}|Acc]); + {_, _, State1} -> + ?fatal_error(State1, "expecting attribute version") + end; + false -> + ?fatal_error(State, "expecting attribute version") + end; +parse_version_info(Bytes, State, Acc) -> + unicode_incomplete_check([Bytes, State, Acc, fun parse_version_info/3], + undefined). + + + +%%---------------------------------------------------------------------- +%% Function: parse_xml_decl_rest(Rest, State, Acc) -> Result +%% Input: Rest = string() | binary() +%% State = #xmerl_sax_parser_state{} +%% Acc = [{Name, Value}] +%% Name = string() +%% Value = string() +%% Output: Result = {[{Name, Value}], Rest, State} +%% Description: Checks if there is more to parse in the XML directive. +%%---------------------------------------------------------------------- +parse_xml_decl_rest(?STRING_EMPTY, State, Acc) -> + cf(?STRING_EMPTY, State, Acc, fun parse_xml_decl_rest/3); +parse_xml_decl_rest(?STRING("?") = Rest, State, Acc) -> + cf(Rest, State, Acc, fun parse_xml_decl_rest/3); +parse_xml_decl_rest(?STRING_REST("?>", Rest), State, Acc) -> + {lists:reverse(Acc), Rest, State}; +parse_xml_decl_rest(?STRING_UNBOUND_REST(C, _) = Rest, State, Acc) when ?is_whitespace(C) -> + {_WS, Rest1, State1} = whitespace(Rest, State, []), + parse_xml_decl_encoding(Rest1, State1, Acc); +parse_xml_decl_rest(Bytes, State, Acc) -> + unicode_incomplete_check([Bytes, State, Acc, fun parse_xml_decl_rest/3], + "expecting encoding, standalone, whitespace or ?>"). + + +%%---------------------------------------------------------------------- +%% Function: parse_xml_decl_encoding(Rest, State, Acc) -> Result +%% Input: Rest = string() | binary() +%% State = #xmerl_sax_parser_state{} +%% Acc = [{Name, Value}] +%% Name = string() +%% Value = string() +%% Output: Result = {[{Name, Value}], Rest, State} +%% Description: Parse the encoding attribute in the XML directive. +%% [80] EncodingDecl ::= S 'encoding' Eq ('"' EncName '"' | "'" EncName "'" ) +% [81] EncName ::= [A-Za-z] ([A-Za-z0-9._] | '-')* +%%---------------------------------------------------------------------- +parse_xml_decl_encoding(?STRING_EMPTY, State, Acc) -> + cf(?STRING_EMPTY, State, Acc, fun parse_xml_decl_encoding/3); +parse_xml_decl_encoding(?STRING_REST("e", Rest), State, Acc) -> + case parse_name(Rest, State,[$e]) of + {"encoding", Rest1, State1} -> + {Rest2, State2} = parse_eq(Rest1, State1), + {Enc, Rest3, State3} = parse_att_value(Rest2, State2), + parse_xml_decl_encoding_1(Rest3, State3, [{"encoding",Enc} |Acc]); + {Name, _Rest1, State1} -> + ?fatal_error(State1, "Attribute " ++ Name ++ + " not allowed in xml declaration") + end; +parse_xml_decl_encoding(?STRING_UNBOUND_REST(_C, _) = Bytes, State, Acc) -> + parse_xml_decl_standalone(Bytes, State, Acc); +parse_xml_decl_encoding(Bytes, State, Acc) -> + unicode_incomplete_check([Bytes, State, Acc, fun parse_xml_decl_encoding/3], + undefined). + + +parse_xml_decl_encoding_1(?STRING_UNBOUND_REST(C, _) = Bytes, State, Acc) when ?is_whitespace(C) -> + {_WS, Rest1, State1} = whitespace(Bytes, State, []), + parse_xml_decl_standalone(Rest1, State1, Acc); +parse_xml_decl_encoding_1(?STRING_UNBOUND_REST(_C, _) = Bytes, State, Acc) -> + parse_xml_decl_rest(Bytes, State, Acc); +parse_xml_decl_encoding_1(Bytes, State, Acc) -> + unicode_incomplete_check([Bytes, State, Acc, fun parse_xml_decl_encoding_1/3], + undefined). + + +%%---------------------------------------------------------------------- +%% Function: parse_xml_decl_standalone(Rest, State, Acc) -> Result +%% Input: Rest = string() | binary() +%% State = #xmerl_sax_parser_state{} +%% Acc = [{Name, Value}] +%% Name = string() +%% Value = string() +%% Output: Result = {[{Name, Value}], Rest, State} +%% Description: Parse the standalone attribute in the XML directive. +%% [32] SDDecl ::= S 'standalone' Eq (("'" ('yes' | 'no') "'") | +%% ('"' ('yes' | 'no') '"')) +%%---------------------------------------------------------------------- +parse_xml_decl_standalone(?STRING_EMPTY, State, Acc) -> + cf(?STRING_EMPTY, State, Acc, fun parse_xml_decl_standalone/3); +parse_xml_decl_standalone(?STRING_REST("s", Rest), State, Acc) -> + case parse_name(Rest, State,[$s]) of + {"standalone", Rest1, State1} -> + {Rest2, State2} = parse_eq(Rest1, State1), + {Standalone, Rest3, State3} = parse_att_value(Rest2, State2), + case Standalone of + "yes" -> ok; + "no" -> ok; + _ -> + ?fatal_error(State3, "Wrong value of attribute standalone in xml declaration, must be yes or no") + end, + {_WS, Rest4, State4} = whitespace(Rest3, State3, []), + parse_xml_decl_rest(Rest4, State4#xmerl_sax_parser_state{standalone=list_to_atom(Standalone)}, + [{"standalone",Standalone} |Acc]); + {Name, _Rest1, State1} -> + ?fatal_error(State1, "Attribute " ++ Name ++ + " not allowed in xml declaration") + end; +parse_xml_decl_standalone(?STRING_UNBOUND_REST(_C, _) = Bytes, State, Acc) -> + parse_xml_decl_rest(Bytes, State, Acc); +parse_xml_decl_standalone(Bytes, State, Acc) -> + unicode_incomplete_check([Bytes, State, Acc, fun parse_xml_decl_standalone/3], + undefined). + + + +%%---------------------------------------------------------------------- +%% Function: parse_pi(Rest, State) -> Result +%% Input: Rest = string() | binary() +%% State = #xmerl_sax_parser_state{} +%% Output: Result = {Rest, State} +%% Description: Parse processing instructions. +%% [16] PI ::= '<?' PITarget (S (Char* - (Char* '?>' Char*)))? '?>' +%% [17] PITarget ::= Name - (('X' | 'x') ('M' | 'm') ('L' | 'l')) +%%---------------------------------------------------------------------- +parse_pi(?STRING_EMPTY, State) -> + cf(?STRING_EMPTY, State, fun parse_pi/2); +parse_pi(?STRING_UNBOUND_REST(C, Rest) = Bytes, State) -> + case is_name_start(C) of + true -> + {PiTarget, Rest1, State1} = + parse_name(Rest, State, [C]), + case string:to_lower(PiTarget) of + "xml" -> + case State#xmerl_sax_parser_state.end_tags of + [] -> + {Bytes, State}; + _ -> + ?fatal_error(State1, "<?xml ...?> not first in document") + end; + _ -> + {PiData, Rest2, State2} = parse_pi_1(Rest1, State1), + State3 = event_callback({processingInstruction, PiTarget, PiData}, State2), + {Rest2, State3} + end; + false -> + ?fatal_error(State, "expecting name") + end; +parse_pi(Bytes, State) -> + unicode_incomplete_check([Bytes, State, fun parse_pi/2], undefined). + +%%---------------------------------------------------------------------- +%% Function: parse_pi_1(Rest, State) -> Result +%% Input: Rest = string() | binary() +%% State = #xmerl_sax_parser_state{} +%% Output: Result = {Rest, State} +%% Description: Parse processing instructions. +%%---------------------------------------------------------------------- +parse_pi_1(?STRING_EMPTY, State) -> + cf(?STRING_EMPTY, State, fun parse_pi_1/2); +parse_pi_1(?STRING_UNBOUND_REST(C,_) = Rest, State) when ?is_whitespace(C) -> + {_WS, Rest1, State1} = + whitespace(Rest, State, []), + parse_pi_data(Rest1, State1, []); +parse_pi_1(?STRING_REST("?>", Rest), State) -> + {[], Rest, State}; +parse_pi_1(Bytes, State) -> + unicode_incomplete_check([Bytes, State, fun parse_pi/2], + "expecting whitespace or '?>'"). + + +%%---------------------------------------------------------------------- +%% Function: parse_name(Rest, State, Acc) -> Result +%% Input: Rest = string() | binary() +%% State = #xmerl_sax_parser_state{} +%% Acc = string() +%% Output: Result = {Name, Rest, State} +%% Name = string() +%% Description: Parse a name. Next character is put in the accumulator +%% if it's a valid name character. +%% [5] Name ::= (Letter | '_' | ':') (NameChar)* +%%---------------------------------------------------------------------- +parse_name(?STRING_EMPTY, State, Acc) -> + cf(?STRING_EMPTY, State, Acc, fun parse_name/3); +parse_name(?STRING_UNBOUND_REST(C, Rest) = Bytes, State, Acc) -> + case is_name_char(C) of + true -> + parse_name(Rest, State, [C|Acc]); + false -> + {lists:reverse(Acc), Bytes, State} + end; +parse_name(Bytes, State, Acc) -> + unicode_incomplete_check([Bytes, State, Acc, fun parse_name/3], undefined). + + +%%---------------------------------------------------------------------- +%% Function: parse_ns_name(Rest, State, Prefix, Name) -> Result +%% Input: Rest = string() | binary() +%% State = #xmerl_sax_parser_state{} +%% Prefix = string() +%% Name = string() +%% Output: Result = {{Prefix, Name}, Rest, State} +%% Name = string() +%% Description: Parse a namespace name. Next character is put in the +%% accumulator if it's a valid name character. +%% The difference between this function and parse_name/3 is +%% that a colon is interpreted as a separator between the +%% namespace prefix and the name. +%%---------------------------------------------------------------------- +parse_ns_name(?STRING_EMPTY, State, Prefix, Name) -> + cf(?STRING_EMPTY, State, Prefix, Name, fun parse_ns_name/4); +parse_ns_name(?STRING_UNBOUND_REST($:, Rest), State, [], Name) -> + parse_ns_name(Rest, State, lists:reverse(Name), []); +parse_ns_name(?STRING_UNBOUND_REST(C, Rest) = Bytes, State, Prefix, Name) -> + case is_name_char(C) of + true -> + parse_ns_name(Rest, State, Prefix, [C|Name]); + false -> + {{Prefix,lists:reverse(Name)}, Bytes, State} + end; +parse_ns_name(Bytes, State, Prefix, Name) -> + unicode_incomplete_check([Bytes, State, Prefix, Name, fun parse_ns_name/4], + undefined). + + +%%---------------------------------------------------------------------- +%% Function: parse_pi_data(Rest, State, Acc) -> Result +%% Input: Rest = string() | binary() +%% State = #xmerl_sax_parser_state{} +%% Acc = string() +%% Output: Result = {PiData, Rest, State} +%% PiData = string() +%% Description: Parse the data part of the processing instruction. +%% If next character is valid it's put in the accumulator. +%%---------------------------------------------------------------------- +parse_pi_data(?STRING_EMPTY, State, Acc) -> + cf(?STRING_EMPTY, State, Acc, fun parse_pi_data/3); +parse_pi_data(?STRING("?"), State, Acc) -> + cf(?STRING("?"), State, Acc, fun parse_pi_data/3); +parse_pi_data(?STRING("\r"), State, Acc) -> + cf(?STRING("\r"), State, Acc, fun parse_pi_data/3); +parse_pi_data(?STRING_REST("?>", Rest), State, Acc) -> + {lists:reverse(Acc), Rest, State}; +parse_pi_data(?STRING_REST("\n", Rest), #xmerl_sax_parser_state{line_no=N} = State, Acc) -> + parse_pi_data(Rest, State#xmerl_sax_parser_state{line_no=N+1}, [?lf |Acc]); +parse_pi_data(?STRING_REST("\r\n", Rest), #xmerl_sax_parser_state{line_no=N} = State, Acc) -> + parse_pi_data(Rest, State#xmerl_sax_parser_state{line_no=N+1}, [?lf |Acc]); +parse_pi_data(?STRING_REST("\r", Rest), #xmerl_sax_parser_state{line_no=N} = State, Acc) -> + parse_pi_data(Rest, State#xmerl_sax_parser_state{line_no=N+1}, [?lf |Acc]); +parse_pi_data(?STRING_UNBOUND_REST(C, Rest), State, Acc) when ?is_char(C)-> + parse_pi_data(Rest, State, [C|Acc]); +parse_pi_data(Bytes, State, Acc) -> + unicode_incomplete_check([Bytes, State, Acc, fun parse_pi_data/3], + "not an character"). + + +%%---------------------------------------------------------------------- +%% Function: parse_cdata(Rest, State) -> Result +%% Input: Rest = string() | binary() +%% State = #xmerl_sax_parser_state{} +%% Output: Result = {Rest, State} +%% Description: Start the parsing of a CDATA block. +%% [18] CDSect ::= CDStart CData CDEnd +%% [19] CDStart ::= '<![CDATA[' +%% [20] CData ::= (Char* - (Char* ']]>' Char*)) +%% [21] CDEnd ::= ']]>' +%%---------------------------------------------------------------------- +parse_cdata(?STRING_EMPTY, State) -> + cf(?STRING_EMPTY, State, fun parse_cdata/2); +parse_cdata(?STRING("["), State) -> + cf(?STRING("["), State, fun parse_cdata/2); +parse_cdata(?STRING("[C"), State) -> + cf(?STRING("[C"), State, fun parse_cdata/2); +parse_cdata(?STRING("[CD"), State) -> + cf(?STRING("[CD"), State, fun parse_cdata/2); +parse_cdata(?STRING("[CDA"), State) -> + cf(?STRING("[CDA"), State, fun parse_cdata/2); +parse_cdata(?STRING("[CDAT"), State) -> + cf(?STRING("[CDAT"), State, fun parse_cdata/2); +parse_cdata(?STRING("[CDATA"), State) -> + cf(?STRING("[CDATA"), State, fun parse_cdata/2); +parse_cdata(?STRING_REST("[CDATA[", Rest), State) -> + State1 = event_callback(startCDATA, State), + parse_cdata(Rest, State1, []); +parse_cdata(Bytes, State) -> + unicode_incomplete_check([Bytes, State, fun parse_cdata/2], + "expecting comment or CDATA"). + + +%%---------------------------------------------------------------------- +%% Function: parse_cdata(Rest, State, Acc) -> Result +%% Input: Rest = string() | binary() +%% State = #xmerl_sax_parser_state{} +%% Acc = string() +%% Output: Result = {Rest, State} +%% Description: Parse a CDATA block. +%%---------------------------------------------------------------------- +parse_cdata(?STRING_EMPTY, State, Acc) -> + cf(?STRING_EMPTY, State, Acc, fun parse_cdata/3); +parse_cdata(?STRING("\r"), State, Acc) -> + cf(?STRING("\r"), State, Acc, fun parse_cdata/3); +parse_cdata(?STRING("]"), State, Acc) -> + cf(?STRING("]"), State, Acc, fun parse_cdata/3); +parse_cdata(?STRING("]]"), State, Acc) -> + cf(?STRING("]]"), State, Acc, fun parse_cdata/3); +parse_cdata(?STRING_REST("]]>", Rest), State, Acc) -> + State1 = event_callback({characters, lists:reverse(Acc)}, State), + State2 = event_callback(endCDATA, State1), + parse_content(Rest, State2, [], true); +parse_cdata(?STRING_REST("\n", Rest), #xmerl_sax_parser_state{line_no=N} = State, Acc) -> + parse_cdata(Rest, State#xmerl_sax_parser_state{line_no=N+1}, [?lf |Acc]); +parse_cdata(?STRING_REST("\r\n", Rest), #xmerl_sax_parser_state{line_no=N} = State, Acc) -> + parse_cdata(Rest, State#xmerl_sax_parser_state{line_no=N+1}, [?lf |Acc]); +parse_cdata(?STRING_REST("\r", Rest), #xmerl_sax_parser_state{line_no=N} = State, Acc) -> + parse_cdata(Rest, State#xmerl_sax_parser_state{line_no=N+1}, [?lf |Acc]); +parse_cdata(?STRING_UNBOUND_REST(C, Rest), State, Acc) when ?is_char(C) -> + parse_cdata(Rest, State, [C|Acc]); +parse_cdata(?STRING_UNBOUND_REST(C, _), State, _) -> + ?fatal_error(State, "CDATA contains bad character value: " ++ [C]); +parse_cdata(Bytes, State, Acc) -> + unicode_incomplete_check([Bytes, State, Acc, fun parse_cdata/3], + undefined). + + +%%---------------------------------------------------------------------- +%% Function: parse_comment(Rest, State, Acc) -> Result +%% Input: Rest = string() | binary() +%% State = #xmerl_sax_parser_state{} +%% Acc = string() +%% Output: Result = {Rest, State} +%% Description: Parse a comment. +%% [15] Comment ::= '<!--' ((Char - '-') | ('-' (Char - '-')))* '-->' +%%---------------------------------------------------------------------- +parse_comment(?STRING_EMPTY, State, Acc) -> + cf(?STRING_EMPTY, State, Acc, fun parse_comment/3); +parse_comment(?STRING("\r"), State, Acc) -> + cf(?STRING("\r"), State, Acc, fun parse_comment/3); +parse_comment(?STRING("-"), State, Acc) -> + cf(?STRING("-"), State, Acc, fun parse_comment/3); +parse_comment(?STRING("--"), State, Acc) -> + cf(?STRING("--"), State, Acc, fun parse_comment/3); +parse_comment(?STRING_REST("-->", Rest), State, Acc) -> + State1 = event_callback({comment, lists:reverse(Acc)}, State), + {Rest, State1}; +parse_comment(?STRING_REST("--", _), State, _) -> + ?fatal_error(State, "comment contains '--'"); +parse_comment(?STRING_REST("\n", Rest), #xmerl_sax_parser_state{line_no=N} = State, Acc) -> + parse_comment(Rest, State#xmerl_sax_parser_state{line_no=N+1}, [?lf|Acc]); +parse_comment(?STRING_REST("\r\n", Rest), #xmerl_sax_parser_state{line_no=N} = State, Acc) -> + parse_comment(Rest, State#xmerl_sax_parser_state{line_no=N+1}, [?lf|Acc]); +parse_comment(?STRING_REST("\r", Rest), #xmerl_sax_parser_state{line_no=N} = State, Acc) -> + parse_comment(Rest, State#xmerl_sax_parser_state{line_no=N+1}, [?lf|Acc]); +parse_comment(?STRING_UNBOUND_REST(C, Rest), State, Acc) -> + if + ?is_char(C) -> + parse_comment(Rest, State, [C|Acc]); + true -> + ?fatal_error(State, "Bad character in comment: " ++ C) + end; +parse_comment(Bytes, State, Acc) -> + unicode_incomplete_check([Bytes, State, Acc, fun parse_comment/3], + undefined). + + +%%---------------------------------------------------------------------- +%% Function: parse_misc(Rest, State, Eod) -> Result +%% Input: Rest = string() | binary() +%% State = #xmerl_sax_parser_state{} +%% Eod = true |false +%% Output: Result = {Rest, State} +%% Description: Parse a misc clause, could be a comment, a processing +%% instruction or whitespace. If the input stream is empty +%% (Eod parameter true) then we return current state and quit. +%% [27] Misc ::= Comment | PI | S +%%---------------------------------------------------------------------- +parse_misc(?STRING_EMPTY, State, true) -> + {?STRING_EMPTY, State}; +parse_misc(?STRING_EMPTY, State, Eod) -> + cf(?STRING_EMPTY, State, Eod, fun parse_misc/3); +parse_misc(?STRING("<") = Rest, State, Eod) -> + cf(Rest, State, Eod, fun parse_misc/3); +parse_misc(?STRING_REST("<?", Rest), State, Eod) -> + {Rest1, State1} = parse_pi(Rest, State), + parse_misc(Rest1, State1, Eod); +parse_misc(?STRING("<!") = Rest, State, Eod) -> + cf(Rest, State, Eod, fun parse_misc/3); +parse_misc(?STRING("<!-") = Rest, State, Eod) -> + cf(Rest, State, Eod, fun parse_misc/3); +parse_misc(?STRING_REST("<!--", Rest), State, Eod) -> + {Rest1, State1} = parse_comment(Rest, State, []), + parse_misc(Rest1, State1, Eod); +parse_misc(?STRING_UNBOUND_REST(C, _) = Rest, State, Eod) when ?is_whitespace(C) -> + {_WS, Rest1, State1} = whitespace(Rest, State, []), + parse_misc(Rest1, State1, Eod); +parse_misc(Rest, State, _Eod) -> + {Rest, State}. +%% unicode_incomplete_check([Bytes, State, Eod, fun parse_misc/3], +%% "expecting comment or PI"). + +%%---------------------------------------------------------------------- +%% Function: parse_stag(Rest, State) -> Result +%% Input: Rest = string() | binary() +%% State = #xmerl_sax_parser_state{} +%% Output: Result = {Rest, State} +%% Description: Parsing a start tag. +%% [40] STag ::= '<' Name (S Attribute)* S? '>' +%%---------------------------------------------------------------------- +parse_stag(?STRING_EMPTY, State) -> + cf(?STRING_EMPTY, State, fun parse_stag/2); +parse_stag(?STRING_UNBOUND_REST(C, Rest), State) -> + case is_name_start(C) of + true -> + {TagName, Rest1, State1} = + parse_ns_name(Rest, State, [], [C]), + parse_attributes(Rest1, State1, {TagName, [], []}); + false -> + ?fatal_error(State, "expecting name") + end; +parse_stag(Bytes, State) -> + unicode_incomplete_check([Bytes, State, fun parse_stag/2], + undefined). + +%%---------------------------------------------------------------------- +%% Function: parse_attributes(Rest, State, CurrentTag) -> Result +%% Input: Rest = string() | binary() +%% State = #xmerl_sax_parser_state{} +%% CurrentTag = {Name, AttList, NewNsList} +%% Name = string() +%% AttList = [{Name, Value}] +%% NewNsList = [{Name, Value}] +%% Output: Result = {Rest, State} +%% Description: Parsing the attribute list in the start tag. The current +%% tag tuple contains the tag name, a list of attributes +%% (exclusive NS attributes) and a list of new NS attributes. +%% [41] Attribute ::= Name Eq AttValue +%%---------------------------------------------------------------------- +parse_attributes(?STRING_EMPTY, State, CurrentTag) -> + cf(?STRING_EMPTY, State, CurrentTag, fun parse_attributes/3); +parse_attributes(?STRING("/"), State, CurrentTag) -> + cf(?STRING("/"), State, CurrentTag, fun parse_attributes/3); +parse_attributes(?STRING_REST("/>", Rest), State, {Tag, AttList, NewNsList}) -> + CompleteNsList = NewNsList ++ State#xmerl_sax_parser_state.ns, + {Uri, LocalName, QName, Attributes} = fix_ns(Tag, AttList, CompleteNsList), + State1 = send_start_prefix_mapping_event(lists:reverse(NewNsList), State), + State2 = event_callback({startElement, Uri, LocalName, QName, Attributes}, State1), + State3 = event_callback({endElement, Uri, LocalName, QName}, State2), + State4 = send_end_prefix_mapping_event(NewNsList, State3), + parse_content(Rest, State4, [], true); +parse_attributes(?STRING_REST(">", Rest), #xmerl_sax_parser_state{end_tags=ETags, ns = OldNsList} = State, + {Tag, AttList, NewNsList}) -> + CompleteNsList = NewNsList ++ OldNsList, + {Uri, LocalName, QName, Attributes} = fix_ns(Tag, AttList, CompleteNsList), + State1 = send_start_prefix_mapping_event(lists:reverse(NewNsList), State), + State2 = event_callback({startElement, Uri, LocalName, QName, Attributes}, State1), + parse_content(Rest, State2#xmerl_sax_parser_state{end_tags=[{Tag, Uri, LocalName, QName, + OldNsList, NewNsList} |ETags], + ns = CompleteNsList}, + [], true); +parse_attributes(?STRING_UNBOUND_REST(C, _) = Rest, State, CurrentTag) when ?is_whitespace(C) -> + {_WS, Rest1, State1} = whitespace(Rest, State, []), + parse_attributes(Rest1, State1, CurrentTag); +parse_attributes(?STRING_UNBOUND_REST(C, Rest), State, {Tag, AttList, NsList}) -> + case is_name_start(C) of + true -> + {AttrName, Rest1, State1} = + parse_ns_name(Rest, State, [], [C]), + {Rest2, State2} = parse_eq(Rest1, State1), + {AttValue, Rest3, State3} = parse_att_value(Rest2, State2), + case AttrName of + {"xmlns", NsName} -> + parse_attributes(Rest3, State3, {Tag, AttList, [{NsName, AttValue} |NsList]}); + {"", "xmlns"} -> + parse_attributes(Rest3, State3, {Tag, AttList, [{"", AttValue} |NsList]}); + {_Prefix, _LocalName} -> + case lists:keyfind(AttrName, 1, AttList) of + false -> + parse_attributes(Rest3, State3, {Tag, [{AttrName, AttValue}|AttList], NsList}); + _ -> + ElName = + case Tag of + {"", N} -> N; + {Ns, N} -> Ns ++ ":" ++ N + end, + ?fatal_error(State, "Attribute exist more than once in element: " ++ ElName) + end + end; + false -> + ?fatal_error(State, "Invalid start character in attribute name: " ++ C) + end; +parse_attributes(Bytes, State, CurrentTag) -> + unicode_incomplete_check([Bytes, State, CurrentTag, fun parse_attributes/3], + "expecting name, whitespace, /> or >"). + + + +%%---------------------------------------------------------------------- +%% Function: fix_ns({Prefix, Name}, Attributes, Ns) -> Result +%% Input: Prefix = string() +%% Name = string() +%% Attributes = [{Name, Value}] +%% Ns = [{Prefix, Uri}] +%% Uri = string() +%% Output: Result = {Uri, Name, QualifiedName, Attributes} +%% QualifiedName = string() +%% Description: Fix the name space prefixing for the attributes and start tag. +%%---------------------------------------------------------------------- +% fix_ns({"", Name}, Attributes, Ns) -> +% Attributes2 = fix_attributes_ns(Attributes, Ns, []), +% {"", Name, Name, Attributes2}; +fix_ns({Prefix, Name}, Attributes, Ns) -> + Uri = + case lists:keysearch(Prefix, 1, Ns) of + {value, {Prefix, U}} -> + U; + false -> + "" + end, + Attributes2 = fix_attributes_ns(Attributes, Ns, []), + + {Uri, Name, {Prefix, Name}, Attributes2}. + +%%---------------------------------------------------------------------- +%% Function: fix_attributes_ns(Attributes, Ns, Acc) -> Result +%% Input: Attributes = [{{Prefix, Name}, Value}] +%% Prefix = string() +%% Name = string() +%% Value = string() +%% Ns = [{Prefix, Uri}] +%% Uri = string() +%% Output: Result = [{Uri, Name, Value}] +%% Description: Fix the name spaces for the attributes. +%%---------------------------------------------------------------------- +fix_attributes_ns([], _, Acc) -> + Acc; +fix_attributes_ns([{{"", Name}, AttrValue} | Attrs], Ns, Acc) -> + fix_attributes_ns(Attrs, Ns, [{"", "", Name, AttrValue} |Acc]); +fix_attributes_ns([{{Prefix, Name}, AttrValue} | Attrs], Ns, Acc) -> + Uri = + case lists:keysearch(Prefix, 1, Ns) of + {value, {Prefix, U}} -> + U; + false -> + "" + end, + fix_attributes_ns(Attrs, Ns, [{Uri, Prefix, Name, AttrValue} |Acc]). + + +%%---------------------------------------------------------------------- +%% Function: send_start_prefix_mapping_event(Ns, State) -> Result +%% Input: Ns = [{Prefix, Uri}] +%% Prefix = string() +%% Uri = string() +%% State = #xmerl_sax_parser_state{} +%% Output: Result = #xmerl_sax_parser_state{} +%% Description: Loops over a name space list and sends startPrefixMapping events. +%%---------------------------------------------------------------------- +send_start_prefix_mapping_event([], State) -> + State; +send_start_prefix_mapping_event([{Prefix, Uri} |Ns], State) -> + State1 = event_callback({startPrefixMapping, Prefix, Uri}, State), + send_start_prefix_mapping_event(Ns, State1). + + +%%---------------------------------------------------------------------- +%% Function: send_end_prefix_mapping_event(Ns, State) -> Result +%% Input: Ns = [{Prefix, Uri}] +%% Prefix = string() +%% Uri = string() +%% State = #xmerl_sax_parser_state{} +%% Output: Result = #xmerl_sax_parser_state{} +%% Description: Loops over a name space list and sends endPrefixMapping events. +%%---------------------------------------------------------------------- +send_end_prefix_mapping_event([], State) -> + State; +send_end_prefix_mapping_event([{Prefix, _Uri} |Ns], State) -> + State1 = event_callback({endPrefixMapping, Prefix}, State), + send_end_prefix_mapping_event(Ns, State1). + + +%%---------------------------------------------------------------------- +%% Function: parse_eq(Rest, State) -> Result +%% Input: Rest = string() | binary() +%% State = #xmerl_sax_parser_state{} +%% Output: Result = {Rest, State} +%% Description: Parsing an '=' from the stream. +%% [25] Eq ::= S? '=' S? +%%---------------------------------------------------------------------- +parse_eq(?STRING_EMPTY, State) -> + cf(?STRING_EMPTY, State, fun parse_eq/2); +parse_eq(?STRING_REST("=", Rest), State) -> + {Rest, State}; +parse_eq(?STRING_UNBOUND_REST(C, _) = Bytes, State) when ?is_whitespace(C) -> + {_WS, Rest, State1} = + whitespace(Bytes, State, []), + parse_eq(Rest, State1); +parse_eq(Bytes, State) -> + unicode_incomplete_check([Bytes, State, fun parse_eq/2], + "expecting = or whitespace"). + + +%%---------------------------------------------------------------------- +%% Function: parse_att_value(Rest, State) -> Result +%% Input: Rest = string() | binary() +%% State = #xmerl_sax_parser_state{} +%% Output: Result = {Rest, State} +%% Description: Start the parsing of an attribute value by checking the delimiter +%% [10] AttValue ::= '"' ([^<&"] | Reference)* '"' +%% | "'" ([^<&'] | Reference)* "'" +%%---------------------------------------------------------------------- +parse_att_value(?STRING_EMPTY, State) -> + cf(?STRING_EMPTY, State, fun parse_att_value/2); +parse_att_value(?STRING_UNBOUND_REST(C, Rest), State) when C == $'; C == $" -> + parse_att_value(Rest, State, C, []); +parse_att_value(?STRING_UNBOUND_REST(C, _) = Bytes, State) when ?is_whitespace(C) -> + {_WS, Rest, State1} = + whitespace(Bytes, State, []), + parse_att_value(Rest, State1); +parse_att_value(Bytes, State) -> + unicode_incomplete_check([Bytes, State, fun parse_att_value/2], + "\', \" or whitespace expected"). + + +%%---------------------------------------------------------------------- +%% Function : parse_att_value(Rest, State, Stop, Acc) -> Result +%% Parameters: Rest = string() | binary() +%% State = #xmerl_sax_parser_state{} +%% Stop = $' | $" +%% Acc = string() +%% Result : {Value, Rest, State} +%% Value = string() +%% Description: Parse an attribute value +%%---------------------------------------------------------------------- +parse_att_value(?STRING_EMPTY, State, undefined, Acc) -> + {Acc, [], State}; %% stop clause when parsing references +parse_att_value(?STRING_EMPTY, State, Stop, Acc) -> + cf(?STRING_EMPTY, State, Stop, Acc, fun parse_att_value/4); +parse_att_value(?STRING("\r"), State, Stop, Acc) -> + cf(?STRING("\r"), State, Stop, Acc, fun parse_att_value/4); +parse_att_value(?STRING_REST("\n", Rest), #xmerl_sax_parser_state{line_no=N} = State, Stop, Acc) -> + parse_att_value(Rest, + State#xmerl_sax_parser_state{line_no=N+1}, Stop, [?space |Acc]); +parse_att_value(?STRING_REST("\r\n", Rest), #xmerl_sax_parser_state{line_no=N} = State, Stop, Acc) -> + parse_att_value(Rest, + State#xmerl_sax_parser_state{line_no=N+1}, Stop, [?space |Acc]); +parse_att_value(?STRING_REST("\r", Rest), #xmerl_sax_parser_state{line_no=N} = State, Stop, Acc) -> + parse_att_value(Rest, + State#xmerl_sax_parser_state{line_no=N+1}, Stop, [?space |Acc]); +parse_att_value(?STRING_REST("\t", Rest), #xmerl_sax_parser_state{line_no=N} = State, Stop, Acc) -> + parse_att_value(Rest, + State#xmerl_sax_parser_state{line_no=N+1}, Stop, [?space |Acc]); +parse_att_value(?STRING_REST("&", Rest), State, Stop, Acc) -> + {Ref, Rest1, State1} = parse_reference(Rest, State, true), + case Ref of + {character, _, CharValue} -> + parse_att_value(Rest1, State1, Stop, [CharValue | Acc]); + {internal_general, true, _, Value} -> + parse_att_value(Rest1, State1, Stop, Value ++ Acc); + {internal_general, false, _, Value} -> + {ParsedValue, [], State2} = parse_att_value(?TO_INPUT_FORMAT(Value), State1, undefined, []), + parse_att_value(Rest1, State2, Stop, ParsedValue ++ Acc); + {external_general, Name, _} -> + ?fatal_error(State1, "External parsed entity reference in attribute value: " ++ Name); + {unparsed, Name, _} -> + ?fatal_error(State1, "Unparsed entity reference in attribute value: " ++ Name) + end; +parse_att_value(?STRING_UNBOUND_REST(Stop, Rest), State, Stop, Acc) -> + {lists:reverse(Acc), Rest, State}; +parse_att_value(?STRING_UNBOUND_REST($<, _Rest), State, _Stop, _Acc) -> + ?fatal_error(State, "< not allowed in attribute value"); +parse_att_value(?STRING_UNBOUND_REST(C, Rest), State, Stop, Acc) -> + parse_att_value(Rest, State, Stop, [C|Acc]); +parse_att_value(Bytes, State, Stop, Acc) -> + unicode_incomplete_check([Bytes, State, Stop, Acc, fun parse_att_value/4], + undefined). + + +%%---------------------------------------------------------------------- +%% Function : parse_etag(Rest, State) -> Result +%% Parameters: Rest = string() | binary() +%% State = #xmerl_sax_parser_state{} +%% Result : {Rest, State} +%% Description: Parse the end tag +%% [42] ETag ::= '</' Name S? '>' +%%---------------------------------------------------------------------- +parse_etag(?STRING_EMPTY, State) -> + cf(?STRING_EMPTY, State, fun parse_etag/2); +parse_etag(?STRING_UNBOUND_REST(C, Rest), + #xmerl_sax_parser_state{end_tags=[{ETag, _Uri, _LocalName, _QName, _OldNsList, _NewNsList} + |_RestOfETags]} = State) -> + case is_name_start(C) of + true -> + {Tag, Rest1, State1} = parse_ns_name(Rest, State, [], [C]), + case Tag == ETag of + true -> + {_WS, Rest2, State2} = whitespace(Rest1, State1, []), + parse_etag_1(Rest2, State2, Tag); + false -> + case State1#xmerl_sax_parser_state.match_end_tags of + true -> + {P,TN} = Tag, + ?fatal_error(State1, "EndTag: " ++ P ++ ":" ++ TN ++ + ", does not match StartTag"); + false -> + {_WS, Rest2, State2} = whitespace(Rest1, State1, []), + parse_etag_1(Rest2, State2, Tag) + end + end; + false -> + ?fatal_error(State, "Name expected") + end; +parse_etag(?STRING_UNBOUND_REST(_C, _) = Rest, #xmerl_sax_parser_state{end_tags=[]}= State) -> + {Rest, State}; +parse_etag(Bytes, State) -> + unicode_incomplete_check([Bytes, State, fun parse_etag/2], + undefined). + + +parse_etag_1(?STRING_REST(">", Rest), + #xmerl_sax_parser_state{end_tags=[{_ETag, Uri, LocalName, QName, OldNsList, NewNsList} + |RestOfETags]} = State, _Tag) -> + State1 = event_callback({endElement, Uri, LocalName, QName}, State), + State2 = send_end_prefix_mapping_event(NewNsList, State1), + parse_content(Rest, + State2#xmerl_sax_parser_state{end_tags=RestOfETags, + ns = OldNsList}, + [], true); +parse_etag_1(?STRING_UNBOUND_REST(_C, _), State, Tag) -> + {P,TN} = Tag, + ?fatal_error(State, "Bad EndTag: " ++ P ++ ":" ++ TN); +parse_etag_1(Bytes, State, Tag) -> + unicode_incomplete_check([Bytes, State, Tag, fun parse_etag_1/3], + undefined). + +%%---------------------------------------------------------------------- +%% Function: parse_content(Rest, State, Acc, IgnorableWS) -> Result +%% Parameters: Rest = string() | binary() +%% State = #xmerl_sax_parser_state{} +%% Acc = string() +%% IgnorableWS = true | false +%% Result : {Rest, State} +%% Description: Parsing the content part of tags +%% [43] content ::= (element | CharData | Reference | CDSect | PI | Comment)* +%%---------------------------------------------------------------------- + +parse_content(?STRING_EMPTY, State, Acc, IgnorableWS) -> + case catch cf(?STRING_EMPTY, State, Acc, IgnorableWS, fun parse_content/4) of + {Rest, State1} when is_record(State1, xmerl_sax_parser_state) -> + {Rest, State1}; + {fatal_error, {State1, Msg}} -> + case check_if_document_complete(State1, Msg) of + true -> + State2 = send_character_event(length(Acc), IgnorableWS, lists:reverse(Acc), State1), + {?STRING_EMPTY, State2}; + false -> + ?fatal_error(State1, Msg) + end; + Other -> + throw(Other) + end; +parse_content(?STRING("\r"), State, Acc, IgnorableWS) -> + cf(?STRING("\r"), State, Acc, IgnorableWS, fun parse_content/4); +parse_content(?STRING("<"), State, Acc, IgnorableWS) -> + cf(?STRING("<"), State, Acc, IgnorableWS, fun parse_content/4); +parse_content(?STRING_REST("</", Rest), State, Acc, IgnorableWS) -> + State1 = send_character_event(length(Acc), IgnorableWS, lists:reverse(Acc), State), + parse_etag(Rest, State1); +parse_content(?STRING("<!"), State, _Acc, IgnorableWS) -> + cf(?STRING("<!"), State, [], IgnorableWS, fun parse_content/4); +parse_content(?STRING("<!-"), State, _Acc, IgnorableWS) -> + cf(?STRING("<!-"), State, [], IgnorableWS, fun parse_content/4); +parse_content(?STRING_REST("<!--", Rest), State, Acc, IgnorableWS) -> + State1 = send_character_event(length(Acc), IgnorableWS, lists:reverse(Acc), State), + {Rest1, State2} = parse_comment(Rest, State1, []), + parse_content(Rest1, State2, [], true); +parse_content(?STRING_REST("<?", Rest), State, Acc, IgnorableWS) -> + State1 = send_character_event(length(Acc), IgnorableWS, lists:reverse(Acc), State), + {Rest1, State2} = parse_pi(Rest, State1), + parse_content(Rest1, State2, [], true); +parse_content(?STRING_REST("<!", Rest1) = Rest, #xmerl_sax_parser_state{end_tags = ET} = State, Acc, IgnorableWS) -> + case ET of + [] -> + {Rest, State}; %%LATH : Skicka ignorable WS ??? + _ -> + State1 = send_character_event(length(Acc), IgnorableWS, lists:reverse(Acc), State), + parse_cdata(Rest1, State1) + end; +parse_content(?STRING_REST("<", Rest1) = Rest, #xmerl_sax_parser_state{end_tags = ET} = State, Acc, IgnorableWS) -> + case ET of + [] -> + {Rest, State}; %%LATH : Skicka ignorable WS ??? + _ -> + State1 = send_character_event(length(Acc), IgnorableWS, lists:reverse(Acc), State), + parse_stag(Rest1, State1) + end; +parse_content(?STRING_REST("\n", Rest), State, Acc, IgnorableWS) -> + N = State#xmerl_sax_parser_state.line_no, + parse_content(Rest, State#xmerl_sax_parser_state{line_no=N+1},[?lf |Acc], IgnorableWS); +parse_content(?STRING_REST("\r\n", Rest), State, Acc, IgnorableWS) -> + N = State#xmerl_sax_parser_state.line_no, + parse_content(Rest, State#xmerl_sax_parser_state{line_no=N+1},[?lf |Acc], IgnorableWS); +parse_content(?STRING_REST("\r", Rest), State, Acc, IgnorableWS) -> + N = State#xmerl_sax_parser_state.line_no, + parse_content(Rest, State#xmerl_sax_parser_state{line_no=N+1},[?lf |Acc], IgnorableWS); +parse_content(?STRING_REST(" ", Rest), State, Acc, IgnorableWS) -> + parse_content(Rest, State,[?space |Acc], IgnorableWS); +parse_content(?STRING_REST("\t", Rest), State, Acc, IgnorableWS) -> + parse_content(Rest, State,[?tab |Acc], IgnorableWS); +parse_content(?STRING_REST("]]>", _Rest), State, _Acc, _IgnorableWS) -> + ?fatal_error(State, "\"]]>\" is not allowed in content"); +parse_content(?STRING_UNBOUND_REST(_C, _) = Rest, + #xmerl_sax_parser_state{end_tags = []} = State, + _Acc, _IgnorableWS) -> + {Rest, State}; +parse_content(?STRING_REST("&", Rest), State, Acc, _IgnorableWS) -> + {Ref, Rest1, State1} = parse_reference(Rest, State, true), + case Ref of + {character, _, CharValue} -> + parse_content(Rest1, State1, [CharValue | Acc], false); + {internal_general, true, _, Value} -> + parse_content(Rest1, State1, Value ++ Acc, false); + {internal_general, false, _, Value} -> + IValue = ?TO_INPUT_FORMAT(Value), + parse_content(?APPEND_STRING(IValue, Rest1), State1, Acc, false); + {external_general, _, {PubId, SysId}} -> + State2 = parse_external_entity(State1, PubId, SysId), + parse_content(Rest1, State2, Acc, false); + {unparsed, Name, _} -> + ?fatal_error(State1, "Unparsed entity reference in content: " ++ Name) + end; +parse_content(?STRING_UNBOUND_REST(C, Rest), State, Acc, _IgnorableWS) -> + if + ?is_char(C) -> + parse_content(Rest, State, [C|Acc], false); + true -> + ?fatal_error(State, "Bad character in content: " ++ C) + end; +parse_content(Bytes, State, Acc, IgnorableWS) -> + unicode_incomplete_check([Bytes, State, Acc, IgnorableWS, fun parse_content/4], + undefined). + + +%%---------------------------------------------------------------------- +%% Function: check_if_document_complete(State, ErrorMsg) -> Result +%% Parameters: State = #xmerl_sax_parser_state{} +%% ErrorMsg = string() +%% Result : boolean() +%% Description: Checks that the document is complete if we don't have more data.. +%%---------------------------------------------------------------------- +check_if_document_complete(#xmerl_sax_parser_state{end_tags = []}, + "No more bytes") -> + true; +check_if_document_complete(#xmerl_sax_parser_state{end_tags = []}, + "Continuation function undefined") -> + true; +check_if_document_complete(_, _) -> + false. + +%%---------------------------------------------------------------------- +%% Function: send_character_event(Length, IgnorableWS, String, State) -> Result +%% Parameters: Length = integer() +%% IgnorableWS = true | false +%% String = string() +%% State = #xmerl_sax_parser_state{} +%% Result : #xmerl_sax_parser_state{} +%% Description: Sends the correct type of character event depending on if +%% it's whitespaces that can be ignored or not. +%%---------------------------------------------------------------------- +send_character_event(0, _, _, State) -> + State; +send_character_event(_, false, String, State) -> + event_callback({characters, String}, State); +send_character_event(_, true, String, State) -> + event_callback({ignorableWhitespace, String}, State). + + +%%---------------------------------------------------------------------- +%% Function: whitespace(Rest, State, Acc) -> Result +%% Parameters: Rest = string() | binary() +%% State = #xmerl_sax_parser_state{} +%% Acc = string() +%% Result : {Rest, State} +%% Description: Parse whitespaces. +%% [3] S ::= (#x20 | #x9 | #xD | #xA)+ +%%---------------------------------------------------------------------- +whitespace(?STRING_EMPTY, State, Acc) -> + case cf(?STRING_EMPTY, State, Acc, fun whitespace/3) of + {?STRING_EMPTY, State} -> + {lists:reverse(Acc), ?STRING_EMPTY, State}; + Ret -> + Ret + end; +whitespace(?STRING("\r") = Bytes, State, Acc) -> + case cf(Bytes, State, Acc, fun whitespace/3) of + {?STRING("\r") = Bytes, State} -> + {lists:reverse(Acc), Bytes, State}; + Ret -> + Ret + end; +whitespace(?STRING_REST("\n", Rest), State, Acc) -> + N = State#xmerl_sax_parser_state.line_no, + whitespace(Rest, State#xmerl_sax_parser_state{line_no=N+1}, [?lf |Acc]); +whitespace(?STRING_REST("\r\n", Rest), State, Acc) -> + N = State#xmerl_sax_parser_state.line_no, + whitespace(Rest, State#xmerl_sax_parser_state{line_no=N+1}, [?lf |Acc]); +whitespace(?STRING_REST("\r", Rest), State, Acc) -> + N = State#xmerl_sax_parser_state.line_no, + whitespace(Rest, State#xmerl_sax_parser_state{line_no=N+1}, [?lf |Acc]); +whitespace(?STRING_UNBOUND_REST(C, Rest), State, Acc) when ?is_whitespace(C) -> + whitespace(Rest, State, [C|Acc]); +whitespace(?STRING_UNBOUND_REST(_C, _) = Bytes, State, Acc) -> + {lists:reverse(Acc), Bytes, State}; +whitespace(Bytes, #xmerl_sax_parser_state{encoding=Enc} = State, Acc) when is_binary(Bytes) -> + case unicode:characters_to_list(Bytes, Enc) of + {incomplete, _, _} -> + cf(Bytes, State, Acc, fun whitespace/3); + {error, _Encoded, _Rest} -> + ?fatal_error(State, lists:flatten(io_lib:format("Bad character, not in ~p\n", [Enc]))) + end. + + +%%---------------------------------------------------------------------- +%% Function: parse_reference(Rest, State, HaveToExist) -> Result +%% Parameters: Rest = string() | binary() +%% State = #xmerl_sax_parser_state{} +%% Result : {Value, Rest, State} +%% Description: Parse entity references. +%% [66] CharRef ::= '&#' [0-9]+ ';' +%% | '&#x' [0-9a-fA-F]+ ';' +%% [67] Reference ::= EntityRef | CharRef +%% [68] EntityRef ::= '&' Name ';' +%%---------------------------------------------------------------------- +parse_reference(?STRING_EMPTY, State, HaveToExist) -> + cf(?STRING_EMPTY, State, HaveToExist, fun parse_reference/3); +parse_reference(?STRING("#"), State, HaveToExist) -> + cf(?STRING("#"), State, HaveToExist, fun parse_reference/3); +parse_reference(?STRING_REST("#x", Rest), State, _HaveToExist) -> + {CharValue, RefString, Rest1, State1} = parse_hex(Rest, State, []), + if + ?is_char(CharValue) -> + {{character, is_delimiter(CharValue), CharValue}, + Rest1, State1}; + true -> + ?fatal_error(State1, "Not a legal character: #x" ++ RefString) %%WFC: Legal Character + end; +parse_reference(?STRING_REST("#", Rest), State, _HaveToExist) -> + {CharValue, RefString, Rest1, State1} = parse_digit(Rest, State, []), + if + ?is_char(CharValue) -> + {{character, is_delimiter(CharValue), CharValue}, + Rest1, State1}; + true -> + ?fatal_error(State1, "Not a legal character: #" ++ RefString)%%WFC: Legal Character + end; +parse_reference(?STRING_UNBOUND_REST(C, Rest), State, HaveToExist) -> + case is_name_start(C) of + true -> + {Name, Rest1, State1} = parse_name(Rest, State, [C]), + parse_reference_1(Rest1, State1, HaveToExist, Name); + false -> + ?fatal_error(State, "name expected") + end; +parse_reference(Bytes, State, HaveToExist) -> + unicode_incomplete_check([Bytes, State, HaveToExist, fun parse_reference/3], + underfined). + + +parse_reference_1(?STRING_REST(";", Rest), State, HaveToExist, Name) -> + case look_up_reference(Name, HaveToExist, State) of + {internal_general, Name, RefValue} -> + {{internal_general, is_delimiter(RefValue), Name, RefValue}, + Rest, State}; + Result -> + {Result, Rest, State} + end; +parse_reference_1(Bytes, State, HaveToExist, Name) -> + unicode_incomplete_check([Bytes, State, HaveToExist, Name, fun parse_reference_1/4], + "Missing semicolon after reference: " ++ Name). + + + +%%---------------------------------------------------------------------- +%% Function: is_delimiter(Character) -> Result +%% Parameters: Character +%% Result : +%%---------------------------------------------------------------------- +is_delimiter(38) -> + true; +is_delimiter(60) -> + true; +is_delimiter(62) -> + true; +is_delimiter(39) -> + true; +is_delimiter(34) -> + true; +is_delimiter("&") -> + true; +is_delimiter("<") -> + true; +is_delimiter(">") -> + true; +is_delimiter("'") -> + true; +is_delimiter("\"") -> + true; +is_delimiter(_) -> + false. + +%%---------------------------------------------------------------------- +%% Function: parse_pe_reference(Rest, State) -> Result +%% Parameters: Rest = string() | binary() +%% State = #xmerl_sax_parser_state{} +%% Acc = string() +%% Result : {Result, Rest, State} +%% Description: Parse a parameter entity reference. +%% [69] PEReference ::= '%' Name ';' +%%---------------------------------------------------------------------- +parse_pe_reference(?STRING_EMPTY, State) -> + cf(?STRING_EMPTY, State, fun parse_pe_reference/2); +parse_pe_reference(?STRING_UNBOUND_REST(C, Rest), State) -> + case is_name_start(C) of + true -> + {Name, Rest1, State1} = parse_name(Rest, State, [C]), + parse_pe_reference_1(Rest1, State1, Name); + false -> + ?fatal_error(State, "Name expected") + end; +parse_pe_reference(Bytes, State) -> + unicode_incomplete_check([Bytes, State, fun parse_pe_reference/2], + underfined). + + +parse_pe_reference_1(?STRING_REST(";", Rest), State, Name) -> + Name1 = "%" ++ Name, + Result = look_up_reference(Name1, true, State), + {Result, Rest, State}; +parse_pe_reference_1(Bytes, State, Name) -> + unicode_incomplete_check([Bytes, State, Name, fun parse_pe_reference_1/3], + "missing ; after reference " ++ Name). + + + +%%---------------------------------------------------------------------- +%% Function: insert_reference(Reference, State) -> Result +%% Parameters: Reference = string() +%% State = #xmerl_sax_parser_state{} +%% Result : +%%---------------------------------------------------------------------- +insert_reference({Name, Type, Value}, Table) -> + case ets:lookup(Table, Name) of + [{Name, _, _}] -> + ok; + _ -> + ets:insert(Table, {Name, Type, Value}) + end. + + + +%%---------------------------------------------------------------------- +%% Function: look_up_reference(Reference, State) -> Result +%% Parameters: Reference = string() +%% State = #xmerl_sax_parser_state{} +%% Result : +%%---------------------------------------------------------------------- +look_up_reference("amp", _, _) -> + {internal_general, "amp", "&"}; +look_up_reference("lt", _, _) -> + {internal_general, "lt", "<"}; +look_up_reference("gt", _, _) -> + {internal_general, "gt", ">"}; +look_up_reference("apos", _, _) -> + {internal_general, "apos", "'"}; +look_up_reference("quot", _, _) -> + {internal_general, "quot", "\""}; +look_up_reference(Name, HaveToExist, State) -> + case ets:lookup(State#xmerl_sax_parser_state.ref_table, Name) of + [{Name, Type, Value}] -> + {Type, Name, Value}; + _ -> + case HaveToExist of + true -> + case State#xmerl_sax_parser_state.standalone of + yes -> + ?fatal_error(State, "Entity not declared: " ++ Name); %%WFC: Entity Declared + no -> + ?fatal_error(State, "Entity not declared: " ++ Name) %%VC: Entity Declared + end; + false -> + {not_found, Name} + end + end. + + +%%---------------------------------------------------------------------- +%% Function: parse_hex(Rest, State, Acc) -> Result +%% Parameters: Rest = string() | binary() +%% State = #xmerl_sax_parser_state{} +%% Acc = string() +%% Result : {Value, Reference, Rest, State} +%% Value = integer() +%% Reference = string() +%% Description: Parse a hex reference. +%%---------------------------------------------------------------------- +parse_hex(?STRING_EMPTY, State, Acc) -> + cf(?STRING_EMPTY, State, Acc, fun parse_hex/3); +parse_hex(?STRING_REST(";", Rest), State, Acc) -> + RefString = lists:reverse(Acc), + {erlang:list_to_integer(RefString, 16), RefString, Rest, State}; +parse_hex(?STRING_UNBOUND_REST(C, Rest), State, Acc) when ?is_hex_digit(C) -> + parse_hex(Rest, State, [C |Acc]); +parse_hex(Bytes, State, Acc) -> + unicode_incomplete_check([Bytes, State, Acc, fun parse_hex/3], + "Bad hex value in reference: "). + + +%%---------------------------------------------------------------------- +%% Function: parse_digit(Rest, State, Acc) -> Result +%% Parameters: Rest = string() | binary() +%% State = #xmerl_sax_parser_state{} +%% Acc = string() +%% Result : {Value, Reference, Rest, State} +%% Value = integer() +%% Reference = string() +%% Description: Parse a decimal reference. +%%---------------------------------------------------------------------- +parse_digit(?STRING_EMPTY, State, Acc) -> + cf(?STRING_EMPTY, State, Acc, fun parse_digit/3); +parse_digit(?STRING_REST(";", Rest), State, Acc) -> + RefString = lists:reverse(Acc), + {list_to_integer(RefString), RefString, Rest, State}; +parse_digit(?STRING_UNBOUND_REST(C, Rest), State, Acc) -> + case is_digit(C) of + true -> + parse_digit(Rest, State, [C |Acc]); + false -> + ?fatal_error(State, "Character in reference not a digit: " ++ [C]) + end; +parse_digit(Bytes, State, Acc) -> + unicode_incomplete_check([Bytes, State, Acc, fun parse_digit/3], + undefined). + +%%---------------------------------------------------------------------- +%% Function: parse_system_litteral(Rest, State, Stop, Acc) -> Result +%% Parameters: Rest = string() | binary() +%% State = #xmerl_sax_parser_state{} +%% Stop = $' | $" +%% Acc = string() +%% Result : {Value, Reference, Rest, State} +%% Value = integer() +%% Reference = string() +%% Description: Parse a system litteral. +%% [11] SystemLiteral ::= ('"' [^"]* '"') | ("'" [^']* "'") +%%---------------------------------------------------------------------- +parse_system_litteral(?STRING_EMPTY, State, Stop, Acc) -> + cf(?STRING_EMPTY, State, Stop, Acc, fun parse_system_litteral/4); +parse_system_litteral(?STRING_UNBOUND_REST(Stop, Rest), State, Stop, Acc) -> + {lists:reverse(Acc), Rest, State}; +parse_system_litteral(?STRING_UNBOUND_REST(C, Rest), State, Stop, Acc) -> + parse_system_litteral(Rest, State, Stop, [C |Acc]); +parse_system_litteral(Bytes, State, Stop, Acc) -> + unicode_incomplete_check([Bytes, State, Stop, Acc, fun parse_system_litteral/4], + undefined). + +%%---------------------------------------------------------------------- +%% Function: parse_pubid_litteral(Rest, State, Stop, Acc) -> Result +%% Parameters: Rest = string() | binary() +%% State = #xmerl_sax_parser_state{} +%% Stop = $' | $" +%% Acc = string() +%% Result : {Value, Reference, Rest, State} +%% Value = integer() +%% Reference = string() +%% Description: Parse a public idlitteral. +%% [12] PubidLiteral ::= '"' PubidChar* '"' | "'" (PubidChar - "'")* "'" +%%---------------------------------------------------------------------- +parse_pubid_litteral(?STRING_EMPTY, State, Stop, Acc) -> + cf(?STRING_EMPTY, State, Stop, Acc, fun parse_pubid_litteral/4); +parse_pubid_litteral(?STRING_UNBOUND_REST(Stop, Rest), State, Stop, Acc) -> + {lists:reverse(Acc), Rest, State}; +parse_pubid_litteral(?STRING_UNBOUND_REST(C, Rest), State, Stop, Acc) -> + case is_pubid_char(C) of + true -> + parse_pubid_litteral(Rest, State, Stop, [C |Acc]); + false -> + ?fatal_error(State, "Character not allowed in pubid litteral: " ++ [C]) + end; +parse_pubid_litteral(Bytes, State, Stop, Acc) -> + unicode_incomplete_check([Bytes, State, Stop, Acc, fun parse_pubid_litteral/4], + undefined). + +%%====================================================================== +%% DTD Parsing +%%====================================================================== + +%%---------------------------------------------------------------------- +%% Function : parse_doctype(Rest, State, Level, Acc) -> Result +%% Parameters: Rest = string() | binary() +%% State = #xmerl_sax_parser_state{} +%% Level = integer() +%% Acc = string() +%% Result : {string(), Rest, State} +%% Description: This function is just searching the end of the doctype +%% declaration and doesn't parse it. It's used when the +%% parse_dtd option is set to skip. +%%---------------------------------------------------------------------- +%% Just returns doctype as string +%% parse_doctype(?STRING_EMPTY, State, Level, Acc) -> +%% cf(?STRING_EMPTY, State, Level, Acc, fun parse_doctype/4); +%% parse_doctype(?STRING("\r"), State, Level, Acc) -> +%% cf(?STRING("\r"), State, Level, Acc, fun parse_doctype/4); +%% parse_doctype(?STRING_REST(">", Rest), State, 0, Acc) -> +%% {Acc, Rest, State}; +%% parse_doctype(?STRING_REST(">", Rest), State, Level, Acc) -> +%% parse_doctype(Rest, State, Level-1, Acc); +%% parse_doctype(?STRING_REST("<", Rest), State, Level, Acc) -> +%% parse_doctype(Rest, State, Level+1, [$<|Acc]); +%% parse_doctype(?STRING_REST("\n", Rest), #xmerl_sax_parser_state{line_no=N} = State, Level, Acc) -> +%% parse_doctype(Rest, State#xmerl_sax_parser_state{line_no=N+1}, Level, [?lf |Acc]); +%% parse_doctype(?STRING_REST("\r\n", Rest), #xmerl_sax_parser_state{line_no=N} = State, Level, Acc) -> +%% parse_doctype(Rest, State#xmerl_sax_parser_state{line_no=N+1}, Level, [?lf |Acc]); +%% parse_doctype(?STRING_REST("\r", Rest), #xmerl_sax_parser_state{line_no=N} = State, Level, Acc) -> +%% parse_doctype(Rest, State#xmerl_sax_parser_state{line_no=N+1}, Level, [?lf |Acc]); +%% parse_doctype(?STRING_UNBOUND_REST(C, Rest), State, Level, Acc) -> +%% parse_doctype(Rest, State, Level, [C|Acc]); +%% parse_doctype(Bytes, State, Level, Acc) -> +%% unicode_incomplete_check([Bytes, State, Level, Acc, fun parse_doctype/4], +%% undefined). + + +%%---------------------------------------------------------------------- +%% Function : parse_doctype(Rest, State) -> Result +%% Parameters: Rest = string() | binary() +%% State = #xmerl_sax_parser_state{} +%% Result : {Rest, State} +%% Description: This function starts an parsing of the DTD +%% that sends apropriate events. +%% [28] doctypedecl ::= '<!DOCTYPE' S Name (S ExternalID)? S? +%% ('[' (markupdecl | PEReference | S)* ']' S?)? '>' +%%---------------------------------------------------------------------- +parse_doctype(?STRING_EMPTY, State) -> + cf(?STRING_EMPTY, State, fun parse_doctype/2); +parse_doctype(?STRING_UNBOUND_REST(C, _) = Bytes, State) when ?is_whitespace(C) -> + {_WS, Rest, State1} = whitespace(Bytes, State, []), + parse_doctype(Rest, State1); +parse_doctype(?STRING_UNBOUND_REST(C, Rest), State) -> + case is_name_start(C) of + true -> + {Name, Rest1, State1} = parse_name(Rest, State, [C]), + parse_doctype_1(Rest1, State1, Name, false); + false -> + ?fatal_error(State, "expecting name or whitespace") + end; +parse_doctype(Bytes, State) -> + unicode_incomplete_check([Bytes, State, fun parse_doctype/2], + undefined). + + +%%---------------------------------------------------------------------- +%% Function : parse_doctype_1(Rest, State, Name, Definition) -> Result +%% Parameters: Rest = string() | binary() +%% State = #xmerl_sax_parser_state{} +%% Name = string() +%% Definition = true |false +%% Result : {Rest, State} +%% Description: Gets the DTD name as a parameter and contine parse the DOCTYPE +%% directive +%%---------------------------------------------------------------------- +parse_doctype_1(?STRING_EMPTY, State, Name, Definition) -> + cf(?STRING_EMPTY, State, Name, Definition, fun parse_doctype_1/4); +parse_doctype_1(?STRING_REST(">", Rest), State, _, _) -> + {Rest, State}; +parse_doctype_1(?STRING_REST("[", Rest), State, Name, Definition) -> + State1 = + case Definition of + false -> + event_callback({startDTD, Name, "", ""}, State); + true -> + State + end, + {Rest1, State2} = parse_doctype_decl(Rest, State1), + {_WS, Rest2, State3} = whitespace(Rest1, State2, []), + parse_doctype_2(Rest2, State3); +parse_doctype_1(?STRING_UNBOUND_REST(C, _) = Rest, State, Name, Definition) when ?is_whitespace(C) -> + {_WS, Rest1, State1} = whitespace(Rest, State, []), + parse_doctype_1(Rest1, State1, Name, Definition); +parse_doctype_1(?STRING_UNBOUND_REST(C, _) = Rest, State, Name, _Definition) when C == $S; C == $P -> + {PubId, SysId, Rest1, State1} = parse_external_id(Rest, State, false), + State2 = event_callback({startDTD, Name, PubId, SysId}, State1), + State3 = + case State2#xmerl_sax_parser_state.skip_external_dtd of + false -> + parse_external_entity(State2#xmerl_sax_parser_state{file_type=dtd}, PubId, SysId); + true -> + State2 + end, + parse_doctype_1(Rest1, State3, Name, true); +parse_doctype_1(Bytes, State, Name, Definition) -> + unicode_incomplete_check([Bytes, State, Name, Definition, fun parse_doctype_1/4], + "expecting >, external id or declaration part"). + + +parse_doctype_2(?STRING_REST(">", Rest), State) -> + {Rest, State}; +parse_doctype_2(Bytes, State) -> + unicode_incomplete_check([Bytes, State, fun parse_doctype_2/2], + "expecting >"). + + +%%---------------------------------------------------------------------- +%% Function : parse_external_entity(State, PubId, SysId) -> Result +%% Parameters: State = #xmerl_sax_parser_state{} +%% PubId = string() +%% SysId = string() +%% Result : {Rest, State} +%% Description: Starts the parsing of an external entity by calling the resolver and +%% then sends the input to the parsing function. +%%---------------------------------------------------------------------- +%% The public id is not handled +parse_external_entity(State, _PubId, SysId) -> + + ExtRef = check_uri(SysId, State#xmerl_sax_parser_state.current_location), + + SaveState = event_callback({startEntity, SysId}, State), + + State1 = State#xmerl_sax_parser_state{line_no=1, + continuation_state=undefined, + continuation_fun=fun xmerl_sax_parser:default_continuation_cb/1, + end_tags = []}, + + + EventState = handle_external_entity(ExtRef, State1), + + NewState = event_callback({endEntity, SysId}, SaveState#xmerl_sax_parser_state{event_state=EventState}), + NewState#xmerl_sax_parser_state{file_type=normal}. + + + +%%---------------------------------------------------------------------- +%% Function : handle_external_entity(ExtRef, State) -> Result +%% Parameters: ExtRef = {file, string()} | {http, string()} +%% State = #xmerl_sax_parser_state{} +%% Result : string() | binary() +%% Description: Returns working directory, entity and the opened +%% filedescriptor. +%%---------------------------------------------------------------------- +handle_external_entity({file, FileToOpen}, State) -> + + case file:open(FileToOpen, [raw, read, binary]) of + {error, Reason} -> + ?fatal_error(State, "Couldn't open external entity "++ FileToOpen ++ " : " + ++ file:format_error(Reason)); + {ok, FD} -> + {?STRING_EMPTY, EntityState} = + parse_external_entity_1(<<>>, + State#xmerl_sax_parser_state{continuation_state=FD, + current_location=filename:dirname(FileToOpen), + entity=filename:basename(FileToOpen)}), + file:close(FD), + EntityState#xmerl_sax_parser_state.event_state + end; +handle_external_entity({http, Url}, State) -> + + try + {Host, Port, Key} = http(Url), + TmpFile = http_get_file(Host, Port, Key), + case file:open(TmpFile, [raw, read, binary]) of + {error, Reason} -> + ?fatal_error(State, "Couldn't open temporary file " ++ TmpFile ++ " : " + ++ file:format_error(Reason)); + {ok, FD} -> + {?STRING_EMPTY, EntityState} = + parse_external_entity_1(<<>>, + State#xmerl_sax_parser_state{continuation_state=FD, + current_location=filename:dirname(Url), + entity=filename:basename(Url)}), + file:close(FD), + file:delete(TmpFile), + EntityState#xmerl_sax_parser_state.event_state + end + catch + throw:{error, Error} -> + ?fatal_error(State, Error) + end; +handle_external_entity({Tag, _Url}, State) -> + ?fatal_error(State, "Unsupported URI type: " ++ atom_to_list(Tag)). + +%%---------------------------------------------------------------------- +%% Function : parse_external_entity_1(Rest, State) -> Result +%% Parameters: Rest = string() | binary() +%% State = #xmerl_sax_parser_state{} +%% Result : {Rest, State} +%% Description: Parse the external entity. +%%---------------------------------------------------------------------- +parse_external_entity_1(?STRING_EMPTY, #xmerl_sax_parser_state{file_type=Type} = State) -> + case catch cf(?STRING_EMPTY, State, fun parse_external_entity_1/2) of + {Rest, State1} when is_record(State1, xmerl_sax_parser_state) -> + {Rest, State}; + {fatal_error, {State1, "No more bytes"}} when Type == dtd; Type == entity -> + {?STRING_EMPTY, State1}; + Other -> + throw(Other) + end; +parse_external_entity_1(?BYTE_ORDER_MARK_1, State) -> + cf(?BYTE_ORDER_MARK_1, State, fun parse_external_entity_1/2); +parse_external_entity_1(?BYTE_ORDER_MARK_2, State) -> + cf(?BYTE_ORDER_MARK_2, State, fun parse_external_entity_1/2); +parse_external_entity_1(?BYTE_ORDER_MARK_REST(Rest), State) -> + parse_external_entity_1(Rest, State); +parse_external_entity_1(?STRING("<"), State) -> + cf(?STRING("<"), State, fun parse_external_entity_1/2); +parse_external_entity_1(?STRING("<?"), State) -> + cf(?STRING("<?"), State, fun parse_external_entity_1/2); +parse_external_entity_1(?STRING("<?x"), State) -> + cf(?STRING("<?x"), State, fun parse_external_entity_1/2); +parse_external_entity_1(?STRING("<?xm"), State) -> + cf(?STRING("<?xm"), State, fun parse_external_entity_1/2); +parse_external_entity_1(?STRING("<?xml"), State) -> + cf(?STRING("<?xml"), State, fun parse_external_entity_1/2); +parse_external_entity_1(?STRING_REST("<?xml", Rest) = Bytes, + #xmerl_sax_parser_state{file_type=Type} = State) -> + {Rest1, State1} = + case is_next_char_whitespace(Rest, State) of + false -> + {Bytes, State}; + true -> + {_XmlAttributes, R, S} = parse_version_info(Rest, State, []), + %S1 = event_callback({processingInstruction, "xml", XmlAttributes}, S),% The XML decl. should not be reported as a PI + {R, S} + end, + case Type of + dtd -> + case catch parse_doctype_decl(Rest1, State1) of + {Rest2, State2} when is_record(State2, xmerl_sax_parser_state) -> + {Rest2, State2}; + {fatal_error, {State2, "No more bytes"}} -> + {?STRING_EMPTY, State2}; + Other -> + throw(Other) + end; + + _ -> % Type is normal or entity + parse_content(Rest1, State1, [], true) + end; +parse_external_entity_1(?STRING_UNBOUND_REST(_C, _) = Bytes, + #xmerl_sax_parser_state{file_type=Type} = State) -> + case Type of + normal -> + parse_content(Bytes, State, [], true); + dtd -> + parse_doctype_decl(Bytes, State); + entity -> + parse_doctype_decl(Bytes, State) end; +parse_external_entity_1(Bytes, State) -> + unicode_incomplete_check([Bytes, State, fun parse_external_entity_1/2], + undefined). + +%%---------------------------------------------------------------------- +%% Function : is_next_char_whitespace(Bytes, State) -> Result +%% Parameters: Bytes = string() | binary() +%% State = #xmerl_sax_parser_state{} +%% Result : true | false +%% Description: Checks if first character is whitespace. +%%---------------------------------------------------------------------- +is_next_char_whitespace(?STRING_UNBOUND_REST(C, _), _) when ?is_whitespace(C) -> + true; +is_next_char_whitespace(?STRING_UNBOUND_REST(_C, _), _) -> + false; +is_next_char_whitespace(Bytes, State) -> + unicode_incomplete_check([Bytes, State, fun is_next_char_whitespace/2], + undefined). + +%%---------------------------------------------------------------------- +%% Function : parse_external_id(Rest, State, OptionalSystemId) -> Result +%% Parameters: Rest = string() | binary() +%% State = #xmerl_sax_parser_state{} +%% OptionalSystemId = true | false +%% Result : {PubId, SysId, Rest, State} +%% PubId = string() +%% SysId = string() +%% Description: Parse an external id. The function is used in two cases one +%% where the system is optional and one where it's required +%% after a public id. +%% [75] ExternalID ::= 'SYSTEM' S SystemLiteral +%% | 'PUBLIC' S PubidLiteral S SystemLiteral +%%---------------------------------------------------------------------- +parse_external_id(?STRING_EMPTY, State, OptionalSystemId) -> + cf(?STRING_EMPTY, State, OptionalSystemId, fun parse_external_id/3); +parse_external_id(?STRING("S"), State,OptionalSystemId) -> + cf(?STRING("S"), State, OptionalSystemId, fun parse_external_id/3); +parse_external_id(?STRING("SY"), State, OptionalSystemId) -> + cf(?STRING("SY"), State, OptionalSystemId, fun parse_external_id/3); +parse_external_id(?STRING("SYS"), State, OptionalSystemId) -> + cf(?STRING("SYS"), State, OptionalSystemId, fun parse_external_id/3); +parse_external_id(?STRING("SYST"), State, OptionalSystemId) -> + cf(?STRING("SYST"), State, OptionalSystemId, fun parse_external_id/3); +parse_external_id(?STRING("SYSTE"), State, OptionalSystemId) -> + cf(?STRING("SYSTE"), State, OptionalSystemId, fun parse_external_id/3); +parse_external_id(?STRING_REST("SYSTEM", Rest), State, _) -> + {SysId, Rest1, State1} = parse_system_id(Rest, State, false), + {"", SysId, Rest1, State1}; +parse_external_id(?STRING("P"), State, OptionalSystemId) -> + cf(?STRING("P"), State, OptionalSystemId, fun parse_external_id/3); +parse_external_id(?STRING("PU"), State, OptionalSystemId) -> + cf(?STRING("PU"), State, OptionalSystemId, fun parse_external_id/3); +parse_external_id(?STRING("PUB"), State, OptionalSystemId) -> + cf(?STRING("PUB"), State, OptionalSystemId, fun parse_external_id/3); +parse_external_id(?STRING("PUBL"), State, OptionalSystemId) -> + cf(?STRING("PUBL"), State, OptionalSystemId, fun parse_external_id/3); +parse_external_id(?STRING("PUBLI"), State, OptionalSystemId) -> + cf(?STRING("PUBLI"), State, OptionalSystemId, fun parse_external_id/3); +parse_external_id(?STRING_REST("PUBLIC", Rest), State, OptionalSystemId) -> + parse_public_id(Rest, State, OptionalSystemId); +parse_external_id(Bytes, State, OptionalSystemId) -> + unicode_incomplete_check([Bytes, State, OptionalSystemId, fun parse_external_id/3], + "expecting SYSTEM or PUBLIC"). + + +%%---------------------------------------------------------------------- +%% Function : parse_system_id(Rest, State, OptionalSystemId) -> Result +%% Parameters: Rest = string() | binary() +%% State = #xmerl_sax_parser_state{} +%% OptionalSystemId = true | false +%% Result : {SysId, Rest, State} +%% SysId = string() +%% Description: Parse a system id. The function is used in two cases one +%% where the system is optional and one where it's required. +%%---------------------------------------------------------------------- +parse_system_id(?STRING_UNBOUND_REST(C, _) = Bytes, State, OptionalSystemId) when ?is_whitespace(C) -> + {_WS, Rest, State1} = whitespace(Bytes, State, []), + check_system_litteral(Rest, State1, OptionalSystemId); +parse_system_id(?STRING_UNBOUND_REST(_C, _) = Bytes, State, true) -> + {"", Bytes, State}; +parse_system_id(Bytes, State, OptionalSystemId) -> + unicode_incomplete_check([Bytes, State, OptionalSystemId, fun parse_system_id/3], + "whitespace expected"). + +check_system_litteral(?STRING_UNBOUND_REST(C, Rest), State, _OptionalSystemId) when C == $'; C == $" -> + parse_system_litteral(Rest, State, C, []); +check_system_litteral(?STRING_UNBOUND_REST(_C, _) = Bytes, State, true) -> + {"", Bytes, State}; +check_system_litteral(Bytes, State, OptionalSystemId) -> + unicode_incomplete_check([Bytes, State, OptionalSystemId, fun check_system_litteral/3], + "\" or \' expected"). + + +%%---------------------------------------------------------------------- +%% Function : parse_public_id(Rest, State, OptionalSystemId) -> Result +%% Parameters: Rest = string() | binary() +%% State = #xmerl_sax_parser_state{} +%% OptionalSystemId = true | false +%% Result : {PubId, SysId, Rest, State} +%% PubId = string() +%% SysId = string() +%% Description: Parse a public id. The function is used in two cases one +%% where the following system is optional and one where it's required. +%%---------------------------------------------------------------------- +parse_public_id(?STRING_UNBOUND_REST(C, _) = Bytes, State, OptionalSystemId) when ?is_whitespace(C) -> + {_WS, Rest, State1} = whitespace(Bytes, State, []), + check_public_litteral(Rest, State1, OptionalSystemId); +parse_public_id(Bytes, State,OptionalSystemId) -> + unicode_incomplete_check([Bytes, State, OptionalSystemId, fun parse_public_id/3], + "whitespace expected"). + + +check_public_litteral(?STRING_UNBOUND_REST(C, Rest), State, OptionalSystemId) when C == $'; C == $" -> + {PubId, Rest1, State1} = parse_pubid_litteral(Rest, State, C, []), + {SysId, Rest2, State2} = parse_system_id(Rest1, State1, OptionalSystemId), + {PubId, SysId, Rest2, State2}; +check_public_litteral(Bytes, State, OptionalSystemId) -> + unicode_incomplete_check([Bytes, State, OptionalSystemId, fun check_public_litteral/3], + "\" or \' expected"). + + +%%---------------------------------------------------------------------- +%% Function : parse_doctype_decl(Rest, State) -> Result +%% Parameters: Rest = string() | binary() +%% State = #xmerl_sax_parser_state{} +%% Result : {Rest, State} +%% Description: Parse the DOCTYPE declaration part +%% [29] markupdecl ::= elementdecl | AttlistDecl | EntityDecl +%% | NotationDecl | PI | Comment +%%---------------------------------------------------------------------- +parse_doctype_decl(?STRING_EMPTY, State) -> + cf(?STRING_EMPTY, State, fun parse_doctype_decl/2); +parse_doctype_decl(?STRING("<"), State) -> + cf(?STRING("<"), State, fun parse_doctype_decl/2); +parse_doctype_decl(?STRING_REST("<?", Rest), State) -> + {Rest1, State1} = parse_pi(Rest, State), + parse_doctype_decl(Rest1, State1); +parse_doctype_decl(?STRING_REST("%", Rest), State) -> + {Ref, Rest1, State1} = parse_pe_reference(Rest, State), + case Ref of + {internal_parameter, _, RefValue} -> + IValue = ?TO_INPUT_FORMAT(" " ++ RefValue ++ " "), + parse_doctype_decl(?APPEND_STRING(IValue, Rest1), State1); + {external_parameter, _, {PubId, SysId}} -> + State2 = parse_external_entity(State1#xmerl_sax_parser_state{file_type = entity}, PubId, SysId), + parse_doctype_decl(Rest1, State2) + end; +parse_doctype_decl(?STRING_REST("<!", Rest1), State) -> + parse_doctype_decl_1(Rest1, State); +parse_doctype_decl(?STRING_REST("]", Rest), State) -> + {Rest, State}; +parse_doctype_decl(?STRING_UNBOUND_REST(C, _) = Bytes, State) when ?is_whitespace(C) -> + {_WS, Rest, State1} = whitespace(Bytes, State, []), + parse_doctype_decl(Rest, State1); +parse_doctype_decl(Bytes, State) -> + unicode_incomplete_check([Bytes, State, fun parse_doctype_decl/2], + "expecting ELEMENT, ATTLIST, ENTITY, NOTATION or comment"). + + +%%---------------------------------------------------------------------- +%% Function : parse_doctype_decl_1(Rest, State) -> Result +%% Parameters: Rest = string() | binary() +%% State = #xmerl_sax_parser_state{} +%% Result : {Rest, State} +%% Description: Main switching function for the different markup declarations +%% of the DOCTYPE. +%%---------------------------------------------------------------------- +parse_doctype_decl_1(?STRING_EMPTY, State) -> + cf(?STRING_EMPTY, State, fun parse_doctype_decl_1/2); + +parse_doctype_decl_1(?STRING("E"), State) -> + cf(?STRING("E"), State, fun parse_doctype_decl_1/2); +parse_doctype_decl_1(?STRING("EL"), State) -> + cf(?STRING("EL"), State, fun parse_doctype_decl_1/2); +parse_doctype_decl_1(?STRING("ELE"), State) -> + cf(?STRING("ELE"), State, fun parse_doctype_decl_1/2); +parse_doctype_decl_1(?STRING("ELEM"), State) -> + cf(?STRING("ELEM"), State, fun parse_doctype_decl_1/2); +parse_doctype_decl_1(?STRING("ELEME"), State) -> + cf(?STRING("ELEME"), State, fun parse_doctype_decl_1/2); +parse_doctype_decl_1(?STRING("ELEMEN"), State) -> + cf(?STRING("ELEMEN"), State, fun parse_doctype_decl_1/2); +parse_doctype_decl_1(?STRING_REST("ELEMENT", Rest), State) -> + {Rest1, State1} = parse_element_decl(Rest, State), + parse_doctype_decl(Rest1, State1); + +parse_doctype_decl_1(?STRING("A"), State) -> + cf(?STRING("A"), State, fun parse_doctype_decl_1/2); +parse_doctype_decl_1(?STRING("AT"), State) -> + cf(?STRING("AT"), State, fun parse_doctype_decl_1/2); +parse_doctype_decl_1(?STRING("ATT"), State) -> + cf(?STRING("ATT"), State, fun parse_doctype_decl_1/2); +parse_doctype_decl_1(?STRING("ATTL"), State) -> + cf(?STRING("ATTL"), State, fun parse_doctype_decl_1/2); +parse_doctype_decl_1(?STRING("ATTLI"), State) -> + cf(?STRING("ATTLI"), State, fun parse_doctype_decl_1/2); +parse_doctype_decl_1(?STRING("ATTLIS"), State) -> + cf(?STRING("ATTLIS"), State, fun parse_doctype_decl_1/2); +parse_doctype_decl_1(?STRING_REST("ATTLIST", Rest), State) -> + {Rest1, State1} = parse_att_list_decl(Rest, State), + parse_doctype_decl(Rest1, State1); + +%% E clause not needed here because already taken care of above. +parse_doctype_decl_1(?STRING("EN"), State) -> + cf(?STRING("EN"), State, fun parse_doctype_decl_1/2); +parse_doctype_decl_1(?STRING("ENT"), State) -> + cf(?STRING("ENT"), State, fun parse_doctype_decl_1/2); +parse_doctype_decl_1(?STRING("ENTI"), State) -> + cf(?STRING("ENTI"), State, fun parse_doctype_decl_1/2); +parse_doctype_decl_1(?STRING("ENTIT"), State) -> + cf(?STRING("ENTIT"), State, fun parse_doctype_decl_1/2); +parse_doctype_decl_1(?STRING_REST("ENTITY", Rest), State) -> + {Rest1, State1} = parse_entity_decl(Rest, State), + parse_doctype_decl(Rest1, State1); + +parse_doctype_decl_1(?STRING("N"), State) -> + cf(?STRING("N"), State, fun parse_doctype_decl_1/2); +parse_doctype_decl_1(?STRING("NO"), State) -> + cf(?STRING("NO"), State, fun parse_doctype_decl_1/2); +parse_doctype_decl_1(?STRING("NOT"), State) -> + cf(?STRING("NOT"), State, fun parse_doctype_decl_1/2); +parse_doctype_decl_1(?STRING("NOTA"), State) -> + cf(?STRING("NOTA"), State, fun parse_doctype_decl_1/2); +parse_doctype_decl_1(?STRING("NOTAT"), State) -> + cf(?STRING("NOTAT"), State, fun parse_doctype_decl_1/2); +parse_doctype_decl_1(?STRING("NOTATI"), State) -> + cf(?STRING("NOTATI"), State, fun parse_doctype_decl_1/2); +parse_doctype_decl_1(?STRING("NOTATIO"), State) -> + cf(?STRING("NOTATIO"), State, fun parse_doctype_decl_1/2); +parse_doctype_decl_1(?STRING_REST("NOTATION", Rest), State) -> + {Rest1, State1} = parse_notation_decl(Rest, State), + parse_doctype_decl(Rest1, State1); +parse_doctype_decl_1(?STRING("-"), State) -> + cf(?STRING("-"), State, fun parse_doctype_decl_1/2); +parse_doctype_decl_1(?STRING_REST("--", Rest), State) -> + {Rest1, State1} = parse_comment(Rest, State, []), + parse_doctype_decl(Rest1, State1); +parse_doctype_decl_1(Bytes, State) -> + unicode_incomplete_check([Bytes, State, fun parse_doctype_decl_1/2], + "expecting ELEMENT, ATTLIST, ENTITY, NOTATION or comment"). + + +%%---------------------------------------------------------------------- +%% Function : parse_element_decl(Rest, State) -> Result +%% Parameters: Rest = string() | binary() +%% State = #xmerl_sax_parser_state{} +%% Result : {Rest, State} +%% Description: Parse element declarations. +%% [45] elementdecl ::= '<!ELEMENT' S Name S contentspec S? '>' +%%---------------------------------------------------------------------- +parse_element_decl(?STRING_EMPTY, State) -> + cf(?STRING_EMPTY, State, fun parse_element_decl/2); +parse_element_decl(?STRING_UNBOUND_REST(C, _) = Bytes, State) when ?is_whitespace(C) -> + {_WS, Rest, State1} = whitespace(Bytes, State, []), + parse_element_decl_1(Rest, State1); +parse_element_decl(Bytes, State) -> + unicode_incomplete_check([Bytes, State, fun parse_element_decl/2], + "whitespace expected"). + +parse_element_decl_1(?STRING_UNBOUND_REST(C, Rest), State) -> + case is_name_start(C) of + true -> + {Name, Rest1, State1} = parse_name(Rest, State, [C]), + {Model, Rest2, State2} = parse_element_content(Rest1, State1), + State3 = event_callback({elementDecl, Name, Model}, State2), + {Rest2, State3}; + false -> + ?fatal_error(State, "name expected") + end; +parse_element_decl_1(Bytes, State) -> + unicode_incomplete_check([Bytes, State, fun parse_element_decl_1/2], + undefined). + + +%%---------------------------------------------------------------------- +%% Function : parse_element_content(Rest, State) -> Result +%% Parameters: Rest = string() | binary() +%% State = #xmerl_sax_parser_state{} +%% Result : {Rest, State} +%% Description: Parse contents of an element declaration. +%% [46] contentspec ::= 'EMPTY' | 'ANY' | Mixed | children +%%---------------------------------------------------------------------- +parse_element_content(?STRING_EMPTY, State) -> + cf(?STRING_EMPTY, State, fun parse_element_content/2); +parse_element_content(?STRING_UNBOUND_REST(C, _) = Bytes, State) when ?is_whitespace(C) -> + {_WS, Rest, State1} = whitespace(Bytes, State, []), + parse_element_content_1(Rest, State1, []); +parse_element_content(Bytes, State) -> + unicode_incomplete_check([Bytes, State, fun parse_element_content/2], + "whitespace expected"). + + +%%---------------------------------------------------------------------- +%% Function : parse_element_content_1(Rest, State, Acc) -> Result +%% Parameters: Rest = string() | binary() +%% State = #xmerl_sax_parser_state{} +%% Acc = string() +%% Result : {Content, Rest, State} +%% Content = string() +%% Description: Parse contents of an element declaration. +%%---------------------------------------------------------------------- +parse_element_content_1(?STRING_EMPTY, State, Acc) -> + cf(?STRING_EMPTY, State, Acc, fun parse_element_content_1/3); +parse_element_content_1(?STRING_REST(">", Rest), State, Acc) -> + {lists:reverse(Acc), Rest, State}; +parse_element_content_1(?STRING_UNBOUND_REST(C, Rest), State, Acc) -> + parse_element_content_1(Rest, State, [C|Acc]); +parse_element_content_1(Bytes, State, Acc) -> + unicode_incomplete_check([Bytes, State, Acc, fun parse_element_content_1/3], + undefined). + + +%%---------------------------------------------------------------------- +%% Function : parse_att_list_decl(Rest, State) -> Result +%% Parameters: Rest = string() | binary() +%% State = #xmerl_sax_parser_state{} +%% Result : {Rest, State} +%% Description: Parse an attribute list declaration. +%% [52] AttlistDecl ::= '<!ATTLIST' S Name AttDef* S? '>' +%%---------------------------------------------------------------------- +parse_att_list_decl(?STRING_EMPTY, State) -> + cf(?STRING_EMPTY, State, fun parse_att_list_decl/2); +parse_att_list_decl(?STRING_UNBOUND_REST(C, _) = Bytes, State) when ?is_whitespace(C) -> + {_WS, Rest, State1} = whitespace(Bytes, State, []), + parse_att_list_decl_1(Rest, State1); +parse_att_list_decl(Bytes, State) -> + unicode_incomplete_check([Bytes, State, fun parse_att_list_decl/2], + "whitespace expected"). + + +parse_att_list_decl_1(?STRING_UNBOUND_REST(C, Rest), State) -> + case is_name_start(C) of + true -> + {ElementName, Rest1, State1} = parse_name(Rest, State, [C]), + parse_att_defs(Rest1, State1, ElementName); + false -> + ?fatal_error(State, "name expected") + end; +parse_att_list_decl_1(Bytes, State) -> + unicode_incomplete_check([Bytes, State, fun parse_att_list_decl_1/2], + undefined). + + +%%---------------------------------------------------------------------- +%% Function : parse_att_defs(Rest, State, ElementName) -> Result +%% Parameters: Rest = string() | binary() +%% State = #xmerl_sax_parser_state{} +%% ElementName = string() +%% Result : {Rest, State} +%% Description: Parse an attribute definition. +%% [53] AttDef ::= S Name S AttType S DefaultDecl +%%---------------------------------------------------------------------- +parse_att_defs(?STRING_EMPTY, State, ElementName) -> + cf(?STRING_EMPTY, State, ElementName, fun parse_att_defs/3); +parse_att_defs(?STRING_REST(">", Rest), State, _ElementName) -> + {Rest, State}; +parse_att_defs(?STRING_UNBOUND_REST(C, _) = Rest, State, ElementName) when ?is_whitespace(C) -> + {_WS, Rest1, State1} = whitespace(Rest, State, []), + parse_att_defs(Rest1, State1, ElementName); +parse_att_defs(?STRING_UNBOUND_REST(C, Rest), State, ElementName) -> + case is_name_start(C) of + true -> + {AttrName, Rest1, State1} = parse_name(Rest, State, [C]), + {Type, Rest2, State2} = parse_att_type(Rest1, State1), + {Mode, Value, Rest3, State3} = parse_default_decl(Rest2, State2), + State4 = event_callback({attributeDecl, ElementName, AttrName, Type, Mode, Value}, State3), + parse_att_defs(Rest3, State4, ElementName); + false -> + ?fatal_error(State, "whitespace or name expected") + end; +parse_att_defs(Bytes, State, ElementName) -> + unicode_incomplete_check([Bytes, State, ElementName, fun parse_att_defs/3], + undefined). + + +%%---------------------------------------------------------------------- +%% Function : parse_att_type(Rest, State) -> Result +%% Parameters: Rest = string() | binary() +%% State = #xmerl_sax_parser_state{} +%% Result : {Type, Rest, State} +%% Type = string() +%% Description: Parse an attribute type. +%% [54] AttType ::= StringType | TokenizedType | EnumeratedType +%% [55] StringType ::= 'CDATA' +%% [56] TokenizedType ::= 'ID' | 'IDREF' | 'IDREFS' | 'ENTITY' +%% | 'ENTITIES' | 'NMTOKEN' | 'NMTOKENS' +%% [57] EnumeratedType ::= NotationType | Enumeration +%% [58] NotationType ::= 'NOTATION' S '(' S? Name (S? '|' S? Name)* S? ')' +%% [59] Enumeration ::= '(' S? Nmtoken (S? '|' S? Nmtoken)* S? ')' +%%---------------------------------------------------------------------- +parse_att_type(?STRING_EMPTY, State) -> + cf(?STRING_EMPTY, State, fun parse_att_type/2); +parse_att_type(?STRING_UNBOUND_REST(C, _) = Bytes, State) when ?is_whitespace(C) -> + {_WS, Rest, State1} = whitespace(Bytes, State, []), + case parse_att_type_1(Rest, State1, []) of + {Type, Rest1, State2} when Type == "("; Type == "NOTATION" -> + {T, Rest2, State3} = parse_until_right_paren(Rest1, State2, []), + {Type ++ T, Rest2, State3}; + {Type, Rest1, State2} -> + case check_att_type(Type) of + true -> + {Type, Rest1, State2}; + false -> + ?fatal_error(State2, "wrong attribute type") + end + end; +parse_att_type(Bytes, State) -> + unicode_incomplete_check([Bytes, State, fun parse_att_type/2], + "whitespace expected"). + + +%%---------------------------------------------------------------------- +%% Function : parse_att_type_1(Rest, State, Acc) -> Result +%% Parameters: Rest = string() | binary() +%% State = #xmerl_sax_parser_state{} +%% Acc = string() +%% Result : {Type, Rest, State} +%% Type = string() +%% Description: Parse an attribute type. +%%---------------------------------------------------------------------- +parse_att_type_1(?STRING_EMPTY, State, Acc) -> + cf(?STRING_EMPTY, State, Acc, fun parse_att_type_1/3); +parse_att_type_1(?STRING_UNBOUND_REST(C, _) = Bytes, State, Acc) when ?is_whitespace(C) -> + {lists:reverse(Acc), Bytes, State}; +parse_att_type_1(?STRING_REST("(", Rest), State, []) -> + {"(", Rest, State}; +parse_att_type_1(?STRING_UNBOUND_REST(C, Rest), State, Acc) -> + parse_att_type_1(Rest, State, [C|Acc]); +parse_att_type_1(Bytes, State, Acc) -> + unicode_incomplete_check([Bytes, State, Acc, fun parse_att_type_1/3], + undefined). + +%%---------------------------------------------------------------------- +%% Function : check_att_type(Type) -> Result +%% Parameters: Type = string() +%% Result : true | false +%% Description:Check if an attribute type is valid. +%%---------------------------------------------------------------------- +check_att_type("CDATA") -> + true; +check_att_type("ID") -> + true; +check_att_type("IDREF") -> + true; +check_att_type("IDREFS") -> + true; +check_att_type("ENTITY") -> + true; +check_att_type("ENTITIES") -> + true; +check_att_type("NMTOKEN") -> + true; +check_att_type("NMTOKENS") -> + true; +check_att_type(_) -> + false. + + +%%---------------------------------------------------------------------- +%% Function : parse_until_right_paren(Rest, State, Acc) -> Result +%% Parameters: Rest = string() | binary() +%% State = #xmerl_sax_parser_state{} +%% Acc = string() +%% Result : {Type, Rest, State} +%% Type = string() +%% Description: Parse an enumurated type until ')'. +%%---------------------------------------------------------------------- +parse_until_right_paren(?STRING_EMPTY, State, Acc) -> + cf(?STRING_EMPTY, State, Acc, fun parse_until_right_paren/3); +parse_until_right_paren(?STRING_REST(")", Rest), State, Acc) -> + {lists:reverse(")" ++ Acc), Rest, State}; +parse_until_right_paren(?STRING_UNBOUND_REST(C, Rest), State, Acc) -> + parse_until_right_paren(Rest, State, [C|Acc]); +parse_until_right_paren(Bytes, State, Acc) -> + unicode_incomplete_check([Bytes, State, Acc, fun parse_until_right_paren/3], + undefined). + + +%%---------------------------------------------------------------------- +%% Function : parse_default_decl(Rest, State) -> Result +%% Parameters: Rest = string() | binary() +%% State = #xmerl_sax_parser_state{} +%% Result : {Default, Rest, State} +%% Default = string() +%% Description: Parse a default declaration. +%% [60] DefaultDecl ::= '#REQUIRED' | '#IMPLIED' | (('#FIXED' S)? AttValue) +%%---------------------------------------------------------------------- +parse_default_decl(?STRING_EMPTY, State) -> + cf(?STRING_EMPTY, State, fun parse_default_decl/2); +parse_default_decl(?STRING_UNBOUND_REST(C, _) = Bytes, State) when ?is_whitespace(C) -> + {_WS, Rest, State1} = whitespace(Bytes, State, []), + parse_default_decl_1(Rest, State1); +parse_default_decl(Bytes, State) -> + unicode_incomplete_check([Bytes, State, fun parse_default_decl/2], + "whitespace expected"). + + +%%---------------------------------------------------------------------- +%% Function : parse_default_decl_1(Rest, State) -> Result +%% Parameters: Rest = string() | binary() +%% State = #xmerl_sax_parser_state{} +%% Result : {Default, Rest, State} +%% Default = string() +%% Description: Parse a default declaration. +%%---------------------------------------------------------------------- +parse_default_decl_1(?STRING_EMPTY, State) -> + cf(?STRING_EMPTY, State, fun parse_default_decl_1/2); +parse_default_decl_1(?STRING_REST("#", Rest), State) -> + case Rest of + ?STRING("R") -> + cf(Rest, State, fun parse_default_decl_1/2); + ?STRING("RE") -> + cf(Rest, State, fun parse_default_decl_1/2); + ?STRING("REQ") -> + cf(Rest, State, fun parse_default_decl_1/2); + ?STRING("REQU") -> + cf(Rest, State, fun parse_default_decl_1/2); + ?STRING("REQUI") -> + cf(Rest, State, fun parse_default_decl_1/2); + ?STRING("REQUIR") -> + cf(Rest, State, fun parse_default_decl_1/2); + ?STRING("REQUIRE") -> + cf(Rest, State, fun parse_default_decl_1/2); + ?STRING_REST("REQUIRED", Rest1) -> + {"#REQUIRED", undefined, Rest1, State}; + + ?STRING("I") -> + cf(Rest, State, fun parse_default_decl_1/2); + ?STRING("IM") -> + cf(Rest, State, fun parse_default_decl_1/2); + ?STRING("IMP") -> + cf(Rest, State, fun parse_default_decl_1/2); + ?STRING("IMPL") -> + cf(Rest, State, fun parse_default_decl_1/2); + ?STRING("IMPLI") -> + cf(Rest, State, fun parse_default_decl_1/2); + ?STRING("IMPLIE") -> + cf(Rest, State, fun parse_default_decl_1/2); + ?STRING_REST("IMPLIED", Rest1) -> + {"#IMPLIED", undefined, Rest1, State}; + + ?STRING("F") -> + cf(Rest, State, fun parse_default_decl_1/2); + ?STRING("FI") -> + cf(Rest, State, fun parse_default_decl_1/2); + ?STRING("FIX") -> + cf(Rest, State, fun parse_default_decl_1/2); + ?STRING("FIXE") -> + cf(Rest, State, fun parse_default_decl_1/2); + ?STRING_REST("FIXED", Rest1) -> + parse_fixed(Rest1, State); + _ -> + ?fatal_error(State, "REQUIRED, IMPLIED or FIXED expected") + end; +parse_default_decl_1(?STRING_UNBOUND_REST(C, Rest), State) when C == $'; C == $" -> + {DefaultValue, Rest1, State1} = parse_att_value(Rest, State, C, []), + {"", DefaultValue, Rest1, State1}; +parse_default_decl_1(Bytes, State) -> + unicode_incomplete_check([Bytes, State, fun parse_default_decl_1/2], + "bad default declaration"). + + +parse_fixed(?STRING_UNBOUND_REST(C, _) = Bytes, State) when ?is_whitespace(C) -> + {DefaultValue, Rest, State1} = parse_att_value(Bytes, State), % parse_att_value removes leading WS + {"#FIXED", DefaultValue, Rest, State1}; +parse_fixed(Bytes, State) -> + unicode_incomplete_check([Bytes, State, fun parse_fixed/2], + "whitespace expected"). + +%%---------------------------------------------------------------------- +%% Function : parse_entity_decl(Rest, State) -> Result +%% Parameters: Rest = string() | binary() +%% State = #xmerl_sax_parser_state{} +%% Result : {Rest, State} +%% Description: Parse an entity declaration. +%% [70] EntityDecl ::= GEDecl | PEDecl +%% [71] GEDecl ::= '<!ENTITY' S Name S EntityDef S? '>' +%% [72] PEDecl ::= '<!ENTITY' S '%' S Name S PEDef S? '>' +%%---------------------------------------------------------------------- +parse_entity_decl(?STRING_EMPTY, State) -> + cf(?STRING_EMPTY, State, fun parse_entity_decl/2); +parse_entity_decl(?STRING_UNBOUND_REST(C, _) = Bytes, State) when ?is_whitespace(C) -> + {_WS, Rest, State1} = whitespace(Bytes, State, []), + parse_entity_decl_1(Rest, State1); +parse_entity_decl(Bytes, State) -> + unicode_incomplete_check([Bytes, State, fun parse_entity_decl/2], + "whitespace expected"). + + +%%---------------------------------------------------------------------- +%% Function : parse_entity_decl_1(Rest, State) -> Result +%% Parameters: Rest = string() | binary() +%% State = #xmerl_sax_parser_state{} +%% Result : {Rest, State} +%% Description: Parse an entity declaration. +%%---------------------------------------------------------------------- +parse_entity_decl_1(?STRING_EMPTY, State) -> + cf(?STRING_EMPTY, State, fun parse_entity_decl_1/2); +parse_entity_decl_1(?STRING_REST("%", Rest), State) -> + case is_next_char_whitespace(Rest, State) of + true -> + {_WS, Rest1, State1} = whitespace(Rest, State, []), + parse_pe_name(Rest1, State1); + false -> + ?fatal_error(State, "whitespace expected") + end; +parse_entity_decl_1(?STRING_UNBOUND_REST(C, Rest), State) -> + case is_name_start(C) of + true -> + {Name, Rest1, State1} = parse_name(Rest, State, [C]), + case is_next_char_whitespace(Rest1, State1) of + true -> + {_WS, Rest2, State2} = whitespace(Rest1, State1, []), + parse_entity_def(Rest2, State2, Name); + false -> + ?fatal_error(State1, "whitespace expected") + end; + false -> + ?fatal_error(State, "name or % expected") + end; +parse_entity_decl_1(Bytes, State) -> + unicode_incomplete_check([Bytes, State, fun parse_entity_decl_1/2], + undefined). + + + + +parse_pe_name(?STRING_UNBOUND_REST(C, Rest), State) -> + case is_name_start(C) of + true -> + {Name, Rest1, State1} = parse_name(Rest, State, [C]), + case is_next_char_whitespace(Rest1, State1) of + true -> + {_WS, Rest2, State2} = whitespace(Rest1, State1, []), + parse_pe_def(Rest2, State2, Name); + false -> + ?fatal_error(State1, "whitespace expected") + end; + false -> + ?fatal_error(State, "name expected") + end; +parse_pe_name(Bytes, State) -> + unicode_incomplete_check([Bytes, State, fun parse_pe_name/2], + undefined). + + + +%%---------------------------------------------------------------------- +%% Function : parse_entity_def(Rest, State, Name) -> Result +%% Parameters: Rest = string() | binary() +%% State = #xmerl_sax_parser_state{} +%% Name = string() +%% Result : {Rest, State} +%% Description: Parse an entity definition. +%% [73] EntityDef ::= EntityValue | (ExternalID NDataDecl?) +%%---------------------------------------------------------------------- +parse_entity_def(?STRING_EMPTY, State, Name) -> + cf(?STRING_EMPTY, State, Name, fun parse_entity_def/3); +parse_entity_def(?STRING_UNBOUND_REST(C, Rest), State, Name) when C == $'; C == $" -> + {Value, Rest1, State1} = parse_entity_value(Rest, State, C, []), + insert_reference({Name, internal_general, Value}, State1#xmerl_sax_parser_state.ref_table), + State2 = event_callback({internalEntityDecl, Name, Value}, State1), + {_WS, Rest2, State3} = whitespace(Rest1, State2, []), + parse_def_end(Rest2, State3); +parse_entity_def(?STRING_UNBOUND_REST(C, _) = Rest, State, Name) when C == $S; C == $P -> + {PubId, SysId, Rest1, State1} = parse_external_id(Rest, State, false), + {Ndata, Rest2, State2} = parse_ndata(Rest1, State1), + case Ndata of + undefined -> + insert_reference({Name, external_general, {PubId, SysId}}, + State2#xmerl_sax_parser_state.ref_table), + State3 = event_callback({externalEntityDecl, Name, PubId, SysId}, State2), + {Rest2, State3}; + _ -> + insert_reference({Name, unparsed, {PubId, SysId, Ndata}}, + State2#xmerl_sax_parser_state.ref_table), + State3 = event_callback({unparsedEntityDecl, Name, PubId, SysId, Ndata}, State2), + {Rest2, State3} + end; +parse_entity_def(Bytes, State, Name) -> + unicode_incomplete_check([Bytes, State, Name, fun parse_entity_def/3], + "\", \', SYSTEM or PUBLIC expected"). + + +parse_def_end(?STRING_REST(">", Rest), State) -> + {Rest, State}; +parse_def_end(Bytes, State) -> + unicode_incomplete_check([Bytes, State, fun parse_def_end/2], + "> expected"). + + + +%%---------------------------------------------------------------------- +%% Function : parse_ndata(Rest, State) -> Result +%% Parameters: Rest = string() | binary() +%% State = #xmerl_sax_parser_state{} +%% Result : {Rest, State} +%% Description: Parse an NDATA declaration. +%% [76] NDataDecl ::= S 'NDATA' S Name +%%---------------------------------------------------------------------- +parse_ndata(?STRING_EMPTY, State) -> + cf(?STRING_EMPTY, State, fun parse_ndata/2); +parse_ndata(?STRING_REST(">", Rest), State) -> + {undefined, Rest, State}; +parse_ndata(?STRING_UNBOUND_REST(C, _) = Bytes, State) when ?is_whitespace(C) -> + {_WS, Rest1, State1} = whitespace(Bytes, State, []), + parse_ndata_decl(Rest1, State1); +parse_ndata(Bytes, State) -> + unicode_incomplete_check([Bytes, State, fun parse_ndata/2], + "Space before NDATA or > expected"). + +%%---------------------------------------------------------------------- +%% Function : parse_entity_value(Rest, State, Stop, Acc) -> Result +%% Parameters: Rest = string() | binary() +%% State = #xmerl_sax_parser_state{} +%% Stop = $' | $" +%% Acc = string() +%% Result : {Value, Rest, State} +%% Value = string() +%% Description: Parse an attribute value +%%---------------------------------------------------------------------- +parse_entity_value(?STRING_EMPTY, State, undefined, Acc) -> + {Acc, [], State}; %% stop clause when parsing references +parse_entity_value(?STRING_EMPTY, State, Stop, Acc) -> + cf(?STRING_EMPTY, State, Stop, Acc, fun parse_entity_value/4); +parse_entity_value(?STRING("\r"), State, Stop, Acc) -> + cf(?STRING("\r"), State, Stop, Acc, fun parse_entity_value/4); +parse_entity_value(?STRING_REST("\n", Rest), #xmerl_sax_parser_state{line_no=N} = State, Stop, Acc) -> + parse_entity_value(Rest, + State#xmerl_sax_parser_state{line_no=N+1}, Stop, [?space |Acc]); +parse_entity_value(?STRING_REST("\r\n", Rest), #xmerl_sax_parser_state{line_no=N} = State, Stop, Acc) -> + parse_entity_value(Rest, + State#xmerl_sax_parser_state{line_no=N+1}, Stop, [?space |Acc]); +parse_entity_value(?STRING_REST("\r", Rest), #xmerl_sax_parser_state{line_no=N} = State, Stop, Acc) -> + parse_entity_value(Rest, + State#xmerl_sax_parser_state{line_no=N+1}, Stop, [?space |Acc]); +parse_entity_value(?STRING_REST("\t", Rest), #xmerl_sax_parser_state{line_no=N} = State, Stop, Acc) -> + parse_entity_value(Rest, + State#xmerl_sax_parser_state{line_no=N+1}, Stop, [?space |Acc]); +parse_entity_value(?STRING_REST("&", Rest), State, Stop, Acc) -> + {Ref, Rest1, State1} = parse_reference(Rest, State, false), + case Ref of + {character, _, CharValue} -> + parse_entity_value(Rest1, State1, Stop, [CharValue | Acc]); + {internal_general, _, Name, _} -> + parse_entity_value(Rest1, State1, Stop, ";" ++ lists:reverse(Name) ++ "&" ++ Acc); + {external_general, Name, _} -> + parse_entity_value(Rest1, State1, Stop, ";" ++ lists:reverse(Name) ++ "&" ++ Acc); + {not_found, Name} -> + parse_entity_value(Rest1, State1, Stop, ";" ++ lists:reverse(Name) ++ "&" ++ Acc); + {unparsed, Name, _} -> + ?fatal_error(State1, "Unparsed entity reference in entity value: " ++ Name) + end; +parse_entity_value(?STRING_REST("%", Rest), #xmerl_sax_parser_state{file_type=Type} = State, Stop, Acc) -> + {Ref, Rest1, State1} = parse_pe_reference(Rest, State), + case Type of + normal -> %WFC: PEs in Internal Subset + {_, Name, _} = Ref, + ?fatal_error(State1, "A parameter reference may not occur not within " + "markup declarations in the internal DTD subset: " ++ Name); + _ -> + case Ref of + {internal_parameter, _, RefValue} -> + IValue = ?TO_INPUT_FORMAT(" " ++ RefValue ++ " "), + parse_entity_value(?APPEND_STRING(IValue, Rest1), State1, Stop, Acc); + {external_parameter, _, {_PubId, _SysId}} -> + ?fatal_error(State1, "Parameter references in entity value not supported yet.") + end + end; +parse_entity_value(?STRING_UNBOUND_REST(Stop, Rest), State, Stop, Acc) -> + {lists:reverse(Acc), Rest, State}; +parse_entity_value(?STRING_UNBOUND_REST(C, Rest), State, Stop, Acc) -> + parse_entity_value(Rest, State, Stop, [C|Acc]); +parse_entity_value(Bytes, State, Stop, Acc) -> + unicode_incomplete_check([Bytes, State, Stop, Acc, fun parse_entity_value/4], + undefined). + +%%---------------------------------------------------------------------- +%% Function : parse_ndata_decl(Rest, State) -> Result +%% Parameters: Rest = string() | binary() +%% State = #xmerl_sax_parser_state{} +%% Result : {Name, Rest, State} +%% Name = string() +%% Description: Parse an NDATA declaration. +%% [76] NDataDecl ::= S 'NDATA' S Name +%%---------------------------------------------------------------------- +parse_ndata_decl(?STRING_EMPTY, State) -> + cf(?STRING_EMPTY, State, fun parse_ndata_decl/2); +parse_ndata_decl(?STRING_REST(">", Rest), State) -> + {undefined, Rest, State}; +parse_ndata_decl(?STRING("N") = Rest, State) -> + cf(Rest, State, fun parse_ndata_decl/2); +parse_ndata_decl(?STRING("ND") = Rest, State) -> + cf(Rest, State, fun parse_ndata_decl/2); +parse_ndata_decl(?STRING("NDA") = Rest, State) -> + cf(Rest, State, fun parse_ndata_decl/2); +parse_ndata_decl(?STRING("NDAT") = Rest, State) -> + cf(Rest, State, fun parse_ndata_decl/2); +parse_ndata_decl(?STRING_REST("NDATA", Rest), State) -> + parse_ndata_decl_1(Rest, State); +parse_ndata_decl(Bytes, State) -> + unicode_incomplete_check([Bytes, State, fun parse_ndata_decl/2], + "NDATA or > expected"). + + +parse_ndata_decl_1(?STRING_UNBOUND_REST(C, _) = Bytes, State) when ?is_whitespace(C) -> + {_WS, Rest, State1} = whitespace(Bytes, State, []), + parse_ndecl_name(Rest, State1); +parse_ndata_decl_1(Bytes, State) -> + unicode_incomplete_check([Bytes, State, fun parse_ndata_decl/2], + "whitespace expected"). + + +parse_ndecl_name(?STRING_UNBOUND_REST(C, Rest), State) -> + case is_name_start(C) of + true -> + {Name, Rest1, State1} = parse_name(Rest, State, [C]), + {_WS, Rest2, State2} = whitespace(Rest1, State1, []), + {Rest3, State3} = parse_def_end(Rest2, State2), + {Name, Rest3, State3}; + false -> + ?fatal_error(State, "name expected") + end; +parse_ndecl_name(Bytes, State) -> + unicode_incomplete_check([Bytes, State, fun parse_ndecl_name/2], + undefined). + +%%---------------------------------------------------------------------- +%% Function : parse_pe_def(Rest, State, Name) -> Result +%% Parameters: Rest = string() | binary() +%% State = #xmerl_sax_parser_state{} +%% Name = string() +%% Result : {Rest, State} +%% Description: Parse an parameter entity definition. +%% [74] PEDef ::= EntityValue | ExternalID +%%---------------------------------------------------------------------- +parse_pe_def(?STRING_EMPTY, State, Name) -> + cf(?STRING_EMPTY, State, Name, fun parse_pe_def/3); +parse_pe_def(?STRING_UNBOUND_REST(C, Rest), State, Name) when C == $'; C == $" -> + {Value, Rest1, State1} = parse_entity_value(Rest, State, C, []), + Name1 = "%" ++ Name, + insert_reference({Name1, internal_parameter, Value}, + State1#xmerl_sax_parser_state.ref_table), + State2 = event_callback({internalEntityDecl, Name1, Value}, State1), + {_WS, Rest2, State3} = whitespace(Rest1, State2, []), + parse_def_end(Rest2, State3); +parse_pe_def(?STRING_UNBOUND_REST(C, _) = Bytes, State, Name) when C == $S; C == $P -> + {PubId, SysId, Rest1, State1} = parse_external_id(Bytes, State, false), + Name1 = "%" ++ Name, + insert_reference({Name1, external_parameter, {PubId, SysId}}, + State1#xmerl_sax_parser_state.ref_table), + State2 = event_callback({externalEntityDecl, Name1, PubId, SysId}, State1), + {_WS, Rest2, State3} = whitespace(Rest1, State2, []), + parse_def_end(Rest2, State3); +parse_pe_def(Bytes, State, Name) -> + unicode_incomplete_check([Bytes, State, Name, fun parse_pe_def/3], + "\", \', SYSTEM or PUBLIC expected"). + + +%%---------------------------------------------------------------------- +%% Function : parse_notation_decl(Rest, State) -> Result +%% Parameters: Rest = string() | binary() +%% State = #xmerl_sax_parser_state{} +%% Result : {Rest, State} +%% Description: Parse a NOTATION declaration. +%% [82] NotationDecl ::= '<!NOTATION' S Name S (ExternalID | PublicID) S? '>' +%%---------------------------------------------------------------------- +parse_notation_decl(?STRING_EMPTY, State) -> + cf(?STRING_EMPTY, State, fun parse_notation_decl/2); +parse_notation_decl(?STRING_UNBOUND_REST(C, _) = Bytes, State) when ?is_whitespace(C) -> + {_WS, Rest, State1} = whitespace(Bytes, State, []), + parse_notation_decl_1(Rest, State1); +parse_notation_decl(Bytes, State) -> + unicode_incomplete_check([Bytes, State, fun parse_notation_decl/2], + "whitespace expected"). + + +parse_notation_decl_1(?STRING_UNBOUND_REST(C, Rest), State) -> + case is_name_start(C) of + true -> + {Name, Rest1, State1} = parse_name(Rest, State, [C]), + {PubId, SysId, Rest2, State2} = parse_notation_id(Rest1, State1), + State3 = event_callback({notationDecl, Name, PubId, SysId}, State2), + {Rest2, State3}; + false -> + ?fatal_error(State, "name expected") + end; +parse_notation_decl_1(Bytes, State) -> + unicode_incomplete_check([Bytes, State, fun parse_notation_decl_1/2], + undefined). + +%%---------------------------------------------------------------------- +%% Function : parse_notation_id(Rest, State) -> Result +%% Parameters: Rest = string() | binary() +%% State = #xmerl_sax_parser_state{} +%% Result : {PubId, SysId, Rest, State} +%% PubId = string() +%% SysId = string() +%% Description: Parse a NOTATION identity. The public id case is a special +%% variant of extenal id where just the public part is allowed. +%% This is allowed if the third parameter in parse_external_id/3 +%% is true. +%% [83] PublicID ::= 'PUBLIC' S PubidLiteral +%%---------------------------------------------------------------------- +parse_notation_id(?STRING_EMPTY, State) -> + cf(?STRING_EMPTY, State, fun parse_notation_id/2); +%parse_notation_id(?STRING_REST(">", Rest), State) -> +% {"", "", Rest, State}; +parse_notation_id(?STRING_UNBOUND_REST(C, _) = Bytes, State) when ?is_whitespace(C) -> + {_WS, Rest, State1} = whitespace(Bytes, State, []), + parse_notation_id_1(Rest, State1); +parse_notation_id(Bytes, State) -> + unicode_incomplete_check([Bytes, State, fun parse_notation_id/2], + "whitespace expected"). + +%%---------------------------------------------------------------------- +%% Function : parse_notation_id_1(Rest, State) -> Result +%% Parameters: Rest = string() | binary() +%% State = #xmerl_sax_parser_state{} +%% Result : {PubId, SysId, Rest, State} +%% PubId = string() +%% SysId = string() +%% Description: Parse a NOTATION identity. +%%---------------------------------------------------------------------- +parse_notation_id_1(?STRING_EMPTY, State) -> + cf(?STRING_EMPTY, State, fun parse_notation_id_1/2); +parse_notation_id_1(?STRING_UNBOUND_REST(C, _) = Bytes, State) when C == $S; C == $P -> + {PubId, SysId, Rest1, State1} = parse_external_id(Bytes, State, true), + {_WS, Rest2, State2} = whitespace(Rest1, State1, []), + {Rest3, State3} = parse_def_end(Rest2, State2), + {PubId, SysId, Rest3, State3}; +%parse_notation_id_1(?STRING_REST(">", Rest), State) -> +% {"", "", Rest, State}; +parse_notation_id_1(Bytes, State) -> + unicode_incomplete_check([Bytes, State, fun parse_notation_id_1/2], + "external id or public id expected"). + + +%%====================================================================== +%% Character checks and definitions +%%====================================================================== + +%%---------------------------------------------------------------------- +%% Definitions of the first 256 characters +%% 0 - not classified, +%% 1 - base_char or ideographic, +%% 2 - combining_char or digit or extender, +%% 3 - $. or $- or $_ or $: +%%---------------------------------------------------------------------- +-define(SMALL, {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,3,0,2,2,2,2,2,2,2,2,2,2,3,0, + 0,0,0,0,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, + 1,0,0,0,0,3,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, + 1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,2,0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, + 1,1,1,1,1,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, + 1,1,1,1,1,1,1,0,1,1,1,1,1,1,1,1}). + + +%%---------------------------------------------------------------------- +%% Function : is_name_start(Char) -> Result +%% Parameters: Char = char() +%% Result : true | false +%% Description: Check if character is a valid start of a name. +%% [5] Name ::= (Letter | '_' | ':') (NameChar)* +%%---------------------------------------------------------------------- +is_name_start($_) -> + true; +is_name_start($:) -> + true; +is_name_start(C) -> + is_letter(C). + + +%%---------------------------------------------------------------------- +%% Function : is_name_start(Char) -> Result +%% Parameters: Char = char() +%% Result : true | false +%% Description: Check if character is a valid name character. +%% [4] NameChar ::= Letter | Digit | '.' | '-' | '_' | ':' +%% | CombiningChar | Extender +%%---------------------------------------------------------------------- +is_name_char(C) -> + try element(C, ?SMALL) > 0 + catch _:_ -> + case is_letter(C) of + true -> + true; + false -> + case is_digit(C) of + true -> true; + false -> + case is_combining_char(C) of + true -> true; + false -> + is_extender(C) + end + end + end + end. + + +%%---------------------------------------------------------------------- +%% Function : is_pubid_char(Char) -> Result +%% Parameters: Char = char() +%% Result : true | false +%% Description: Check if character is a public identity character. +%% [13] PubidChar ::= #x20 | #xD | #xA | [a-zA-Z0-9] +%% | [-'()+,./:=?;!*#@$_%] +%%---------------------------------------------------------------------- +is_pubid_char(?space) -> + true; +is_pubid_char(?cr) -> + true; +is_pubid_char(?lf) -> + true; +is_pubid_char($!) -> + true; +is_pubid_char($:) -> + true; +is_pubid_char($;) -> + true; +is_pubid_char($=) -> + true; +is_pubid_char($@) -> + true; +is_pubid_char($_) -> + true; +is_pubid_char(C) when $# =< C, C =< $% -> + true; +is_pubid_char(C) when $' =< C, C =< $/ -> + true; +is_pubid_char(C) -> + case is_letter(C) of + true -> + true; + false -> + is_digit(C) + end. + + +%%---------------------------------------------------------------------- +%% Function : is_letter(Char) -> Result +%% Parameters: Char = char() +%% Result : true | false +%% Description: Check if character is a letter. +%% [84] Letter ::= BaseChar | Ideographic +%%---------------------------------------------------------------------- +is_letter(C) -> + try element(C, ?SMALL) =:= 1 + catch _:_ -> + case is_base_char(C) of + false -> + is_ideographic(C); + true -> + true + end + end. + + +%%---------------------------------------------------------------------- +%% Function : is_letter(Char) -> Result +%% Parameters: Char = char() +%% Result : true | false +%% Description: Check if character is a basic character. +%% [85] BaseChar +%%---------------------------------------------------------------------- +is_base_char(C) when C >= 16#0041, C =< 16#005A -> true; %% ASCII Latin +is_base_char(C) when C >= 16#0061, C =< 16#007A -> true; +is_base_char(C) when C >= 16#00C0, C =< 16#00D6 -> true; %% ISO Latin +is_base_char(C) when C >= 16#00D8, C =< 16#00F6 -> true; +is_base_char(C) when C >= 16#00F8, C =< 16#00FF -> true; +is_base_char(C) when C >= 16#0100, C =< 16#0131 -> true; %% Accented Latin +is_base_char(C) when C >= 16#0134, C =< 16#013E -> true; +is_base_char(C) when C >= 16#0141, C =< 16#0148 -> true; +is_base_char(C) when C >= 16#014A, C =< 16#017E -> true; +is_base_char(C) when C >= 16#0180, C =< 16#01C3 -> true; +is_base_char(C) when C >= 16#01CD, C =< 16#01F0 -> true; +is_base_char(C) when C >= 16#01F4, C =< 16#01F5 -> true; +is_base_char(C) when C >= 16#01FA, C =< 16#0217 -> true; +is_base_char(C) when C >= 16#0250, C =< 16#02A8 -> true; %% IPA +is_base_char(C) when C >= 16#02BB, C =< 16#02C1 -> true; %% Spacing Modifiers +is_base_char(16#0386) -> true; %% Greek +is_base_char(C) when C >= 16#0388, C =< 16#038A -> true; +is_base_char(16#038C) -> true; +is_base_char(C) when C >= 16#038E, C =< 16#03A1 -> true; +is_base_char(C) when C >= 16#03A3, C =< 16#03CE -> true; +is_base_char(C) when C >= 16#03D0, C =< 16#03D6 -> true; +is_base_char(16#03DA) -> true; +is_base_char(16#03DC) -> true; +is_base_char(16#03DE) -> true; +is_base_char(16#03E0) -> true; +is_base_char(C) when C >= 16#03E2, C =< 16#03F3 -> true; +is_base_char(C) when C >= 16#0401, C =< 16#040C -> true; %% Cyrillic +is_base_char(C) when C >= 16#040E, C =< 16#044F -> true; +is_base_char(C) when C >= 16#0451, C =< 16#045C -> true; +is_base_char(C) when C >= 16#045E, C =< 16#0481 -> true; +is_base_char(C) when C >= 16#0490, C =< 16#04C4 -> true; +is_base_char(C) when C >= 16#04C7, C =< 16#04C8 -> true; +is_base_char(C) when C >= 16#04CB, C =< 16#04CC -> true; +is_base_char(C) when C >= 16#04D0, C =< 16#04EB -> true; +is_base_char(C) when C >= 16#04EE, C =< 16#04F5 -> true; +is_base_char(C) when C >= 16#04F8, C =< 16#04F9 -> true; +is_base_char(C) when C >= 16#0531, C =< 16#0556 -> true; %% Armenian +is_base_char(16#0559) -> true; +is_base_char(C) when C >= 16#0561, C =< 16#0586 -> true; +is_base_char(C) when C >= 16#05D0, C =< 16#05EA -> true; %% Hebrew +is_base_char(C) when C >= 16#05F0, C =< 16#05F2 -> true; +is_base_char(C) when C >= 16#0621, C =< 16#063A -> true; %% Arabic +is_base_char(C) when C >= 16#0641, C =< 16#064A -> true; +is_base_char(C) when C >= 16#0671, C =< 16#06B7 -> true; +is_base_char(C) when C >= 16#06BA, C =< 16#06BE -> true; +is_base_char(C) when C >= 16#06C0, C =< 16#06CE -> true; +is_base_char(C) when C >= 16#06D0, C =< 16#06D3 -> true; +is_base_char(16#06D5) -> true; +is_base_char(C) when C >= 16#06E5, C =< 16#06E6 -> true; +is_base_char(C) when C >= 16#0905, C =< 16#0939 -> true; %% Devanagari +is_base_char(16#093D) -> true; +is_base_char(C) when C >= 16#0958, C =< 16#0961 -> true; +is_base_char(C) when C >= 16#0985, C =< 16#098C -> true; %% Bengali +is_base_char(C) when C >= 16#098F, C =< 16#0990 -> true; +is_base_char(C) when C >= 16#0993, C =< 16#09A8 -> true; +is_base_char(C) when C >= 16#09AA, C =< 16#09B0 -> true; +is_base_char(16#09B2) -> true; +is_base_char(C) when C >= 16#09B6, C =< 16#09B9 -> true; +is_base_char(C) when C >= 16#09DC, C =< 16#09DD -> true; +is_base_char(C) when C >= 16#09DF, C =< 16#09E1 -> true; +is_base_char(C) when C >= 16#09F0, C =< 16#09F1 -> true; +is_base_char(C) when C >= 16#0A05, C =< 16#0A0A -> true; %% Gurmukhi +is_base_char(C) when C >= 16#0A0F, C =< 16#0A10 -> true; +is_base_char(C) when C >= 16#0A13, C =< 16#0A28 -> true; +is_base_char(C) when C >= 16#0A2A, C =< 16#0A30 -> true; +is_base_char(C) when C >= 16#0A32, C =< 16#0A33 -> true; +is_base_char(C) when C >= 16#0A35, C =< 16#0A36 -> true; +is_base_char(C) when C >= 16#0A38, C =< 16#0A39 -> true; +is_base_char(C) when C >= 16#0A59, C =< 16#0A5C -> true; +is_base_char(16#0A5E) -> true; +is_base_char(C) when C >= 16#0A72, C =< 16#0A74 -> true; +is_base_char(C) when C >= 16#0A85, C =< 16#0A8B -> true; %% Gujarati +is_base_char(16#0A8D) -> true; +is_base_char(C) when C >= 16#0A8F, C =< 16#0A91 -> true; +is_base_char(C) when C >= 16#0A93, C =< 16#0AA8 -> true; +is_base_char(C) when C >= 16#0AAA, C =< 16#0AB0 -> true; +is_base_char(C) when C >= 16#0AB2, C =< 16#0AB3 -> true; +is_base_char(C) when C >= 16#0AB5, C =< 16#0AB9 -> true; +is_base_char(16#0ABD) -> true; +is_base_char(16#0AE0) -> true; +is_base_char(C) when C >= 16#0B05, C =< 16#0B0C -> true; %% Oriya +is_base_char(C) when C >= 16#0B0F, C =< 16#0B10 -> true; +is_base_char(C) when C >= 16#0B13, C =< 16#0B28 -> true; +is_base_char(C) when C >= 16#0B2A, C =< 16#0B30 -> true; +is_base_char(C) when C >= 16#0B32, C =< 16#0B33 -> true; +is_base_char(C) when C >= 16#0B36, C =< 16#0B39 -> true; +is_base_char(16#0B3D) -> true; +is_base_char(C) when C >= 16#0B5C, C =< 16#0B5D -> true; +is_base_char(C) when C >= 16#0B5F, C =< 16#0B61 -> true; +is_base_char(C) when C >= 16#0B85, C =< 16#0B8A -> true; %% Tamil +is_base_char(C) when C >= 16#0B8E, C =< 16#0B90 -> true; +is_base_char(C) when C >= 16#0B92, C =< 16#0B95 -> true; +is_base_char(C) when C >= 16#0B99, C =< 16#0B9A -> true; +is_base_char(16#0B9C) -> true; +is_base_char(C) when C >= 16#0B9E, C =< 16#0B9F -> true; +is_base_char(C) when C >= 16#0BA3, C =< 16#0BA4 -> true; +is_base_char(C) when C >= 16#0BA8, C =< 16#0BAA -> true; +is_base_char(C) when C >= 16#0BAE, C =< 16#0BB5 -> true; +is_base_char(C) when C >= 16#0BB7, C =< 16#0BB9 -> true; +is_base_char(C) when C >= 16#0C05, C =< 16#0C0C -> true; %% Telugu +is_base_char(C) when C >= 16#0C0E, C =< 16#0C10 -> true; +is_base_char(C) when C >= 16#0C12, C =< 16#0C28 -> true; +is_base_char(C) when C >= 16#0C2A, C =< 16#0C33 -> true; +is_base_char(C) when C >= 16#0C35, C =< 16#0C39 -> true; +is_base_char(C) when C >= 16#0C60, C =< 16#0C61 -> true; +is_base_char(C) when C >= 16#0C85, C =< 16#0C8C -> true; %% Kannada +is_base_char(C) when C >= 16#0C8E, C =< 16#0C90 -> true; +is_base_char(C) when C >= 16#0C92, C =< 16#0CA8 -> true; +is_base_char(C) when C >= 16#0CAA, C =< 16#0CB3 -> true; +is_base_char(C) when C >= 16#0CB5, C =< 16#0CB9 -> true; +is_base_char(16#0CDE) -> true; +is_base_char(C) when C >= 16#0CE0, C =< 16#0CE1 -> true; +is_base_char(C) when C >= 16#0D05, C =< 16#0D0C -> true; %% Malayalam +is_base_char(C) when C >= 16#0D0E, C =< 16#0D10 -> true; +is_base_char(C) when C >= 16#0D12, C =< 16#0D28 -> true; +is_base_char(C) when C >= 16#0D2A, C =< 16#0D39 -> true; +is_base_char(C) when C >= 16#0D60, C =< 16#0D61 -> true; +is_base_char(C) when C >= 16#0E01, C =< 16#0E2E -> true; %% Thai +is_base_char(16#0E30) -> true; +is_base_char(C) when C >= 16#0E32, C =< 16#0E33 -> true; +is_base_char(C) when C >= 16#0E40, C =< 16#0E45 -> true; +is_base_char(C) when C >= 16#0E81, C =< 16#0E82 -> true; %% Lao +is_base_char(16#0E84) -> true; +is_base_char(C) when C >= 16#0E87, C =< 16#0E88 -> true; +is_base_char(16#0E8A) -> true; +is_base_char(16#0E8D) -> true; +is_base_char(C) when C >= 16#0E94, C =< 16#0E97 -> true; +is_base_char(C) when C >= 16#0E99, C =< 16#0E9F -> true; +is_base_char(C) when C >= 16#0EA1, C =< 16#0EA3 -> true; +is_base_char(16#0EA5) -> true; +is_base_char(16#0EA7) -> true; +is_base_char(C) when C >= 16#0EAA, C =< 16#0EAB -> true; +is_base_char(C) when C >= 16#0EAD, C =< 16#0EAE -> true; +is_base_char(16#0EB0) -> true; +is_base_char(C) when C >= 16#0EB2, C =< 16#0EB3 -> true; +is_base_char(16#0EBD) -> true; +is_base_char(C) when C >= 16#0EC0, C =< 16#0EC4 -> true; +is_base_char(C) when C >= 16#0F40, C =< 16#0F47 -> true; %% Tibetan +is_base_char(C) when C >= 16#0F49, C =< 16#0F69 -> true; +is_base_char(C) when C >= 16#10A0, C =< 16#10C5 -> true; %% Hangul Jamo +is_base_char(C) when C >= 16#10D0, C =< 16#10F6 -> true; +is_base_char(16#1100) -> true; +is_base_char(C) when C >= 16#1102, C =< 16#1103 -> true; +is_base_char(C) when C >= 16#1105, C =< 16#1107 -> true; +is_base_char(16#1109) -> true; +is_base_char(C) when C >= 16#110B, C =< 16#110C -> true; +is_base_char(C) when C >= 16#110E, C =< 16#1112 -> true; +is_base_char(16#113C) -> true; +is_base_char(16#113E) -> true; +is_base_char(16#1140) -> true; +is_base_char(16#114C) -> true; +is_base_char(16#114E) -> true; +is_base_char(16#1150) -> true; +is_base_char(C) when C >= 16#1154, C =< 16#1155 -> true; +is_base_char(16#1159) -> true; +is_base_char(C) when C >= 16#115F, C =< 16#1161 -> true; +is_base_char(16#1163) -> true; +is_base_char(16#1165) -> true; +is_base_char(16#1167) -> true; +is_base_char(16#1169) -> true; +is_base_char(C) when C >= 16#116D, C =< 16#116E -> true; +is_base_char(C) when C >= 16#1172, C =< 16#1173 -> true; +is_base_char(16#1175) -> true; +is_base_char(16#119E) -> true; +is_base_char(16#11A8) -> true; +is_base_char(16#11AB) -> true; +is_base_char(C) when C >= 16#11AE, C =< 16#11AF -> true; +is_base_char(C) when C >= 16#11B7, C =< 16#11B8 -> true; +is_base_char(16#11BA) -> true; +is_base_char(C) when C >= 16#11BC, C =< 16#11C2 -> true; +is_base_char(16#11EB) -> true; +is_base_char(16#11F0) -> true; +is_base_char(16#11F9) -> true; +is_base_char(C) when C >= 16#1E00, C =< 16#1E9B -> true; %% Latin Extended Additional +is_base_char(C) when C >= 16#1EA0, C =< 16#1EF9 -> true; +is_base_char(C) when C >= 16#1F00, C =< 16#1F15 -> true; %% Greek Extended +is_base_char(C) when C >= 16#1F18, C =< 16#1F1D -> true; +is_base_char(C) when C >= 16#1F20, C =< 16#1F45 -> true; +is_base_char(C) when C >= 16#1F48, C =< 16#1F4D -> true; +is_base_char(C) when C >= 16#1F50, C =< 16#1F57 -> true; +is_base_char(16#1F59) -> true; +is_base_char(16#1F5B) -> true; +is_base_char(16#1F5D) -> true; +is_base_char(C) when C >= 16#1F5F, C =< 16#1F7D -> true; +is_base_char(C) when C >= 16#1F80, C =< 16#1FB4 -> true; +is_base_char(C) when C >= 16#1FB6, C =< 16#1FBC -> true; +is_base_char(16#1FBE) -> true; +is_base_char(C) when C >= 16#1FC2, C =< 16#1FC4 -> true; +is_base_char(C) when C >= 16#1FC6, C =< 16#1FCC -> true; +is_base_char(C) when C >= 16#1FD0, C =< 16#1FD3 -> true; +is_base_char(C) when C >= 16#1FD6, C =< 16#1FDB -> true; +is_base_char(C) when C >= 16#1FE0, C =< 16#1FEC -> true; +is_base_char(C) when C >= 16#1FF2, C =< 16#1FF4 -> true; +is_base_char(C) when C >= 16#1FF6, C =< 16#1FFC -> true; +is_base_char(16#2126) -> true; %% Letterlike Symbols +is_base_char(C) when C >= 16#212A, C =< 16#212B -> true; +is_base_char(16#212E) -> true; +is_base_char(C) when C >= 16#2180, C =< 16#2182 -> true; %% Number Forms +is_base_char(C) when C >= 16#3041, C =< 16#3094 -> true; %% Hiragana +is_base_char(C) when C >= 16#30A1, C =< 16#30FA -> true; %% Katakana +is_base_char(C) when C >= 16#3105, C =< 16#312C -> true; %% Bopomofo +is_base_char(C) when C >= 16#ac00, C =< 16#d7a3 -> true; %% Hangul Syllables +is_base_char(_) -> + false. + +%%---------------------------------------------------------------------- +%% Function : is_ideographic(Char) -> Result +%% Parameters: Char = char() +%% Result : true | false +%% Description: Check if character is an ideographic letter. +%% [86] Ideographic +%%---------------------------------------------------------------------- +is_ideographic(C) when C >= 16#4e00, C =< 16#9fa5 -> true; %% Unified CJK Ideographs +is_ideographic(16#3007) -> true; %% CJK Symbols and Punctuation +is_ideographic(C) when C >= 16#3021, C =< 16#3029 -> true; +is_ideographic(_) -> + false. + +%%---------------------------------------------------------------------- +%% Function : is_ideographic(Char) -> Result +%% Parameters: Char = char() +%% Result : true | false +%% Description: Check if character is a combining character. +%% [87] CombiningChar +%%---------------------------------------------------------------------- +is_combining_char(C) when C >= 16#0300, C =< 16#0345 -> true; %% Combining Diacritics +is_combining_char(C) when C >= 16#0360, C =< 16#0361 -> true; +is_combining_char(C) when C >= 16#0483, C =< 16#0486 -> true; %% Cyrillic Combining Diacritics +is_combining_char(C) when C >= 16#0591, C =< 16#05a1 -> true; %% Hebrew Combining Diacritics +is_combining_char(C) when C >= 16#05a3, C =< 16#05b9 -> true; +is_combining_char(C) when C >= 16#05bb, C =< 16#05bd -> true; +is_combining_char(16#05bf) -> true; +is_combining_char(C) when C >= 16#05c1, C =< 16#05c2 -> true; +is_combining_char(16#05c4) -> true; +is_combining_char(C) when C >= 16#064b, C =< 16#0652 -> true; %% Arabic Combining Diacritics +is_combining_char(16#0670) -> true; +is_combining_char(C) when C >= 16#06d6, C =< 16#06dc -> true; +is_combining_char(C) when C >= 16#06dd, C =< 16#06df -> true; +is_combining_char(C) when C >= 16#06e0, C =< 16#06e4 -> true; +is_combining_char(C) when C >= 16#06e7, C =< 16#06e8 -> true; +is_combining_char(C) when C >= 16#06ea, C =< 16#06ed -> true; +is_combining_char(C) when C >= 16#0901, C =< 16#0903 -> true; %% Devanagari Combining Diacritics +is_combining_char(16#093c) -> true; +is_combining_char(C) when C >= 16#093e, C =< 16#094c -> true; +is_combining_char(16#094d) -> true; +is_combining_char(C) when C >= 16#0951, C =< 16#0954 -> true; +is_combining_char(C) when C >= 16#0962, C =< 16#0963 -> true; +is_combining_char(C) when C >= 16#0981, C =< 16#0983 -> true; %% Bengali Combining Diacritics +is_combining_char(16#09bc) -> true; +is_combining_char(16#09be) -> true; +is_combining_char(16#09bf) -> true; +is_combining_char(C) when C >= 16#09c0, C =< 16#09c4 -> true; +is_combining_char(C) when C >= 16#09c7, C =< 16#09c8 -> true; +is_combining_char(C) when C >= 16#09cb, C =< 16#09cd -> true; +is_combining_char(16#09d7) -> true; +is_combining_char(C) when C >= 16#09e2, C =< 16#09e3 -> true; +is_combining_char(16#0a02) -> true; %% Gurmukhi Combining Diacritics +is_combining_char(16#0a3c) -> true; +is_combining_char(16#0a3e) -> true; +is_combining_char(16#0a3f) -> true; +is_combining_char(C) when C >= 16#0a40, C =< 16#0a42 -> true; +is_combining_char(C) when C >= 16#0a47, C =< 16#0a48 -> true; +is_combining_char(C) when C >= 16#0a4b, C =< 16#0a4d -> true; +is_combining_char(C) when C >= 16#0a70, C =< 16#0a71 -> true; +is_combining_char(C) when C >= 16#0a81, C =< 16#0a83 -> true; %% Gujarati Combining Diacritics +is_combining_char(16#0abc) -> true; +is_combining_char(C) when C >= 16#0abe, C =< 16#0ac5 -> true; +is_combining_char(C) when C >= 16#0ac7, C =< 16#0ac9 -> true; +is_combining_char(C) when C >= 16#0acb, C =< 16#0acd -> true; +is_combining_char(C) when C >= 16#0b01, C =< 16#0b03 -> true; %% Oriya Combining Diacritics +is_combining_char(16#0b3c) -> true; +is_combining_char(C) when C >= 16#0b3e, C =< 16#0b43 -> true; +is_combining_char(C) when C >= 16#0b47, C =< 16#0b48 -> true; +is_combining_char(C) when C >= 16#0b4b, C =< 16#0b4d -> true; +is_combining_char(C) when C >= 16#0b56, C =< 16#0b57 -> true; +is_combining_char(C) when C >= 16#0b82, C =< 16#0b83 -> true; %% Tamil Combining Diacritics +is_combining_char(C) when C >= 16#0bbe, C =< 16#0bc2 -> true; +is_combining_char(C) when C >= 16#0bc6, C =< 16#0bc8 -> true; +is_combining_char(C) when C >= 16#0bca, C =< 16#0bcd -> true; +is_combining_char(16#0bd7) -> true; +is_combining_char(C) when C >= 16#0c01, C =< 16#0c03 -> true; %% Telugu Combining Diacritics +is_combining_char(C) when C >= 16#0c3e, C =< 16#0c44 -> true; +is_combining_char(C) when C >= 16#0c46, C =< 16#0c48 -> true; +is_combining_char(C) when C >= 16#0c4a, C =< 16#0c4d -> true; +is_combining_char(C) when C >= 16#0c55, C =< 16#0c56 -> true; +is_combining_char(C) when C >= 16#0c82, C =< 16#0c83 -> true; %% Kannada Combining Diacritics +is_combining_char(C) when C >= 16#0cbe, C =< 16#0cc4 -> true; +is_combining_char(C) when C >= 16#0cc6, C =< 16#0cc8 -> true; +is_combining_char(C) when C >= 16#0cca, C =< 16#0ccd -> true; +is_combining_char(C) when C >= 16#0cd5, C =< 16#0cd6 -> true; +is_combining_char(C) when C >= 16#0d02, C =< 16#0d03 -> true; %% Malayalam Combining Diacritics +is_combining_char(C) when C >= 16#0d3e, C =< 16#0d43 -> true; +is_combining_char(C) when C >= 16#0d46, C =< 16#0d48 -> true; +is_combining_char(C) when C >= 16#0d4a, C =< 16#0d4d -> true; +is_combining_char(16#0d57) -> true; +is_combining_char(16#0e31) -> true; %% Thai Combining Diacritics +is_combining_char(C) when C >= 16#0e34, C =< 16#0e3a -> true; +is_combining_char(C) when C >= 16#0e47, C =< 16#0e4e -> true; +is_combining_char(16#0eb1) -> true; %% Lao Combining Diacritics +is_combining_char(C) when C >= 16#0eb4, C =< 16#0eb9 -> true; +is_combining_char(C) when C >= 16#0ebb, C =< 16#0ebc -> true; +is_combining_char(C) when C >= 16#0ec8, C =< 16#0ecd -> true; +is_combining_char(C) when C >= 16#0f18, C =< 16#0f19 -> true; %% Tibetan Combining Diacritics +is_combining_char(16#0f35) -> true; +is_combining_char(16#0f37) -> true; +is_combining_char(16#0f39) -> true; +is_combining_char(16#0f3e) -> true; +is_combining_char(16#0f3f) -> true; +is_combining_char(C) when C >= 16#0f71, C =< 16#0f84 -> true; +is_combining_char(C) when C >= 16#0f86, C =< 16#0f8b -> true; +is_combining_char(C) when C >= 16#0f90, C =< 16#0f95 -> true; +is_combining_char(16#0f97) -> true; +is_combining_char(C) when C >= 16#0f99, C =< 16#0fad -> true; +is_combining_char(C) when C >= 16#0fb1, C =< 16#0fb7 -> true; +is_combining_char(16#0fb9) -> true; +is_combining_char(C) when C >= 16#20d0, C =< 16#20dc -> true; %% Math/Technical Combining Diacritics +is_combining_char(16#20e1) -> true; +is_combining_char(C) when C >= 16#302a, C =< 16#302f -> true; %% Ideographic Diacritics +is_combining_char(16#3099) -> true; %% Hiragana/Katakana Combining Diacritics +is_combining_char(16#309a) -> true; +is_combining_char(_) -> false. + + +%%---------------------------------------------------------------------- +%% Function : is_digit(Char) -> Result +%% Parameters: Char = char() +%% Result : true | false +%% Description: Check if character is a digit. +%% [88] Digit +%%---------------------------------------------------------------------- +is_digit(C) when C >= 16#0030, C =< 16#0039 -> true; %% Basic ASCII digits 0-9 +is_digit(C) when C >= 16#0660, C =< 16#0669 -> true; %% Arabic Digits 0-9 +is_digit(C) when C >= 16#06F0, C =< 16#06F9 -> true; %% Eastern Arabic-Indic Digits 0-9 +is_digit(C) when C >= 16#0966, C =< 16#096f -> true; %% Devanagari Digits 0-9 +is_digit(C) when C >= 16#09e6, C =< 16#09ef -> true; %% Bengali Digits 0-9 +is_digit(C) when C >= 16#0a66, C =< 16#0a6f -> true; %% Gurmukhi Digits 0-9 +is_digit(C) when C >= 16#0ae6, C =< 16#0aef -> true; %% Gujarati Digits 0-9 +is_digit(C) when C >= 16#0b66, C =< 16#0b6f -> true; %% Oriya Digits 0-9 +is_digit(C) when C >= 16#0be7, C =< 16#0bef -> true; %% Tamil Digits 0-9 +is_digit(C) when C >= 16#0c66, C =< 16#0c6f -> true; %% Telugu Digits 0-9 +is_digit(C) when C >= 16#0ce6, C =< 16#0cef -> true; %% Kannada Digits 0-9 +is_digit(C) when C >= 16#0d66, C =< 16#0d6f -> true; %% Malayalam Digits 0-9 +is_digit(C) when C >= 16#0e50, C =< 16#0e59 -> true; %% Thai Digits 0-9 +is_digit(C) when C >= 16#0ed0, C =< 16#0ed9 -> true; %% Lao Digits 0-9 +is_digit(C) when C >= 16#0f20, C =< 16#0f29 -> true; %% Tibetan Digits 0-9 +is_digit(_) -> false. + + +%%---------------------------------------------------------------------- +%% Function : is_extender(Char) -> Result +%% Parameters: Char = char() +%% Result : true | false +%% Description: Check if character is an extender character. +%% [89] Extender +%%---------------------------------------------------------------------- +is_extender(16#00b7) -> true; %% Middle Dot +is_extender(16#02d0) -> true; %% Triangular Colon and Half Colon +is_extender(16#02d1) -> true; +is_extender(16#0387) -> true; %% Greek Ano Teleia +is_extender(16#0640) -> true; %% Arabic Tatweel +is_extender(16#0e46) -> true; %% Thai Maiyamok +is_extender(16#0ec6) -> true; %% Lao Ko La +is_extender(16#3005) -> true; %% Ideographic Iteration Mark +is_extender(C) when C >= 16#3031, C =< 16#3035 -> true; %% Japanese Kana Repetition Marks +is_extender(C) when C >= 16#309d, C =< 16#309e -> true; %% Japanese Hiragana Iteration Marks +is_extender(C) when C >= 16#30fc, C =< 16#30fe -> true; %% Japanese Kana Iteration Marks +is_extender(_) -> false. + + + +%%====================================================================== +%% Callback and Continuation function handling +%%====================================================================== +%%---------------------------------------------------------------------- +%% Function : event_callback(Event, State) -> Result +%% Parameters: Event = term() +%% State = #xmerl_sax_parser_state{} +%% Result : #xmerl_sax_parser_state{} +%% Description: Function that uses provided fun to send parser events. +%%---------------------------------------------------------------------- +event_callback(Event, + #xmerl_sax_parser_state{ + event_fun=CbFun, + event_state=EventState, + line_no=N, + entity=E, + current_location=L + } = State) -> + try + NewEventState = CbFun(Event, {L, E, N}, EventState), + State#xmerl_sax_parser_state{event_state=NewEventState} + catch + throw:ErrorTerm -> + throw({event_receiver_error, State, ErrorTerm}); + exit:Reason -> + throw({event_receiver_error, State, {'EXIT', Reason}}) + end. + +%%---------------------------------------------------------------------- +%% Function : cf(Rest, State, NextCall) -> Result +%% Parameters: Rest = string() | binary() +%% State = #xmerl_sax_parser_state{} +%% NextCall = fun() +%% Result : {Rest, State} +%% Description: Function that uses provided fun to read another chunk from +%% input stream and calls the fun in NextCall. +%%---------------------------------------------------------------------- +cf(_Rest, #xmerl_sax_parser_state{continuation_fun = undefined} = State, _) -> + ?fatal_error(State, "Continuation function undefined"); +cf(Rest, #xmerl_sax_parser_state{continuation_fun = CFun, continuation_state = CState} = State, + NextCall) -> + Result = + try + CFun(CState) + catch + throw:ErrorTerm -> + ?fatal_error(State, ErrorTerm); + exit:Reason -> + ?fatal_error(State, {'EXIT', Reason}) + end, + case Result of + {?STRING_EMPTY, _} -> + ?fatal_error(State, "No more bytes"); + {NewBytes, NewContState} -> + NextCall(?APPEND_STRING(Rest, NewBytes), + State#xmerl_sax_parser_state{continuation_state = NewContState}) + end. + +%%---------------------------------------------------------------------- +%% Function : cf(Rest, State, NextCall, P) -> Result +%% Parameters: Rest = string() | binary() +%% State = #xmerl_sax_parser_state{} +%% NextCall = fun() +%% P = term() +%% Result : {Rest, State} +%% Description: Function that uses provided fun to read another chunk from +%% input stream and calls the fun in NextCall with P as last parameter. +%%---------------------------------------------------------------------- +cf(_Rest, #xmerl_sax_parser_state{continuation_fun = undefined} = State, _P, _) -> + ?fatal_error(State, "Continuation function undefined"); +cf(Rest, #xmerl_sax_parser_state{continuation_fun = CFun, continuation_state = CState} = State, + P, NextCall) -> + Result = + try + CFun(CState) + catch + throw:ErrorTerm -> + ?fatal_error(State, ErrorTerm); + exit:Reason -> + ?fatal_error(State, {'EXIT', Reason}) + end, + case Result of + {?STRING_EMPTY, _} -> + ?fatal_error(State, "No more bytes"); + {NewBytes, NewContState} -> + NextCall(?APPEND_STRING(Rest, NewBytes), + State#xmerl_sax_parser_state{continuation_state = NewContState}, + P) + end. + + +%%---------------------------------------------------------------------- +%% Function : cf(Rest, State, P1, P2, NextCall) -> Result +%% Parameters: Rest = string() | binary() +%% State = #xmerl_sax_parser_state{} +%% NextCall = fun() +%% P1 = term() +%% P2 = term() +%% Result : {Rest, State} +%% Description: Function that uses provided fun to read another chunk from +%% input stream and calls the fun in NextCall with P1 and +%% P2 as last parameters. +%%---------------------------------------------------------------------- +cf(_Rest, #xmerl_sax_parser_state{continuation_fun = undefined} = State, _P1, _P2, _) -> + ?fatal_error(State, "Continuation function undefined"); +cf(Rest, #xmerl_sax_parser_state{continuation_fun = CFun, continuation_state = CState} = State, + P1, P2, NextCall) -> + Result = + try + CFun(CState) + catch + throw:ErrorTerm -> + ?fatal_error(State, ErrorTerm); + exit:Reason -> + ?fatal_error(State, {'EXIT', Reason}) + end, + case Result of + {?STRING_EMPTY, _} -> + ?fatal_error(State, "No more bytes"); + {NewBytes, NewContState} -> + NextCall(?APPEND_STRING(Rest, NewBytes), + State#xmerl_sax_parser_state{continuation_state = NewContState}, + P1, P2) + end. + + + +%%---------------------------------------------------------------------- +%% Function : unicode_incomplete_check(Args, ErrString) -> Result +%% Parameters: Args = [Bytes, State | RestOfArgs] +%% Bytes = string() | binary() +%% State = #xmerl_sax_parser_state{} +%% RestOfArgs = +%% ErrString = string() +%% Result : {Rest, State} +%% Description: +%%---------------------------------------------------------------------- +unicode_incomplete_check([Bytes, #xmerl_sax_parser_state{encoding=Enc} = State | _] = Args, ErrString) when is_binary(Bytes) -> + case unicode:characters_to_list(Bytes, Enc) of + {incomplete, _, _} -> + apply(?MODULE, cf, Args); + {error, _Encoded, _Rest} -> + ?fatal_error(State, lists:flatten(io_lib:format("Bad character, not in ~p\n", [Enc]))); + _ when ErrString =/= undefined -> + ?fatal_error(State, ErrString) + end; +unicode_incomplete_check([Bytes,State | _], ErrString) when is_list(Bytes), ErrString =/= undefined -> + ?fatal_error(State, ErrString). + + +%%---------------------------------------------------------------------- +%% Function : check_uri(Uri, CL) -> Result +%% Parameters: Uri = string() +%% CL = string() +%% Result : {atom(), string()} +%% Description: +%%---------------------------------------------------------------------- +check_uri("http://" ++ _ = Url, _CL) -> + {http, Url}; +check_uri("file://" ++ Path, _CL) -> + {file, Path}; +check_uri(Path, CL) -> % ordinary filepath other URI's not supported yet + %% "file://" already removed when current_location set + Tag = get_uri_tag(CL), + case filename:pathtype(Path) of + relative -> + case Tag of + false -> + {file, filename:join(CL, Path)}; + T -> + {T, CL ++ "/" ++ Path} + end; + absolute -> + case Tag of + false -> + {file, filename:absname(Path)}; + T -> + {T, CL ++ "/" ++ Path} + end; + volumerelative -> % only windows + case Tag of + false -> + [Vol | _] = re:split(CL, ":", [{return,list}]), + {file, filename:join(Vol ++ ":", Path)}; + T -> + {T, CL ++ "/" ++ Path} + end + end. + +%%---------------------------------------------------------------------- +%% Function : get_uri_tag(Uri) -> Result +%% Parameters: Uri = string() +%% Result : true |false +%% Description: http / file is the only supported URI for the moment +%%---------------------------------------------------------------------- +get_uri_tag(Uri) -> + case re:split(Uri, "://", [{return,list}]) of + [Tag, _] -> + list_to_atom(Tag); + [_] -> + false + end. + +%%---------------------------------------------------------------------- +%% Function : http_get_file(Host, Port, Key) -> Result +%% Parameters: Host = string() +%% Port = integer() +%% Key = string() +%% Result : string() +%% Description: +%%---------------------------------------------------------------------- +http_get_file(Host, Port, Key) -> + ConnectTimeOut = 10000, + SendTimeout = 10000, + FilenameTempl = filename:basename(Key), + + {Filename, FD} = create_tempfile(FilenameTempl), + Socket = create_connection(Host, Port, ConnectTimeOut), + Request = "GET " ++ Key ++ " HTTP/1.0\r\n\r\n", + + case gen_tcp:send(Socket, Request) of + ok -> + try + receive_msg(Socket, FD, true, SendTimeout) + catch + throw:{error, Error} -> + file:close(FD), + file:delete(Filename), + throw({error, Error}) + end; + {error, _Reason} -> + file:close(FD), + file:delete(Filename), + throw({error, lists:flatten(io_lib:format("Couldn't fetch http://~s:~p/~s", + [Host, Port, Key]))}) + end, + file:close(FD), + Filename. + +%%---------------------------------------------------------------------- +%% Function : receive_msg(Socket, FD, WaitForHeader, Timeout) -> Result +%% Parameters: Socket = io_device() +%% FD = io_device() +%% WaitForHeader = boolean() +%% Timeout = integer() +%% Result : ok +%% Description: +%%---------------------------------------------------------------------- +receive_msg(Socket, FD, WaitForHeader, Timeout) -> + receive + {tcp_closed, Socket} -> + ok; + {tcp, Socket, Response} when WaitForHeader == false -> + file:write(FD, Response), + receive_msg(Socket, FD, WaitForHeader, Timeout); + {tcp, Socket, Response} -> + MsgBody = remove_header(Response), + file:write(FD, MsgBody), + receive_msg(Socket, FD, false, Timeout); + {tcp_error, Socket, _Reason} -> + gen_tcp:close(Socket), + throw({error, "http connection failed"}) + after Timeout -> + gen_tcp:close(Socket), + throw({error, "http connection timedout"}) + end. + + +remove_header(<<"\r\n\r\n", MsgBody/binary>>) -> + MsgBody; +remove_header(<<_C, Rest/binary>>) -> + remove_header(Rest). + +%%---------------------------------------------------------------------- +%% Function : create_connection(Host, Port, Timeout) -> Result +%% Parameters: Host = string() +%% Port = integer() +%% Timeout = integer() +%% Result : io_device() +%% Description: +%%---------------------------------------------------------------------- +create_connection(Host, Port, Timeout) -> + case gen_tcp:connect(Host, Port,[{packet,0}, binary, {reuseaddr,true}], Timeout) of + {ok,Socket} -> + Socket; + {error, Reason} -> + throw({error, lists:flatten(io_lib:format("Can't connect to ~s:~p ~p\n", + [Host, Port, Reason]))}) + end. + +%%---------------------------------------------------------------------- +%% Function : http(Url) -> Result +%% Parameters: Url = string() +%% Result : {Host, PortInt, Key} +%% Description: +%%---------------------------------------------------------------------- +http("http://" ++ Address) -> + case string:tokens(Address, ":") of + [Host, Rest] -> + %% At his stage we know that address contains a Port number. + {Port, Key} = split_to_slash(Rest, []), + case catch list_to_integer(Port) of + PortInt when is_integer(PortInt) -> + {Host, PortInt, Key}; + _ -> + throw({error, "Malformed key; port not an integer, should be http://Host:Port/path or http://Host/path"}) + end; + [Address] -> + %% Use default port + {Host, Key} = split_to_slash(Address, []), + {Host, ?HTTP_DEF_PORT, Key}; + _What -> + throw({error, "Malformed key; should be http://Host:Port/path or http://Host/path"}) + end. + +%%---------------------------------------------------------------------- +%% Function : split_to_slash(String, Acc) -> Result +%% Parameters: String = string() +%% Acc = string() +%% Result : {string(), string()} +%% Description: +%%---------------------------------------------------------------------- +split_to_slash([], _Acc) -> + throw({error, "No Key given Host:Port/Key"}); +split_to_slash([$/|Rest], Acc) -> + {lists:reverse(Acc), [$/|Rest]}; +split_to_slash([H|T], Acc) -> + split_to_slash(T, [H|Acc]). + + +%%---------------------------------------------------------------------- +%% Function : create_tempfile(Template) -> Result +%% Parameters: Template = string() +%% Result : string() +%% Description: +%%---------------------------------------------------------------------- +create_tempfile(Template) -> + TmpDir = + case os:type() of + {unix, _} -> + case file:read_file_info("/tmp") of + {ok, _} -> + "/tmp"; + {error,enoent} -> + throw({error, "/tmp doesn't exist"}) + end; + {win32, _} -> + case os:getenv("TMP") of + false -> + case os:getenv("TEMP") of + false -> + throw({error, "Variabel TMP or TEMP doesn't exist"}); + P2 -> + P2 + end; + P1 -> + P1 + end + end, + TmpNameBase = filename:join([TmpDir, os:getpid() ++ Template ++ "."]), + create_tempfile_1(TmpNameBase, 1). + +create_tempfile_1(TmpNameBase, N) -> + FileName = TmpNameBase ++ integer_to_list(N), + case file:open(FileName, [write, binary]) of + {error, _Reason} -> + create_tempfile_1(TmpNameBase, N+1); + {ok, FD} -> + {FileName, FD} + end. + + +%%---------------------------------------------------------------------- +%% Function : filter_endtag_stack(EndTagStack) -> Result +%% Parameters: EndTagStack = [{term(), string(), string(), +%% term(), nslist(), nslist()}] +%% Result : [string()] +%% Description: Returns a stack with just local names. +%%---------------------------------------------------------------------- +filter_endtag_stack(EndTagStack) -> + filter_endtag_stack(EndTagStack,[]). + +filter_endtag_stack([], Acc) -> + lists:reverse(Acc); +filter_endtag_stack([{_,_,N,_,_,_}| Ts], Acc) -> + filter_endtag_stack(Ts, [N |Acc]). + + +%%---------------------------------------------------------------------- +%% Function : format_error(Tag, State, Reason) -> Result +%% Parameters: Tag = atom(), +%% State = xmerl_sax_parser_state() +%% Reason = string() +%% Result : {atom(), {string(), string(), integer()}, string(), [string()], event_state()} +%% Description: Format the resulting error tuple +%%---------------------------------------------------------------------- +format_error(Tag, State, Reason) -> + {Tag, + { + State#xmerl_sax_parser_state.current_location, + State#xmerl_sax_parser_state.entity, + State#xmerl_sax_parser_state.line_no + }, + Reason, + filter_endtag_stack(State#xmerl_sax_parser_state.end_tags), + State#xmerl_sax_parser_state.event_state}. + diff --git a/lib/xmerl/src/xmerl_sax_parser_latin1.erlsrc b/lib/xmerl/src/xmerl_sax_parser_latin1.erlsrc new file mode 100644 index 0000000000..f27758069d --- /dev/null +++ b/lib/xmerl/src/xmerl_sax_parser_latin1.erlsrc @@ -0,0 +1,40 @@ +%%-*-erlang-*- +%%-------------------------------------------------------------------- +%% %CopyrightBegin% +%% +%% Copyright Ericsson AB 2008-2009. All Rights Reserved. +%% +%% The contents of this file are subject to the Erlang Public License, +%% Version 1.1, (the "License"); you may not use this file except in +%% compliance with the License. You should have received a copy of the +%% Erlang Public License along with this software. If not, it can be +%% retrieved online at http://www.erlang.org/. +%% +%% Software distributed under the License is distributed on an "AS IS" +%% basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See +%% the License for the specific language governing rights and limitations +%% under the License. +%% +%% %CopyrightEnd% +%%---------------------------------------------------------------------- +%% File : xmerl_sax_parser_latin1.erl +%% Description : +%% +%% Created : 26 May 2008 +%%---------------------------------------------------------------------- +-module(xmerl_sax_parser_latin1). + +%%---------------------------------------------------------------------- +%% Macros +%%---------------------------------------------------------------------- +-define(STRING_EMPTY, <<>>). +-define(STRING(MatchStr), <<MatchStr>>). +-define(STRING_REST(MatchStr, Rest), <<MatchStr, Rest/binary>>). +-define(APPEND_STRING(Rest, New), <<Rest/binary, New/binary>>). +-define(TO_INPUT_FORMAT(Val), unicode:characters_to_binary(Val, unicode, latin1)). + +%% STRING_REST and STRING_UNBOUND_REST is only different in the list case +-define(STRING_UNBOUND_REST(MatchChar, Rest), <<MatchChar, Rest/binary>>). +-define(BYTE_ORDER_MARK_1, undefined_bom1). +-define(BYTE_ORDER_MARK_2, undefined_bom2). +-define(BYTE_ORDER_MARK_REST(Rest), <<undefined, Rest/binary>>). diff --git a/lib/xmerl/src/xmerl_sax_parser_list.erlsrc b/lib/xmerl/src/xmerl_sax_parser_list.erlsrc new file mode 100644 index 0000000000..26fec41cb9 --- /dev/null +++ b/lib/xmerl/src/xmerl_sax_parser_list.erlsrc @@ -0,0 +1,40 @@ +%%-*-erlang-*- +%%-------------------------------------------------------------------- +%% %CopyrightBegin% +%% +%% Copyright Ericsson AB 2008-2009. All Rights Reserved. +%% +%% The contents of this file are subject to the Erlang Public License, +%% Version 1.1, (the "License"); you may not use this file except in +%% compliance with the License. You should have received a copy of the +%% Erlang Public License along with this software. If not, it can be +%% retrieved online at http://www.erlang.org/. +%% +%% Software distributed under the License is distributed on an "AS IS" +%% basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See +%% the License for the specific language governing rights and limitations +%% under the License. +%% +%% %CopyrightEnd% +%%---------------------------------------------------------------------- +%% File : xmerl_sax_parser_list.erl +%% Description : +%% +%% Created : 25 Apr 2008 +%%---------------------------------------------------------------------- +-module(xmerl_sax_parser_list). + +%%---------------------------------------------------------------------- +%% Macros +%%---------------------------------------------------------------------- +-define(STRING_EMPTY, []). +-define(STRING(MatchStr), MatchStr). +-define(STRING_REST(MatchStr, Rest), MatchStr ++ Rest). +-define(APPEND_STRING(Rest, New), Rest ++ New). +-define(TO_INPUT_FORMAT(Val), Val). + +%% In the list case we can't use a '++' when matchin against an unbound variable +-define(STRING_UNBOUND_REST(MatchChar, Rest), [MatchChar | Rest]). +-define(BYTE_ORDER_MARK_1, undefined_bom1). +-define(BYTE_ORDER_MARK_2, undefined_bom2). +-define(BYTE_ORDER_MARK_REST(Rest), [undefined|Rest]). diff --git a/lib/xmerl/src/xmerl_sax_parser_utf16be.erlsrc b/lib/xmerl/src/xmerl_sax_parser_utf16be.erlsrc new file mode 100644 index 0000000000..fae5346e6a --- /dev/null +++ b/lib/xmerl/src/xmerl_sax_parser_utf16be.erlsrc @@ -0,0 +1,40 @@ +%%-*-erlang-*- +%%-------------------------------------------------------------------- +%% %CopyrightBegin% +%% +%% Copyright Ericsson AB 2008-2009. All Rights Reserved. +%% +%% The contents of this file are subject to the Erlang Public License, +%% Version 1.1, (the "License"); you may not use this file except in +%% compliance with the License. You should have received a copy of the +%% Erlang Public License along with this software. If not, it can be +%% retrieved online at http://www.erlang.org/. +%% +%% Software distributed under the License is distributed on an "AS IS" +%% basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See +%% the License for the specific language governing rights and limitations +%% under the License. +%% +%% %CopyrightEnd% +%%---------------------------------------------------------------------- +%% File : xmerl_sax_parser_utf16be.erl +%% Description : +%% +%% Created : 26 May 2008 +%%---------------------------------------------------------------------- +-module(xmerl_sax_parser_utf16be). + +%%---------------------------------------------------------------------- +%% Macros +%%---------------------------------------------------------------------- +-define(STRING_EMPTY, <<>>). +-define(STRING(MatchStr), <<MatchStr/big-utf16>>). +-define(STRING_REST(MatchStr, Rest), <<MatchStr/big-utf16, Rest/binary>>). +-define(APPEND_STRING(Rest, New), <<Rest/binary, New/binary>>). +-define(TO_INPUT_FORMAT(Val), unicode:characters_to_binary(Val, unicode, {utf16, big})). + +%% STRING_REST and STRING_UNBOUND_REST is only different in the list case +-define(STRING_UNBOUND_REST(MatchChar, Rest), <<MatchChar/big-utf16, Rest/binary>>). +-define(BYTE_ORDER_MARK_1, undefined). +-define(BYTE_ORDER_MARK_2, <<16#FE>>). +-define(BYTE_ORDER_MARK_REST(Rest), <<16#FE, 16#FF, Rest/binary>>). diff --git a/lib/xmerl/src/xmerl_sax_parser_utf16le.erlsrc b/lib/xmerl/src/xmerl_sax_parser_utf16le.erlsrc new file mode 100644 index 0000000000..5e1f0a217c --- /dev/null +++ b/lib/xmerl/src/xmerl_sax_parser_utf16le.erlsrc @@ -0,0 +1,40 @@ +%%-*-erlang-*- +%%-------------------------------------------------------------------- +%% %CopyrightBegin% +%% +%% Copyright Ericsson AB 2008-2009. All Rights Reserved. +%% +%% The contents of this file are subject to the Erlang Public License, +%% Version 1.1, (the "License"); you may not use this file except in +%% compliance with the License. You should have received a copy of the +%% Erlang Public License along with this software. If not, it can be +%% retrieved online at http://www.erlang.org/. +%% +%% Software distributed under the License is distributed on an "AS IS" +%% basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See +%% the License for the specific language governing rights and limitations +%% under the License. +%% +%% %CopyrightEnd% +%%---------------------------------------------------------------------- +%% File : xmerl_sax_parser_utf16le.erl +%% Description : +%% +%% Created : 26 May 2008 +%%---------------------------------------------------------------------- +-module(xmerl_sax_parser_utf16le). + +%%---------------------------------------------------------------------- +%% Macros +%%---------------------------------------------------------------------- +-define(STRING_EMPTY, <<>>). +-define(STRING(MatchStr), <<MatchStr/little-utf16>>). +-define(STRING_REST(MatchStr, Rest), <<MatchStr/little-utf16, Rest/binary>>). +-define(APPEND_STRING(Rest, New), <<Rest/binary, New/binary>>). +-define(TO_INPUT_FORMAT(Val), unicode:characters_to_binary(Val, unicode, {utf16, little})). + +%% STRING_REST and STRING_UNBOUND_REST is only different in the list case +-define(STRING_UNBOUND_REST(MatchChar, Rest), <<MatchChar/little-utf16, Rest/binary>>). +-define(BYTE_ORDER_MARK_1, undefined). +-define(BYTE_ORDER_MARK_2, <<16#FF>>). +-define(BYTE_ORDER_MARK_REST(Rest), <<16#FF, 16#FE, Rest/binary>>). diff --git a/lib/xmerl/src/xmerl_sax_parser_utf8.erlsrc b/lib/xmerl/src/xmerl_sax_parser_utf8.erlsrc new file mode 100644 index 0000000000..542aca5a3a --- /dev/null +++ b/lib/xmerl/src/xmerl_sax_parser_utf8.erlsrc @@ -0,0 +1,43 @@ +%%-*-erlang-*- +%%-------------------------------------------------------------------- +%% %CopyrightBegin% +%% +%% Copyright Ericsson AB 2008-2009. All Rights Reserved. +%% +%% The contents of this file are subject to the Erlang Public License, +%% Version 1.1, (the "License"); you may not use this file except in +%% compliance with the License. You should have received a copy of the +%% Erlang Public License along with this software. If not, it can be +%% retrieved online at http://www.erlang.org/. +%% +%% Software distributed under the License is distributed on an "AS IS" +%% basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See +%% the License for the specific language governing rights and limitations +%% under the License. +%% +%% %CopyrightEnd% +%%---------------------------------------------------------------------- +%% File : xmerl_sax_parser_utf8.erl +%% Description : +%% +%% Created : 27 May 2008 +%%---------------------------------------------------------------------- +-module(xmerl_sax_parser_utf8). + +%%---------------------------------------------------------------------- +%% Macros +%%---------------------------------------------------------------------- +-define(STRING_EMPTY, <<>>). +-define(STRING(MatchStr), <<MatchStr/utf8>>). +-define(STRING_REST(MatchStr, Rest), <<MatchStr/utf8, Rest/binary>>). +-define(APPEND_STRING(Rest, New), <<Rest/binary, New/binary>>). +-define(TO_INPUT_FORMAT(Val), unicode:characters_to_binary(Val, unicode, utf8)). + + +%% STRING_REST and STRING_UNBOUND_REST is only different in the list case +-define(STRING_UNBOUND_REST(MatchChar, Rest), <<MatchChar/utf8, Rest/binary>>). +-define(BYTE_ORDER_MARK_1, <<16#EF>>). +-define(BYTE_ORDER_MARK_2, <<16#EF, 16#BB>>). +-define(BYTE_ORDER_MARK_REST(Rest), <<16#EF, 16#BB, 16#BF, Rest/binary>>). + + diff --git a/lib/xmerl/src/xmerl_sax_simple_dom.erl b/lib/xmerl/src/xmerl_sax_simple_dom.erl new file mode 100644 index 0000000000..58a11f70fe --- /dev/null +++ b/lib/xmerl/src/xmerl_sax_simple_dom.erl @@ -0,0 +1,263 @@ +%%-*-erlang-*- +%%-------------------------------------------------------------------- +%% %CopyrightBegin% +%% +%% Copyright Ericsson AB 2009. All Rights Reserved. +%% +%% The contents of this file are subject to the Erlang Public License, +%% Version 1.1, (the "License"); you may not use this file except in +%% compliance with the License. You should have received a copy of the +%% Erlang Public License along with this software. If not, it can be +%% retrieved online at http://www.erlang.org/. +%% +%% Software distributed under the License is distributed on an "AS IS" +%% basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See +%% the License for the specific language governing rights and limitations +%% under the License. +%% +%% %CopyrightEnd% +%%---------------------------------------------------------------------- +%% File : xmerl_sax_simple_dom.erl +%% Description : +%% +%% Created : 17 Apr 2009 +%%---------------------------------------------------------------------- +-module(xmerl_sax_simple_dom). + +%%---------------------------------------------------------------------- +%% Include files +%%---------------------------------------------------------------------- +-include("xmerl_sax_old_dom.hrl"). + +%%---------------------------------------------------------------------- +%% External exports +%%---------------------------------------------------------------------- +-export([ + initial_state/0, + get_dom/1, + event/3 + ]). + +%%---------------------------------------------------------------------- +%% Internal exports +%%---------------------------------------------------------------------- +-export([ + ]). + +%%====================================================================== +%% Macros +%%====================================================================== +%%---------------------------------------------------------------------- +%% Error handling +%%---------------------------------------------------------------------- +-define(error(Reason), + throw({sax_simple_dom_error, Reason})). + +%%---------------------------------------------------------------------- + +%%====================================================================== +%% Records +%%====================================================================== + +%%---------------------------------------------------------------------- +%% State record for the validator +%%---------------------------------------------------------------------- +-record(xmerl_sax_simple_dom_state, { + tags=[], %% Tag stack + cno=[], %% Current node number + namespaces = [], %% NameSpace stack + dom=[] %% DOM structure + }). + +%%====================================================================== +%% External functions +%%====================================================================== + +%%---------------------------------------------------------------------- +%% Function: initial_state() -> Result +%% Parameters: +%% Result: +%% Description: +%%---------------------------------------------------------------------- +initial_state() -> + #xmerl_sax_simple_dom_state{}. + +%%---------------------------------------------------------------------- +%% Function: get_dom(State) -> Result +%% Parameters: +%% Result: +%% Description: +%%---------------------------------------------------------------------- +get_dom(#xmerl_sax_simple_dom_state{dom=Dom}) -> + Dom. + +%%---------------------------------------------------------------------- +%% Function: event(Event, LineNo, State) -> Result +%% Parameters: +%% Result: +%% Description: +%%---------------------------------------------------------------------- +event(Event, _LineNo, State) -> + build_dom(Event, State). + + +%%====================================================================== +%% Internal functions +%%====================================================================== + +%%---------------------------------------------------------------------- +%% Function : build_dom(Event, State) -> Result +%% Parameters: Event = term() +%% State = #xmerl_sax_simple_dom_state{} +%% Result : #xmerl_sax_simple_dom_state{} | +%% Description: +%%---------------------------------------------------------------------- + +%% Document +%%---------------------------------------------------------------------- +build_dom(startDocument, State) -> + State#xmerl_sax_simple_dom_state{dom=[startDocument]}; +build_dom(endDocument, + #xmerl_sax_simple_dom_state{dom=[{Tag, Attributes, Content} |D]} = State) -> + case D of + [startDocument] -> + State#xmerl_sax_simple_dom_state{dom=[{Tag, Attributes, + lists:reverse(Content)}]}; + [#xmlDecl{} = Decl, startDocument] -> + State#xmerl_sax_simple_dom_state{dom=[Decl, {Tag, Attributes, + lists:reverse(Content)}]}; + _ -> + io:format("~p\n", [D]), + ?error("we're not at end the document when endDocument event is encountered.") + end; + +%% Element +%%---------------------------------------------------------------------- +build_dom({startElement, _Uri, LocalName, _QName, Attributes}, + #xmerl_sax_simple_dom_state{tags=T, cno=CN, dom=D} = State) -> + + A = parse_attributes(LocalName, Attributes), + {Num, NewCN} = + case CN of + [] -> + {1, [1]}; + [ N |CNs] -> + {N, [1, N+1 |CNs]} + end, + + State#xmerl_sax_simple_dom_state{tags=[{list_to_atom(LocalName), Num} |T], + cno=NewCN, + dom=[{list_to_atom(LocalName), + lists:reverse(A), + [] + } | D]}; +build_dom({endElement, _Uri, LocalName, _QName}, + #xmerl_sax_simple_dom_state{tags=[_ |T], + cno=[_ |CN], + dom=[{CName, CAttributes, CContent}, + {PName, PAttributes, PContent} | D]} = State) -> + case list_to_atom(LocalName) of + CName -> + State#xmerl_sax_simple_dom_state{tags=T, + cno=CN, + dom=[{PName, PAttributes, + [{CName, CAttributes, + lists:reverse(CContent)} + |PContent] + } | D]}; + _ -> + ?error("Got end of element: " ++ LocalName ++ " but expected: " ++ + CName) + end; + +%% Text +%%---------------------------------------------------------------------- +build_dom({characters, String}, + #xmerl_sax_simple_dom_state{tags=_T, + cno=[Num |CN], + dom=[{Name, Attributes, Content}| D]} = State) -> + State#xmerl_sax_simple_dom_state{cno=[Num+1 |CN], + dom=[{Name, Attributes, [String|Content]} | D]}; + +build_dom({ignorableWhitespace, String}, + #xmerl_sax_simple_dom_state{tags=T, + cno=[Num |CN], + dom=[{Name, Attributes, Content}| D]} = State) -> + State#xmerl_sax_simple_dom_state{cno=[Num+1 |CN], + dom=[{Name, Attributes, [#xmlText{value=String, parents=T, pos=Num, type=text}|Content]} | D]}; + +%% Comments +%%---------------------------------------------------------------------- +build_dom({comment, String}, + #xmerl_sax_simple_dom_state{tags=T, + cno=[Num |CN], + dom=[#xmlElement{content=C} = Current| D]} = State) -> + State#xmerl_sax_simple_dom_state{cno=[Num+1 |CN], + dom=[Current#xmlElement{content=[#xmlComment{parents=T, pos=Num, value=String}|C]} | D]}; + +%% NameSpaces +%%---------------------------------------------------------------------- +build_dom({startPrefixMapping, [], _Uri}, State) -> + State; +build_dom({startPrefixMapping, Prefix, Uri}, + #xmerl_sax_simple_dom_state{namespaces=NS} = State) -> + State#xmerl_sax_simple_dom_state{namespaces=[{Prefix, list_to_atom(Uri)} |NS]}; +build_dom({endPrefixMapping, Prefix}, + #xmerl_sax_simple_dom_state{namespaces=[{Prefix, _} |NS]} = State) -> + State#xmerl_sax_simple_dom_state{namespaces=NS}; + +%% Processing instructions +%%---------------------------------------------------------------------- +build_dom({processingInstruction,"xml", PiData}, + #xmerl_sax_simple_dom_state{dom=D} = State) -> + {Vsn, PiData1} = find_and_remove_attribute("version", PiData, []), + {Enc, PiData2} = find_and_remove_attribute("encoding", PiData1, []), + {Standalone, PiData3} = find_and_remove_attribute("standalone", PiData2, yes), + State#xmerl_sax_simple_dom_state{dom=[#xmlDecl{vsn=Vsn, encoding=Enc, standalone=Standalone, attributes=PiData3}| D]}; +build_dom({processingInstruction, PiTarget, PiData}, + #xmerl_sax_simple_dom_state{cno=[Num |CN], + dom=[#xmlElement{content=C} = Current| D]} = State) -> + State#xmerl_sax_simple_dom_state{cno=[Num+1 |CN], + dom=[Current#xmlElement{content=[#xmlPI{name=PiTarget,pos=Num, value=PiData} + |C]} | D]}; + + +%% Default +%%---------------------------------------------------------------------- +build_dom(_E, State) -> + State. + + + + + +%%---------------------------------------------------------------------- +%% Function : parse_attributes(ElName, Attributes) -> Result +%% Parameters: +%% Result : +%% Description: +%%---------------------------------------------------------------------- +parse_attributes(ElName, Attributes) -> + parse_attributes(ElName, Attributes, 1, []). + +parse_attributes(_, [], _, Acc) -> + Acc; +parse_attributes(ElName, [{_Uri, _Prefix, LocalName, AttrValue} |As], N, Acc) -> + parse_attributes(ElName, As, N+1, [{list_to_atom(LocalName), AttrValue} |Acc]). + + + +%%---------------------------------------------------------------------- +%% Function : find_and_remove_attribute(Key, Data, Default) -> Result +%% Parameters: +%% Result : +%% Description: +%%---------------------------------------------------------------------- +find_and_remove_attribute(Key, Data, Default) -> + case lists:keysearch(Key, 1, Data) of + {value, {Key, Value}} -> + Data2 = lists:keydelete(Key, 1, Data), + {Value, Data2}; + false -> + {Default, Data} + end. diff --git a/lib/xmerl/src/xmerl_scan.erl b/lib/xmerl/src/xmerl_scan.erl new file mode 100644 index 0000000000..4e5cc59d8f --- /dev/null +++ b/lib/xmerl/src/xmerl_scan.erl @@ -0,0 +1,4088 @@ +%% +%% %CopyrightBegin% +%% +%% Copyright Ericsson AB 2003-2009. All Rights Reserved. +%% +%% The contents of this file are subject to the Erlang Public License, +%% Version 1.1, (the "License"); you may not use this file except in +%% compliance with the License. You should have received a copy of the +%% Erlang Public License along with this software. If not, it can be +%% retrieved online at http://www.erlang.org/. +%% +%% Software distributed under the License is distributed on an "AS IS" +%% basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See +%% the License for the specific language governing rights and limitations +%% under the License. +%% +%% %CopyrightEnd% +%% + +%% Description : Simgle-pass XML scanner. See xmerl.hrl for data defs. + +%% @doc This module is the interface to the XML parser, it handles XML 1.0. +%% The XML parser is activated through +%% <tt>xmerl_scan:string/[1,2]</tt> or +%% <tt>xmerl_scan:file/[1,2]</tt>. +%% It returns records of the type defined in xmerl.hrl. +%% See also <a href="xmerl_examples.html">tutorial</a> on customization +%% functions. +%% @type global_state(). <p> +%% The global state of the scanner, represented by the #xmerl_scanner{} record. +%% </p> +%% @type option_list(). <p>Options allow to customize the behaviour of the +%% scanner. +%% See also <a href="xmerl_examples.html">tutorial</a> on customization +%% functions. +%% </p> +%% Possible options are: +%% <dl> +%% <dt><code>{acc_fun, Fun}</code></dt> +%% <dd>Call back function to accumulate contents of entity.</dd> +%% <dt><code>{continuation_fun, Fun} | +%% {continuation_fun, Fun, ContinuationState}</code></dt> +%% <dd>Call back function to decide what to do if the scanner runs into EOF +%% before the document is complete.</dd> +%% <dt><code>{event_fun, Fun} | +%% {event_fun, Fun, EventState}</code></dt> +%% <dd>Call back function to handle scanner events.</dd> +%% <dt><code>{fetch_fun, Fun} | +%% {fetch_fun, Fun, FetchState}</code></dt> +%% <dd>Call back function to fetch an external resource.</dd> +%% <dt><code>{hook_fun, Fun} | +%% {hook_fun, Fun, HookState}</code></dt> +%% <dd>Call back function to process the document entities once +%% identified.</dd> +%% <dt><code>{close_fun, Fun}</code></dt> +%% <dd>Called when document has been completely parsed.</dd> +%% <dt><code>{rules, ReadFun, WriteFun, RulesState} | +%% {rules, Rules}</code></dt> +%% <dd>Handles storing of scanner information when parsing.</dd> +%% <dt><code>{user_state, UserState}</code></dt> +%% <dd>Global state variable accessible from all customization functions</dd> +%% +%% <dt><code>{fetch_path, PathList}</code></dt> +%% <dd>PathList is a list of +%% directories to search when fetching files. If the file in question +%% is not in the fetch_path, the URI will be used as a file +%% name.</dd> +%% <dt><code>{space, Flag}</code></dt> +%% <dd>'preserve' (default) to preserve spaces, 'normalize' to +%% accumulate consecutive whitespace and replace it with one space.</dd> +%% <dt><code>{line, Line}</code></dt> +%% <dd>To specify starting line for scanning in document which contains +%% fragments of XML.</dd> +%% <dt><code>{namespace_conformant, Flag}</code></dt> +%% <dd>Controls whether to behave as a namespace conformant XML parser, +%% 'false' (default) to not otherwise 'true'.</dd> +%% <dt><code>{validation, Flag}</code></dt> +%% <dd>Controls whether to process as a validating XML parser: +%% 'off' (default) no validation, or validation 'dtd' by DTD or 'schema' +%% by XML Schema. 'false' and 'true' options are obsolete +%% (i.e. they may be removed in a future release), if used 'false' +%% equals 'off' and 'true' equals 'dtd'.</dd> +%% <dt><code>{schemaLocation, [{Namespace,Link}|...]}</code></dt> +%% <dd>Tells explicitly which XML Schema documents to use to validate +%% the XML document. Used together with the +%% <code>{validation,schema}</code> option.</dd> +%% <dt><code>{quiet, Flag}</code></dt> +%% <dd>Set to 'true' if xmerl should behave quietly and not output any +%% information to standard output (default 'false').</dd> +%% <dt><code>{doctype_DTD, DTD}</code></dt> +%% <dd>Allows to specify DTD name when it isn't available in the XML +%% document. This option has effect only together with +%% <code>{validation,'dtd'</code> option.</dd> +%% <dt><code>{xmlbase, Dir}</code></dt> +%% <dd>XML Base directory. If using string/1 default is current directory. +%% If using file/1 default is directory of given file.</dd> +%% <dt><code>{encoding, Enc}</code></dt> +%% <dd>Set default character set used (default UTF-8). +%% This character set is used only if not explicitly given by the XML +%% declaration. </dd> +%% </dl> + +-module(xmerl_scan). +-vsn('0.20'). +-date('03-09-16'). + +%% main API +-export([string/1, string/2, + file/1, file/2]). + +%% access functions for various states +-export([user_state/1, user_state/2, + event_state/1, event_state/2, + hook_state/1, hook_state/2, + rules_state/1, rules_state/2, + fetch_state/1, fetch_state/2, + cont_state/1, cont_state/2]). + +%% helper functions. To xmerl_lib ?? +-export([accumulate_whitespace/4]). + +%-define(debug, 1). +-include("xmerl.hrl"). % record def, macros +-include("xmerl_internal.hrl"). +-include_lib("kernel/include/file.hrl"). + + +-define(fatal(Reason, S), + if + S#xmerl_scanner.quiet -> + ok; + true -> + ok=io:format("~p- fatal: ~p~n", [?LINE, Reason]) + end, + fatal(Reason, S)). + + +-define(ustate(U, S), S#xmerl_scanner{user_state = U}). + + +%% Functions to access the various states + +%%% @spec user_state(S::global_state()) -> global_state() +%%% @equiv user_state(UserState,S) +user_state(#xmerl_scanner{user_state = S}) -> S. + +%%% @spec event_state(S::global_state()) -> global_state() +%%% @equiv event_state(EventState,S) +event_state(#xmerl_scanner{fun_states = #xmerl_fun_states{event = S}}) -> S. + +%%% @spec hook_state(S::global_state()) -> global_state() +%%% @equiv hook_state(HookState,S) +hook_state(#xmerl_scanner{fun_states = #xmerl_fun_states{hook = S}}) -> S. + +%%% @spec rules_state(S::global_state()) -> global_state() +%%% @equiv rules_state(RulesState,S) +rules_state(#xmerl_scanner{fun_states = #xmerl_fun_states{rules = S}}) -> S. + +%%% @spec fetch_state(S::global_state()) -> global_state() +%%% @equiv fetch_state(FetchState,S) +fetch_state(#xmerl_scanner{fun_states = #xmerl_fun_states{fetch = S}}) -> S. + +%%% @spec cont_state(S::global_state()) -> global_state() +%%% @equiv cont_state(ContinuationState,S) +cont_state(#xmerl_scanner{fun_states = #xmerl_fun_states{cont = S}}) -> S. + + +%%%% Functions to modify the various states + +%%% @spec user_state(UserState, S::global_state()) -> global_state() +%%% @doc For controlling the UserState, to be used in a user function. +%%% See <a href="xmerl_examples.html">tutorial</a> on customization functions. +user_state(X, S) -> + S#xmerl_scanner{user_state = X}. + +%%% @spec event_state(EventState, S::global_state()) -> global_state() +%%% @doc For controlling the EventState, to be used in an event +%%% function, and called at the beginning and at the end of a parsed entity. +%%% See <a href="xmerl_examples.html">tutorial</a> on customization functions. +event_state(X, S=#xmerl_scanner{fun_states = FS}) -> + FS1 = FS#xmerl_fun_states{event = X}, + S#xmerl_scanner{fun_states = FS1}. + +%%% @spec hook_state(HookState, S::global_state()) -> global_state() +%%% @doc For controlling the HookState, to be used in a hook +%%% function, and called when the parser has parsed a complete entity. +%%% See <a href="xmerl_examples.html">tutorial</a> on customization functions. +hook_state(X, S=#xmerl_scanner{fun_states = FS}) -> + FS1 = FS#xmerl_fun_states{hook = X}, + S#xmerl_scanner{fun_states = FS1}. + +%%% @spec rules_state(RulesState, S::global_state()) -> global_state() +%%% @doc For controlling the RulesState, to be used in a rules +%%% function, and called when the parser store scanner information in a rules +%%% database. +%%% See <a href="xmerl_examples.html">tutorial</a> on customization functions. +rules_state(X, S=#xmerl_scanner{fun_states = FS}) -> + FS1 = FS#xmerl_fun_states{rules = X}, + S#xmerl_scanner{fun_states = FS1}. + +%%% @spec fetch_state(FetchState, S::global_state()) -> global_state() +%%% @doc For controlling the FetchState, to be used in a fetch +%%% function, and called when the parser fetch an external resource (eg. a DTD). +%%% See <a href="xmerl_examples.html">tutorial</a> on customization functions. +fetch_state(X, S=#xmerl_scanner{fun_states = FS}) -> + FS1 = FS#xmerl_fun_states{fetch = X}, + S#xmerl_scanner{fun_states = FS1}. + +%%% @spec cont_state(ContinuationState, S::global_state()) -> global_state() +%%% @doc For controlling the ContinuationState, to be used in a continuation +%%% function, and called when the parser encounters the end of the byte stream. +%%% See <a href="xmerl_examples.html">tutorial</a> on customization functions. +cont_state(X, S=#xmerl_scanner{fun_states = FS}) -> + FS1 = FS#xmerl_fun_states{cont = X}, + S#xmerl_scanner{fun_states = FS1}. + + +%% @spec file(Filename::string()) -> {xmlElement(),Rest} +%% Rest = list() +%% @equiv file(Filename, []) +file(F) -> + file(F, []). + +%% @spec file(Filename::string(), Options::option_list()) -> {xmlElement(),Rest} +%% Rest = list() +%%% @doc Parse file containing an XML document +file(F, Options) -> + ExtCharset=case lists:keysearch(encoding,1,Options) of + {value,{_,Val}} -> Val; + false -> undefined + end, + case int_file(F,Options,ExtCharset) of + {Res, Tail,S=#xmerl_scanner{close_fun=Close}} -> + Close(S), % for side effects only - final state is dropped + {Res,Tail}; + {error, Reason} -> + {error, Reason} + end. + +int_file(F, Options,_ExtCharset) -> + %%io:format("int_file F=~p~n",[F]), + case file:read_file(F) of + {ok, Bin} -> + int_string(binary_to_list(Bin), Options, filename:dirname(F),F); + Error -> + Error + end. + +int_file_decl(F, Options,_ExtCharset) -> +% io:format("int_file_decl F=~p~n",[F]), + case file:read_file(F) of + {ok, Bin} -> + int_string_decl(binary_to_list(Bin), Options, filename:dirname(F),F); + Error -> + Error + end. + +%% @spec string(Text::list()) -> {xmlElement(),Rest} +%% Rest = list() +%% @equiv string(Test, []) +string(Str) -> + string(Str, []). + +%% @spec string(Text::list(),Options::option_list()) -> {xmlElement(),Rest} +%% Rest = list() +%%% @doc Parse string containing an XML document +string(Str, Options) -> + {Res, Tail, S=#xmerl_scanner{close_fun = Close}} = + int_string(Str, Options,file_name_unknown), + Close(S), % for side effects only - final state is dropped + {Res,Tail}. + +int_string(Str, Options,FileName) -> + {ok, XMLBase} = file:get_cwd(), + int_string(Str, Options, XMLBase, FileName). + +int_string(Str, Options, XMLBase, FileName) -> + S0=initial_state0(Options,XMLBase), + S = S0#xmerl_scanner{filename=FileName}, + %%io:format("int_string1, calling xmerl_lib:detect_charset~n",[]), + + %% In case of no encoding attribute in document utf-8 is default, but + %% another character set may be detected with help of Byte Order Marker or + %% with help of the encoding of the first 4 bytes. + case xmerl_lib:detect_charset(S#xmerl_scanner.encoding,Str) of + {auto,'iso-10646-utf-1',Str2} -> + scan_document(Str2, S#xmerl_scanner{encoding="iso-10646-utf-1"}); + {external,'iso-10646-utf-1',Str2} -> + scan_document(Str2, S#xmerl_scanner{encoding="iso-10646-utf-1"}); + {undefined,undefined,Str2} -> %% no auto detection + scan_document(Str2, S); + {external,ExtCharset,Str2} -> + %% no auto detection, ExtCharset is an explicitly provided + %% 7 bit,8 bit or utf-8 encoding + scan_document(Str2, S#xmerl_scanner{encoding=atom_to_list(ExtCharset)}) + end. + +int_string_decl(Str, Options, XMLBase, FileName) -> + S0=initial_state0(Options,XMLBase), + S = S0#xmerl_scanner{filename=FileName}, + case xmerl_lib:detect_charset(S#xmerl_scanner.encoding,Str) of + {auto,'iso-10646-utf-1',Str2} -> + scan_decl(Str2, S#xmerl_scanner{encoding="iso-10646-utf-1"}); + {external,'iso-10646-utf-1',Str2} -> + scan_decl(Str2, S#xmerl_scanner{encoding="iso-10646-utf-1"}); + {undefined,undefined,Str2} -> + scan_decl(Str2, S); + {external,ExtCharset,Str2} -> + scan_decl(Str2, S#xmerl_scanner{encoding=atom_to_list(ExtCharset)}) + end. + + + +initial_state0(Options,XMLBase) -> + CommonData = common_data(), + initial_state(Options, #xmerl_scanner{ + event_fun = fun event/2, + hook_fun = fun hook/2, + acc_fun = fun acc/3, + fetch_fun = fun fetch/2, + close_fun = fun close/1, + continuation_fun = fun cont/3, + rules_read_fun = fun rules_read/3, + rules_write_fun = fun rules_write/4, + rules_delete_fun= fun rules_delete/3, + xmlbase = XMLBase, + common_data = CommonData + }). + +initial_state([{event_fun, F}|T], S) -> + initial_state(T, S#xmerl_scanner{event_fun = F}); +initial_state([{event_fun, F, ES}|T], S) -> + S1 = event_state(ES, S#xmerl_scanner{event_fun = F}), + initial_state(T, S1); +initial_state([{acc_fun, F}|T], S) -> + initial_state(T, S#xmerl_scanner{acc_fun = F}); +initial_state([{hook_fun, F}|T], S) -> + initial_state(T, S#xmerl_scanner{hook_fun = F}); +initial_state([{hook_fun, F, HS}|T], S) -> + S1 = hook_state(HS, S#xmerl_scanner{hook_fun = F}), + initial_state(T, S1); +initial_state([{close_fun, F}|T], S) -> + initial_state(T, S#xmerl_scanner{close_fun = F}); +initial_state([{fetch_fun, F}|T], S) -> + initial_state(T, S#xmerl_scanner{fetch_fun = F}); +initial_state([{fetch_fun, F, FS}|T], S) -> + S1 = fetch_state(FS, S#xmerl_scanner{fetch_fun = F}), + initial_state(T, S1); +initial_state([{fetch_path, P}|T], S) -> + initial_state(T, S#xmerl_scanner{fetch_path = P}); +initial_state([{continuation_fun, F}|T], S) -> + initial_state(T, S#xmerl_scanner{continuation_fun = F}); +initial_state([{continuation_fun, F, CS}|T], S) -> + S1 = cont_state(CS, S#xmerl_scanner{continuation_fun = F}), + initial_state(T, S1); +initial_state([{rules, R}|T], S) -> + initial_state(T, S#xmerl_scanner{rules = R, + keep_rules = true}); +initial_state([{rules, Read, Write, RS}|T], S) -> + S1 = rules_state(RS, S#xmerl_scanner{rules_read_fun = Read, + rules_write_fun = Write, + keep_rules = true}), + initial_state(T, S1); +initial_state([{user_state, F}|T], S) -> + initial_state(T, S#xmerl_scanner{user_state = F}); +initial_state([{space, L}|T], S) -> + initial_state(T, S#xmerl_scanner{space = L}); +initial_state([{line, L}|T], S) -> + initial_state(T, S#xmerl_scanner{line = L}); +initial_state([{namespace_conformant, F}|T], S) when F==true; F==false -> + initial_state(T, S#xmerl_scanner{namespace_conformant = F}); +initial_state([{validation, F}|T], S) + when F==off; F==dtd; F==schema; F==true; F==false -> + initial_state(T, S#xmerl_scanner{validation = validation_value(F)}); +initial_state([{schemaLocation, SL}|T], S) when is_list(SL) -> + initial_state(T, S#xmerl_scanner{schemaLocation=SL}); +initial_state([{quiet, F}|T], S) when F==true; F==false -> + initial_state(T, S#xmerl_scanner{quiet = F}); +initial_state([{doctype_DTD,DTD}|T], S) -> + initial_state(T,S#xmerl_scanner{doctype_DTD = DTD}); +initial_state([{text_decl,Bool}|T], S) -> + initial_state(T,S#xmerl_scanner{text_decl=Bool}); +initial_state([{environment,Env}|T], S) -> + initial_state(T,S#xmerl_scanner{environment=Env}); +initial_state([{xmlbase, D}|T], S) -> + initial_state(T, S#xmerl_scanner{xmlbase = D}); +initial_state([{encoding, Enc}|T], S) -> + initial_state(T, S#xmerl_scanner{encoding = Enc}); +initial_state([], S=#xmerl_scanner{rules = undefined}) -> + Tab = ets:new(rules, [set, public]), + S#xmerl_scanner{rules = Tab}; +initial_state([], S) -> + S. + +validation_value(true) -> + dtd; +validation_value(false) -> + off; +validation_value(F) -> + F. + +%% Used for compacting (some) indentations. +%% See also fast_accumulate_whitespace(). +common_data() -> + {comdata(lists:duplicate(60, $\s), []), + comdata(lists:duplicate(15, $\t), []), + "\n"}. + +comdata([], CD)-> + list_to_tuple(CD); +comdata([_ | T]=L, CD) -> + comdata(T, [[$\n | L] | CD]). + +%%% ----------------------------------------------------- +%%% Default modifier functions + +%%% Hooks: +%%% - {element, Line, Name, Attrs, Content} +%%% - {processing_instruction, Line, Data} + +hook(X, State) -> + {X, State}. + +%%% Events: +%%% +%%% #xmerl_event{event : started | ended, +%%% line : integer(), +%%% col : integer(), +%%% data} +%%% +%%% Data Events +%%% document started, ended +%%% #xmlElement started, ended +%%% #xmlAttribute ended +%%% #xmlPI ended +%%% #xmlComment ended +%%% #xmlText ended +event(_X, S) -> + S. + +%% The acc/3 function can return either {Acc�, S'} or {Acc', Pos', S'}, +%% where Pos' can be derived from X#xmlElement.pos, X#xmlText.pos, or +%% X#xmlAttribute.pos (whichever is the current object type.) +%% The acc/3 function is not allowed to redefine the type of object +%% being defined, but _is_ allowed to either ignore it or split it +%% into multiple objects (in which case {Acc',Pos',S'} should be returned.) +%% If {Acc',S'} is returned, Pos will be incremented by 1 by default. +%% Below is an example of an acceptable operation +acc(X = #xmlText{value = Text}, Acc, S) -> + {[X#xmlText{value = Text}|Acc], S}; +acc(X, Acc, S) -> + {[X|Acc], S}. + +fetch({system, URI}, S) -> + fetch_URI(URI, S); +fetch({public, _PublicID, URI}, S) -> + fetch_URI(URI, S). + +%%% Always assume an external resource can be found locally! Thus +%%% don't bother fetching with e.g. HTTP. Returns the path where the +%%% resource is found. The path to the external resource is given by +%%% URI directly or the option fetch_path (additional paths) or +%%% directory (base path to external resource) +fetch_URI(URI, S) -> + %% assume URI is a filename + Split = filename:split(URI), + Filename = fun([])->[];(X)->lists:last(X) end (Split), + Fullname = + case Split of %% how about Windows systems? + ["file:"|Name]-> %% absolute path, see RFC2396 sect 3 + %% file:/dtd_name + filename:join(["/"|Name]); + ["/"|Rest] when Rest /= [] -> + %% absolute path name + URI; + ["http:"|_Rest] -> + {http,URI}; + [] -> %% empty systemliteral + []; + _ -> + filename:join(S#xmerl_scanner.xmlbase, URI) + end, + Path = path_locate(S#xmerl_scanner.fetch_path, Filename, Fullname), + ?dbg("fetch(~p) -> {file, ~p}.~n", [URI, Path]), + {ok, Path, S}. + +path_locate(_, _, {http,_}=URI) -> + URI; +path_locate(_, _, []) -> + []; +path_locate([Dir|Dirs], FN, FullName) -> + F = filename:join(Dir, FN), + case file:read_file_info(F) of + {ok, #file_info{type = regular}} -> + {file,F}; + _ -> + path_locate(Dirs, FN, FullName) + end; +path_locate([], _FN, FullName) -> + {file,FullName}. + + +cont(_F, Exception, US) -> + Exception(US). + +close(S) -> + S. + + +%%% ----------------------------------------------------- +%%% Scanner + +%%% [1] document ::= prolog element Misc* +scan_document(Str0, S=#xmerl_scanner{event_fun = Event, + line = L, col = C, + environment=Env, + encoding=Charset, + validation=ValidateResult}) -> + S1 = Event(#xmerl_event{event = started, + line = L, + col = C, + data = document}, S), + + %% Transform to given character set. + %% Note that if another character set is given in the encoding + %% attribute in a XML declaration that one will be used later + Str=if + Charset == "utf-8" -> + Str0; + Charset=/=undefined -> % Default character set is UTF-8 + xmerl_ucs:to_unicode(Str0,list_to_atom(Charset)); + true -> %% Charset is undefined if no external input is + %% given, and no auto detection of character + %% encoding was made. + Str0 + end, +%% M1 = erlang:memory(), +%% io:format("Memory status before prolog: ~p~n",[M1]), + {T1, S2} = scan_prolog(Str, S1, _StartPos = 1), +%% M2 = erlang:memory(), +%% io:format("Memory status after prolog: ~p~n",[M2]), + %%io:format("scan_document 2, prolog parsed~n",[]), + T2 = scan_mandatory("<",T1,1,S2,expected_element_start_tag), +%% M3 = erlang:memory(), +%% io:format("Memory status before element: ~p~n",[M3]), + {Res, T3, S3} =scan_element(T2,S2,_StartPos = 1), +%% M4 = erlang:memory(), +%% io:format("Memory status after element: ~p~n",[M4]), + {Tail, S4}=scan_misc(T3, S3, _StartPos = 1), +%% M5 = erlang:memory(), +%% io:format("Memory status after misc: ~p~n",[M5]), + + S5 = #xmerl_scanner{} = Event(#xmerl_event{event = ended, + line = S4#xmerl_scanner.line, + col = S4#xmerl_scanner.col, + data = document}, S4), + + {Res2,S6} = case validation_mode(ValidateResult) of + off -> + {Res,cleanup(S5)}; + dtd when Env == element; Env == prolog -> + check_decl2(S5), + case xmerl_validate:validate(S5,Res) of + {'EXIT',{error,Reason}} -> + S5b=cleanup(S5), + ?fatal({failed_validation,Reason}, S5b); + {'EXIT',Reason} -> + S5b=cleanup(S5), + ?fatal({failed_validation,Reason}, S5b); + {error,Reason} -> + S5b=cleanup(S5), + ?fatal({failed_validation,Reason}, S5b); + {error,Reason,_Next} -> + S5b=cleanup(S5), + ?fatal({failed_validation,Reason}, S5b); + _XML -> + {Res,cleanup(S5)} + end; + schema -> + case schemaLocations(Res,S5) of + {ok,Schemas} -> + cleanup(S5), + %%io:format("Schemas: ~p~nRes: ~p~ninhertih_options(S): ~p~n", + %% [Schemas,Res,inherit_options(S5)]), + XSDRes = xmerl_xsd:process_validate(Schemas,Res, + inherit_options(S5)), + handle_schema_result(XSDRes,S5); + _ -> + {Res,cleanup(S5)} + end; + _ -> + {Res,cleanup(S5)} + end, + + {Res2, Tail, S6}. + + +scan_decl(Str, S=#xmerl_scanner{event_fun = Event, + line = L, col = C, + environment=_Env, + encoding=_Charset, + validation=_ValidateResult}) -> + S1 = Event(#xmerl_event{event = started, + line = L, + col = C, + data = document}, S), + + case scan_prolog(Str, S1, _StartPos = 1) of + {T2="<"++_, S2} -> + {{S2#xmerl_scanner.user_state,T2},[],S2}; + {[], S2}-> + {[],[],S2}; + {T2, S2} -> + {_,_,S3} = scan_content(T2,S2,[],_Attrs=[],S2#xmerl_scanner.space, + _Lang=[],_Parents=[],#xmlNamespace{}), + {T2,[],S3} + end. + + +%%% [22] Prolog +%%% prolog ::= XMLDecl? Misc* (doctypedecl Misc*)? +%%% +%% empty text declarations are handled by the first function clause. +scan_prolog([], S=#xmerl_scanner{continuation_fun = F}, Pos) -> + ?dbg("cont()...~n", []), + F(fun(MoreBytes, S1) -> scan_prolog(MoreBytes, S1, Pos) end, + fun(S1) -> {[], S1} end, + S); +scan_prolog("<?xml"++T,S0=#xmerl_scanner{encoding=Charset0,col=Col,line=L},Pos) + when ?whitespace(hd(T)) -> + {Charset,T3, S3}= + if + Col==1,L==1,S0#xmerl_scanner.text_decl==true -> + ?dbg("prolog(\"<?xml\")~n", []), + ?bump_col(5), + {_,T1,S1} = mandatory_strip(T,S), + {Decl,T2, S2}=scan_text_decl(T1,S1), + Encoding=Decl#xmlDecl.encoding, + {Encoding,T2, S2#xmerl_scanner{encoding=Encoding}}; + Col==1,L==1 -> + ?dbg("prolog(\"<?xml\")~n", []), + ?bump_col(5), + {Decl,T2, S2}=scan_xml_decl(T, S), + Encoding=Decl#xmlDecl.encoding, + {Encoding,T2, S2#xmerl_scanner{encoding=Encoding}}; + true -> + ?fatal({xml_declaration_must_be_first_in_doc,Col,L},S0) + end, + %% Charset0 is either (1) 'iso-10646-utf-1' (transformation by + %% auto detection), (2) undefined (no auto detection and no + %% external encoding), (3) any other encoding format that must be + %% conformant to the internal explicitly given encoding. The two + %% former cases implies that the explicit internal encoding + %% (Charset) may be different from Charset0. + + %% Now transform to declared character set. + if + Charset==Charset0 -> % Document already transformed to this charset! + scan_prolog(T3, S3, Pos); + Charset0=/=undefined -> + %% For example may an external entity + %% have the BOM for utf-16 and the internal + %% explicit encoding='utf-16', then it will be auto + %% detected and transformed, Charset0 will be + %% 'iso-10646-utf-1', and Charset will be 'utf-16', all + %% legal. + %% + scan_prolog(T3,S3#xmerl_scanner{encoding=Charset0},Pos); + Charset == "utf-8" -> + scan_prolog(T3, S3, Pos); + Charset=/=undefined -> % Document not previously transformed + T4=xmerl_ucs:to_unicode(T3,list_to_atom(Charset)), + scan_prolog(T4, S3, Pos); + true -> % No encoding info given + scan_prolog(T3, S3, Pos) + end; +scan_prolog("<!DOCTYPE" ++ T, S0=#xmerl_scanner{environment=prolog, + encoding=_Charset}, Pos) -> + ?dbg("prolog(\"<!DOCTYPE\")~n", []), + ?bump_col(9), + %% If no known character set assume it is UTF-8 + T1=if + %% Charset==undefined -> xmerl_ucs:to_unicode(T,'utf-8'); + true -> T + end, + {T2, S1} = scan_doctype(T1, S), + scan_misc(T2, S1, Pos); +scan_prolog(Str="%"++_T,S=#xmerl_scanner{environment={external,_}},_Pos) -> + scan_ext_subset(Str,S); +scan_prolog(Str, S0 = #xmerl_scanner{user_state=_US,encoding=_Charset},Pos) -> + ?dbg("prolog(\"<\")~n", []), + + %% Check for Comments, PI before possible DOCTYPE declaration + ?bump_col(1), + %% If no known character set assume it is UTF-8 + T=if +%% Charset==undefined -> xmerl_ucs:to_unicode(Str,'utf-8'); + true -> Str + end, + {T1, S1}=scan_misc(T, S, Pos), + scan_prolog2(T1,S1,Pos). + + + +scan_prolog2([], S=#xmerl_scanner{continuation_fun = F}, Pos) -> + ?dbg("cont()...~n", []), + F(fun(MoreBytes, S1) -> scan_prolog2(MoreBytes, S1, Pos) end, + fun(S1) -> {[], S1} end, + S); +scan_prolog2("<!DOCTYPE" ++ T, S0=#xmerl_scanner{environment=prolog}, Pos) -> + ?dbg("prolog(\"<!DOCTYPE\")~n", []), + ?bump_col(9), + {T1, S1} = scan_doctype(T, S), + scan_misc(T1, S1, Pos); +scan_prolog2(Str = "<!" ++ _, S, _Pos) -> + ?dbg("prolog(\"<!\")~n", []), + %% In e.g. a DTD, we jump directly to markup declarations + scan_ext_subset(Str, S); +scan_prolog2(Str, S0 = #xmerl_scanner{user_state=_US},Pos) -> + ?dbg("prolog(\"<\")~n", []), + + %% Here we consider the DTD provided by doctype_DTD option, + S1 = + case S0 of + #xmerl_scanner{validation=dtd,doctype_DTD=DTD} when is_list(DTD) -> + S=fetch_DTD(undefined,S0), + check_decl(S), + S; + _ -> S0 + end, + %% Check for more Comments and PI after DOCTYPE declaration +% ?bump_col(1), + scan_misc(Str, S1, Pos). + + + + +%%% [27] Misc ::= Comment | PI | S +%% Note: +%% - Neither of Comment and PI are returned in the resulting parsed +%% structure. +%% - scan_misc/3 implements Misc* as that is how the rule is always used +scan_misc([], S=#xmerl_scanner{continuation_fun = F}, Pos) -> + ?dbg("cont()...~n", []), + F(fun(MoreBytes, S1) -> scan_misc(MoreBytes, S1, Pos) end, + fun(S1) -> {[], S1} end, + S); +scan_misc("<!--" ++ T, S0, Pos) -> % Comment + ?bump_col(4), + {_, T1, S1} = scan_comment(T, S, Pos, _Parents = [], _Lang = []), + scan_misc(T1,S1,Pos); +scan_misc("<?" ++ T, S0, Pos) -> % PI + ?dbg("prolog(\"<?\")~n", []), + ?bump_col(2), + {_PI, T1, S1} = scan_pi(T, S, Pos), + scan_misc(T1,S1,Pos); +scan_misc(T=[H|_T], S, Pos) when ?whitespace(H) -> + ?dbg("prolog(whitespace)~n", []), + {_,T1,S1}=strip(T,S), + scan_misc(T1,S1,Pos); +scan_misc(T,S,_Pos) -> + {T,S}. + + +cleanup(S=#xmerl_scanner{keep_rules = false, + rules = Rules}) -> + ets:delete(Rules), + S#xmerl_scanner{rules = undefined}; +cleanup(S) -> + S. + +%%% Prolog and Document Type Declaration XML 1.0 Section 2.8 +%% [23] XMLDecl ::= '<?xml' VersionInfo EncodingDecl? SDDecl? S? '?>' +%% [24] VersionInfo ::= S 'version' Eq ("'" VersionNum "'" | '"' VersionNum '"') +scan_xml_decl(T, S) -> + %% VersionInfo [24] is mandatory + {_,T1,S1} = mandatory_strip(T,S), + {T2,S2} = + case T1 of + "version" ++ _T2 -> + {_T2,S1#xmerl_scanner{col=S1#xmerl_scanner.col+7}}; + _ -> ?fatal(expected_version_attribute,S1) + end, + {T3, S3} = scan_eq(T2, S2), + {Vsn, T4, S4} = scan_xml_vsn(T3, S3), + Attr = #xmlAttribute{name = version, + parents = [{xml, _XMLPos = 1}], + value = Vsn}, + scan_xml_decl(T4, S4, #xmlDecl{attributes = [Attr]}). + +scan_xml_decl([], S=#xmerl_scanner{continuation_fun = F}, Decl) -> + ?dbg("cont()...~n", []), + F(fun(MoreBytes, S1) -> scan_xml_decl(MoreBytes, S1, Decl) end, + fun(S1) -> {[], [], S1} end, + S); +scan_xml_decl("?>" ++ T, S0, Decl) -> + ?bump_col(2), + return_xml_decl(T,S,Decl); +scan_xml_decl(T,S=#xmerl_scanner{event_fun = _Event},Decl) when ?whitespace(hd(T)) -> + {_,T1,S1}=mandatory_strip(T,S), + scan_xml_decl2(T1,S1,Decl); +scan_xml_decl(_T,S=#xmerl_scanner{event_fun = _Event},_Decl) -> + ?fatal(preformat([expected,one,'of:'],['?>',whitespace_character],","),S). + +scan_xml_decl2("?>" ++ T, S0,Decl) -> + ?bump_col(2), + return_xml_decl(T,S,Decl); +scan_xml_decl2("encoding" ++ T, S0 = #xmerl_scanner{event_fun = Event}, + Decl0 = #xmlDecl{attributes = Attrs}) -> + %% [80] EncodingDecl + ?bump_col(8), + {T1, S1} = scan_eq(T, S), + {EncName, T2, S2} = scan_enc_name(T1, S1), + LowEncName=xmerl_lib:to_lower(EncName), + Attr = #xmlAttribute{name = encoding, + parents = [{xml, _XMLPos = 1}], + value = LowEncName}, + Decl = Decl0#xmlDecl{encoding = LowEncName, + attributes = [Attr|Attrs]}, + S3 = #xmerl_scanner{} = Event(#xmerl_event{event = ended, + line = S0#xmerl_scanner.line, + col = S0#xmerl_scanner.col, + data = Attr}, S2), + case T2 of + "?>" ++ _T3 -> + scan_xml_decl3(T2,S3,Decl); + _ -> + {_,T3,S4} = mandatory_strip(T2,S3), + scan_xml_decl3(T3, S4, Decl) + end; +scan_xml_decl2(T="standalone" ++ _T,S,Decl) -> + scan_xml_decl3(T,S,Decl); +scan_xml_decl2(_BadString,S,_Decl) -> + ?fatal(preformat([expected,one,'of:'],['?>',standalone,encoding],","),S). +% ?fatal(lists:flatten(io_lib:format("~s ~s ~s: ~s, ~s, ~s",[expected,one,'of','?>',standalone,encoding])),S). +% ?fatal({expected_one_of,"?>",standalone,encoding},S). + +scan_xml_decl3("?>" ++ T, S0,Decl) -> + ?bump_col(2), + return_xml_decl(T,S,Decl); +scan_xml_decl3("standalone" ++ T,S0 = #xmerl_scanner{event_fun = Event}, + Decl0 = #xmlDecl{attributes = Attrs}) -> + %% [32] SDDecl + ?bump_col(10), + {T1, S1} = scan_eq(T, S), + {StValue,T2,S2}=scan_standalone_value(T1,S1), + Attr = #xmlAttribute{name = standalone, + parents = [{xml, _XMLPos = 1}], + value = StValue}, + Decl = Decl0#xmlDecl{standalone = StValue, + attributes = [Attr|Attrs]}, + S3 = #xmerl_scanner{} = Event(#xmerl_event{event = ended, + line = S0#xmerl_scanner.line, + col = S0#xmerl_scanner.col, + data = Attr}, S2), + {_,T3,S4} = strip(T2,S3), + T4 = scan_mandatory("?>",T3,2,S4,expected_xml_decl_endtag), +%% "?>" ++ T4 = T3, + return_xml_decl(T4, S4#xmerl_scanner{col=S4#xmerl_scanner.col+2}, Decl). + + +return_xml_decl(T,S=#xmerl_scanner{hook_fun = _Hook, + event_fun = Event}, + Decl0 = #xmlDecl{attributes = Attrs}) -> + ?strip1, + Decl = Decl0#xmlDecl{attributes = lists:reverse(Attrs)}, + S2 = #xmerl_scanner{} = Event(#xmerl_event{event = ended, + line = S#xmerl_scanner.line, + col = S#xmerl_scanner.col, + data = Decl}, S1), +%% {Ret, S3} = Hook(Decl, S2), +%% {Ret, T1, S3}. + {Decl, T1, S2}. + + +scan_standalone_value("'yes'" ++T,S0)-> + ?bump_col(5), + {'yes',T,S#xmerl_scanner{standalone=yes}}; +scan_standalone_value("\"yes\"" ++T,S0)-> + ?bump_col(5), + {'yes',T,S#xmerl_scanner{standalone=yes}}; +scan_standalone_value("'no'" ++T,S0) -> + ?bump_col(4), + {'no',T,S}; +scan_standalone_value("\"no\"" ++T,S0) -> + ?bump_col(4), + {'no',T,S}. + +%%% +%%% Text declaration XML 1.0 section 4.3.1 +%%% [77] TextDecl ::= '<?xml' VersionInfo? EncodingDecl S? '?>' +scan_text_decl(T,S=#xmerl_scanner{event_fun = Event}) -> + {#xmlDecl{attributes=Attrs}=Decl0,T1,S1} = scan_optional_version(T,S), + T2 = + case T1 of + "encoding" ++ _T2 -> _T2; + _ -> + ?fatal(expected_encoding_attribute,S1) + end, + S2 = S1#xmerl_scanner{col = S1#xmerl_scanner.col + 8}, + {T3, S3} = scan_eq(T2, S2), + {EncName, T4, S4} = scan_enc_name(T3, S3), + LowEncName=xmerl_lib:to_lower(EncName), + ?strip5, + Attr = #xmlAttribute{name = encoding, + parents = [{xml,1}], + value = LowEncName}, + Decl = Decl0#xmlDecl{encoding = LowEncName, + attributes = [Attr|Attrs]}, + S6=#xmerl_scanner{} = Event(#xmerl_event{event = ended, + line = S5#xmerl_scanner.line, + col = S5#xmerl_scanner.col, + data = Attr}, S5), + scan_text_decl(T5,S6,Decl). + +scan_text_decl("?>"++T,S0 = #xmerl_scanner{hook_fun = _Hook, + event_fun = Event}, + Decl0 = #xmlDecl{attributes = Attrs}) -> + ?bump_col(2), + ?strip1, + Decl = Decl0#xmlDecl{attributes = lists:reverse(Attrs)}, + S2 = #xmerl_scanner{} = Event(#xmerl_event{event = ended, + line = S0#xmerl_scanner.line, + col = S0#xmerl_scanner.col, + data = Decl}, S1), +%% {Ret, S3} = Hook(Decl, S2), +%% {Ret, T1, S3}; + {Decl, T1, S2}; +scan_text_decl([H|_T],S,_) -> + ?fatal({unexpected_character_in_text_declaration,H},S). + +scan_optional_version("version"++T,S0) -> + ?bump_col(7), + ?strip1, + {T2, S2} = scan_eq(T1, S1), + {Vsn, T3, S3} = scan_xml_vsn(T2, S2), + {_,T4,S4} = mandatory_strip(T3,S3), + Attr = #xmlAttribute{name = version,parents = [{xml,1}],value = Vsn}, + {#xmlDecl{attributes=[Attr]},T4,S4}; +scan_optional_version(T,S) -> + {#xmlDecl{attributes=[]},T,S}. + + + +%%%%%%% [81] EncName +scan_enc_name([], S=#xmerl_scanner{continuation_fun = F}) -> + ?dbg("cont()...~n", []), + F(fun(MoreBytes, S1) -> scan_enc_name(MoreBytes, S1) end, + fun(S1) -> ?fatal(expected_encoding_name, S1) end, + S); +scan_enc_name([H|T], S0) when H >= $"; H =< $' -> + ?bump_col(1), + scan_enc_name(T, S, H, []). + + +scan_enc_name([], S=#xmerl_scanner{continuation_fun = F}, Delim, Acc) -> + ?dbg("cont()...~n", []), + F(fun(MoreBytes, S1) -> scan_enc_name(MoreBytes, S1, Delim, Acc) end, + fun(S1) -> ?fatal(expected_encoding_name, S1) end, + S); +scan_enc_name([H|T], S0, Delim, Acc) when H >= $a, H =< $z -> + ?bump_col(1), + scan_enc_name2(T, S, Delim, [H|Acc]); +scan_enc_name([H|T], S0, Delim, Acc) when H >= $A, H =< $Z -> + ?bump_col(1), + scan_enc_name2(T, S, Delim, [H|Acc]); +scan_enc_name([H|_T],S,_Delim,_Acc) -> + ?fatal({error,{unexpected_character_in_Enc_Name,H}},S). + +scan_enc_name2([], S=#xmerl_scanner{continuation_fun = F}, Delim, Acc) -> + ?dbg("cont()...~n", []), + F(fun(MoreBytes, S1) -> scan_enc_name2(MoreBytes, S1, Delim, Acc) end, + fun(S1) -> ?fatal(expected_encoding_name, S1) end, + S); +scan_enc_name2([H|T], S0, H, Acc) -> + ?bump_col(1), + {lists:reverse(Acc), T, S}; +scan_enc_name2([H|T], S0, Delim, Acc) when H >= $a, H =< $z -> + ?bump_col(1), + scan_enc_name2(T, S, Delim, [H|Acc]); +scan_enc_name2([H|T], S0, Delim, Acc) when H >= $A, H =< $Z -> + ?bump_col(1), + scan_enc_name2(T, S, Delim, [H|Acc]); +scan_enc_name2([H|T], S0, Delim, Acc) when H >= $0, H =< $9 -> + ?bump_col(1), + scan_enc_name2(T, S, Delim, [H|Acc]); +scan_enc_name2([H|T], S0, Delim, Acc) when H == $.; H == $_; H == $- -> + ?bump_col(1), + scan_enc_name2(T, S, Delim, [H|Acc]). + + +%%%%%%% [26] VersionNum +%%% VersionNum ::= ([a-zA-Z0-9_.:] | '-')+ +scan_xml_vsn([], S=#xmerl_scanner{continuation_fun = F}) -> + ?dbg("cont()...~n", []), + F(fun(MoreBytes, S1) -> scan_xml_vsn(MoreBytes, S1) end, + fun(S1) -> ?fatal(unexpected_end, S1) end, + S); +scan_xml_vsn([H|T], S) when H==$"; H==$'-> + xml_vsn(T, S#xmerl_scanner{col = S#xmerl_scanner.col+1}, H, []). + +xml_vsn([], S=#xmerl_scanner{continuation_fun = F}, Delim, Acc) -> + ?dbg("cont()...~n", []), + F(fun(MoreBytes, S1) -> xml_vsn(MoreBytes, S1, Delim, Acc) end, + fun(S1) -> ?fatal(unexpected_end, S1) end, + S); +xml_vsn([H|T], S=#xmerl_scanner{col = C}, H, Acc) -> + {lists:reverse(Acc), T, S#xmerl_scanner{col = C+1}}; +xml_vsn([H|T], S=#xmerl_scanner{col = C},Delim, Acc) when H >= $a, H =< $z -> + xml_vsn(T, S#xmerl_scanner{col = C+1}, Delim, [H|Acc]); +xml_vsn([H|T], S=#xmerl_scanner{col = C},Delim, Acc) when H >= $A, H =< $Z -> + xml_vsn(T, S#xmerl_scanner{col = C+1}, Delim, [H|Acc]); +xml_vsn([H|T], S=#xmerl_scanner{col = C},Delim, Acc) when H >= $0, H =< $9 -> + xml_vsn(T, S#xmerl_scanner{col = C+1}, Delim, [H|Acc]); +xml_vsn([H|T], S=#xmerl_scanner{col = C}, Delim, Acc) -> + case lists:member(H, "_.:-") of + true -> + xml_vsn(T, S#xmerl_scanner{col = C+1}, Delim, [H|Acc]); + false -> + ?fatal({invalid_vsn_char, H}, S) + end. + +%%%%%%% [16] PI ::= '<?' PITarget (S (Char* - (Char* '?>' Char*)))? '?>' + +scan_pi([], S=#xmerl_scanner{continuation_fun = F}, Pos) -> + ?dbg("cont()...~n", []), + F(fun(MoreBytes, S1) -> scan_pi(MoreBytes, S1, Pos) end, + fun(S1) -> ?fatal(unexpected_end, S1) end, + S); +scan_pi(Str = [H1,H2,H3 | T],S0=#xmerl_scanner{line = L, col = C}, Pos) + when H1==$x;H1==$X -> + %% names beginning with [xX][mM][lL] are reserved for future use. + ?bump_col(3), + if + ((H2==$m) or (H2==$M)) and + ((H3==$l) or (H3==$L)) -> + scan_wellknown_pi(T,S,Pos); + true -> + {Target, _NamespaceInfo, T1, S1} = scan_name(Str, S), + scan_pi(T1, S1, Target, L, C, Pos, []) + end; +scan_pi(Str, S=#xmerl_scanner{line = L, col = C}, Pos) -> + {Target, _NamespaceInfo, T1, S1} = scan_name(Str, S), + scan_pi(T1, S1, Target, L, C, Pos,[]). + + +%%% More info on xml-stylesheet can be found at: +%%% "Associating Style Sheets with XML documents", Version 1.0, +%%% W3C Recommendation 29 June 1999 (http://www.w3.org/TR/xml-stylesheet/) +scan_wellknown_pi("-stylesheet"++T, S0=#xmerl_scanner{line=L,col=C},Pos) -> + ?dbg("prolog(\"<?xml-stylesheet\")~n", []), + ?bump_col(16), + scan_pi(T, S, "xml-stylesheet",L,C,Pos,[]); +scan_wellknown_pi(Str,S,_Pos) -> + ?fatal({invalid_target_name, lists:sublist(Str, 1, 10)}, S). + + + +scan_pi([], S=#xmerl_scanner{continuation_fun = F}, Target,L, C, Pos, Acc) -> + ?dbg("cont()...~n", []), + F(fun(MoreBytes, S1) -> scan_pi(MoreBytes, S1, Target, L, C, Pos, Acc) end, + fun(S1) -> ?fatal(unexpected_end, S1) end, + S); +scan_pi("?>" ++ T, S0 = #xmerl_scanner{hook_fun = Hook, + event_fun = Event}, + Target, L, C, Pos, Acc) -> + ?bump_col(2), + PI = #xmlPI{name = Target, + pos = Pos, + value = lists:reverse(Acc)}, + S1 = #xmerl_scanner{} = Event(#xmerl_event{event = ended, + line = L, + col = C, + data = PI}, S), + {Ret, S2} = Hook(PI, S1), + {Ret, T, S2}; +scan_pi([H|T], S, Target, L, C, Pos, Acc) when ?whitespace(H) -> + ?strip1, + scan_pi2(T1, S1, Target, L, C, Pos, Acc); +scan_pi([H|_T],S,_Target, _L, _C, _Pos, _Acc) -> + ?fatal({expected_whitespace_OR_end_of_PI,{char,H}}, S). + +scan_pi2([], S=#xmerl_scanner{continuation_fun = F}, Target,L, C, Pos, Acc) -> + ?dbg("cont()...~n", []), + F(fun(MoreBytes, S1) -> scan_pi2(MoreBytes, S1, Target, L, C, Pos, Acc) end, + fun(S1) -> ?fatal(unexpected_end, S1) end, + S); +scan_pi2("?>" ++ T, S0 = #xmerl_scanner{hook_fun = Hook, + event_fun = Event}, + Target, L, C, Pos, Acc) -> + ?bump_col(2), + PI = #xmlPI{name = Target, + pos = Pos, + value = lists:reverse(Acc)}, + S1 = #xmerl_scanner{} = Event(#xmerl_event{event = ended, + line = L, + col = C, + data = PI}, S), + {Ret, S2} = Hook(PI, S1), + {Ret, T, S2}; +scan_pi2(Str, S0, Target, L, C, Pos, Acc) -> + ?bump_col(1), + {Ch,T} = wfc_legal_char(Str,S), + scan_pi2(T, S, Target, L, C, Pos, [Ch|Acc]). + + + +%% [28] doctypedecl ::= +%% '<!DOCTYPE' S Name (S ExternalID)? S? ('[' intSubset ']' S?)? '>' +scan_doctype([], S=#xmerl_scanner{continuation_fun = F}) -> + ?dbg("cont()...~n", []), + F(fun(MoreBytes, S1) -> scan_doctype(MoreBytes, S1) end, + fun(S1) -> ?fatal(unexpected_end, S1) end, + S); +scan_doctype(T, S) -> + {_,T1,S1} = mandatory_strip(T,S), + {DTName, _NamespaceInfo, T2, S2} = scan_name(T1, S1), + ?strip3, + scan_doctype1(T3, S3#xmerl_scanner{doctype_name = DTName}). + + +%% [75] ExternalID ::= 'SYSTEM' S SystemLiteral +%% | 'PUBLIC' S PubidLiteral S SystemLiteral +scan_doctype1([], S=#xmerl_scanner{continuation_fun = F}) -> + ?dbg("cont()...~n", []), + F(fun(MoreBytes, S1) -> scan_doctype1(MoreBytes, S1) end, + fun(S1) -> ?fatal(unexpected_end, S1) end, + S); +scan_doctype1("PUBLIC" ++ T, S0) -> + ?bump_col(6), + {_,T1,S1} = mandatory_strip(T,S), + {PIDL, T2, S2} = scan_pubid_literal(T1, S1), + {_,T3,S3} = mandatory_strip(T2,S2), + {SL, T4, S4} = scan_system_literal(T3, S3), + ?strip5, + scan_doctype2(T5, S5, {public, PIDL, SL}); +scan_doctype1("SYSTEM" ++ T, S0) -> + ?bump_col(6), + {_,T1,S1} = mandatory_strip(T,S), + {SL, T2, S2} = scan_system_literal(T1, S1), + ?strip3, + scan_doctype2(T3, S3, {system, SL}); +scan_doctype1(T, S) -> + scan_doctype2(T, S, undefined). + + +scan_doctype2([], S=#xmerl_scanner{continuation_fun = F},DTD) -> + ?dbg("cont()...~n", []), + F(fun(MoreBytes, S1) -> scan_doctype2(MoreBytes, S1, DTD) end, + fun(S1) -> ?fatal(unexpected_end, S1) end, + S); +scan_doctype2("[" ++ T, S0, DTD) -> + ?bump_col(1), + ?strip1, + scan_doctype3(T1, S1, DTD); +scan_doctype2(">" ++ T, S0, DTD) -> + ?bump_col(1), + ?strip1, + S2 = fetch_DTD(DTD, S1), + check_decl(S2), + {T1, S2}; +scan_doctype2(_T,S,_DTD) -> + ?fatal(expected_end_of_DOCTYPE_declaration, S). + +%% [28a] DeclSep ::= PEReference | S +%% [28b] intSubset ::= (markupdecl | DeclSep)* +scan_doctype3([], S=#xmerl_scanner{continuation_fun = F},DTD) -> + ?dbg("cont()...~n", []), + F(fun(MoreBytes, S1) -> scan_doctype3(MoreBytes, S1,DTD) end, + fun(S1) -> ?fatal(unexpected_end, S1) end, + S); +scan_doctype3("%" ++ T, S0, DTD) -> + ?bump_col(1), + {PERefName, T1, S1} = scan_pe_reference(T, S), + ?strip2, + case expand_pe_reference(PERefName, S2,as_PE) of + {system, _} = Name -> + S3 = fetch_DTD(Name, S2), + check_decl(S3), + scan_doctype3(T2, S3, DTD); + {public, _} = Name -> + S3 = fetch_DTD(Name, S2), + check_decl(S3), + scan_doctype3(T2, S3, DTD); + {public, _, _} = Name -> + S3 = fetch_DTD(Name, S2), + check_decl(S3), + scan_doctype3(T2, S3, DTD); + ExpRef when is_list(ExpRef) -> % Space added, see Section 4.4.8 + {_,T3,S3} = strip(ExpRef++T2,S2), + scan_doctype3(T3,S3,DTD) + end; +scan_doctype3("]" ++ T, S0, DTD) -> + ?bump_col(1), + ?strip1, + S2 = fetch_DTD(DTD, S1), + check_decl(S2), + T2 = scan_mandatory(">",T1,1,S2,expected_doctype_end_tag), +%% ">" ++ T2 = T1, + {T2, S2}; +scan_doctype3(T, S, DTD) -> + {_, T1, S1} = scan_markup_decl(T, S), + scan_doctype3(T1, S1, DTD). + + + +fetch_DTD(undefined, S=#xmerl_scanner{doctype_DTD=URI}) when is_list(URI)-> + %% allow to specify DTD name when it isn't available in xml stream + fetch_DTD({system,URI},S#xmerl_scanner{doctype_DTD=option_provided}); +fetch_DTD(undefined, S) -> + S; +% fetch_DTD(_,S=#xmerl_scanner{validation=false}) -> +% S; +fetch_DTD(DTDSpec, S)-> + case fetch_and_parse(DTDSpec,S,[{text_decl,true}, + {environment,{external,subset}}]) of + NewS when is_record(NewS,xmerl_scanner) -> + NewS; + {_Res,_Tail,_Sx} -> % Continue with old scanner data, result in Rules + S + end. + +fetch_and_parse(ExtSpec,S=#xmerl_scanner{fetch_fun=Fetch, + rules=Rules, + xmlbase = XMLBase}, + Options0) -> + RetS = + case Fetch(ExtSpec, S) of + {ok, NewS} -> + %% For backward compatibility only. This will be removed later!! + NewS; + {ok, not_fetched,NewS} -> + NewS; + {ok, DataRet, NewS = #xmerl_scanner{ + fetch_path=FetchPath, + user_state = UState, + event_fun = Event, + hook_fun = Hook, + fetch_fun = Fetch1, + close_fun = Close1, + continuation_fun = Cont, + acc_fun = Acc, + rules_read_fun = Read, + rules_write_fun = Write, + validation = Valid, + quiet = Quiet, + encoding = Charset + }} -> + EvS = event_state(NewS), + HoS = hook_state(NewS), + FeS = fetch_state(NewS), + CoS = cont_state(NewS), + Options = Options0++[{fetch_path,FetchPath}, + {user_state, UState}, + {rules, Rules}, + {event_fun, Event, EvS}, + {hook_fun, Hook, HoS}, + {fetch_fun, Fetch1, FeS}, + {close_fun, Close1}, + {continuation_fun, Cont, CoS}, + {rules, Read, Write, ""}, + {acc_fun, Acc}, + {validation,Valid}, + {quiet,Quiet}, + {encoding,Charset}], + + case DataRet of + {file, F} -> + int_file_decl(F, Options,Charset); + {string, String} -> + int_string_decl(String, Options,XMLBase,file_name_unknown); + _ -> + %% other scheme + {DataRet,[],NewS} + end; + Error -> + ?fatal({error_fetching_DTD, {ExtSpec, Error}}, S) + end, + case RetS of + #xmerl_scanner{} -> + RetS#xmerl_scanner{text_decl=false, + environment=S#xmerl_scanner.environment}; + _ -> RetS + end. + + +fetch_not_parse(ExtSpec,S=#xmerl_scanner{fetch_fun=Fetch}) -> + case Fetch(ExtSpec,S) of + {ok, not_fetched,_NewS} -> + ?fatal({error_fetching_external_source,ExtSpec},S); + {ok, DataRet, NewS} -> + {String,LocationName} = + case DataRet of + {file,F} -> + {get_file(F,S),F}; + {string,Str} -> + {binary_to_list(Str),file_name_unknown}; + {http,URI} -> + {{http,URI},URI}; + _ -> DataRet + end, + {String, NewS#xmerl_scanner{filename=LocationName}}; + _ -> + ?fatal({error_fetching_external_resource,ExtSpec},S) + end. + +get_file(F,S) -> +% io:format("get_file F=~p~n",[F]), + case file:read_file(F) of + {ok,Bin} -> + binary_to_list(Bin); + Err -> + ?fatal({error_reading_file,F,Err},S) + end. +%% check_decl/1 +%% Now it is necessary to check that all referenced types is declared, +%% since it is legal to reference some xml types before they are +%% declared. +check_decl(#xmerl_scanner{validation=V}) when V =/= dtd -> + ok; +check_decl(#xmerl_scanner{rules=Tab} = S) -> + check_notations(Tab,S), + check_elements(Tab,S), %% check also attribute defs for element + check_entities(Tab,S). + +check_notations(Tab,S) -> + case ets:match(Tab,{{notation,'$1'},undeclared}) of + [[]] -> ok; + [] -> ok; + [L] when is_list(L) -> + ?fatal({error_missing_declaration_in_DTD,hd(L)},S); + Err -> + ?fatal({error_missing_declaration_in_DTD,Err},S) + end. + +check_elements(Tab,S) -> + case catch ets:match(Tab,{{elem_def,'_'},'$2'},10) of + {_,_}=M -> + Fun = fun({Match,'$end_of_table'},_F) -> + lists:foreach(fun(X)->check_elements2(X,S) end, + Match), + ok; + ('$end_of_table',_) -> + ok; + ({Match,Cont},F) -> + lists:foreach(fun(X)->check_elements2(X,S) end, + Match), + F(ets:match(Cont),F) + end, + Fun(M,Fun); + '$end_of_table' -> ok; + Err -> ?fatal({error_missing_declaration_in_DTD,Err},S) + end. + +% it is not an error to declare attributes for an element that is not +% declared. +check_elements2([#xmlElement{attributes=Attrs}],S) -> + check_attributes(Attrs,S); +check_elements2(_,_) -> + ok. + +check_attributes([{N1,'ID',_,_,_}=Attr|Rest],S) -> + case lists:keysearch('ID',2,Rest) of + {value,Att2} -> + ?fatal({error_more_than_one_ID_def,N1,element(1,Att2)},S); + _ -> + ok + end, + vc_ID_Attribute_Default(Attr,S), + check_attributes(Rest,S); +check_attributes([{_,{enumeration,_},_,_,_}=Attr|T],S) -> + vc_Enumeration(Attr,S), + check_attributes(T,S); +check_attributes([{_,Ent,_,_,_}=Attr|T],S) + when Ent=='ENTITY';Ent=='ENTITIES' -> + vc_Entity_Name(Attr,S), + check_attributes(T,S); +check_attributes([_|T],S) -> + check_attributes(T,S); +check_attributes([],_S) -> + ok. + +check_entities(Tab,S=#xmerl_scanner{validation=dtd}) -> + case ets:match(Tab,{{entity,'$1'},undeclared}) of + [[]] -> ok; + [] -> ok; + [L] when is_list(L) -> + ?fatal({error_missing_declaration_in_DTD,hd(L)},S); + Err -> + ?fatal({error_missing_declaration_in_DTD,Err},S) + end; +check_entities(_,_) -> + ok. + + +%% check_decl2/1: checks that all referenced ID attributes are declared +check_decl2(S=#xmerl_scanner{rules=Tab}) -> + check_referenced_ids(Tab,S). + + +check_referenced_ids(Tab,S) -> + case ets:match(Tab,{{id,'$1'},undeclared}) of + [[]] -> ok; + [] -> ok; + [L] when is_list(L) -> + ?fatal({error_missing_declaration_in_DTD,hd(L)},S); + Err -> + ?fatal({error_missing_declaration_in_DTD,Err},S) + end. + +%%%%%%% [30] extSubSet ::= TextDecl? extSubsetDecl + +scan_ext_subset([], S=#xmerl_scanner{continuation_fun = F}) -> + ?dbg("cont()...~n", []), + F(fun(MoreBytes, S1) -> scan_ext_subset(MoreBytes, S1) end, + fun(S1) -> {[], S1} end, + S); +scan_ext_subset("%" ++ T, S0) -> + %% DeclSep [28a]: WFC: PE Between Declarations. + %% The replacement text of a parameter entity reference in a + %% DeclSep must match the production extSubsetDecl. + ?bump_col(1), + {T1,S1} = scan_decl_sep(T,S), + scan_ext_subset(T1, S1); +scan_ext_subset("<![" ++ T, S0) -> + ?bump_col(3), + ?strip1, + {_, T2, S2} = scan_conditional_sect(T1, S1), + scan_ext_subset(T2,S2); +scan_ext_subset(T, S) when ?whitespace(hd(T)) -> + {_,T1,S1} = strip(T,S), + scan_ext_subset(T1, S1); +scan_ext_subset(T, S) -> + {_, T1, S1} = scan_markup_decl(T, S), + scan_ext_subset(T1, S1). + + +%%%%%%% [28a] DeclSep ::= PEReference | S +scan_decl_sep(T,S) -> + {PERefName, T1, S1} = scan_pe_reference(T, S), + {ExpandedRef,S2} = + case expand_pe_reference(PERefName,S1,as_PE) of + Tuple when is_tuple(Tuple) -> + %% {system,URI} or {public,URI} + {ExpRef,_Sx}=fetch_not_parse(Tuple,S1), + {ExpRef,S1}; + ExpRef -> + {ExpRef,S1} + end, + {_,TRef,S3} = strip(ExpandedRef,S2), + {_,S4}=scan_ext_subset(TRef,S3), + {T1,S4}. +% scan_decl_sep(T,S=#xmerl_scanner{rules_read_fun=Read, +% rules_write_fun=Write, +% rules_delete_fun=Delete}) -> +% {PERefName, T1, S1} = scan_pe_reference(T, S), +% {ExpandedRef,S2} = +% case expand_pe_reference(PERefName,S1,as_PE) of +% Tuple when tuple(Tuple) -> +% %% {system,URI} or {public,URI} +% {ExpRef,Sx}=fetch_not_parse(Tuple,S1), +% {EntV,_,_S2} = scan_entity_value(ExpRef, Sx, no_delim, +% PERefName,parameter), +% %% should do an update Write(parameter_entity) so next +% %% expand_pe_reference is faster +% Delete(parameter_entity,PERefName,_S2), +% _S3 = Write(parameter_entity,PERefName,EntV,_S2), +% EntV2 = Read(parameter_entity,PERefName,_S3), +% {" " ++ EntV2 ++ " ",_S3}; +% ExpRef -> +% {ExpRef,S1} +% end, +% {_, T3, S3} = strip(ExpandedRef,S2), +% {_T4,S4} = scan_ext_subset(T3,S3), +% strip(T1,S4). + +%%%%%%% [61] ConditionalSect ::= includeSect | ignoreSect + +scan_conditional_sect([], S=#xmerl_scanner{continuation_fun = F}) -> + ?dbg("cont()...~n", []), + F(fun(MoreBytes, S1) -> scan_conditional_sect(MoreBytes, S1) end, + fun(S1) -> ?fatal(unexpected_end, S1) end, + S); +scan_conditional_sect("IGNORE" ++ T, S0) -> + ?bump_col(6), + ?strip1, + T2 = scan_mandatory("[",T1,1,S,expected_IGNORE_bracket), +% "[" ++ T2 = T1, + {_,T3,S3} = strip(T2,S1), + scan_ignore(T3,S3); +scan_conditional_sect("INCLUDE" ++ T, S0) -> + ?bump_col(7), + ?strip1, + T2 = scan_mandatory("[",T1,1,S,expected_INCLUDE_bracket), +% "[" ++ T2 = T1, + {_,T3,S3} = strip(T2,S1), + scan_include(T3, S3); +scan_conditional_sect("%"++T,S0) -> + ?bump_col(1), + {PERefName, T1, S1} = scan_pe_reference(T, S), + ExpRef = expand_pe_reference(PERefName, S1,as_PE), + {_,T2,S2} = strip(ExpRef ++ T1,S1), + scan_conditional_sect(T2,S2). + + +%%%% [63] ignoreSect ::= '<![' S? 'IGNORE' S? '[' ignoreSectContents* ']]>' +%%%% [64] ignoreSectContents ::= Ignore ('<![' ignoreSectContents ']]>' Ignore)* +%%%% [65] Ignore ::= Char* - (Char* ('<![' | ']]>') Char*) +scan_ignore(Str,S) -> + scan_ignore(Str,S,0). + +scan_ignore([], S=#xmerl_scanner{continuation_fun = F},Level) -> + ?dbg("cont()...~n", []), + F(fun(MoreBytes, S1) -> scan_ignore(MoreBytes, S1,Level) end, + fun(S1) -> ?fatal(unexpected_end, S1) end, + S); +scan_ignore("<![" ++ T, S0,Level) -> + %% nested conditional section. Topmost condition is ignore, though + ?bump_col(3), + scan_ignore(T, S,Level+1); +scan_ignore("]]>" ++ T, S0,0) -> + ?bump_col(3), + {[], T, S}; +scan_ignore("]]>" ++ T, S0,Level) -> + ?bump_col(3), + scan_ignore(T, S,Level-1); +scan_ignore([_H|T],S0,Level) -> + ?bump_col(1), + scan_ignore(T,S,Level). + + +%%%%%%% [62] includeSect ::= '<![' S? 'INCLUDE' S? '[' extSubsetDecl ']]>' +scan_include([], S=#xmerl_scanner{continuation_fun = F}) -> + ?dbg("cont()...~n", []), + F(fun(MoreBytes, S1) -> scan_include(MoreBytes, S1) end, + fun(S1) -> ?fatal(unexpected_end, S1) end, + S); +scan_include("]]>" ++ T, S0) -> + ?bump_col(3), + {[], T, S}; +scan_include("%" ++ T, S0) -> + ?bump_col(1), + {PERefName, T1, S1} = scan_pe_reference(T, S), + ExpRef = expand_pe_reference(PERefName, S1,as_PE), + {_,T2,S2} = strip(ExpRef ++ T1,S1), + scan_include(T2, S2); +scan_include("<![" ++ T, S0) -> + ?bump_col(3), + ?strip1, + {_, T2, S2} = scan_conditional_sect(T1, S1), + ?strip3, + scan_include(T3,S3); +scan_include(T, S) -> + {_, T1, S1} = scan_markup_decl(T, S), + scan_include(T1, S1). + + +%%%%%%% [29] markupdecl ::= elementdecl | AttlistDecl | EntityDecl | +%%%%%%% NotationDecl | PI |Comment +%%%%%%% [45] elementdecl ::= '<!ELEMENT' S Name S contentspec S? '>' + +%% Validity constraint: Unique Type Declaration: No element type may be +%% declared more than once. +%% +scan_markup_decl([], S=#xmerl_scanner{continuation_fun = F}) -> + ?dbg("cont()...~n", []), + F(fun(MoreBytes, S1) -> scan_markup_decl(MoreBytes, S1) end, + fun(S1) -> {[], [], S1} end, + S); +scan_markup_decl("<!--" ++ T, S0) -> + ?bump_col(4), + scan_comment(T, S); +scan_markup_decl("<?" ++ T, S0) -> + ?bump_col(2), + {_PI, T1, S1} = scan_pi(T, S,_Pos=markup), + strip(T1, S1); +scan_markup_decl("<!ELEMENT" ++ T, + #xmerl_scanner{rules_read_fun = Read, + rules_write_fun = Write, + rules_delete_fun = Delete} = S0) -> + ?bump_col(9), + {_,T1,S1} = mandatory_strip(T,S), + {Ename, _NamespaceInfo, T2, S2} = scan_name(T1, S1), + Element = + case Read(elem_def, Ename, S2) of + El = #xmlElement{elementdef=Decl} when Decl =/= undeclared -> + case S2#xmerl_scanner.validation of + dtd -> + ?fatal({already_defined, Ename}, S2); + _ -> + Delete(elem_def,Ename,S2), + El + end; + El = #xmlElement{} -> + Delete(elem_def,Ename,S2), + El; + undefined -> + #xmlElement{} + end, + {_,T3,S3} = mandatory_strip(T2,S2), + {Edef, T4, S4} = scan_contentspec(T3, S3), + ?strip5, + {">" ++ T6,S6} = scan_element_completion(T5,S5), + S7 = Write(elem_def, Ename, + Element#xmlElement{name = Ename, + content = Edef, + elementdef=S6#xmerl_scanner.environment}, + S6#xmerl_scanner{col=S6#xmerl_scanner.col+1}), + strip(T6,S7); +scan_markup_decl("<!ENTITY" ++ T, S0) -> + %% <!ENTITY [%] entity.name NDATA notation.name> + %% <!ENTITY [%] entity.name "replacement text"> + %% <!ENTITY [%] entity.name SYSTEM "system.identifier"> + %% <!ENTITY [%] entity.name PUBLIC public.identifier "system.identifier"> + ?bump_col(8), + {_,T1,S1} = mandatory_strip(T,S), + {T2, S2} = scan_entity(T1, S1), + strip(T2,S2); +scan_markup_decl("<!NOTATION" ++ T, S0) -> + %% <!NOTATION notation.name "public.identifier" "helper.application"> + ?bump_col(10), + {_,T1,S1} = mandatory_strip(T,S), + {T2, S2} = scan_notation_decl(T1, S1), + strip(T2,S2); +scan_markup_decl("<!ATTLIST" ++ T, + #xmerl_scanner{rules_read_fun = Read, + rules_write_fun = Write, + rules_delete_fun= Delete} = S0) -> + %% <!ATTLIST Ename ( AttrName Type Value )*> + ?bump_col(9), + {_,T1,S1} = mandatory_strip(T,S), + {Ename, _NamespaceInfo, T2, S2} = scan_name(T1, S1), +% ?strip3, + {Attributes, T4, S4} = scan_attdef(T2, S2), + {EDEF,MergedAttrs} = + case Read(elem_def, Ename, S4) of + undefined -> %% this may happen when the ELEMENT is declared in + %% the external DTD but the ATTLIST in the + %% internal DTD. + {#xmlElement{},update_attributes(Attributes,[])}; + Edef = #xmlElement{attributes = OldAttrs} -> + Delete(elem_def,Ename,S4), + %% the slot in rules table must be empty so that the + %% later write has the assumed effect. Read maybe + %% should empty the table slot. + {Edef,update_attributes(Attributes, OldAttrs)} + end, + NewEdef = EDEF#xmlElement{name=Ename,attributes = MergedAttrs}, + S5 = Write(elem_def, Ename, NewEdef, S4), + T5 = T4, + strip(T5,S5); +scan_markup_decl(_Str,S) -> + ?fatal(expected_markup,S). + +scan_element_completion(T,S) -> + scan_markup_completion_gt(T,S). + +update_attributes(NewAttrs, OldAttrs) -> + update_attributes1(NewAttrs,lists:reverse(OldAttrs)). + +update_attributes1([A = {Name,_Type,_DefaultV,_DefaultD,_Env}|Attrs], + OldAttrs) -> + case lists:keymember(Name, 1, OldAttrs) of + true -> + update_attributes1(Attrs, OldAttrs); + false -> + update_attributes1(Attrs, [A|OldAttrs]) + end; +update_attributes1([],Acc) -> + lists:reverse(Acc). + + +%%%%%%% [53] AttDef + +scan_attdef([], S=#xmerl_scanner{continuation_fun = F}) -> + ?dbg("cont()...~n", []), + F(fun(MoreBytes, S1) -> scan_attdef(MoreBytes, S1) end, + fun(S1) -> ?fatal(unexpected_end, S1) end, + S); +scan_attdef(T, S) -> + scan_attdef(T, S, _AttrAcc = []). + + +scan_attdef([], S=#xmerl_scanner{continuation_fun = F}, Attrs) -> + ?dbg("cont()...~n", []), + F(fun(MoreBytes, S1) -> scan_attdef(MoreBytes, S1, Attrs) end, + fun(S1) -> ?fatal(unexpected_end, S1) end, + S); +scan_attdef(">" ++ T, S0, Attrs) -> + ?bump_col(1), + {lists:reverse(Attrs), T, S}; +scan_attdef("%" ++ _T, S=#xmerl_scanner{environment=prolog}, _Attrs) -> + ?fatal({error,{wfc_PEs_In_Internal_Subset}},S); +scan_attdef("%" ++ T, S0, Attrs) -> + ?bump_col(1), + {PERefName, T1, S1} = scan_pe_reference(T, S), + ExpRef = expand_pe_reference(PERefName, S1,as_PE), + {_,T2,S2} = strip(ExpRef ++ T1,S1), + scan_attdef(T2, S2, Attrs); +scan_attdef(T,S,Attrs) -> + {_,T1,S1} = mandatory_strip(T,S), + scan_attdef2(T1,S1,Attrs). + +scan_attdef2(">" ++ T, S0, Attrs) -> + ?bump_col(1), + {lists:reverse(Attrs), T, S}; +scan_attdef2("%" ++ _T, S=#xmerl_scanner{environment=prolog}, _Attrs) -> + ?fatal({error,{wfc_PEs_In_Internal_Subset}},S); +scan_attdef2("%" ++ T, S0, Attrs) -> + ?bump_col(1), + {PERefName, T1, S1} = scan_pe_reference(T, S), + ExpRef = expand_pe_reference(PERefName, S1,as_PE), + {_,T2,S2} = strip(ExpRef ++ T1,S1), + scan_attdef2(T2, S2, Attrs); +scan_attdef2(T, S, Attrs) -> + {AttName, _NamespaceInfo, T1, S1} = scan_name(T, S), + {_,T2,S2} = mandatory_strip(T1,S1), + {AttType, T3, S3} = scan_att_type(T2, S2), + {_,T4,S4} = mandatory_strip(T3,S3), + {{DefaultDecl,DefaultValue}, T5, S5} = scan_default_decl(T4, S4, AttType), + ?strip6, + Attr = {AttName, AttType,DefaultValue,DefaultDecl, + S#xmerl_scanner.environment}, + scan_attdef2(T6, S6, [Attr|Attrs]). + + +%%% [54] StringType +scan_att_type([], S=#xmerl_scanner{continuation_fun = F}) -> + ?dbg("cont()...~n", []), + F(fun(MoreBytes, S1) -> scan_att_type(MoreBytes, S1) end, + fun(S1) -> ?fatal(unexpected_end, S1) end, + S); +scan_att_type("CDATA" ++ T, S0) -> + ?bump_col(5), + {'CDATA', T, S}; +%%% [55] TokenizedType +scan_att_type("IDREFS" ++ T, S0) -> + ?bump_col(6), + {'IDREFS', T, S}; +scan_att_type("IDREF" ++ T, S0) -> + ?bump_col(5), + {'IDREF', T, S}; +scan_att_type("ID" ++ T, S0) -> + ?bump_col(2), + {'ID', T, S}; +scan_att_type("ENTITY" ++ T, S0) -> + ?bump_col(6), + {'ENTITY', T, S}; +scan_att_type("ENTITIES" ++ T, S0) -> + ?bump_col(8), + {'ENTITIES', T, S}; +scan_att_type("NMTOKENS" ++ T, S0) -> + ?bump_col(8), + {'NMTOKENS', T, S}; +scan_att_type("NMTOKEN" ++ T, S0) -> + ?bump_col(7), + {'NMTOKEN', T, S}; +%%% [57] EnumeratedType +scan_att_type("NOTATION" ++ T, S0) -> + ?bump_col(8), + {_,T1,S1} = mandatory_strip(T,S), + T2 = scan_mandatory("(",T1,1,S1,expected_parenthesis_after_NOTATION), +% "(" ++ T2 = T1, + S2 = S1, + ?strip3, + {Name, _NamespaceInfo, T4, S4} = scan_name(T3, S3), + notation_exists(Name, S4), + ?strip5, + scan_notation_type(T5, S5, [Name]); +scan_att_type("(" ++ T, S0) -> + ?bump_col(1), + ?strip1, + {NmToken, _NamespaceInfo, T2, S2} = scan_nmtoken(T1, S1), + ?strip3, + scan_enumeration(T3, S3, [NmToken]); +scan_att_type("%" ++ _T, S=#xmerl_scanner{environment=prolog}) -> + ?fatal({error,{wfc_PEs_In_Internal_Subset}},S); +scan_att_type("%" ++ T, S0) -> + ?bump_col(1), + {PERefName, T1, S1} = scan_pe_reference(T, S), + ExpRef = expand_pe_reference(PERefName, S1,in_literal), + {ExpRef,T1,S1}. + +%%% [58] NotationType + +scan_notation_type([], S=#xmerl_scanner{continuation_fun = F}, Acc) -> + ?dbg("cont()...~n", []), + F(fun(MoreBytes, S1) -> scan_notation_type(MoreBytes, S1, Acc) end, + fun(S1) -> ?fatal(unexpected_end, S1) end, + S); +scan_notation_type(")" ++ T, S0, Acc) -> + ?bump_col(1), + {{notation, lists:reverse(Acc)}, T, S}; +scan_notation_type("|" ++ T, S0, Acc) -> + ?bump_col(1), + ?strip1, + {Name, _NamespaceInfo, T2, S2} = scan_name(T1, S1), + notation_exists(Name, S2), + ?strip3, + scan_notation_type(T3, S3, [Name | Acc]). + +%%% Validity constraint for NotationType: +%%% The used notation names must be declared in the DTD, but they may +%%% be declared later. +notation_exists(Name, #xmerl_scanner{rules_read_fun = Read, + rules_write_fun = Write } = S) -> + case Read(notation, Name, S) of + undefined -> + %% this is legal, since the referenced NOTATION + %% may be declared later in internal or external + %% subset. + Write(notation,Name,undeclared,S); + _Value -> + ok + end. + +%%% [59] Enumeration + +scan_enumeration([], S=#xmerl_scanner{continuation_fun = F}, Acc) -> + ?dbg("cont()...~n", []), + F(fun(MoreBytes, S1) -> scan_enumeration(MoreBytes, S1, Acc) end, + fun(S1) -> ?fatal(unexpected_end, S1) end, + S); +scan_enumeration(")" ++ T, S0, Acc) -> + ?bump_col(1), + {{enumeration, lists:reverse(Acc)}, T, S}; +scan_enumeration("|" ++ T, S0, Acc) -> + ?bump_col(1), + ?strip1, + {NmToken, _NamespaceInfo, T2, S2} = scan_nmtoken(T1, S1), + ?strip3, + scan_enumeration(T3, S3, [NmToken|Acc]). + + +%%%%%%% [60] DefaultDecl + +scan_default_decl([], S=#xmerl_scanner{continuation_fun = F}, Type) -> + ?dbg("cont()...~n", []), + F(fun(MoreBytes, S1) -> scan_default_decl(MoreBytes, S1, Type) end, + fun(S1) -> ?fatal(unexpected_end, S1) end, + S); +scan_default_decl("#REQUIRED" ++ T, S0, _Type) -> + ?bump_col(9), + {{'#REQUIRED',no_value}, T, S}; +scan_default_decl("#IMPLIED" ++ T, S0, _Type) -> + ?bump_col(8), + {{'#IMPLIED',no_value}, T, S}; +scan_default_decl("#FIXED" ++ T, S0, Type) -> + ?bump_col(6), + {_,T1,S1} = mandatory_strip(T,S), + {Value,T2,S2,_} = default_value(T1, S1, Type), + {{'#FIXED',Value},T2,S2}; +scan_default_decl(Str, S, Type) -> + {Value,T1,S1,_} = default_value(Str, S, Type), + {{no_decl,Value},T1,S1}. + + +%% There is room here to validate against Type, but we don't do it at +%% the moment. +default_value(T, S, Type) -> + {_Val, _T1, _S1,_} = scan_att_value(T, S, Type). + + +%%%%%%% [71] EntityDef + +scan_entity([], S=#xmerl_scanner{continuation_fun = F}) -> + ?dbg("cont()...~n", []), + F(fun(MoreBytes, S1) -> scan_entity(MoreBytes, S1) end, + fun(S1) -> ?fatal(unexpected_end, S1) end, + S); +scan_entity("%" ++ T, #xmerl_scanner{rules_write_fun = Write} = S0) -> + %% parameter entity + ?bump_col(1), + {_,T1,S1} = mandatory_strip(T,S), + {PEName, _NamespaceInfo, T2, S2} = scan_name_no_colons(T1, S1), + {_,T3,S3} = mandatory_strip(T2,S2), + {PEDef, T4, S4} = scan_pe_def(T3, S3, PEName), + ?strip5, + {">" ++ T6,S6} = scan_entity_completion(T5,S5), + S7 = Write(parameter_entity, PEName, PEDef, S6), + {T6, S7}; +scan_entity(T, #xmerl_scanner{rules_write_fun = Write, + rules_read_fun = Read, + rules_delete_fun = Delete} = S) -> + %% generic entity + {EName, _NamespaceInfo, T1, S1} = scan_name_no_colons(T, S), + {_,T2,S2} = mandatory_strip(T1,S1), + {EDef, EntType, T3, S3} = scan_entity_def(T2, S2, EName), + check_entity_recursion(EName,S3), + ?strip4, + {">" ++ T5,S5} = scan_entity_completion(T4,S4), + case Read(entity,EName,S5) of + undeclared -> Delete(entity,EName,S5); + _ -> ok + end, + S6 = Write(entity, EName, {S5#xmerl_scanner.environment,EntType,EDef}, S5), + {T5, S6}. + +scan_entity_completion(T,S) -> + scan_markup_completion_gt(T,S). + +%%%%%%% [73] EntityDef + +scan_entity_def([], S=#xmerl_scanner{continuation_fun = F}, EName) -> + ?dbg("cont()...~n", []), + F(fun(MoreBytes, S1) -> scan_entity_def(MoreBytes, S1, EName) end, + fun(S1) -> ?fatal(unexpected_end, S1) end, + S); +scan_entity_def("'" ++ T, S0, EName) -> + ?bump_col(1), + {EVal,T1,S1}=scan_entity_value(T, S, $', EName,general), + {EVal,internal,T1,S1}; +scan_entity_def("\"" ++ T, S0, EName) -> + ?bump_col(1), + {EVal,T1,S1}=scan_entity_value(T, S, $", EName,general), + {EVal,internal,T1,S1}; +%% external general entity, parsed or unparsed. +scan_entity_def(Str, S, EName) -> + {ExtID, T1, S1} = scan_external_id(Str, S), + {NData, T2, S2} = scan_ndata_decl(T1, S1), + case NData of + {ndata,_} -> + %% if NDATA exists it is an unparsed ENTITY + {{ExtID,NData},external,T2,S2}; + _ -> + case fetch_and_parse(ExtID,S2, + [{text_decl,true}, + {environment,{external,{entity,EName}}}]) of + {{_USret,Entity},_Tail,_Sx} -> + {Entity, external,T2, S2}; + {Entity,_Tail,Sx} -> + OldRef=S2#xmerl_scanner.entity_references, + NewRef=Sx#xmerl_scanner.entity_references, + {Entity,external,T2, + S2#xmerl_scanner{entity_references=OldRef++NewRef}}; + {error,enoent} -> % this bad entity is declared, + % but it may not be referenced, + % then it would not be an + % error. + {{error,enoent},external,T2,S2} + end + end. + + +scan_ndata_decl([], S=#xmerl_scanner{continuation_fun = F}) -> + ?dbg("cont()...~n", []), + F(fun(MoreBytes, S1) -> scan_ndata_decl(MoreBytes, S1) end, + fun(S1) -> ?fatal(unexpected_end, S1) end, + S); +scan_ndata_decl(Str = ">"++_T, S) -> + {[], Str, S}; +scan_ndata_decl(T, S) -> + {_,T1,S1} = mandatory_strip(T,S), + scan_ndata_decl2(T1,S1). +scan_ndata_decl2(Str = ">"++_T,S) -> + {[], Str, S}; +scan_ndata_decl2("NDATA" ++ T,S0 = #xmerl_scanner{rules_read_fun = Read, + rules_write_fun = Write}) -> + ?bump_col(5), + {_,T1,S1} = mandatory_strip(T,S), + {Name, _NamespaceInfo, T2, S2} = scan_name(T1, S1), + case Read(notation, Name, S2) of + undefined -> %% this is legal, since the referenced NOTATION + %% may be declared later in internal or external + %% subset. + Write(notation,Name,undeclared,S2), + {{ndata,Name},T2,S2}; + _Value -> + {{ndata, Name}, T2, S2} + end. + +%%%%%%% [39] element + +scan_element(T, S, Pos) -> + scan_element(T, S, Pos, S#xmerl_scanner.space, + _Lang = [], _Parents = [], #xmlNamespace{}). + +scan_element(T, S=#xmerl_scanner{line=L,col=C}, + Pos, SpaceDefault,Lang, Parents, NS) -> + {Name, NamespaceInfo, T1, S1} = scan_name(T, S), + vc_Element_valid(Name,S), + ?strip2, + scan_element(T2, S2, Pos, Name, L, C, _Attrs = [], + Lang, Parents, NamespaceInfo, NS, + SpaceDefault). + + +scan_element("/", S=#xmerl_scanner{continuation_fun = F}, + Pos, Name, StartL, StartC, Attrs, Lang, Parents, + NSI, NS, SpaceDefault) -> + ?dbg("trailing / detected~n", []), + F(fun(MoreBytes, S1) -> scan_element("/" ++ MoreBytes, S1, + Pos, Name, StartL, StartC, Attrs, + Lang,Parents,NSI,NS,SpaceDefault) end, + fun(S1) -> ?fatal(unexpected_end, S1) end, + S); +scan_element([], S=#xmerl_scanner{continuation_fun = F}, + Pos, Name, StartL, StartC, Attrs, Lang, Parents, + NSI, NS, SpaceDefault) -> + ?dbg("cont()...~n", []), + F(fun(MoreBytes, S1) -> scan_element(MoreBytes, S1, + Pos, Name, StartL, StartC, Attrs, + Lang,Parents,NSI,NS,SpaceDefault) end, + fun(S1) -> ?fatal(unexpected_end, S1) end, + S); +scan_element("/>" ++ T, S0 = #xmerl_scanner{hook_fun = Hook, + event_fun = Event, + line = L, col = C, + xmlbase_cache=XMLBase}, Pos, + Name, _StartL, _StartC, Attrs0, Lang, Parents, NSI, + Namespace, _SpaceDefault) -> + ?bump_col(2), + Attrs = lists:reverse(Attrs0), + E=processed_whole_element(S, Pos, Name, Attrs, Lang, Parents,NSI,Namespace), + + wfc_unique_att_spec(Attrs,S), + S1 = #xmerl_scanner{} = Event(#xmerl_event{event = ended, + line = L, + col = C, + data = E}, S0), + {Ret, S2} = Hook(E, S1), + S2b=S2#xmerl_scanner{xmlbase=XMLBase}, + {Ret, T, S2b}; +scan_element(">", S=#xmerl_scanner{continuation_fun = F}, + Pos, Name, StartL, StartC, Attrs, Lang, Parents, + NSI, NS, SpaceDefault) -> + ?dbg("trailing > detected~n", []), + F(fun(MoreBytes, S1) -> scan_element(">" ++ MoreBytes, S1, + Pos, Name, StartL, StartC, Attrs, + Lang,Parents,NSI,NS,SpaceDefault) end, + fun(S1) -> ?fatal(unexpected_end, S1) end, + S); +scan_element(">" ++ T, S0 = #xmerl_scanner{event_fun = Event, + hook_fun = Hook, + line = L, col = C, + xmlbase_cache=XMLBase, + space = SpaceOption}, + Pos, Name, StartL, StartC, Attrs0, Lang, Parents, + NSI, Namespace, SpaceDefault) -> + ?bump_col(1), + Attrs = lists:reverse(Attrs0), + wfc_unique_att_spec(Attrs,S), + XMLSpace = case lists:keysearch('xml:space', #xmlAttribute.name, Attrs) of + false -> SpaceDefault; + {value, #xmlAttribute{value="default"}} -> SpaceOption; + {value, #xmlAttribute{value="preserve"}} -> preserve; + _ -> SpaceDefault + end, + + E0=processed_whole_element(S,Pos,Name,Attrs,Lang,Parents,NSI,Namespace), + S1 = #xmerl_scanner{} = Event(#xmerl_event{event = started, + line = StartL, + col = StartC, + data = E0}, S), + + {Content, T1, S2} = scan_content(T, S1, Name, Attrs, XMLSpace, + E0#xmlElement.language, + [{Name, Pos}|Parents], Namespace), + + Element=E0#xmlElement{content=Content, + xmlbase=E0#xmlElement.xmlbase}, + S3 = #xmerl_scanner{} = Event(#xmerl_event{event = ended, + line = L, + col = C, + data = Element}, S2), + {Ret, S4} = Hook(Element, S3), + S4b=S4#xmerl_scanner{xmlbase=XMLBase}, + {Ret, T1, S4b}; +scan_element(T, S, Pos, Name, StartL, StartC, Attrs, Lang, Parents, + NSI, NS, SpaceDefault) -> + {AttName, NamespaceInfo, T1, S1} = scan_name(T, S), + {T2, S2} = scan_eq(T1, S1), + {AttType,_DefaultDecl} = get_att_type(S2,AttName,Name), + {AttValue, T3, S3,IsNorm} = scan_att_value(T2, S2, AttType), +%% check_default_value(S3,DefaultDecl,AttValue), + NewNS = check_namespace(AttName, NamespaceInfo, AttValue, NS), + wfc_whitespace_betw_attrs(hd(T3),S3), + ?strip4, + AttrPos = case Attrs of + [] -> + 1; + [#xmlAttribute{pos = P}|_] -> + P+1 + end, + Attr = #xmlAttribute{name = AttName, + pos = AttrPos, + language = Lang, + namespace = NamespaceInfo, + value = AttValue, + normalized = IsNorm}, + XMLBase=if + AttName=='xml:base' -> + resolve_relative_uri(AttValue,S4#xmerl_scanner.xmlbase); + true -> + S4#xmerl_scanner.xmlbase + end, + + #xmerl_scanner{event_fun = Event, + line = Line, + col = Col} = S4, + S5 = Event(#xmerl_event{event = ended, + line = Line, + col = Col, + data = Attr}, + S4#xmerl_scanner{xmlbase=XMLBase, + xmlbase_cache=S#xmerl_scanner.xmlbase}), + scan_element(T4, S5, Pos, Name, StartL, StartC, [Attr|Attrs], + Lang, Parents, NSI, NewNS, SpaceDefault). + +get_att_type(S=#xmerl_scanner{rules_read_fun=Read},AttName,ElemName) -> + case Read(elem_def,ElemName,S) of + #xmlElement{attributes = Attrs} -> + case lists:keysearch(AttName,1,Attrs) of + {value,{_,AttType,_,DefaultDecl,_}} -> + {AttType,DefaultDecl}; + _ -> {'CDATA',no_value} %% undefined attribute shall be treated as CDATA + end; + _ -> {'CDATA',no_value} + end. + +resolve_relative_uri(NewBase="/"++_,CurrentBase) -> + case xmerl_uri:parse(CurrentBase) of + {error,_Reason} -> + NewBase; + {Scheme,Host,Port,_Path,_Query} -> + atom_to_list(Scheme)++Host++":"++integer_to_list(Port)++NewBase + end; +resolve_relative_uri(NewBase,CurrentBase) -> + filename:join(CurrentBase,NewBase). + + +processed_whole_element(S=#xmerl_scanner{hook_fun = _Hook, + xmlbase = XMLBase, + line = _L, col = _C, + event_fun = _Event}, + Pos, Name, Attrs, Lang, Parents, NSI, Namespace) -> + Language = check_language(Attrs, Lang), + + {ExpName, ExpAttrs} = + case S#xmerl_scanner.namespace_conformant of + true -> + %% expand attribute names. We need to do this after having + %% scanned all attributes of the element, since (as far as + %% I can tell), XML Names only specifies that namespace attrs + %% are valid within the whole scope of the element in which + %% they are declared, which should also mean that even if they + %% are declared after some other attributes, the namespace + %% should apply to those attributes as well. + %% Note that the default URI does not apply to attrbute names. + TempNamespace = Namespace#xmlNamespace{default = []}, + ExpAttrsX = + [A#xmlAttribute{ + expanded_name=expanded_name( + A#xmlAttribute.name, + A#xmlAttribute.namespace, + % NSI, + TempNamespace, S)} || A <- Attrs], + {expanded_name(Name, NSI, Namespace, S), ExpAttrsX}; + false -> + {Name, Attrs} + end, + + #xmlElement{name = Name, + xmlbase = XMLBase, + pos = Pos, + parents = Parents, + attributes = ExpAttrs, + language = Language, + expanded_name = ExpName, + nsinfo = NSI, + namespace = Namespace}. + + +check_language([#xmlAttribute{name='xml:lang',value=Lang}|_], _) -> + Lang; +check_language([_|T], Lang) -> + check_language(T, Lang); +check_language([], Lang) -> + Lang. + + +check_namespace(xmlns, _, Value, NS) -> + NS#xmlNamespace{default = list_to_atom(Value)}; +check_namespace(_, {"xmlns", Prefix}, Value, + NS = #xmlNamespace{nodes = Ns}) -> + NS#xmlNamespace{nodes = keyreplaceadd( + Prefix, 1, Ns, {Prefix, list_to_atom(Value)})}; +check_namespace(_, _, _, NS) -> + NS. + + +expanded_name(Name, [], #xmlNamespace{default = []}, _S) -> + Name; +expanded_name(Name, [], #xmlNamespace{default = URI}, _S) -> + {URI, Name}; +expanded_name(_Name, {"xmlns", Local}, _NS, _S) -> % CHECK THIS /JB + {"xmlns",Local}; +expanded_name(_Name, {Prefix, Local}, #xmlNamespace{nodes = Ns}, S) -> + case lists:keysearch(Prefix, 1, Ns) of + {value, {_, URI}} -> + {URI, list_to_atom(Local)}; + false -> + %% A namespace constraint of XML Names is that the prefix + %% must be declared + ?fatal({namespace_prefix_not_declared, Prefix}, S) + end. + + + + +keyreplaceadd(K, Pos, [H|T], Obj) when K == element(Pos, H) -> + [Obj|T]; +keyreplaceadd(K, Pos, [H|T], Obj) -> + [H|keyreplaceadd(K, Pos, T, Obj)]; +keyreplaceadd(_K, _Pos, [], Obj) -> + [Obj]. + +%%%%%%% [10] AttValue +%% normalize the attribute value according to XML 1.0 section 3.3.3 + +scan_att_value([], S=#xmerl_scanner{continuation_fun = F},AT) -> + ?dbg("cont()...~n", []), + F(fun(MoreBytes, S1) -> scan_att_value(MoreBytes, S1, AT) end, + fun(S1) -> ?fatal(unexpected_end, S1) end, + S); +scan_att_value("%"++_T,S=#xmerl_scanner{environment=prolog},_AttType) -> + ?fatal({error,{wfc_PEs_In_Internal_Subset}},S); +scan_att_value("%"++T,S0=#xmerl_scanner{rules_read_fun=Read, + rules_write_fun=Write, + rules_delete_fun=Delete},AttType) -> + ?bump_col(1), + {Name,T1,S1} = scan_pe_reference(T,S), + {ExpandedRef,S2} = + case expand_pe_reference(Name,S1,in_literal) of + Tuple when is_tuple(Tuple) -> + %% {system,URI} or {public,URI} + %% Included in literal, just get external file. + {ExpRef,Sx}=fetch_not_parse(Tuple,S1), + {EntV,_,_S2} = scan_entity_value(ExpRef, Sx, no_delim, + Name,parameter), + %% should do an update Write(parameter_entity) so next + %% expand_pe_reference is faster + Delete(parameter_entity,Name,_S2), + _S3 = Write(parameter_entity,Name,EntV,_S2), + EntV2 = Read(parameter_entity,Name,_S3), + {EntV2,_S3}; + ExpRef -> + {ExpRef,S1} + end, + {_,T2,S3} = strip(ExpandedRef ++ T1,S2), + scan_att_value(T2,S3,AttType); +scan_att_value([H|T], S0,'CDATA'=AT) when H == $"; H == $' -> + ?bump_col(1), + scan_att_chars(T, S, H, [],[], AT,false); +scan_att_value([H|T], S0,AttType) when H == $"; H == $' -> + ?bump_col(1), + {T1,S1,IsNorm} = normalize(T,S,false), + scan_att_chars(T1, S1, H, [],[], AttType,IsNorm). + +scan_att_chars([],S=#xmerl_scanner{continuation_fun=F},H,Acc,TmpAcc,AT,IsNorm)-> + ?dbg("cont()...~n", []), + F(fun(MoreBytes, S1) -> + scan_att_chars(MoreBytes, S1, H, Acc,TmpAcc,AT,IsNorm) + end, + fun(S1) -> ?fatal(unexpected_end, S1) end, + S); +scan_att_chars([H|T], S0, H, Acc, TmpAcc,AttType,IsNorm) -> % End quote + ?bump_col(1), + check_att_default_val(S#xmerl_scanner.validation,TmpAcc,AttType,S), + {Acc2,S2,IsNorm2} = + if + AttType == 'CDATA' -> {Acc,S,IsNorm}; + true -> + normalize(Acc,S,IsNorm) + end, + {lists:reverse(Acc2), T, S2,IsNorm2}; +scan_att_chars("&" ++ T, S0, Delim, Acc, TmpAcc,AT,IsNorm) -> % Reference + ?bump_col(1), + {ExpRef, T1, S1} = scan_reference(T, S), + case markup_delimeter(ExpRef) of + true -> + scan_att_chars(T1,S1,Delim,[ExpRef|Acc],[ExpRef|TmpAcc],AT,IsNorm); + _ -> + scan_att_chars(string_to_char_set(S#xmerl_scanner.encoding,ExpRef) + ++ T1, S1, Delim, Acc,TmpAcc, AT,IsNorm) + end; +scan_att_chars("<" ++ _T, S0, _Delim, _Acc,_, _,_) -> % Tags not allowed here + ?fatal(unexpected_char, S0); +scan_att_chars([H|T], S0, Delim, Acc, _TmpAcc,'CDATA',IsNorm) + when ?whitespace(H) -> + ?bump_col(1), + scan_att_chars(T, S, Delim, [$\s|Acc], [],'CDATA',IsNorm); +scan_att_chars([H|T], S0, Delim, Acc, TmpAcc,AT,IsNorm) + when ?whitespace(H) -> + ?bump_col(1), + {T1,S1,IsNorm2} = normalize(T,S,IsNorm), + check_att_default_val(S#xmerl_scanner.validation,TmpAcc,AT,S1), + scan_att_chars(T1, S1, Delim, [$\s|Acc],[], AT,IsNorm2); +scan_att_chars(Str, S0, Delim, Acc, TmpAcc,AT,IsNorm) -> + ?bump_col(1), + {Ch,T} = to_ucs(S#xmerl_scanner.encoding,Str), + valid_Char(S#xmerl_scanner.validation,AT,Ch,S), + scan_att_chars(T, S, Delim, [Ch|Acc], [Ch|TmpAcc],AT,IsNorm). + +markup_delimeter("&")-> true; +markup_delimeter("\"") -> true; +markup_delimeter("\'") -> true; +markup_delimeter("<") -> true; +markup_delimeter(">") -> true; +markup_delimeter("%") -> true; +markup_delimeter(_) -> false. + +check_att_default_val(dtd,[],_Ent,_S) -> + ok; +check_att_default_val(dtd,RevName,Ent,S) -> + check_att_default_val(lists:reverse(RevName),Ent,S); +check_att_default_val(_,_,_,_) -> + ok. + +check_att_default_val(Name,Ent,S=#xmerl_scanner{rules_write_fun=Write}) + when Ent == 'ENTITY'; Ent == 'ENTITIES' -> + case xmerl_lib:is_letter(hd(Name)) of + true -> ok; + _ -> ?fatal({illegal_first_character,Ent,Name},S) + end, + SName = list_to_atom(Name), + Write(entity,SName,undeclared,S); +check_att_default_val(Name,IDR,S=#xmerl_scanner{rules_write_fun=Write}) + when IDR == 'IDREF'; IDR == 'IDREFS' -> + case xmerl_lib:is_letter(hd(Name)) of + true -> ok; + _ -> ?fatal({illegal_first_character,IDR,Name},S) + end, + SName = list_to_atom(Name), + Write(id,SName,undeclared,S); +check_att_default_val(Name,'ID',S=#xmerl_scanner{rules_write_fun=Write, + rules_read_fun=Read, + rules_delete_fun=Delete}) -> + case xmerl_lib:is_name(Name) of + false -> + ?fatal({'ID_names_must_be_Name_production',Name},S); + _ -> + ok + end, + SName = if + is_list(Name) -> list_to_atom(Name); + true -> Name + end, + case Read(id,SName,S) of + undeclared -> %% was referenced in IDREF/IDREFS before defined + Delete(id,SName,S); + SName -> ?fatal({values_must_be_unique,'ID',SName},S); + undefined -> ok + end, + Write(id,SName,SName,S); +check_att_default_val(_,_,_) -> + ok. + +valid_Char(dtd,AT,C,S) when AT=='NMTOKEN';AT=='NMTOKENS' -> + vc_Valid_Char(AT,C,S); +valid_Char(_,_,[C],S) -> + case xmerl_lib:is_char(C) of + true -> + ok; + false -> + ?fatal({unexpected_char,C}, S) + end; +valid_Char(_,_,C,S) -> + case xmerl_lib:is_char(C) of + true -> + ok; + false -> + ?fatal({unexpected_char,C}, S) + end. + + + +%%%%%%% [43] content + +scan_content(T, S, Name, Attrs, Space, Lang, Parents, NS) -> + scan_content(T, S, _Pos = 1, Name, Attrs, Space, + Lang, Parents, NS, _Acc = [],_MarkupDel=[]). + +scan_content("<", S= #xmerl_scanner{continuation_fun = F}, + Pos, Name, Attrs, Space, Lang, Parents, NS, Acc,_) -> + ?dbg("trailing < detected~n", []), + F(fun(MoreBytes, S1) -> scan_content("<" ++ MoreBytes, S1, + Pos, Name, Attrs, + Space, Lang, Parents, NS, Acc,[]) end, + fun(S1) -> ?fatal(unexpected_end, S1) end, + S); +scan_content([], S=#xmerl_scanner{environment={external,{entity,_}}}, + _Pos, _Name, _Attrs, _Space, _Lang, _Parents, _NS, Acc,_) -> + {lists:reverse(Acc),[],S}; +scan_content([], S=#xmerl_scanner{environment=internal_parsed_entity}, + _Pos, _Name, _Attrs, _Space, _Lang, _Parents, _NS, Acc,_) -> + {lists:reverse(Acc),[],S}; +scan_content([], S=#xmerl_scanner{continuation_fun = F}, + Pos, Name, Attrs, Space, Lang, Parents, NS, Acc,_) -> + ?dbg("cont()...~n", []), + F(fun(MoreBytes, S1) -> scan_content(MoreBytes, S1, + Pos, Name, Attrs, + Space, Lang, Parents, NS, Acc,[]) end, + fun(S1) -> ?fatal(unexpected_end, S1) end, + S); +scan_content("</" ++ T, S0, _Pos, Name, _Attrs, _Space, _Lang, + _Parents, _NS, Acc,[]) -> + ?bump_col(2), + {ETagName, _NamespaceInfo, T1, S1} = scan_name(T, S), + if ETagName == Name -> + ok; + true -> + ?fatal({endtag_does_not_match, {was,ETagName,should_have_been, Name}}, S) + end, + ?strip2, + case T2 of + ">" ++ T3 -> + {lists:reverse(Acc), T3, S2}; + _ -> + ?fatal({error,{unexpected_end_of_STag}},S) + end; +scan_content([$&|_T]=Str, + #xmerl_scanner{environment={external,{entity,EName}}} = S0, + Pos, Name, Attrs, Space, Lang, Parents, NS, Acc,_) -> + {_EntV,T1,S1}=scan_entity_value(Str,S0 ,[],EName,general), + %%This is a problem. All referenced entities in the external entity must be checked for recursion, thus parse the contentbut,skip result. + scan_content(T1,S1,Pos, Name, Attrs, Space, Lang, Parents, NS, Acc,[]); +scan_content("&"++T, + #xmerl_scanner{environment=internal_parsed_entity} = S, + Pos, Name, Attrs, Space, Lang, Parents, NS, Acc,_) -> + {_, T1, S1} = scan_reference(T, S), + scan_content(T1,S1,Pos, Name, Attrs, Space, Lang, Parents, NS, Acc,[]); +scan_content("&" ++ T, S0, Pos, Name, Attrs, Space, Lang, Parents, NS, Acc,[]) -> + ?bump_col(1), + {ExpRef, T1, S1} = scan_reference(T, S), + case markup_delimeter(ExpRef) of + true -> scan_content(ExpRef++T1,S1,Pos,Name,Attrs,Space,Lang,Parents,NS,Acc,ExpRef); + _ -> + scan_content(string_to_char_set(S1#xmerl_scanner.encoding,ExpRef)++T1,S1,Pos,Name,Attrs,Space,Lang,Parents,NS,Acc,[]) + end; +scan_content("<!--" ++ T, S, Pos, Name, Attrs, Space, Lang, Parents, NS, Acc,[]) -> + {_, T1, S1} = scan_comment(T, S, Pos, Parents, Lang), + scan_content(T1, S1, Pos+1, Name, Attrs, Space, Lang, Parents, NS, Acc,[]); +scan_content("<" ++ T, S0, Pos, Name, Attrs, Space, Lang, Parents, NS, Acc,[]) -> + ?bump_col(1), + {Markup, T1, S1} = + scan_content_markup(T, S, Pos, Name, Attrs, Space, Lang, Parents, NS), + AccF = S1#xmerl_scanner.acc_fun, + {NewAcc, NewPos, NewS} = case AccF(Markup, Acc, S1) of + {Acc2, S2} -> + {Acc2, Pos+1, S2}; + {Acc2, Pos2, S2} -> + {Acc2, Pos2, S2} + end, + scan_content(T1, NewS, NewPos, Name, Attrs, Space, Lang, + Parents, NS, NewAcc,[]); +scan_content([_H|T], S= #xmerl_scanner{environment={external,{entity,_}}}, + Pos, Name, Attrs, Space, Lang, Parents, NS, Acc,_) -> + %% Guess we have to scan the content to find any internal entity + %% references. + scan_content(T,S,Pos, Name, Attrs, Space, Lang, Parents, NS, Acc,[]); +scan_content(T, S=#xmerl_scanner{acc_fun = F, + event_fun = Event, + hook_fun=Hook, + line = _L}, + Pos, Name, Attrs, Space, Lang, Parents, NS, Acc,MarkupDel) -> + Text0 = #xmlText{pos = Pos, + parents = Parents}, + S1 = #xmerl_scanner{} = Event(#xmerl_event{event = started, + line = S#xmerl_scanner.line, + data = Text0}, S), + {Data, T1, S2} = scan_char_data(T, S1, Space,MarkupDel), + Text = Text0#xmlText{value = Data}, + {Ret,S2b} = Hook(Text,S2), + S3 = #xmerl_scanner{} = Event(#xmerl_event{event = ended, + line = S2b#xmerl_scanner.line, + data = Ret}, S2b), + {NewAcc, NewPos, NewS} = case F(Ret, Acc, S3) of + {Acc4, S4} -> + {Acc4, Pos+1, S4}; + {Acc4, Pos4, S4} -> + {Acc4, Pos4, S4} + end, + scan_content(T1, NewS, NewPos, Name, Attrs, Space, Lang, + Parents, NS, NewAcc,[]). + + +scan_content_markup([], S=#xmerl_scanner{continuation_fun = F}, + Pos, Name, Attrs, Space, Lang, Parents, NS) -> + ?dbg("cont()...~n", []), + F(fun(MoreBytes, S1) -> scan_content_markup( + MoreBytes,S1,Pos,Name, + Attrs,Space,Lang,Parents,NS) end, + fun(S1) -> ?fatal(unexpected_end, S1) end, + S); +scan_content_markup("![CDATA[" ++ T, S0, Pos, _Name, _Attrs, + _Space, _Lang, Parents, _NS) -> + ?bump_col(8), + scan_cdata(T, S, Pos, Parents); +scan_content_markup("?"++T,S0,Pos,_Name,_Attrs,_Space,_Lang,_Parents,_NS) -> + ?bump_col(1), + scan_pi(T, S, Pos); +scan_content_markup(T, S, Pos, _Name, _Attrs, Space, Lang, Parents, NS) -> + scan_element(T, S, Pos, Space, Lang, Parents, NS). + +scan_char_data(T, S, Space,MUD) -> + scan_char_data(T, S, Space,MUD, _Acc = []). + +%%%%%%% [14] CharData + +scan_char_data([], S=#xmerl_scanner{environment={external,{entity,_}}}, + _Space,_MUD, Acc) -> + + {lists:reverse(Acc), [], S}; +scan_char_data([], S=#xmerl_scanner{environment=internal_parsed_entity}, + _Space, _MUD,Acc) -> + + {lists:reverse(Acc), [], S}; +scan_char_data([], S=#xmerl_scanner{continuation_fun = F}, Space, _MUD,Acc) -> + ?dbg("cont()...~n", []), + F(fun(MoreBytes, S1) -> scan_char_data(MoreBytes,S1,Space,_MUD,Acc) end, + fun(S1) -> ?fatal(unexpected_end, S1) end, + S); +scan_char_data([$&|T], S,Space,"&",Acc) -> + scan_char_data(T, S, Space,[], [$&|Acc]); +scan_char_data(T=[$&|_], S,_Space,_MUD,Acc) -> + + {lists:reverse(Acc), T, S}; +scan_char_data("]]>" ++ _T, S, _Space,_MUD, _Acc) -> + %% See Section 2.4: Especially: + %% "The right angle bracket (>) MAY be represented using the string ">", + %% and MUST, for compatibility, be escaped using either ">" or a + %% character reference when it appears in the string "]]>" in content, when + %% that string is not marking the end of a CDATA section. + ?fatal(unexpected_cdata_end, S); +scan_char_data([$<|T],S,Space,"<", Acc) -> + scan_char_data(T, S, Space,[], [$<|Acc]); +scan_char_data(T = [$<|_], S, _Space,_MUD,Acc) -> + + {lists:reverse(Acc), T, S}; +scan_char_data(T = [H|R], S, Space,MUD, Acc) when ?whitespace(H) -> + if + MUD =:= [], Acc =:= [], H =:= $\n, Space =:= preserve -> + case fast_accumulate_whitespace(R, S, T) of + {done, Reply} -> + Reply; + {NewAcc, T1, S1} -> + scan_char_data(T1, S1, Space, MUD, NewAcc) + end; + true -> + {NewAcc, T1, S1} = accumulate_whitespace(T, S, Space, Acc), + scan_char_data(T1, S1, Space,MUD,NewAcc) + end; +scan_char_data([H1,H2|_T],S,_Space,_MUD,_Acc) when ?non_character(H1,H2) -> + ?fatal({error,{not_allowed_to_use_Unicode_noncharacters}},S); +scan_char_data("]]>"++_T,S,_Space,_MUD,_Acc) -> + ?fatal({error,{illegal_character_in_content,"]]>"}},S); +scan_char_data(Str,S0,Space,MUD,Acc) -> + ?bump_col(1), + {Ch,T} = wfc_legal_char(Str,S), + scan_char_data(T,S,Space,MUD,[Ch|Acc]). + + + +%%%%%%% [18]-[21] CDATA + +scan_cdata(Str, S, Pos, Parents) -> + scan_cdata(Str, S, Pos, Parents, _Acc = []). + + +scan_cdata([], S=#xmerl_scanner{continuation_fun = F}, Pos, Parents, Acc) -> + ?dbg("cont()...~n", []), + F(fun(MoreBytes, S1) -> scan_cdata(MoreBytes, S1, Pos, Parents, Acc) end, + fun(S1) -> ?fatal(unexpected_end, S1) end, + S); +scan_cdata("]]>" ++ T, S0, Pos, Parents, Acc) -> + ?bump_col(3), + {#xmlText{pos = Pos, + parents = Parents, + value = lists:reverse(Acc), + type = cdata}, T, S}; +scan_cdata(Str, S0, Pos, Parents, Acc) -> + {Ch,T} = to_ucs(S0#xmerl_scanner.encoding,Str), + case xmerl_lib:is_char(Ch) of + true -> + ?bump_col(1), + scan_cdata(T, S, Pos, Parents, [Ch|Acc]); + false -> + ?fatal({unexpected_char,Ch}, S0) + end. + + +%%%%%%% [67] Reference +%% returns a three tuple {Result,RestBuf,State} + +scan_reference([], S=#xmerl_scanner{continuation_fun = F}) -> + ?dbg("cont()...~n", []), + F(fun(MoreBytes, S1) -> scan_reference(MoreBytes, S1) end, + fun(S1) -> ?fatal(unexpected_end, S1) end, + S); +scan_reference("#x" ++ T, S0) -> + %% [66] CharRef + ?bump_col(1), + if hd(T) /= $; -> + {[Ch], T2, S2} = scan_char_ref_hex(T, S, 0), + {to_char_set(S2#xmerl_scanner.encoding,Ch),T2,S2}; + true -> + ?fatal(invalid_char_ref, S) + end; +scan_reference("#" ++ T, S0) -> + %% [66] CharRef + ?bump_col(1), + if hd(T) /= $; -> + scan_char_ref_dec(T, S, []); + true -> + ?fatal(invalid_char_ref, S) + end; +scan_reference(T, S) -> + case catch scan_entity_ref(T, S) of + {'EXIT', _} -> + ?fatal(error_scanning_entity_ref,S); + Other -> + Other + end. + + +%% Chapter 4.4.2: ... the replacement text of entities used to escape +%% markup delimiters (the entities amp, lt, gt, apos, quot) is always treated +%% as data. (The string "AT&T;" expands to "AT&T;" and the remaining +%% ampersand is not recognized as an entity-reference delimiter.)" +%% +%% How to achieve this? My current approach is to insert the *strings* "&", +%% "<", ">", "'", and "\"" instead of the characters. The processor will +%% ignore them when performing multiple expansions. This means, for now, that +%% the character data output by the processor is (1-2 levels) deep. +%% At some suitable point, we should flatten these, so that application-level +%% processors should not have to be aware of this detail. + +scan_entity_ref([], S=#xmerl_scanner{continuation_fun = F}) -> + ?dbg("cont()...~n", []), + F(fun(MoreBytes, S1) -> scan_entity_ref(MoreBytes, S1) end, + fun(S1) -> ?fatal(unexpected_end, S1) end, + S); +scan_entity_ref("amp;" ++ T, S0) -> + ?bump_col(4), + {"&", T, S}; +scan_entity_ref("lt;" ++ T, S0) -> + ?bump_col(3), + {"<", T, S}; +scan_entity_ref("gt;" ++ T, S0) -> + ?bump_col(3), + {">", T, S}; +scan_entity_ref("apos;" ++ T, S0) -> + ?bump_col(5), + {"'", T, S}; +scan_entity_ref("quot;" ++ T, S0) -> + ?bump_col(5), + {"\"", T, S}; +scan_entity_ref(T, S) -> + {Name, _NamespaceInfo, T1, S1} = scan_name(T, S), + T2 = scan_mandatory(";",T1,1,S1,expected_entity_reference_semicolon), +% ";" ++ T2 = T1, + S2 = S1, + Entity = expand_reference(Name, S2), + {Entity, T2, S2}. + + +%%%%%%% [69] PEReference + +scan_pe_reference(T, S) -> + {Name, _NamespaceInfo, T1, S1} = scan_name(T, S), + T2 = scan_mandatory(";",T1,1,S1,expected_parsed_entity_reference_semicolon), +% ";" ++ T2 = T1, + {Name, T2, S1#xmerl_scanner{col = S1#xmerl_scanner.col+1}}. + +expand_pe_reference(Name, #xmerl_scanner{rules_read_fun = Read} = S,WS) -> + case Read(parameter_entity, Name, S) of + undefined -> + ?fatal({unknown_parameter_entity, Name}, S); % WFC or VC failure + Err={error,_Reason} -> + ?fatal(Err,S); + Tuple when is_tuple(Tuple) -> + Tuple; + Result -> + if + WS == in_literal -> Result; + true -> " "++Result++" " + end + end. + +% Currently unused +% +% expand_external_pe_reference(Name, #xmerl_scanner{rules_read_fun = Read} = S) -> +% case Read(parameter_entity, Name, S) of +% undefined -> +% ?fatal({unknown_parameter_entity, Name}, S); +% Result -> +% fetch_DTD(Result,S) +% end. + + +%%%%%%% [68] EntityReference + +expand_reference(Name, #xmerl_scanner{environment={external,{entity,_}}}) -> + atom_to_list(Name); +expand_reference(Name, #xmerl_scanner{environment=internal_parsed_entity}) -> + atom_to_list(Name); +expand_reference(Name, #xmerl_scanner{rules_read_fun = Read} = S) -> + case Read(entity, Name, S) of + undefined -> + ?fatal({unknown_entity_ref, Name}, S); + {_,external,{error,enoent}} -> + ?fatal({error,{entity_target_not_found,{error,enoent},Name}},S); + {DefEnv,EntType,Value} -> + wfc_Entity_Declared(DefEnv,S,Name), + Value2 = string_to_char_set(S#xmerl_scanner.encoding,Value), + wfc_Internal_parsed_entity(EntType,Value2,S), + Value + end. + + +%%%%%%% [66] CharRef + +scan_char_ref_dec([], S=#xmerl_scanner{continuation_fun = F}, Acc) -> + ?dbg("cont()...~n", []), + F(fun(MoreBytes, S1) -> scan_char_ref_dec(MoreBytes, S1, Acc) end, + fun(S1) -> ?fatal(unexpected_end, S1) end, + S); +scan_char_ref_dec([H|T], S0, Acc) when H >= $0, H =< $9 -> + ?bump_col(1), + scan_char_ref_dec(T, S, [H|Acc]); +scan_char_ref_dec(";" ++ T, S0, Acc) -> + ?bump_col(1), + Ref = list_to_integer(lists:reverse(Acc)), + {Ch,_} = wfc_legal_char(Ref,S), + {[Ch], T, S}. %% changed return value from [[Ref]] + + +scan_char_ref_hex([], S=#xmerl_scanner{continuation_fun = F}, Acc) -> + ?dbg("cont()...~n", []), + F(fun(MoreBytes, S1) -> scan_char_ref_hex(MoreBytes, S1, Acc) end, + fun(S1) -> ?fatal(unexpected_end, S1) end, + S); +scan_char_ref_hex([H|T], S0, Acc) when H >= $0, H =< $9 -> + ?bump_col(1), + Dec = H - $0, + scan_char_ref_hex(T, S, (Dec bor (Acc bsl 4))); +scan_char_ref_hex([H|T], S0, Acc) when H >= $a, H =< $f -> + ?bump_col(1), + Dec = (H - $a) + 10, + scan_char_ref_hex(T, S, (Dec bor (Acc bsl 4))); +scan_char_ref_hex([H|T], S0, Acc) when H >= $A, H =< $F -> + ?bump_col(1), + Dec = (H - $A) + 10, + scan_char_ref_hex(T, S, (Dec bor (Acc bsl 4))); +scan_char_ref_hex(";" ++ T, S0, Acc) -> + ?bump_col(1), + {Ch,_} = wfc_legal_char(Acc,S), + {[Ch], T, S}. %% changed return value from [[Acc]] + + + +%%%%%%% [25] Eq +%%% Eq ::= S? '=' S? +scan_eq(T, S) -> + ?strip1, + case T1 of + [$=|T2] -> + S2 = S1#xmerl_scanner{col=S1#xmerl_scanner.col+1}, + ?strip3, + {T3, S3}; + _ -> + ?fatal(assignment_expected,S) + end. + + +%% scan_name/2 +%% +%% We perform some checks here to make sure that the names conform to +%% the "Namespaces in XML" specification. This is an option. +%% +%% Qualified Name: +%% [6] QName ::= (Prefix ':')? LocalPart +%% [7] Prefix ::= NCName +%% [8] LocalPart ::= NCName +%% [4] NCName ::= (Letter | '_') (NCNameChar)* +%% [5] NCNameChar ::= Letter | Digit | '.' | '-' | '_' +%% | CombiningChar | Extender + + +%% The effect of XML Names (namespace) conformance is that: +%% - All element types and attribute names contain either zero or one colon +%% - No entity names, PI targets, or notation names contain any colons. +%% +%% scan_name_no_colons/2 will ensure that the name contains no colons iff +%% the scanner has been told to be namespace conformant. Otherwise, it will +%% behave exactly like scan_name/2. +%% +scan_name_no_colons(Str, S) -> + NSC = S#xmerl_scanner.namespace_conformant, + case NSC of + true -> + {Target, NSI, T1, S1} = + scan_name(Str,S#xmerl_scanner{namespace_conformant=no_colons}), + {Target,NSI,T1,S1#xmerl_scanner{namespace_conformant=NSC}}; + false -> + scan_name(Str, S) + end. + + + +%% [5] Name ::= (Letter | '_' | ':') (NameChar)* +scan_name([], S=#xmerl_scanner{continuation_fun = F}) -> + ?dbg("cont()...~n", []), + F(fun(MoreBytes, S1) -> scan_name(MoreBytes, S1) end, + fun(S1) -> ?fatal(unexpected_end, S1) end, + S); +scan_name(Str = [$:|T], S0 = #xmerl_scanner{namespace_conformant = NSC}) -> + if NSC == false -> + ?bump_col(1), + scan_nmtoken(T, S, [$:], NSC); + NSC == no_colons -> + ?fatal({invalid_NCName, lists:sublist(Str, 1, 6)}, S0); + true -> + %% In order to conform with the "Namespaces in XML" spec, + %% we cannot allow names to begin with ":" + ?fatal({invalid_NCName, lists:sublist(Str, 1, 6)}, S0) + end; +scan_name([$_|T], S0 = #xmerl_scanner{namespace_conformant = NSC}) -> + ?bump_col(1), + scan_nmtoken(T, S, [$_], NSC); +scan_name("%"++_T,S=#xmerl_scanner{environment=prolog}) -> + ?fatal({error,{wfc_PEs_In_Internal_Subset}},S); +scan_name("%"++T,S0=#xmerl_scanner{environment={external,_}}) -> + %% parameter entity that expands to a name + ?bump_col(1), + {PERefName, T1, S1} = scan_pe_reference(T, S), + ExpRef = expand_pe_reference(PERefName, S1,as_PE), + {_,T2,S2} = strip(ExpRef ++ T1,S1), + scan_name(T2,S2); +scan_name(Str, S0 = #xmerl_scanner{namespace_conformant = NSC}) -> + {Ch,T} = to_ucs(S0#xmerl_scanner.encoding,Str), + case xmerl_lib:is_letter(Ch) of + true -> + ?bump_col(1), + scan_nmtoken(T, S, [Ch], NSC); + false -> + ?fatal({invalid_name, lists:sublist(Str, 1, 6)}, S0) + end; +scan_name(Str, S) -> + ?fatal({invalid_name, Str}, S). + + + + + + +scan_nmtoken(Str, S, Acc, NSC) -> + scan_nmtoken(Str, S, Acc, _Prefix = [], _Local = Acc, NSC,isLatin1(hd(Acc),true)). + +%% scan_nmtoken/2 +%% [7] NmToken ::= (NameChar)+ +scan_nmtoken([], S=#xmerl_scanner{continuation_fun = F}) -> + ?dbg("cont()...~n", []), + F(fun(MoreBytes, S1) -> scan_nmtoken(MoreBytes, S1) end, + fun(S1) -> ?fatal(unexpected_end, S1) end, + S); +scan_nmtoken("%"++T, S0=#xmerl_scanner{environment={external,_}}) -> + ?bump_col(1), + {PERefName, T1, S1} = scan_pe_reference(T, S), + ExpRef = expand_pe_reference(PERefName, S1,as_PE), + {_,T2,S2} = strip(ExpRef ++ T1,S1), + scan_nmtoken(T2,S2); +scan_nmtoken(Str, S) -> + {Ch,T} = to_ucs(S#xmerl_scanner.encoding,Str), + case xmerl_lib:is_namechar(Ch) of + true -> + scan_nmtoken(T, S#xmerl_scanner{col = S#xmerl_scanner.col+1}, + _Acc = [Ch], _Prefix = [], _Local = [Ch], + _NamespaceConformant = false,isLatin1(Ch,true)); + false -> + ?fatal({invalid_nmtoken, lists:sublist(Str, 1, 6)}, S) + end. + + +scan_nmtoken([], S=#xmerl_scanner{continuation_fun = F}, + Acc, Prefix, Local, NSC,IsLatin1) -> + ?dbg("cont()...~n", []), + F(fun(MoreBytes, S1) -> scan_nmtoken(MoreBytes,S1,Acc,Prefix,Local,NSC,IsLatin1) end, + fun(S1) -> {list_to_atom(lists:reverse(Acc)), + namespace_info(Prefix, Local),[],S1} end, + S); +%% whitespace marks the end of a name +scan_nmtoken(Str = [H|_], S, Acc, Prefix, Local, _NSC,true) when ?whitespace(H) -> + %% we don't strip here because the occurrence of whitespace may be an error + %% e.g. <!ELEMENT spec (front, body, back ?)> + NmString = lists:reverse(Acc), + {list_to_atom(NmString), namespace_info(Prefix, Local), Str, S}; +scan_nmtoken(Str = [$:|_], S, Acc, [], _Local, no_colons,_IsLatin1) -> + ?fatal({invalid_NCName, + lists:sublist(lists:reverse(Acc) ++ Str, 1, 6)}, S); +scan_nmtoken([$:|T], S0, Acc, [], Local, NSC, IsLatin1) -> + ?bump_col(1), + scan_nmtoken(T, S, [$:|Acc], lists:reverse(Local), [], NSC,IsLatin1); +scan_nmtoken(Str = [$:|_T], S, Acc, _Prefix, _Local, _NSC = true,_IsLatin1) -> + %% non-empty Prefix means that we've encountered a ":" already. + %% Conformity with "Namespaces in XML" requires + %% at most one colon in a name + ?fatal({invalid_NCName, + lists:sublist(lists:reverse(Acc) ++ Str, 1, 6)}, S); + +%% non-namechar also marks the end of a name +scan_nmtoken(Str, S0, Acc, Prefix, Local, NSC,IsLatin1) -> + ?bump_col(1), + {Ch,T} = to_ucs(S#xmerl_scanner.encoding,Str), + case {xmerl_lib:is_namechar(Ch),IsLatin1} of + {true,_} -> + scan_nmtoken(T, S, [Ch|Acc], Prefix, [Ch|Local], NSC,isLatin1(Ch,IsLatin1)); + {_,true} -> + NmStr = lists:reverse(Acc), + {list_to_atom(NmStr), namespace_info(Prefix, Local), Str, S}; + _ -> + {lists:reverse(Acc), namespace_info(Prefix, Local), Str, S} + end. + +namespace_info([], _) -> + []; +namespace_info(Prefix, Local) -> + {Prefix, lists:reverse(Local)}. + +isLatin1(_Ch,false) -> + false; +isLatin1(Ch,_) when Ch > 255 -> + false; +isLatin1(_,_) -> + true. + +%%%%%%% [11] SystemLiteral + +scan_system_literal([], S=#xmerl_scanner{continuation_fun = F}) -> + ?dbg("cont()...~n", []), + F(fun(MoreBytes, S1) -> scan_system_literal(MoreBytes, S1) end, + fun(S1) -> ?fatal(unexpected_end, S1) end, + S); +scan_system_literal("\"" ++ T, S) -> + scan_system_literal(T, S, $", []); +scan_system_literal("'" ++ T, S) -> + scan_system_literal(T, S, $', []). + + +scan_system_literal([], S=#xmerl_scanner{continuation_fun = F}, + Delimiter, Acc) -> + ?dbg("cont()...~n", []), + F(fun(MoreBytes, S1) -> scan_system_literal(MoreBytes,S1,Delimiter,Acc) end, + fun(S1) -> ?fatal(unexpected_end, S1) end, + S); +scan_system_literal([H|T], S, H, Acc) -> + {lists:reverse(Acc), T, S#xmerl_scanner{col = S#xmerl_scanner.col+1}}; +scan_system_literal("#"++_R, S, _H, _Acc) -> + %% actually not a fatal error + ?fatal(fragment_identifier_in_system_literal,S); +scan_system_literal(Str, S, Delimiter, Acc) -> + {Ch,T} = to_ucs(S#xmerl_scanner.encoding,Str), + scan_system_literal(T, S#xmerl_scanner{col = S#xmerl_scanner.col+1}, + Delimiter, [Ch|Acc]). + + +%%%%%%% [12] PubidLiteral + +scan_pubid_literal([], S=#xmerl_scanner{continuation_fun = F}) -> + ?dbg("cont()...~n", []), + F(fun(MoreBytes, S1) -> scan_pubid_literal(MoreBytes, S1) end, + fun(S1) -> ?fatal(unexpected_end, S1) end, + S); +scan_pubid_literal([H|T], S) when H == $"; H == $' -> + scan_pubid_literal(T, S#xmerl_scanner{col = S#xmerl_scanner.col+1}, H, []); +scan_pubid_literal([H|_T], S) -> + ?fatal({invalid_pubid_char, H}, S). + + +scan_pubid_literal([], S=#xmerl_scanner{continuation_fun = F}, + Delimiter, Acc) -> + ?dbg("cont()...~n", []), + F(fun(MoreBytes, S1) -> scan_pubid_literal(MoreBytes,S1,Delimiter,Acc) end, + fun(S1) -> ?fatal(unexpected_end, S1) end, + S); +scan_pubid_literal([H|T], S, H, Acc) -> + {lists:reverse(Acc), T, S#xmerl_scanner{col = S#xmerl_scanner.col+1}}; +scan_pubid_literal(Str = [H|_], S, Delimiter, Acc) when ?whitespace(H) -> + %% Before matching public identifiers, all whitespace must be normalized, + %% so we do that here + {_, T, S1} = pub_id_strip(Str, S), + scan_pubid_literal(T, S1, Delimiter, [16#20|Acc]); +scan_pubid_literal([H|T], S, Delimiter, Acc) -> + case is_pubid_char(H) of + true -> + scan_pubid_literal( + T, S#xmerl_scanner{col = S#xmerl_scanner.col+1}, + Delimiter, [H|Acc]); + false -> + ?fatal({invalid_pubid_char, H}, S) + end. + +%% We do not match whitespace here, even though they're allowed in public +%% identifiers. This is because we normalize this whitespace as we scan +%% (see above in scan_pubid_literal()) +%% +is_pubid_char(X) when X >= $a, X =< $z -> true; +is_pubid_char(X) when X >= $A, X =< $Z -> true; +is_pubid_char(X) when X >= $0, X =< $9 -> true; +is_pubid_char(X) -> + lists:member(X, "-'()+,./:=?;!*#@$_%"). + + +%%%%%%% [46] contentspec + +scan_contentspec([], S=#xmerl_scanner{continuation_fun = F}) -> + ?dbg("cont()...~n", []), + F(fun(MoreBytes, S1) -> scan_contentspec(MoreBytes, S1) end, + fun(S1) -> ?fatal(unexpected_end, S1) end, + S); +scan_contentspec("EMPTY" ++ T, S0) -> + ?bump_col(5), + {empty, T, S}; +scan_contentspec("ANY" ++ T, S0) -> + ?bump_col(3), + {any, T, S}; +scan_contentspec("%" ++ _T, S=#xmerl_scanner{environment=prolog}) -> + ?fatal({error,{wfc_PEs_In_Internal_Subset}},S); +scan_contentspec("%" ++ T, S0) -> + ?bump_col(1), + {PERefName, T1, S1} = scan_pe_reference(T, S), + ExpRef = expand_pe_reference(PERefName, S1,as_PE), + {_,T2,S2} = strip(ExpRef ++ T1,S1), + scan_contentspec(T2, S2); +scan_contentspec("(" ++ T, S0) -> + ?bump_col(1), + ?strip1, + scan_elem_content(T1, S1); +scan_contentspec(_Str,S) -> + ?fatal(unexpected_character,S). + + +%%%%%%% [47] children +%%%%%%% [51] Mixed + +scan_elem_content(T, S) -> + scan_elem_content(T, S, _Context = children, _Mode = unknown, _Acc = []). + +scan_elem_content([], S=#xmerl_scanner{continuation_fun = F}, + Context, Mode, Acc) -> + ?dbg("cont()...~n", []), + F(fun(MoreBytes,S1) -> scan_elem_content(MoreBytes,S1,Context,Mode,Acc) end, + fun(S1) -> ?fatal(unexpected_end, S1) end, + S); +scan_elem_content(")" ++ T, S0, Context, Mode0, Acc0) -> + ?bump_col(1), + {Mode, Acc} = case {Mode0, Acc0} of + {unknown, [_X]} -> + {seq, Acc0}; + {M, _L} when M == seq; M == choice -> + {Mode0, lists:reverse(Acc0)} + end, + {Occurrence, T1, S1} = scan_occurrence(T, S), + vc_No_Duplicate_Types(S,Context,Acc), + case {Occurrence, Context,Acc} of + {once, mixed,['#PCDATA']} -> ok; % It is not ok when there are + % more names than '#PCDATA' + % and no '*'. + {'*', mixed,_} -> ok; + {Other, mixed,_} -> + ?fatal({illegal_for_mixed_content, Other}, S1); + _ -> + ok + end, + ?strip2, + {format_elem_content({Occurrence, {Mode, Acc}}), T2, S2}; +scan_elem_content("#PCDATA" ++ _T, S, not_mixed, _Mode, _Acc) -> + ?fatal({error,{extra_set_of_parenthesis}},S); +scan_elem_content("#PCDATA" ++ _T, S, _Cont, Mode, Acc) + when Mode==choice;Mode==seq;Acc/=[] -> + ?fatal({error,{invalid_format_of_mixed_content}},S); +scan_elem_content("#PCDATA" ++ T, S0, _Context, Mode, Acc) -> + ?bump_col(7), + ?strip1, + scan_elem_content(T1, S1, mixed, Mode, ['#PCDATA'|Acc]); +scan_elem_content("," ++ _T, S, _Context, choice, _Acc) -> + ?fatal({mixing_comma_and_vertical_bar_in_content_model},S); +scan_elem_content("," ++ T, S0, Context, _Mode, Acc) -> + ?bump_col(1), + ?strip1, + scan_elem_content2(T1, S1, Context, seq, Acc); +scan_elem_content("|" ++ _T, S, _Context, seq, _Acc) -> + ?fatal({mixing_comma_and_vertical_bar_in_content_model},S); +scan_elem_content("|" ++ T, S0, Context, _Mode, Acc) -> + ?bump_col(1), + ?strip1, + scan_elem_content2(T1, S1, Context, choice, Acc); +scan_elem_content(T, S, Context, Mode, Acc) -> + scan_elem_content2(T, S, Context, Mode, Acc). + +scan_elem_content2("(" ++ _T, S, mixed, _Mode, _Acc) -> + ?fatal({error, + {element_names_must_not_be_parenthesized_in_mixed_content}},S); +scan_elem_content2("(" ++ T, S0, Context, Mode, Acc) -> + ?bump_col(1), + ?strip1, + {Inner, T2, S2} = scan_elem_content(T1, S1, not_mixed, unknown, []), + scan_elem_content(T2, S2, Context, Mode, [Inner|Acc]); +scan_elem_content2("%" ++ _T,S=#xmerl_scanner{environment=prolog},_Context,_Mode,_Acc) -> + ?fatal({error,{wfc_PEs_In_Internal_Subset}},S); +scan_elem_content2("%" ++ T, S0, Context, Mode, Acc) -> + ?bump_col(1), + {PERefName, T1, S1} = scan_pe_reference(T, S), + ExpRef = expand_pe_reference(PERefName, S1,as_PE), + {_,T2,S2}=strip(ExpRef++T1,S1), + scan_elem_content(T2, S2, Context, Mode, Acc); +scan_elem_content2(T, S, Context, Mode, Acc) -> + {Name, _NameStr, T1, S1} = scan_name(T, S), + {Occurrence, T2, S2} = scan_occurrence(T1, S1), + case {Occurrence, Context} of + {once, mixed} -> ok; + {Other, mixed} -> + ?fatal({illegal_for_mixed_content, Other}, S1); + _ -> + ok + end, + ?strip3, + mandatory_delimeter_wfc(T3,S3), + NewAcc = [format_elem_content({Occurrence, Name}) | Acc], + scan_elem_content(T3, S3, Context, Mode, NewAcc). + + +format_elem_content({once, What}) -> What; +format_elem_content(Other) -> Other. + + +scan_occurrence([], S=#xmerl_scanner{continuation_fun = F}) -> + ?dbg("cont()...~n", []), + F(fun(MoreBytes, S1) -> scan_occurrence(MoreBytes, S1) end, + fun(S1) -> ?fatal(unexpected_end, S1) end, + S); +scan_occurrence([$?|T], S0) -> + ?bump_col(1), + {'?', T, S}; +scan_occurrence([$+|T], S0) -> + ?bump_col(1), + {'+', T, S}; +scan_occurrence([$*|T], S0) -> + ?bump_col(1), + {'*', T, S}; +scan_occurrence(T, S) -> + {once, T , S}. + +%%% Tests of Validity Constraints + + +%% first part of VC: Name Token +vc_Valid_Char(_AT,C,S) -> + case xmerl_lib:is_namechar(C) of + true -> + ok; + _ -> + ?fatal({error,{validity_constraint_Name_Token,C}},S) + end. + + + +vc_ID_Attribute_Default(_,#xmerl_scanner{validation=Valid}) + when Valid /= dtd -> + ok; +vc_ID_Attribute_Default({_,'ID',_,Def,_},_S) + when Def=='#IMPLIED';Def=='#REQUIRED' -> + ok; +vc_ID_Attribute_Default({_,'ID',_,Def,_},S) -> + ?fatal({error,{validity_constraint_error_ID_Attribute_Default,Def}},S). + +vc_Enumeration({_Name,{_,NameList},DefaultVal,_,_},S) + when is_list(DefaultVal) -> + case lists:member(list_to_atom(DefaultVal),NameList) of + true -> + ok; + _ -> + ?fatal({error,{vc_enumeration,list_to_atom(DefaultVal),NameList}},S) + end; +vc_Enumeration({_Name,{_,_NameList},_DefaultVal,_,_},_S) -> + ok. + +vc_Entity_Name({_Name,'ENTITY',DefaultVal,_,_},S) when is_list(DefaultVal) -> + Read = S#xmerl_scanner.rules_read_fun, + case Read(entity,list_to_atom(DefaultVal),S) of + {_,external,{_,{ndata,_}}} -> + ok; + _ -> ?fatal({error,{vc_Entity_Name,list_to_atom(DefaultVal)}},S) + end; +vc_Entity_Name({_Name,'ENTITY',_,_,_},_S) -> + ok; +vc_Entity_Name({_,'ENTITIES',DefaultVal,_,_},S) when is_list(DefaultVal) -> + Read = S#xmerl_scanner.rules_read_fun, + NameListFun = fun([],Acc,_St,_Fun) -> + lists:reverse(Acc); + (Str,Acc,St,Fun) -> + {N,_,St2,Str2} = scan_name(Str,St), + Fun(Str2,[N|Acc],St2,Fun) + end, + NameList = NameListFun(DefaultVal,[],S,NameListFun), + VcFun = + fun(X) -> + case Read(entity,X,S) of + {_,external,{_,{ndata,_}}} -> + ok; + _ -> ?fatal({error,{vc_Entity_Name,X}},S) + end + end, + lists:foreach(VcFun,NameList); +vc_Entity_Name({_,'ENTITIES',_,_,_},_S) -> + ok. + +vc_No_Duplicate_Types(#xmerl_scanner{validation=dtd} = S,mixed,Acc) -> + CheckDupl = + fun([H|T],F) -> + case lists:member(H,T) of + true -> + ?fatal({no_duplicate_types_allowed,H},S); + _ -> F(T,F) + end; + ([],_) -> ok + end, + CheckDupl(Acc,CheckDupl); +vc_No_Duplicate_Types(_,_,_) -> + ok. + + +%%% Tests of Well-Formededness Constraints + + +mandatory_delimeter_wfc(","++_T,_S) -> + ok; +mandatory_delimeter_wfc("|"++_T,_S) -> + ok; +mandatory_delimeter_wfc(")"++_T,_S) -> + ok; +mandatory_delimeter_wfc("%"++_T,_S) -> + %% a parameter reference is ok + ok; +mandatory_delimeter_wfc(T,S) -> + ?fatal({comma_or_vertical_bar_mandatory_between_names_in_content_model,T},S). + + +wfc_unique_att_spec([],_S) -> + ok; +wfc_unique_att_spec([#xmlAttribute{name=N}|Atts],S) -> + case lists:keymember(N,#xmlAttribute.name,Atts) of + true -> + ?fatal({error,{unique_att_spec_required,N}},S); + _ -> + wfc_unique_att_spec(Atts,S) + end. + +wfc_legal_char(Chars,S) when is_list(Chars)-> + {Ch,Rest} = to_ucs(S#xmerl_scanner.encoding,Chars), + case xmerl_lib:is_char(Ch) of + true -> + {Ch,Rest}; + _ -> + ?fatal({error,{wfc_Legal_Character,Ch}},S) + end; +wfc_legal_char(Ch,S) -> + case xmerl_lib:is_char(Ch) of + true -> + {Ch,[]}; + _ -> + ?fatal({error,{wfc_Legal_Character,Ch}},S) + end. + + +wfc_whitespace_betw_attrs(WS,_S) when ?whitespace(WS) -> + ok; +wfc_whitespace_betw_attrs($/,_S) -> + ok; +wfc_whitespace_betw_attrs($>,_S) -> + ok; +wfc_whitespace_betw_attrs(_,S) -> + ?fatal({whitespace_required_between_attributes},S). + +wfc_Entity_Declared({external,_},S=#xmerl_scanner{standalone=yes},Name) -> + ?fatal({reference_to_externally_defed_entity_standalone_doc,Name},S); +wfc_Entity_Declared({external,_},_S,_) -> + ok; +wfc_Entity_Declared(_Env,_S,_) -> + ok. + +wfc_Internal_parsed_entity(internal,Value,S) -> + %% WFC test that replacement text matches production content + scan_content(Value,S#xmerl_scanner{environment=internal_parsed_entity}, + _Name=[],[],S#xmerl_scanner.space,_Lang=[],_Prnt=[], + #xmlNamespace{}); +wfc_Internal_parsed_entity(_,_,_) -> + ok. + + +vc_Element_valid(_Name,#xmerl_scanner{environment=internal_parsed_entity}) -> + ok; +vc_Element_valid(Name,S=#xmerl_scanner{rules_read_fun=Read, + validation=dtd}) -> + case Read(elem_def,Name,S) of + #xmlElement{elementdef=undeclared} -> + ?fatal({error,{error_missing_element_declaration_in_DTD,Name}},S); undefined -> + ?fatal({error,{error_missing_element_declaration_in_DTD,Name}},S); _ -> ok + end; +vc_Element_valid(_,_) -> + ok. + +%%%%%%% [74] PEDef + + +scan_pe_def([], S=#xmerl_scanner{continuation_fun = F}, PEName) -> + ?dbg("cont()...~n", []), + F(fun(MoreBytes, S1) -> scan_pe_def(MoreBytes, S1, PEName) end, + fun(S1) -> ?fatal(unexpected_end, S1) end, + S); +scan_pe_def("'" ++ T, S0, PEName) -> + ?bump_col(1), + scan_entity_value(T, S, $', PEName,parameter); +scan_pe_def("\"" ++ T, S0, PEName) -> + ?bump_col(1), + scan_entity_value(T, S, $", PEName,parameter); +scan_pe_def(Str, S, _PEName) -> + scan_external_id(Str, S). + + +%%%%%%% [82] NotationDecl + +scan_notation_decl(T, #xmerl_scanner{rules_write_fun = Write, + rules_read_fun=Read, + rules_delete_fun=Delete} = S) -> + {Name, _NameStr, T1, S1} = scan_name_no_colons(T, S), + {_,T2,S2} = mandatory_strip(T1,S1), + {Def, T3, S3} = scan_notation_decl1(T2, S2), + ?strip4, + T5 = scan_mandatory(">",T4,1,S4,expected_end_tag_notation_declaration), +% ">" ++ T5 = T4, + case Read(notation,Name,S) of + undeclared -> Delete(notation,Name,S4); + _ -> ok + end, + S5 = Write(notation, Name, Def, S4), + {T5, S5}. + +scan_notation_decl1([], S=#xmerl_scanner{continuation_fun = F}) -> + ?dbg("cont()...~n", []), + F(fun(MoreBytes, S1) -> scan_notation_decl1(MoreBytes, S1) end, + fun(S1) -> ?fatal(unexpected_end, S1) end, + S); +scan_notation_decl1("SYSTEM" ++ T, S0) -> + ?bump_col(6), + {_,T1,S1} = mandatory_strip(T,S), + {SL, T2, S2} = scan_system_literal(T1, S1), + {{system, SL}, T2, S2}; +scan_notation_decl1("PUBLIC" ++ T, S0) -> + ?bump_col(6), + {_,T1,S1} = mandatory_strip(T,S), + {PIDL, T2, S2} = scan_pubid_literal(T1, S1), + ?strip3, + case T3 of + ">" ++ _ -> + {{public, PIDL}, T3, + S3#xmerl_scanner{col = S3#xmerl_scanner.col+1}}; + _ -> + {SL, T4, S4} = scan_system_literal(T3, S3), + {{public, PIDL, SL}, T4, S4} + end. + +%%%%%%% [75] ExternalID + +scan_external_id([], S=#xmerl_scanner{continuation_fun = F}) -> + ?dbg("cont()...~n", []), + F(fun(MoreBytes, S1) -> scan_external_id(MoreBytes, S1) end, + fun(S1) -> ?fatal(unexpected_end, S1) end, + S); +scan_external_id("SYSTEM" ++ T, S0) -> + ?bump_col(6), + {_,T1,S1} = mandatory_strip(T,S), + {SL, T2, S2} = scan_system_literal(T1, S1), + {{system, SL}, T2, S2}; +scan_external_id("PUBLIC" ++ T, S0) -> + ?bump_col(6), + {_,T1,S1} = mandatory_strip(T,S), + {PIDL, T2, S2} = scan_pubid_literal(T1, S1), + {_,T3,S3} = mandatory_strip(T2,S2), + {SL, T4, S4} = scan_system_literal(T3, S3), + {{public, PIDL, SL}, T4, S4}. + + +%%%%%%% [9] EntityValue + +%% Note that we have two different scan functions for EntityValue +%% They differ in that this one checks for recursive calls to the same +%% parameter entity. + +scan_entity_value(Str, S, Delim, Name, Namespace) -> + scan_entity_value(Str, S, Delim, _Acc = [], Name, Namespace,[]). + + +scan_entity_value([], S=#xmerl_scanner{environment={external,{entity,_}}}, + _Delim,Acc,_,_,[]) -> + {lists:flatten(lists:reverse(Acc)), [], S}; +scan_entity_value([], S=#xmerl_scanner{environment={external,{entity,_}}, + validation=dtd}, + _Delim,_Acc,PEName,_,_) -> + {{error,{failed_VC_Proper_Declaration_PE_Nesting,1,PEName}},[],S}; +scan_entity_value([],S, + no_delim,Acc,_,_,[]) -> + {lists:flatten(lists:reverse(Acc)),[],S}; +scan_entity_value([],S=#xmerl_scanner{validation=dtd}, + no_delim,_Acc,PEName,_,_PENesting) -> + {{error,{failed_VC_Proper_Declaration_PE_Nesting,2,PEName}},[],S}; +scan_entity_value([], S=#xmerl_scanner{continuation_fun = F}, + Delim, Acc, PEName,Namespace,PENesting) -> + ?dbg("cont()...~n", []), + F(fun(MoreBytes, S1) -> + scan_entity_value(MoreBytes,S1, + Delim,Acc,PEName,Namespace,PENesting) + end, + fun(S1) -> ?fatal(unexpected_end, S1) end, + S); +scan_entity_value([Delim|T], S=#xmerl_scanner{validation=dtd}, + Delim,_Acc,PEName,_NS,PENesting) when length(PENesting) /= 0 -> + {{error,{failed_VC_Proper_Declaration_PE_Nesting,3,PEName}},T,S}; +scan_entity_value([Delim|T], S0, + Delim, Acc, _PEName,_NS,_PENesting) -> + ?bump_col(1), + {lists:flatten(lists:reverse(Acc)), T, S}; +scan_entity_value("%" ++ _T,S=#xmerl_scanner{environment=prolog},_,_,_,_,_) -> + ?fatal({error,{wfc_PEs_In_Internal_Subset}},S); +% %% This is a PEdecl in an external entity +% scan_entity_value([$%,WS|T], S0, Delim, Acc, PEName,Namespace,PENesting) +% when ?whitespace(WS) -> +% ?bump_col(2), +% scan_entity_value(T, S, Delim, [WS,$%|Acc], PEName,Namespace,PENesting); +scan_entity_value("%" ++ T, S0, Delim, Acc, PEName,Namespace,PENesting) -> + ?bump_col(1), + {PERefName, T1, S1} = scan_pe_reference(T, S), + if PERefName == PEName,Namespace==parameter -> + ?fatal({illegal_recursion_in_PE, PEName}, S1); + true -> + {ExpandedRef,S2} = + case expand_pe_reference(PERefName, S1, in_literal) of + %% actually should pe ref be expanded as_PE but + %% handle whitespace explicitly in this case. + Tuple when is_tuple(Tuple) -> + %% {system,URI} or {public,URI} + %% Included in literal. + {ExpRef,Sx}=fetch_not_parse(Tuple,S1), + {EntV,_,_S2} = + scan_entity_value(ExpRef, Sx, no_delim,[], + PERefName,parameter,[]), + %% should do an update Write(parameter_entity) + %% so next expand_pe_reference is faster + {EntV,_S2}; + ExpRef -> + {ExpRef,S1} + end, + %% single or duoble qoutes are not treated as delimeters + %% in passages "included in literal" + S3 = S2#xmerl_scanner{col=S2#xmerl_scanner.col+1}, + {Acc2,_,S4} = scan_entity_value(ExpandedRef,S3,no_delim,Acc, + PEName,Namespace,[]), +% {_,T2,S5} = strip(" "++T1,S4), + scan_entity_value(T1,S4#xmerl_scanner{line=S3#xmerl_scanner.line, + col=S3#xmerl_scanner.col}, + Delim,lists:reverse(Acc2), + PEName,Namespace,PENesting) +% scan_entity_value(T1,S4,Delim,lists:reverse(Acc2), +% PEName,Namespace,PENesting) + end; +scan_entity_value("&" ++ T, S0, Delim, Acc, PEName,Namespace,PENesting) -> + %% This is either a character entity or a general entity (internal + %% or external) reference. An internal general entity shall not be + %% expanded in an entity def XML1.0 section 4.5. + ?bump_col(1), + case T of + "#"++_T -> + {ExpRef, T1, S1} = scan_reference(T, S), + Tok = pe_nesting_token(ExpRef++T1,Namespace,S1#xmerl_scanner.validation), + case markup_delimeter(ExpRef) of + true -> + scan_entity_value(T1, S1, Delim, [ExpRef|Acc], PEName, + Namespace,pe_push(Tok,PENesting,S1)); + _ -> + ExpRef2 = string_to_char_set(S#xmerl_scanner.encoding,ExpRef), + scan_entity_value(ExpRef2 ++ T1, S1, Delim, Acc, PEName, + Namespace,pe_push(Tok,PENesting,S1)) + end; + _ -> %% General Entity is bypassed, though must check for + %% recursion: save referenced name now and check for + %% recursive reference after the whole entity definition is + %% completed. + {Name, _NamespaceInfo, T1, S1} = scan_name(T,S), + T2=scan_mandatory(";",T1,1,S1,expected_entity_reference_semicolon), + S2=save_refed_entity_name(Name,PEName,S1), + scan_entity_value(T2,S2,Delim,[";",atom_to_list(Name),"&"|Acc],PEName,Namespace,PENesting) + end; +%% The following clauses is for PE Nesting VC constraint +%% Start delimeter for ConditionalSection +scan_entity_value("<!["++T,S0,Delim,Acc,PEName,parameter=NS,PENesting)-> + ?bump_col(3), + scan_entity_value(T,S,Delim,["<!["|Acc],PEName,NS, + pe_push("<![",PENesting,S)); +%% Start delimeter for ConditionalSection (2) +scan_entity_value("["++T,S0,Delim,Acc,PEName,parameter=NS,PENesting)-> + ?bump_col(1), + scan_entity_value(T,S,Delim,["["|Acc],PEName,NS, + pe_push("[",PENesting,S)); +%% Start delimeter for comment +scan_entity_value("<!--"++T,S0,Delim,Acc,PEName,parameter=NS,PENesting)-> + ?bump_col(4), + scan_entity_value(T,S,Delim,["<!--"|Acc],PEName,NS, + pe_push("<!--",PENesting,S)); +%% Start delimeter for ElementDecl, AttListDecl,EntityDecl,NotationDecl +scan_entity_value("<!"++ T,S0,Delim,Acc,PEName, parameter=NS,PENesting) -> + ?bump_col(2), + scan_entity_value(T,S,Delim,["<!"|Acc],PEName,NS, + pe_push("<!",PENesting,S)); +%% Start delimeter for PI +scan_entity_value("<?"++T,S0,Delim,Acc,PEName, parameter=NS,PENesting) -> + ?bump_col(2), + scan_entity_value(T,S,Delim,["<?"|Acc],PEName,NS, + pe_push("<?",PENesting,S)); +%% Start delimeter for elements that matches the proper stop delimeter +%% for a markupdecl +scan_entity_value("</"++T,S0,Delim,Acc,PEName,parameter=NS,PENesting)-> + ?bump_col(2), + scan_entity_value(T,S,Delim,["</"|Acc],PEName,NS, + pe_push("</",PENesting,S)); +scan_entity_value("<"++T,S0,Delim,Acc,PEName,parameter=NS,PENesting)-> + ?bump_col(1), + scan_entity_value(T,S,Delim,["<"|Acc],PEName,NS, + pe_push("<",PENesting,S)); +%% Delimeter for contentspecs +scan_entity_value("("++T,S0,Delim,Acc,PEName,parameter=NS,PENesting)-> + ?bump_col(1), + scan_entity_value(T,S,Delim,["("|Acc],PEName,NS, + pe_push("(",PENesting,S)); +%% Stop delimeter for ElementDecl, AttListDecl,EntityDecl,NotationDecl +scan_entity_value(">"++ T,S0,Delim,Acc,PEName, parameter=NS,PENesting) -> + ?bump_col(1), + scan_entity_value(T,S,Delim,[">"|Acc],PEName,NS, + pe_pop(">",PENesting,S)); +%% Stop delimeter for PI +scan_entity_value("?>"++ T,S0,Delim,Acc,PEName, parameter=NS,PENesting) -> + ?bump_col(2), + scan_entity_value(T,S,Delim,["?>"|Acc],PEName,NS, + pe_pop("?>",PENesting,S)); +%% Stop delimeter for comment +scan_entity_value("-->"++ T,S0,Delim,Acc,PEName, parameter=NS,PENesting) -> + ?bump_col(3), + scan_entity_value(T,S,Delim,["-->"|Acc],PEName,NS, + pe_pop("-->",PENesting,S)); +%% Stop delimeter for ConditionalSection +scan_entity_value("]]>"++ T,S0,Delim,Acc,PEName, parameter=NS,PENesting) -> + ?bump_col(3), + scan_entity_value(T,S,Delim,["]]>"|Acc],PEName,NS, + pe_pop("]]>",PENesting,S)); +%% Stop delimeter added to match a content start delimeter included +scan_entity_value("/>"++ T,S0,Delim,Acc,PEName, parameter=NS,PENesting) -> + ?bump_col(2), + scan_entity_value(T,S,Delim,["/>"|Acc],PEName,NS, + pe_pop("/>",PENesting,S)); +scan_entity_value(")"++ T,S0,Delim,Acc,PEName, parameter=NS,PENesting) -> + ?bump_col(1), + scan_entity_value(T,S,Delim,[")"|Acc],PEName,NS, + pe_pop(")",PENesting,S)); +scan_entity_value("\n"++T, S, Delim, Acc, PEName,Namespace,PENesting) -> + scan_entity_value(T, S#xmerl_scanner{line=S#xmerl_scanner.line+1}, + Delim, ["\n"|Acc], PEName,Namespace,PENesting); +scan_entity_value(Str, S0, Delim, Acc, PEName,Namespace,PENesting) -> + {Ch,T} = to_ucs(S0#xmerl_scanner.encoding,Str), + case xmerl_lib:is_char(Ch) of + true -> + ?bump_col(1), + scan_entity_value(T, S, Delim, [Ch|Acc], PEName,Namespace,PENesting); + false -> + ?fatal({unexpected_char,Ch}, S0) + end. + + + +save_refed_entity_name(Name,PEName,S) -> + case predefined_entity(Name) of + true -> + S; + _ -> + save_refed_entity_name1(Name,PEName,S) + end. + +save_refed_entity_name1(Name,PEName, + S=#xmerl_scanner{entity_references=ERefs}) -> + case lists:keysearch(PEName,1,ERefs) of + {value,{_,Refs}} -> + NewRefs = + case lists:member(Name,Refs) of + true ->Refs; + _ -> [Name|Refs] + end, + S#xmerl_scanner{entity_references=lists:keyreplace(PEName,1,ERefs, + {PEName,NewRefs}) + }; + _ -> + S#xmerl_scanner{entity_references=[{PEName,[Name]}|ERefs]} + end. + + + +pe_push(Tok,Stack,_S) when Tok=="<!";Tok=="<?";Tok=="<!--";Tok=="<!["; + Tok=="[";Tok=="<";Tok=="</";Tok=="(" -> + [Tok|Stack]; +pe_push(Tok,Stack,#xmerl_scanner{validation=dtd}) + when Tok==")";Tok==">";Tok=="?>";Tok=="]]>";Tok=="-->";Tok=="/>"-> + [Tok|Stack]; +pe_push(_,Stack,_S) -> + Stack. + +pe_pop(">",["<!"|Rest],_S) -> Rest; +pe_pop("?>",["<?"|Rest],_S) -> Rest; +pe_pop("-->",["<!--"|Rest],_S) -> Rest; +pe_pop("]]>",["[","<!["|Rest],_S) -> Rest; +pe_pop("/>",["<"|Rest],_S) -> Rest; +pe_pop(">",["<"|Rest],_S) -> Rest; +pe_pop(">",["</"|Rest],_S) -> Rest; +pe_pop(")",["("|Rest],_S) -> Rest; +pe_pop(Token,_Stack,S=#xmerl_scanner{validation=dtd}) -> + ?fatal({error,{failed_VC_Proper_Declaration_PE_Nesting,5,Token}},S); +pe_pop(_,Rest,_) -> + Rest. + +pe_nesting_token("<!"++_T,parameter,dtd) -> "<!"; +pe_nesting_token("<?"++_T,parameter,dtd) -> "<?"; +pe_nesting_token("<!--"++_T,parameter,dtd) -> "<!--"; +pe_nesting_token("<!["++_T,parameter,dtd) -> "<!["; +pe_nesting_token("["++_T,parameter,dtd) -> "["; +pe_nesting_token("("++_T,parameter,dtd) -> "("; +pe_nesting_token(">"++_T,parameter,dtd) -> ">"; +pe_nesting_token("?>"++_T,parameter,dtd) -> "?>"; +pe_nesting_token("-->"++_T,parameter,dtd) -> "-->"; +pe_nesting_token("]]>"++_T,parameter,dtd) -> "]]>"; +pe_nesting_token(")"++_T,parameter,dtd) -> ")"; +pe_nesting_token("/>"++_T,parameter,dtd) -> "/>"; +pe_nesting_token(_,_,_) -> false. + +predefined_entity(amp) -> true; +predefined_entity(lt) -> true; +predefined_entity(gt) -> true; +predefined_entity(apos) -> true; +predefined_entity(quot) -> true; +predefined_entity(_) -> false. + +check_entity_recursion(EName, + S=#xmerl_scanner{entity_references=EntityRefList}) -> + Set = sofs:family(EntityRefList), + case catch sofs:family_to_digraph(Set, [acyclic]) of + {'EXIT',{cyclic,_}} -> + ?fatal({illegal_recursion_in_Entity, EName}, S); + DG -> + digraph:delete(DG), + ok + end. + + + + +%%%%%%% [15] Comment +scan_comment(Str, S) -> + scan_comment(Str, S, _Pos = undefined, _Parents = [], _Lang = []). + +scan_comment(Str,S=#xmerl_scanner{col=C,event_fun=Event}, Pos, Parents, Lang) -> + Comment = #xmlComment{pos = Pos, + parents = Parents, + language = Lang, + value = undefined}, + S1 = #xmerl_scanner{} = Event(#xmerl_event{event = started, + line = S#xmerl_scanner.line, + col = C, + pos = Pos, + data = Comment}, S), + + scan_comment1(Str, S1, Pos, Comment, _Acc = []). + +scan_comment1([], S=#xmerl_scanner{continuation_fun = F}, + Pos, Comment, Acc) -> + ?dbg("cont()...~n", []), + F(fun(MoreBytes, S1) -> scan_comment1(MoreBytes, S1, Pos, Comment, Acc) end, + fun(S1) -> ?fatal(unexpected_end, S1) end, + S); +scan_comment1("-->" ++ T, S0 = #xmerl_scanner{col = C, + event_fun = Event, + hook_fun = Hook}, + _Pos, Comment, Acc) -> + ?bump_col(3), + Comment1 = Comment#xmlComment{value = lists:reverse(Acc)}, + S1=#xmerl_scanner{}=Event(#xmerl_event{event = ended, + line=S#xmerl_scanner.line, + col = C, + data = Comment1}, S), + {Ret, S2} = Hook(Comment1, S1), + {_,T3,S3}=strip(T,S2), + {Ret,T3,S3}; +scan_comment1("--"++T,S,_Pos,_Comment,_Acc) -> + ?fatal({invalid_comment,"--"++[hd(T)]}, S); +scan_comment1("\n" ++ T, S=#xmerl_scanner{line = L}, Pos, Cmt, Acc) -> + scan_comment1(T, S#xmerl_scanner{line=L+1,col=1},Pos, Cmt, "\n" ++ Acc); +scan_comment1("\r\n" ++ T, S=#xmerl_scanner{line = L}, Pos, Cmt, Acc) -> + %% CR followed by LF is read as a single LF + scan_comment1(T, S#xmerl_scanner{line=L+1,col=1}, Pos, Cmt, "\n" ++ Acc); +scan_comment1("\r" ++ T, S=#xmerl_scanner{line = L}, Pos, Cmt, Acc) -> + %% CR not followed by LF is read as a LF + scan_comment1(T, S#xmerl_scanner{line=L+1,col=1}, Pos, Cmt, "\n" ++ Acc); +scan_comment1(Str, S=#xmerl_scanner{col = C}, Pos, Cmt, Acc) -> + {Ch,T} = wfc_legal_char(Str,S), + scan_comment1(T, S#xmerl_scanner{col=C+1}, Pos, Cmt, [Ch|Acc]). + +%%%%%%% + +scan_markup_completion_gt([$>|_R]=T,S) -> + {T,S}; +scan_markup_completion_gt([$%|T],S0) -> + ?bump_col(1), + {Name,T1,S1} = scan_pe_reference(T,S), + ExpandedRef = expand_pe_reference(Name,S1,as_PE), + {_,T2,S2} = strip(ExpandedRef++T1,S1), + scan_markup_completion_gt(T2,S2); +scan_markup_completion_gt(T,S) -> + ?fatal({error,{malformed_syntax_entity_completion,T}},S). + + +scan_mandatory(Pattern,T,N,S,ErrorMsg) -> + case lists:prefix(Pattern,T) of + true -> + lists:nthtail(N,T); + _ -> + ?fatal(ErrorMsg,S) + end. + + +strip(Str,S) -> + strip(Str,S,all). + +strip([], S=#xmerl_scanner{continuation_fun = F},_) -> + ?dbg("cont()... stripping whitespace~n", []), + F(fun(MoreBytes, S1) -> strip(MoreBytes, S1) end, + fun(S1) -> {[], [], S1} end, + S); +strip("\s" ++ T, S=#xmerl_scanner{col = C},Lim) -> + strip(T, S#xmerl_scanner{col = C+1},Lim); +strip("\t" ++ _T, S ,no_tab) -> + ?fatal({error,{no_tab_allowed}},S); +strip("\t" ++ T, S=#xmerl_scanner{col = C},Lim) -> + strip(T, S#xmerl_scanner{col = expand_tab(C)},Lim); +strip("\n" ++ T, S=#xmerl_scanner{line = L},Lim) -> + strip(T, S#xmerl_scanner{line = L+1, col = 1},Lim); +strip("\r\n" ++ T, S=#xmerl_scanner{line = L},Lim) -> + %% CR followed by LF is read as a single LF + strip(T, S#xmerl_scanner{line = L+1, col = 1},Lim); +strip("\r" ++ T, S=#xmerl_scanner{line = L},Lim) -> + %% CR not followed by LF is read as a LF + strip(T, S#xmerl_scanner{line = L+1, col = 1},Lim); +strip(Str, S,_Lim) -> + {[], Str, S}. + +%% demands a whitespace, though a parameter entity is ok, it will +%% expand with a whitespace on each side. +mandatory_strip([],S) -> + ?fatal({error,{whitespace_was_expected}},S); +mandatory_strip(T,S) when ?whitespace(hd(T)) -> + strip(T,S,all); +mandatory_strip([$%|T],S) when ?whitespace(hd(T)) -> %this is not a PERefence, but an PEDeclaration + ?fatal({error,{whitespace_was_expected}},S); +mandatory_strip([$%|_T]=T,S) -> + {[],T,S}; +mandatory_strip(_T,S) -> + ?fatal({error,{whitespace_was_expected}},S). + +%% strip but don't accept tab +pub_id_strip(Str, S) -> + strip(Str,S,no_tab). + + +normalize("&"++T,S,IsNorm) -> + case scan_reference(T, S) of + {ExpRef, T1, S1} when ?whitespace(hd(ExpRef)) -> + ExpRef2 = string_to_char_set(S#xmerl_scanner.encoding,ExpRef), + normalize(ExpRef2++T1,S1,IsNorm); + _ -> + {"&"++T,S,IsNorm} + end; +normalize(T,S,IsNorm) -> + case strip(T,S) of + {_,T,S} -> + {T,S,IsNorm}; + {_,T1,S1} -> + {T1,S1,true} + end. + + +%% Optimization: +%% - avoid building list of spaces or tabs; +%% - avoid reverse; +%% - compact two common indentation patterns. +%% Note: only to be called when a \n was found. +fast_accumulate_whitespace(" " ++ T, S, _) -> + fast_acc_spaces(T, S, 1); +fast_accumulate_whitespace("\t"++T, S, _) -> + fast_acc_tabs(T, S, 1); +fast_accumulate_whitespace("<"++_=R, S, _T) -> + #xmerl_scanner{common_data = CD, line = Line} = S, + {done, {element(3, CD), R, S#xmerl_scanner{col = 1, line = Line + 1}}}; +fast_accumulate_whitespace(_, S, T) -> + accumulate_whitespace(T, S, []). + +fast_acc_spaces(" " ++ T, S, N) -> + fast_acc_spaces(T, S, N + 1); +fast_acc_spaces(T, S, N) -> + fast_acc_end(T, S, N, N, $\s, 1). + +fast_acc_tabs("\t" ++ T, S, N) -> + fast_acc_tabs(T, S, N + 1); +fast_acc_tabs(T, S, N) -> + fast_acc_end(T, S, N, N * 8 + 1, $\t, 2). + +fast_acc_end(T, S, N, Col, C, CD_I) -> + #xmerl_scanner{common_data = CD, line = Line0} = S, + Line = Line0 + 1, + try + $< = hd(T), + {done,{element(N, element(CD_I, CD)), T, + S#xmerl_scanner{col = Col, line = Line}}} + catch _:_ -> + accumulate_whitespace(T, S, Line, Col, lists:duplicate(N, C)++"\n") + end. + + +%%% @spec accumulate_whitespace(T::string(),S::global_state(), +%%% atom(),Acc::string()) -> {Acc, T1, S1} +%%% +%%% @doc Function to accumulate and normalize whitespace. +accumulate_whitespace(T, S, preserve, Acc) -> + accumulate_whitespace(T, S, Acc); +accumulate_whitespace(T, S, normalize, Acc) -> + {_WsAcc, T1, S1} = accumulate_whitespace(T, S, []), + {[$\s|Acc], T1, S1}. + +accumulate_whitespace(T, S, Acc) -> + #xmerl_scanner{line = Line, col = Col} = S, + accumulate_whitespace(T, S, Line, Col, Acc). + +accumulate_whitespace([], S0, Line, Col, Acc) -> + #xmerl_scanner{continuation_fun = F} = S0, + S = S0#xmerl_scanner{line = Line, col = Col}, + ?dbg("cont()...~n", []), + F(fun(MoreBytes, S1) -> accumulate_whitespace(MoreBytes, S1, Acc) end, + fun(S1) -> {Acc, [], S1} end, + S); +accumulate_whitespace("\s" ++ T, S, Line, Col, Acc) -> + accumulate_whitespace(T, S, Line, Col+1, [$\s|Acc]); +accumulate_whitespace("\t" ++ T, S, Line, Col, Acc) -> + accumulate_whitespace(T, S, Line, expand_tab(Col), [$\t|Acc]); +accumulate_whitespace("\n" ++ T, S, Line, _Col, Acc) -> + accumulate_whitespace(T, S, Line+1, 1, [$\n|Acc]); +accumulate_whitespace("\r\n" ++ T, S, Line, _Col, Acc) -> + %% CR followed by LF is read as a single LF + accumulate_whitespace(T, S, Line+1, 1, [$\n|Acc]); +accumulate_whitespace("\r" ++ T, S, Line, _Col, Acc) -> + %% CR not followed by LF is read as a LF + accumulate_whitespace(T, S, Line+1, 1, [$\n|Acc]); +accumulate_whitespace(Str, S, Line, Col, Acc) -> + {Acc, Str, S#xmerl_scanner{line = Line, col = Col}}. + +expand_tab(Col) -> + Rem = (Col-1) rem 8, + _NewCol = Col + 8 - Rem. + +%% validation_mode(Validation) +%% Validation = off | dtd | schema | true | false +%% true and false are obsolete +validation_mode(false) -> + off; +validation_mode(true) -> + dtd; +validation_mode(Other) -> + Other. + + +schemaLocations(El,#xmerl_scanner{schemaLocation=[]}) -> + schemaLocations(El); +schemaLocations(El,#xmerl_scanner{schemaLocation=SL}) -> + case SL of + [{_,_}|_] -> + {ok,SL}; + _ -> + schemaLocations(El) + end. + +schemaLocations(#xmlElement{attributes=Atts,xmlbase=_Base}) -> + Pred = fun(#xmlAttribute{name=schemaLocation}) -> false; + (#xmlAttribute{namespace={_,"schemaLocation"}}) -> false; + (_) -> true + end, + case lists:dropwhile(Pred,Atts) of + [#xmlAttribute{value=Paths}|_] -> + + case string:tokens(Paths," ") of + L when length(L) > 0 -> + case length(L) rem 2 of + 0 -> + PairList = + fun([],_Fun) -> + []; + ([SLNS,SLLoc|Rest],Fun) -> + [{SLNS,SLLoc}|Fun(Rest,Fun)] + end, + {ok,PairList(L,PairList)}; + _ -> + {error,{schemaLocation_attribute,namespace_location_not_in_pair}} + end; + _ -> + {error,{missing_schemaLocation}} + end; + [] -> + {error,{missing_schemaLocation}} + end. + +inherit_options(S) -> + %%io:format("xsdbase: ~p~n",[S#xmerl_scanner.xmlbase]), + [{xsdbase,S#xmerl_scanner.xmlbase}]. + +handle_schema_result({XSDRes=#xmlElement{},_},S5) -> + {XSDRes,S5}; +handle_schema_result({error,Reason},S5) -> + ?fatal({failed_schema_validation,Reason},S5). + +%%% Helper functions + +fatal(Reason, S) -> + exit({fatal, {Reason, + {file,S#xmerl_scanner.filename}, + {line,S#xmerl_scanner.line}, + {col,S#xmerl_scanner.col}}}). + +%% preformat formats tokens in L1 and L2, L2 separated by Sep into a +%% list +preformat(L1,L2,Sep) -> + Format1= lists:flatten(lists:duplicate(length(L1)-1,"~s ")++"~s"), + Format2 = lists:flatten(lists:duplicate(length(L2)-1, + " ~s"++Sep)++" ~s"), + + lists:flatten(io_lib:format(Format1++Format2,L1++L2)). + + +%% BUG when we are many <!ATTLIST ..> balise none attributes has save in rules +rules_write(Context, Name, Value, #xmerl_scanner{rules = T} = S) -> + case ets:lookup(T, {Context, Name}) of + [] -> + ets:insert(T, {{Context, Name}, Value}); + _ -> + ok + end, + S. + + +rules_read(Context, Name, #xmerl_scanner{rules = T}) -> + case ets:lookup(T, {Context, Name}) of + [] -> + undefined; + [{_, V}] -> + V + end. + +rules_delete(Context,Name,#xmerl_scanner{rules = T}) -> + ets:delete(T,{Context,Name}). + +to_ucs(Encoding, Chars) when Encoding=="utf-8"; Encoding == undefined -> + utf8_2_ucs(Chars); +to_ucs(_,[C|Rest]) -> + {C,Rest}. + +utf8_2_ucs([A,B,C,D|Rest]) when A band 16#f8 =:= 16#f0, + B band 16#c0 =:= 16#80, + C band 16#c0 =:= 16#80, + D band 16#c0 =:= 16#80 -> + %% 11110vvv 10vvvvvv 10vvvvvv 10vvvvvv + case ((D band 16#3f) bor ((C band 16#3f) bsl 6) bor + ((B band 16#3f) bsl 12) bor ((A band 16#07) bsl 18)) of + Ch when Ch >= 16#10000 -> + {Ch,Rest}; + Ch -> + {{error,{bad_character,Ch}},Rest} + end; +utf8_2_ucs([A,B,C|Rest]) when A band 16#f0 =:= 16#e0, + B band 16#c0 =:= 16#80, + C band 16#c0 =:= 16#80 -> + %% 1110vvvv 10vvvvvv 10vvvvvv + case ((C band 16#3f) bor ((B band 16#3f) bsl 6) bor + ((A band 16#0f) bsl 12)) of + Ch when Ch >= 16#800 -> + {Ch,Rest}; + Ch -> + {{error,{bad_character,Ch}},Rest} + end; +utf8_2_ucs([A,B|Rest]) when A band 16#e0 =:= 16#c0, + B band 16#c0 =:= 16#80 -> + %% 110vvvvv 10vvvvvv + case ((B band 16#3f) bor ((A band 16#1f) bsl 6)) of + Ch when Ch >= 16#80 -> + {Ch,Rest}; + Ch -> + {{error,{bad_character,Ch}},Rest} + end; +utf8_2_ucs([A|Rest]) when A < 16#80 -> + {A,Rest}; +utf8_2_ucs([A|Rest]) -> + {{error,{bad_character,A}},Rest}. + +to_char_set("iso-10646-utf-1",Ch) -> + [Ch]; +to_char_set(UTF8,Ch) when UTF8 =:= "utf-8"; UTF8 =:= undefined -> + ucs_2_utf8(Ch); +to_char_set(_,Ch) -> + [Ch]. + +ucs_2_utf8(Ch) when Ch < 128 -> + %% 0vvvvvvv + [Ch]; +ucs_2_utf8(Ch) when Ch < 16#0800 -> + %% Ch: -----vvv vvvvvvvv + %% 110vvvvv 10vvvvvv + %% O1 = (Ch band 16#07c0) bsr 6, + %% O2 = (Ch band 16#003f), + [((Ch band 16#07c0) bsr 6) bor 16#c0,(Ch band 16#003f) bor 16#80]; +ucs_2_utf8(Ch) when Ch < 16#10000 -> + %% Ch: vvvvvvvv vvvvvvvv + %% 1110vvvv 10vvvvvv 10vvvvvv + %% O1 = (Ch band 16#f000) bsr 12 + %% O2 = (Ch band 16#0fc0) bsr 6 + %% O3 = (Ch band 16#003f) + [((Ch band 16#f000) bsr 12) bor 16#e0, + ((Ch band 16#0fc0) bsr 6) bor 16#80, + (Ch band 16#003f) bor 16#80]; +ucs_2_utf8(Ch) when Ch < 16#200000 -> + %% Ch: ---vvvvv vvvvvvvv vvvvvvvv + %% 11110vvv 10vvvvvv 10vvvvvv 10vvvvvv + %% O1 = (Ch band 16#1c0000) bsr 18 + %% O2 = (Ch band 16#03f000) bsr 12 + %% O3 = (Ch band 16#000fc0) bsr 6 + %% O4 = (Ch band 16#00003f) + [((Ch band 16#1c0000) bsr 18) bor 16#f0, + ((Ch band 16#03f000) bsr 12) bor 16#80, + ((Ch band 16#000fc0) bsr 6) bor 16#80, + (Ch band 16#00003f) bor 16#80]. + + +string_to_char_set(Enc,Str) when Enc =:= "utf-8"; Enc =:= undefined -> + lists:flatten([ucs_2_utf8(X)||X <- Str]); +string_to_char_set(_,Str) -> + Str. + +%% diagnose(Line) -> +%% Mem=erlang:memory(), +%% {OldTot,OldLine} = get_total(), +%% NewTot = +%% case {lists:keysearch(total,1,Mem),OldTot*1.1} of +%% {{_,{_,Tot}},Tot110} when Tot > Tot110 -> +%% io:format("From ~p to ~p, total memory: ~p (~p)~n",[OldLine,Line,Tot,OldTot]), +%% Tot; +%% {{_,{_,Tot}},_} -> +%% Tot +%% end, +%% put_total({NewTot,Line}). + +%% get_total() -> +%% case get(xmerl_mem) of +%% undefined -> +%% put(xmerl_mem,{0,0}), +%% {0,0}; +%% M -> M +%% end. + +%% put_total(M) -> +%% put(xmerl_mem,M). diff --git a/lib/xmerl/src/xmerl_sgml.erl b/lib/xmerl/src/xmerl_sgml.erl new file mode 100644 index 0000000000..1fe6453e7f --- /dev/null +++ b/lib/xmerl/src/xmerl_sgml.erl @@ -0,0 +1,65 @@ +%% +%% %CopyrightBegin% +%% +%% Copyright Ericsson AB 2004-2009. All Rights Reserved. +%% +%% The contents of this file are subject to the Erlang Public License, +%% Version 1.1, (the "License"); you may not use this file except in +%% compliance with the License. You should have received a copy of the +%% Erlang Public License along with this software. If not, it can be +%% retrieved online at http://www.erlang.org/. +%% +%% Software distributed under the License is distributed on an "AS IS" +%% basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See +%% the License for the specific language governing rights and limitations +%% under the License. +%% +%% %CopyrightEnd% +%% + + %% Description : Callback module for exporting XML to SGML. + +-module(xmerl_sgml). + +-export(['#xml-inheritance#'/0]). + +%% Note: we assume XML data, so all tags are lowercase! + +-export(['#root#'/4, + '#element#'/5, + '#text#'/1]). + +-import(xmerl_lib, [markup/3, find_attribute/2, export_text/1]). + +-include("xmerl.hrl"). + + +'#xml-inheritance#'() -> []. + + +%% The '#text#' function is called for every text segment. + +'#text#'(Text) -> + export_text(Text). + + +%% The '#root#' tag is called when the entire structure has been +%% exported. It does not appear in the structure itself. + +'#root#'(Data, Attrs, [], _E) -> + case find_attribute(header, Attrs) of + {value, Hdr} -> + [Hdr, Data]; + false -> + Data + end. + + +%% Note that SGML does not have the <Tag/> empty-element form. +%% Furthermore, for some element types, the end tag may be forbidden - +%% this can be handled by extending this module - see xmerl_otpsgml.erl +%% for an example. (By default, we always generate the end tag, to make +%% sure that the scope of a markup is not extended by mistake.) + +'#element#'(Tag, Data, Attrs, _Parents, _E) -> + markup(Tag, Attrs, Data). diff --git a/lib/xmerl/src/xmerl_simple.erl b/lib/xmerl/src/xmerl_simple.erl new file mode 100644 index 0000000000..573dcd2a62 --- /dev/null +++ b/lib/xmerl/src/xmerl_simple.erl @@ -0,0 +1,109 @@ +%% +%% %CopyrightBegin% +%% +%% Copyright Ericsson AB 2003-2009. All Rights Reserved. +%% +%% The contents of this file are subject to the Erlang Public License, +%% Version 1.1, (the "License"); you may not use this file except in +%% compliance with the License. You should have received a copy of the +%% Erlang Public License along with this software. If not, it can be +%% retrieved online at http://www.erlang.org/. +%% +%% Software distributed under the License is distributed on an "AS IS" +%% basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See +%% the License for the specific language governing rights and limitations +%% under the License. +%% +%% %CopyrightEnd% +%% + + +%% Description : Simple event-based processor (front-end to xmerl_scanner) + + +-module(xmerl_simple). + +-export([file/2, + string/2]). + +-include("xmerl.hrl"). + +-record(state, {content_acc = [], + attr_acc = [], + content_stack = [], + attr_stack = []}). + +file(Fname, Opts) -> + Opts1 = scanner_options(Opts), + xmerl_scan:file(Fname, Opts1). + +string(Str, Opts) -> + Opts1 = scanner_options(Opts), + xmerl_scan:string(Str, Opts1). + +scanner_options(Opts) -> + EventS = #state{}, + scanner_options(Opts, + [{event_fun, fun event/2, EventS}, + {acc_fun, fun(_, Acc, S) -> {Acc,S} end}, + {close_fun, fun close/1}]). + +scanner_options([H|T], Opts) -> + case catch keyreplace(H, 1, Opts) of + false -> + scanner_options(T, [H|Opts]); + NewOpts -> + scanner_options(T, NewOpts) + end; +scanner_options([], Opts) -> + Opts. + +keyreplace(X, Pos, [H|T]) when element(Pos, X) == element(Pos, H) -> + [X|T]; +keyreplace(X, Pos, [H|T]) -> + [H|keyreplace(X, Pos, T)]; +keyreplace(_X, _Pos, []) -> + throw(false). + + +close(S) -> + ES = xmerl_scan:event_state(S), + #state{attr_stack = [], + content_stack = [], + %% attr_acc may contain document attributes + content_acc = Content} = ES, + lists:reverse(Content). + +event(#xmerl_event{event = started, data = #xmlElement{}}, S) -> + #state{content_acc = CAcc, + attr_acc = AAcc, + content_stack = CSt, + attr_stack = ASt} = ES = xmerl_scan:event_state(S), + xmerl_scan:event_state(ES#state{content_acc = [], + attr_acc = [], + content_stack = [CAcc | CSt], + attr_stack = [AAcc | ASt]}, S); + +event(#xmerl_event{event = ended, data = #xmlElement{name = Name}}, S) -> + #state{content_acc = CAcc, + attr_acc = AAcc, + content_stack = [PrevCAcc | CSt], + attr_stack = [PrevAAcc | ASt]} = ES = xmerl_scan:event_state(S), + Simple = {Name, lists:reverse(AAcc), lists:reverse(CAcc)}, + xmerl_scan:event_state(ES#state{content_acc = [Simple|PrevCAcc], + attr_acc = PrevAAcc, + content_stack = CSt, + attr_stack = ASt}, S); + +event(#xmerl_event{event = ended, data = #xmlAttribute{name = Name, + value = Value}}, S) -> + #state{attr_acc = AAcc} = ES = xmerl_scan:event_state(S), + Simple = {Name, Value}, + xmerl_scan:event_state(ES#state{attr_acc = [Simple|AAcc]}, S); + +event(#xmerl_event{event = ended, data = #xmlText{value = Text}}, S) -> + #state{content_acc = CAcc} = ES = xmerl_scan:event_state(S), + xmerl_scan:event_state(ES#state{content_acc = [Text|CAcc]}, S); + +event(_E, S) -> + S. diff --git a/lib/xmerl/src/xmerl_text.erl b/lib/xmerl/src/xmerl_text.erl new file mode 100644 index 0000000000..70df8369ff --- /dev/null +++ b/lib/xmerl/src/xmerl_text.erl @@ -0,0 +1,50 @@ +%% +%% %CopyrightBegin% +%% +%% Copyright Ericsson AB 2003-2009. All Rights Reserved. +%% +%% The contents of this file are subject to the Erlang Public License, +%% Version 1.1, (the "License"); you may not use this file except in +%% compliance with the License. You should have received a copy of the +%% Erlang Public License along with this software. If not, it can be +%% retrieved online at http://www.erlang.org/. +%% +%% Software distributed under the License is distributed on an "AS IS" +%% basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See +%% the License for the specific language governing rights and limitations +%% under the License. +%% +%% %CopyrightEnd% +%% + +%% Description : Callback module for exporting as raw text. + + +-module(xmerl_text). + +-export(['#xml-inheritance#'/0]). + +-export(['#root#'/4, + '#element#'/5, + '#text#'/1]). + +-include("xmerl.hrl"). + + +'#xml-inheritance#'() -> []. + + +%% The '#text#' function is called for every text segment. + +'#text#'(Text) -> Text. + + +%% The '#root#' tag is called when the entire structure has been +%% exported. It does not appear in the structure itself. + +'#root#'(Data, _Attrs, [], _E) -> Data. + + +%% The '#element#' function is the default handler for XML elements. + +'#element#'(_Tag, Data, _Attrs, _Parents, _E) -> Data. diff --git a/lib/xmerl/src/xmerl_ucs.erl b/lib/xmerl/src/xmerl_ucs.erl new file mode 100644 index 0000000000..7c45c838ab --- /dev/null +++ b/lib/xmerl/src/xmerl_ucs.erl @@ -0,0 +1,556 @@ +%% +%% %CopyrightBegin% +%% +%% Copyright Ericsson AB 2005-2009. All Rights Reserved. +%% +%% The contents of this file are subject to the Erlang Public License, +%% Version 1.1, (the "License"); you may not use this file except in +%% compliance with the License. You should have received a copy of the +%% Erlang Public License along with this software. If not, it can be +%% retrieved online at http://www.erlang.org/. +%% +%% Software distributed under the License is distributed on an "AS IS" +%% basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See +%% the License for the specific language governing rights and limitations +%% under the License. +%% +%% %CopyrightEnd% +%% + +-module(xmerl_ucs). + +-compile([verbose,report_warnings,warn_unused_vars]). + + + +%%% Conversion to/from IANA recognised character sets +-export([to_unicode/2]). + +%%% Micellaneous predicates +-export([is_iso10646/1, is_unicode/1, is_bmpchar/1, is_latin1/1, is_ascii/1, + is_visible_latin1/1, is_visible_ascii/1, is_iso646_basic/1, + is_incharset/2]). + +%%% Conversion to/from RFC-1345 style mnemonic strings consisting +%%% of subsets of ISO-10646 with "escape" sequences. +%-export([from_mnemonic/1, from_mnemonic/2]). + +%%% UCS-2, UCS-4, UTF-16, and UTF-8 encoding and decoding +-export([to_ucs2be/1,from_ucs2be/1, from_ucs2be/2]). +-export([to_ucs2le/1,from_ucs2le/1, from_ucs2le/2]). +-export([to_ucs4be/1,from_ucs4be/1, from_ucs4be/2]). +-export([to_ucs4le/1,from_ucs4le/1, from_ucs4le/2]). +-export([to_utf16be/1, from_utf16be/1, from_utf16be/2]). +-export([to_utf16le/1, from_utf16le/1, from_utf16le/2]). +-export([to_utf8/1, from_utf8/1]). + +%%% NB: Non-canonical UTF-8 encodings and incorrectly used +%%% surrogate-pair codes are disallowed by this code. There are +%%% important security implications concerning them. DO NOT REMOVE +%%% THE VARIOUS GUARDS AND TESTS THAT ENFORCE THIS POLICY. + +%%% Test if Ch is a legitimate ISO-10646 character code +is_iso10646(Ch) when is_integer(Ch), Ch >= 0 -> + if Ch < 16#D800 -> true; + Ch < 16#E000 -> false; % Surrogates + Ch < 16#FFFE -> true; + Ch =< 16#FFFF -> false; % FFFE and FFFF (not characters) + Ch =< 16#7FFFFFFF -> true; + true -> false + end; +is_iso10646(_) -> false. + +%%% Test if Ch is a legitimate ISO-10646 character code capable of +%%% being encoded in a UTF-16 string. +is_unicode(Ch) when Ch < 16#110000 -> is_iso10646(Ch); +is_unicode(_) -> false. + +%%% Test if Ch is a legitimate ISO-10646 character code belonging to +%%% the basic multi-lingual plane (BMP). +is_bmpchar(Ch) when is_integer(Ch), Ch >= 0 -> + if Ch < 16#D800 -> true; + Ch < 16#E000 -> false; % Surrogates + Ch < 16#FFFE -> true; + true -> false + end; +is_bmpchar(_) -> false. + +%%% Test for legitimate Latin-1 code +is_latin1(Ch) when is_integer(Ch), Ch >= 0, Ch =< 255 -> true; +is_latin1(_) -> false. + +%%% Test for legitimate ASCII code +is_ascii(Ch) when is_integer(Ch), Ch >= 0, Ch =< 127 -> true; +is_ascii(_) -> false. + +%%% Test for char an element of ISO-646.basic set +is_iso646_basic(Ch) when is_integer(Ch), Ch >= $\s -> + if Ch =< $Z -> + %% Everything in this range except $# $$ and $@ + if Ch > $$ -> Ch =/= $@; + true -> Ch < $# + end; + %% Only $_ and $a .. $z in range above $Z + Ch > $z -> false; + Ch >= $a -> true; + true -> Ch =:= $_ + end; +is_iso646_basic(_) -> + false. + +%%% Test for char a visible Latin-1 char, i.e. a non-control Latin-1 char, +%%% excepting non-break space (but including space). +is_visible_latin1(Ch) when is_integer(Ch), Ch >= $\s -> + if Ch =< $~ -> true; + Ch >= 161 -> Ch =< 255 + end; +is_visible_latin1(_) -> + false. + +%%% Test for char a visible ASCII char, i.e. a non-control ASCII char +%%% (including space). +is_visible_ascii(Ch) when is_integer(Ch), Ch >= $\s -> Ch =< $~; +is_visible_ascii(_) -> false. + + +%%% UCS-4, big and little endian versions, encoding and decoding +to_ucs4be(List) when is_list(List) -> lists:flatmap(fun to_ucs4be/1, List); +to_ucs4be(Ch) -> char_to_ucs4be(Ch). + +from_ucs4be(Bin) when is_binary(Bin) -> from_ucs4be(Bin,[],[]); +from_ucs4be(List) -> from_ucs4be(list_to_binary(List),[],[]). + +from_ucs4be(Bin,Tail) when is_binary(Bin) -> from_ucs4be(Bin,[],Tail); +from_ucs4be(List,Tail) -> from_ucs4be(list_to_binary(List),[],Tail). + +to_ucs4le(List) when is_list(List) -> lists:flatmap(fun to_ucs4le/1, List); +to_ucs4le(Ch) -> char_to_ucs4le(Ch). + +from_ucs4le(Bin) when is_binary(Bin) -> from_ucs4le(Bin,[],[]); +from_ucs4le(List) -> from_ucs4le(list_to_binary(List),[],[]). + +from_ucs4le(Bin,Tail) when is_binary(Bin) -> from_ucs4le(Bin,[],Tail); +from_ucs4le(List,Tail) -> from_ucs4le(list_to_binary(List),[],Tail). + +%%% UCS-2, big and little endian versions, encoding and decoding +to_ucs2be(List) when is_list(List) -> lists:flatmap(fun to_ucs2be/1, List); +to_ucs2be(Ch) -> char_to_ucs2be(Ch). + +from_ucs2be(Bin) when is_binary(Bin) -> from_ucs2be(Bin,[],[]); +from_ucs2be(List) -> from_ucs2be(list_to_binary(List),[],[]). + +from_ucs2be(Bin,Tail) when is_binary(Bin) -> from_ucs2be(Bin,[],Tail); +from_ucs2be(List,Tail) -> from_ucs2be(list_to_binary(List),[],Tail). + +to_ucs2le(List) when is_list(List) -> lists:flatmap(fun to_ucs2le/1, List); +to_ucs2le(Ch) -> char_to_ucs2le(Ch). + +from_ucs2le(Bin) when is_binary(Bin) -> from_ucs2le(Bin,[],[]); +from_ucs2le(List) -> from_ucs2le(list_to_binary(List),[],[]). + +from_ucs2le(Bin,Tail) when is_binary(Bin) -> from_ucs2le(Bin,[],Tail); +from_ucs2le(List,Tail) -> from_ucs2le(list_to_binary(List),[],Tail). + + +%%% UTF-16, big and little endian versions, encoding and decoding +to_utf16be(List) when is_list(List) -> lists:flatmap(fun to_utf16be/1, List); +to_utf16be(Ch) -> char_to_utf16be(Ch). + +from_utf16be(Bin) when is_binary(Bin) -> from_utf16be(Bin,[],[]); +from_utf16be(List) -> from_utf16be(list_to_binary(List),[],[]). + +from_utf16be(Bin,Tail) when is_binary(Bin) -> from_utf16be(Bin,[],Tail); +from_utf16be(List,Tail) -> from_utf16be(list_to_binary(List),[],Tail). + +to_utf16le(List) when is_list(List) -> lists:flatmap(fun to_utf16le/1, List); +to_utf16le(Ch) -> char_to_utf16le(Ch). + +from_utf16le(Bin) when is_binary(Bin) -> from_utf16le(Bin,[],[]); +from_utf16le(List) -> from_utf16le(list_to_binary(List),[],[]). + +from_utf16le(Bin,Tail) when is_binary(Bin) -> from_utf16le(Bin,[],Tail); +from_utf16le(List,Tail) -> from_utf16le(list_to_binary(List),[],Tail). + + +%%% UTF-8 encoding and decoding +to_utf8(List) when is_list(List) -> lists:flatmap(fun to_utf8/1, List); +to_utf8(Ch) -> char_to_utf8(Ch). + +from_utf8(Bin) when is_binary(Bin) -> from_utf8(binary_to_list(Bin)); +from_utf8(List) -> + case expand_utf8(List) of + {Result,0} -> Result; + {_Res,_NumBadChar} -> + exit({ucs,{bad_utf8_character_code}}) + end. + + + + +%%% UCS-4 support +%%% Possible errors encoding UCS-4: +%%% - Non-character values (something other than 0 .. 2^31-1) +%%% - Surrogate-pair code in string. +%%% - 16#FFFE or 16#FFFF character in string. +%%% Possible errors decoding UCS-4: +%%% - Element out of range (i.e. the "sign" bit is set). +%%% - Surrogate-pair code in string. +%%% - 16#FFFE or 16#FFFF character in string. +char_to_ucs4be(Ch) -> + true = is_iso10646(Ch), + [(Ch bsr 24), + (Ch bsr 16) band 16#FF, + (Ch bsr 8) band 16#FF, + Ch band 16#FF]. + +from_ucs4be(<<Ch:32/big-signed-integer, Rest/binary>>,Acc,Tail) -> + if Ch < 0; Ch >= 16#D800, Ch < 16#E000; Ch =:= 16#FFFE; Ch =:= 16#FFFF -> + exit({bad_character_code,Ch}); + true -> + from_ucs4be(Rest,[Ch|Acc],Tail) + end; +from_ucs4be(<<>>,Acc,Tail) -> + lists:reverse(Acc,Tail); +from_ucs4be(Bin,Acc,Tail) -> + io:format("ucs Error: Bin=~p~n Acc=~p~n Tail=~p~n",[Bin,Acc,Tail]), + {error,not_ucs4be}. + +char_to_ucs4le(Ch) -> + true = is_iso10646(Ch), + [Ch band 16#FF, + (Ch bsr 8) band 16#FF, + (Ch bsr 16) band 16#FF, + (Ch bsr 24)]. + + +from_ucs4le(<<Ch:32/little-signed-integer, Rest/binary>>,Acc,Tail) -> + if Ch < 0; Ch >= 16#D800, Ch < 16#E000; Ch =:= 16#FFFE; Ch =:= 16#FFFF -> + exit({bad_character_code,Ch}); + true -> + from_ucs4le(Rest,[Ch|Acc],Tail) + end; +from_ucs4le(<<>>,Acc,Tail) -> + lists:reverse(Acc,Tail); +from_ucs4le(Bin,Acc,Tail) -> + io:format("ucs Error: Bin=~p~n Acc=~p~n Tail=~p~n",[Bin,Acc,Tail]), + {error,not_ucs4le}. + + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +%%% UCS-2 support +%%% FIXME! Don't know how to encode UCS-2!! +%%% Currently I just encode as UCS-4, but strips the 16 higher bits. +char_to_ucs2be(Ch) -> + true = is_iso10646(Ch), + [(Ch bsr 8) band 16#FF, + Ch band 16#FF]. + +from_ucs2be(<<Ch:16/big-signed-integer, Rest/binary>>,Acc,Tail) -> + if Ch < 0; Ch >= 16#D800, Ch < 16#E000; Ch =:= 16#FFFE; Ch =:= 16#FFFF -> + exit({bad_character_code,Ch}); + true -> + from_ucs2be(Rest,[Ch|Acc],Tail) + end; +from_ucs2be(<<>>,Acc,Tail) -> + lists:reverse(Acc,Tail); +from_ucs2be(Bin,Acc,Tail) -> + io:format("ucs Error: Bin=~p~n Acc=~p~n Tail=~p~n",[Bin,Acc,Tail]), + {error,not_ucs2be}. + +char_to_ucs2le(Ch) -> + true = is_iso10646(Ch), + [(Ch bsr 16) band 16#FF, + (Ch bsr 24)]. + + +from_ucs2le(<<Ch:16/little-signed-integer, Rest/binary>>,Acc,Tail) -> + if Ch < 0; Ch >= 16#D800, Ch < 16#E000; Ch =:= 16#FFFE; Ch =:= 16#FFFF -> + exit({bad_character_code,Ch}); + true -> + from_ucs4le(Rest,[Ch|Acc],Tail) + end; +from_ucs2le(<<>>,Acc,Tail) -> + lists:reverse(Acc,Tail); +from_ucs2le(Bin,Acc,Tail) -> + io:format("ucs Error: Bin=~p~n Acc=~p~n Tail=~p~n",[Bin,Acc,Tail]), + {error,not_ucs2le}. + + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +%%% UTF-16 support +%%% Possible errors encoding UTF-16 +%%% - Non-character values (something other than 0 .. 2^31-1) +%%% - Surrogate-pair code in string. +%%% - 16#FFFE or 16#FFFF character in string. +%%% NB: the UCS replacement char (U+FFFD) will be quietly substituted +%%% for unrepresentable chars (i.e. those geq to 2^20+2^16). +%%% Possible errors decoding UTF-16: +%%% - Unmatched surrogate-pair code in string. +%%% - 16#FFFE or 16#FFFF character in string. +char_to_utf16be(Ch) when is_integer(Ch), Ch >= 0 -> + if Ch =< 16#FFFF -> + if Ch < 16#D800; Ch >= 16#E000, Ch < 16#FFFE -> + [Ch bsr 8, Ch band 16#FF] + end; + Ch < 16#110000 -> + %% Encode with surrogate pair + X = Ch - 16#10000, + [16#D8 + (X bsr 18), + (X bsr 10) band 16#FF, + 16#DC + ((X bsr 8) band 3), + X band 16#FF]; + Ch =< 16#7FFFFFFF -> + %% Unrepresentable char: use REPLACEMENT CHARACTER (U+FFFD) + [16#FF, 16#FD] + end. + +from_utf16be(<<Ch:16/big-unsigned-integer, Rest/binary>>, Acc, Tail) + when Ch < 16#D800; Ch > 16#DFFF -> + if Ch < 16#FFFE -> from_utf16be(Rest,[Ch|Acc],Tail) end; +from_utf16be(<<Hi:16/big-unsigned-integer, Lo:16/big-unsigned-integer, + Rest/binary>>, Acc, Tail) + when Hi >= 16#D800, Hi < 16#DC00, Lo >= 16#DC00, Lo =< 16#DFFF -> + %% Surrogate pair + Ch = ((Hi band 16#3FF) bsl 10) + (Lo band 16#3FF) + 16#10000, + from_utf16be(Rest, [Ch|Acc], Tail); +from_utf16be(<<>>,Acc,Tail) -> + lists:reverse(Acc,Tail); +from_utf16be(Bin,Acc,Tail) -> + io:format("ucs Error: Bin=~p~n Acc=~p~n Tail=~p~n",[Bin,Acc,Tail]), + {error,not_utf16be}. + +char_to_utf16le(Ch) when is_integer(Ch), Ch >= 0 -> + if Ch =< 16#FFFF -> + if Ch < 16#D800; Ch >= 16#E000, Ch < 16#FFFE -> + [Ch band 16#FF, Ch bsr 8] + end; + Ch < 16#110000 -> + %% Encode with surrogate pair + X = Ch - 16#10000, + [(X bsr 10) band 16#FF, + 16#D8 + (X bsr 18), + X band 16#FF, + 16#DC + ((X bsr 8) band 3)]; + Ch =< 16#7FFFFFFF -> + %% Unrepresentable char: use REPLACEMENT CHARACTER (U+FFFD) + [16#FD, 16#FF] + end. + +from_utf16le(<<Ch:16/little-unsigned-integer, Rest/binary>>, Acc, Tail) + when Ch < 16#D800; Ch > 16#DFFF -> + if Ch < 16#FFFE -> from_utf16le(Rest, [Ch|Acc], Tail) end; +from_utf16le(<<Hi:16/little-unsigned-integer, Lo:16/little-unsigned-integer, + Rest/binary>>, Acc, Tail) + when Hi >= 16#D800, Hi < 16#DC00, Lo >= 16#DC00, Lo =< 16#DFFF -> + %% Surrogate pair + Ch = ((Hi band 16#3FF) bsl 10) + (Lo band 16#3FF) + 16#10000, + from_utf16le(Rest, [Ch|Acc], Tail); +from_utf16le(<<>>,Acc,Tail) -> + lists:reverse(Acc,Tail); +from_utf16le(Bin,Acc,Tail) -> + io:format("ucs Error: Bin=~p~n Acc=~p~n Tail=~p~n",[Bin,Acc,Tail]), + {error,not_utf16le}. + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +%%% UTF-8 support +%%% Possible errors encoding UTF-8: +%%% - Non-character values (something other than 0 .. 2^31-1). +%%% - Surrogate pair code in string. +%%% - 16#FFFE or 16#FFFF character in string. +%%% Possible errors decoding UTF-8: +%%% - 10xxxxxx or 1111111x as initial byte. +%%% - Insufficient number of 10xxxxxx octets following an initial octet of +%%% multi-octet sequence. +%%% - Non-canonical encoding used. +%%% - Surrogate-pair code encoded as UTF-8. +%%% - 16#FFFE or 16#FFFF character in string. +char_to_utf8(Ch) when is_integer(Ch), Ch >= 0 -> + if Ch < 128 -> + %% 0yyyyyyy + [Ch]; + Ch < 16#800 -> + %% 110xxxxy 10yyyyyy + [16#C0 + (Ch bsr 6), + 128+(Ch band 16#3F)]; + Ch < 16#10000 -> + %% 1110xxxx 10xyyyyy 10yyyyyy + if Ch < 16#D800; Ch > 16#DFFF, Ch < 16#FFFE -> + [16#E0 + (Ch bsr 12), + 128+((Ch bsr 6) band 16#3F), + 128+(Ch band 16#3F)] + end; + Ch < 16#200000 -> + %% 11110xxx 10xxyyyy 10yyyyyy 10yyyyyy + [16#F0+(Ch bsr 18), + 128+((Ch bsr 12) band 16#3F), + 128+((Ch bsr 6) band 16#3F), + 128+(Ch band 16#3F)]; + Ch < 16#4000000 -> + %% 111110xx 10xxxyyy 10yyyyyy 10yyyyyy 10yyyyyy + [16#F8+(Ch bsr 24), + 128+((Ch bsr 18) band 16#3F), + 128+((Ch bsr 12) band 16#3F), + 128+((Ch bsr 6) band 16#3F), + 128+(Ch band 16#3F)]; + Ch < 16#80000000 -> + %% 1111110x 10xxxxyy 10yyyyyy 10yyyyyy 10yyyyyy 10yyyyyy + [16#FC+(Ch bsr 30), + 128+((Ch bsr 24) band 16#3F), + 128+((Ch bsr 18) band 16#3F), + 128+((Ch bsr 12) band 16#3F), + 128+((Ch bsr 6) band 16#3F), + 128+(Ch band 16#3F)] + end. + + + + +%% expand_utf8([Byte]) -> {[UnicodeChar],NumberOfBadBytes} +%% Expand UTF8 byte sequences to ISO 10646/Unicode +%% charactes. Any illegal bytes are removed and the number of +%% bad bytes are returned. +%% +%% Reference: +%% RFC 3629: "UTF-8, a transformation format of ISO 10646". + +expand_utf8(Str) -> + expand_utf8_1(Str, [], 0). + +expand_utf8_1([C|Cs], Acc, Bad) when C < 16#80 -> + %% Plain Ascii character. + expand_utf8_1(Cs, [C|Acc], Bad); +expand_utf8_1([C1,C2|Cs], Acc, Bad) when C1 band 16#E0 =:= 16#C0, + C2 band 16#C0 =:= 16#80 -> + case ((C1 band 16#1F) bsl 6) bor (C2 band 16#3F) of + C when 16#80 =< C -> + expand_utf8_1(Cs, [C|Acc], Bad); + _ -> + %% Bad range. + expand_utf8_1(Cs, Acc, Bad+1) + end; +expand_utf8_1([C1,C2,C3|Cs], Acc, Bad) when C1 band 16#F0 =:= 16#E0, + C2 band 16#C0 =:= 16#80, + C3 band 16#C0 =:= 16#80 -> + case ((((C1 band 16#0F) bsl 6) bor (C2 band 16#3F)) bsl 6) bor + (C3 band 16#3F) of + C when 16#800 =< C -> + expand_utf8_1(Cs, [C|Acc], Bad); + _ -> + %% Bad range. + expand_utf8_1(Cs, Acc, Bad+1) + end; +expand_utf8_1([C1,C2,C3,C4|Cs], Acc, Bad) when C1 band 16#F8 =:= 16#F0, + C2 band 16#C0 =:= 16#80, + C3 band 16#C0 =:= 16#80, + C4 band 16#C0 =:= 16#80 -> + case ((((((C1 band 16#0F) bsl 6) bor (C2 band 16#3F)) bsl 6) bor + (C3 band 16#3F)) bsl 6) bor (C4 band 16#3F) of + C when 16#10000 =< C -> + expand_utf8_1(Cs, [C|Acc], Bad); + _ -> + %% Bad range. + expand_utf8_1(Cs, Acc, Bad+1) + end; +expand_utf8_1([_|Cs], Acc, Bad) -> + %% Ignore bad character. + expand_utf8_1(Cs, Acc, Bad+1); +expand_utf8_1([], Acc, Bad) -> {lists:reverse(Acc),Bad}. + + + +%%% ---------------------------------------------------------------------------- +%%% Translation to/from any IANA defined character set, given that a mapping +%%% exists. Don't care about validating valid subsets of Unicode +to_unicode(Input,Cs) when Cs=='ansi_x3.4-1968';Cs=='iso-ir-6'; + Cs=='ansi_x3.4-1986';Cs=='iso_646.irv:1991'; + Cs=='ascii';Cs=='iso646-us';Cs=='us-ascii';Cs=='us'; + Cs=='ibm367';Cs=='cp367';Cs=='csascii' -> % US-ASCII + Input; +to_unicode(Input,Cs) when Cs=='iso-10646-utf-1';Cs=='csiso10646utf1' -> + Input; +to_unicode(Input,Cs) when Cs=='iso_646.basic:1983';Cs=='ref'; + Cs=='csiso646basic1983' -> + Input; +to_unicode(Input,Cs) when Cs=='iso_8859-1:1987';Cs=='iso-ir-100'; + Cs=='iso_8859-1';Cs=='iso-8859-1';Cs=='latin1'; + Cs=='l1';Cs=='ibm819'; + Cs=='cp819';Cs=='csisolatin1' -> + Input; +% to_unicode(Input,Cs) when Cs=='mnemonic';Cs=='"mnemonic+ascii+38'; +% Cs=='mnem';Cs=='"mnemonic+ascii+8200' -> +% from_mnemonic(Input); +to_unicode(Input,Cs) when Cs=='iso-10646-ucs-2';Cs=='csunicode' -> + from_ucs2be(Input); % Guess byteorder +to_unicode(Input,Cs) when Cs=='iso-10646-ucs-4';Cs=='csucs4' -> + from_ucs4be(Input); % Guess byteorder +to_unicode(Input,Cs) when Cs=='utf-16be';Cs=='utf-16' -> + from_utf16be(Input); +to_unicode(Input,'utf-16le') -> + from_utf16le(Input); +to_unicode(Input,'utf-8') -> + from_utf8(Input); +to_unicode(Input,Charset) -> + exit({bad_character_code,Input,Charset}). + %ucs_data:to_unicode(Input,Charset). + + + + +%%% Tests if Char is in Charset. +%%% Do this by trying to convert it into unicode, if possible a mapping was +%%% found and we are ok. +is_incharset(In,Cs) when Cs=='ansi_x3.4-1968';Cs=='iso-ir-6'; + Cs=='ansi_x3.4-1986';Cs=='iso_646.irv:1991'; + Cs=='ascii';Cs=='iso646-us';Cs=='us-ascii';Cs=='us'; + Cs=='ibm367';Cs=='cp367';Cs=='csascii' -> % US-ASCII + if + is_integer(In) -> is_ascii(In); + is_list(In) -> test_charset(fun is_ascii/1,In) + end; +is_incharset(In,Cs) when Cs=='iso-10646-utf-1';Cs=='csiso10646utf1' -> + if + is_integer(In) -> is_unicode(In); + is_list(In) -> test_charset(fun is_unicode/1, In) + end; +is_incharset(In,Cs) when Cs=='iso_646.basic:1983';Cs=='ref'; + Cs=='csiso646basic1983' -> + if + is_integer(In) -> is_iso646_basic(In); + is_list(In) -> test_charset(fun is_iso646_basic/1, In) + end; +is_incharset(In,Cs) when Cs=='iso_8859-1:1987';Cs=='iso-ir-100'; + Cs=='iso_8859-1';Cs=='iso-8859-1'; + Cs=='latin1';Cs=='l1';Cs=='ibm819'; + Cs=='cp819';Cs=='csisolatin1' -> + if + is_integer(In) -> is_latin1(In); + is_list(In) -> test_charset(fun is_latin1/1, In) + end; +is_incharset(In,Charset) when is_integer(In) -> + case to_unicode([In],Charset) of + {error,unsupported_charset} -> + {error,unsupported_charset}; + {error,_} -> + false; + [Int] when is_integer(Int) -> + true + end; +is_incharset(In,Charset) when is_list(In) -> + case to_unicode(In,Charset) of + {error,unsupported_charset} -> + {error,unsupported_charset}; + {error,_} -> + false; + [Int] when is_integer(Int) -> + true + end. + + +test_charset(Fun,Input) -> + case lists:all(Fun, Input) of + true -> + true; + _ -> + false + end. + diff --git a/lib/xmerl/src/xmerl_uri.erl b/lib/xmerl/src/xmerl_uri.erl new file mode 100644 index 0000000000..d8edb2e6e1 --- /dev/null +++ b/lib/xmerl/src/xmerl_uri.erl @@ -0,0 +1,478 @@ +%% +%% %CopyrightBegin% +%% +%% Copyright Ericsson AB 2005-2009. All Rights Reserved. +%% +%% The contents of this file are subject to the Erlang Public License, +%% Version 1.1, (the "License"); you may not use this file except in +%% compliance with the License. You should have received a copy of the +%% Erlang Public License along with this software. If not, it can be +%% retrieved online at http://www.erlang.org/. +%% +%% Software distributed under the License is distributed on an "AS IS" +%% basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See +%% the License for the specific language governing rights and limitations +%% under the License. +%% +%% %CopyrightEnd% +%% + + +-module(xmerl_uri). + + +-export([parse/1,resolve/2 +% scan_abspath/1 + ]). + + +%%% Parse URI and return {Scheme,Path} +%%% Note that Scheme specific parsing/validation is not handled here! +resolve(_Root,_Rel) -> + ok. + +%%% See "http://www.isi.edu/in-notes/iana/assignments/url-schemes" for a list of +%%% defined URL schemes and references to its sources. + +parse(URI) -> + case parse_scheme(URI) of + {http,Cont} -> parse_http(Cont,http); + {https,Cont} -> parse_http(Cont,https); + {ftp,Cont} -> parse_ftp(Cont,ftp); + {sip,Cont} -> parse_sip(Cont,sip); + {sips,Cont} -> parse_sip(Cont,sips); + {sms,Cont} -> parse_sms(Cont,sms); % Note: From old draft + {error,Error} -> {error,Error}; + {Scheme,Cont} -> {Scheme,Cont} + end. + + +%%% Parse the scheme. +parse_scheme(URI) -> + parse_scheme(URI,[]). + +parse_scheme([H|URI],Acc) when $a=<H,H=<$z; $A=<H,H=<$Z -> + parse_scheme2(URI,[H|Acc]); +parse_scheme(_,_) -> + {error,no_scheme}. + +parse_scheme2([H|URI],Acc) + when $a=<H,H=<$z; $A=<H,H=<$Z; $0=<H,H=<$9; H==$-;H==$+;H==$. -> + parse_scheme2(URI,[H|Acc]); +parse_scheme2([$:|URI],Acc) -> + {list_to_atom(lists:reverse(Acc)),URI}; +parse_scheme2(_,_) -> + {error,no_scheme}. + + +%%% ............................................................................ +-define(HTTP_DEFAULT_PORT, 80). +-define(HTTPS_DEFAULT_PORT, 443). + +%%% HTTP (Source RFC 2396, RFC 2616) +%%% http_URL = "*" | absoluteURI | abs_path [ "?" query ] | authority + +%%% http_URL = "http:" "//" host [ ":" port ] [ abs_path [ "?" query ]] +%%% Returns a tuple {http,Host,Port,Path,Query} where +%%% Host = string() Host value +%%% Port = string() Port value +%%% Path = string() Absolute path +%%% Query = string() Query or Fragment value +parse_http("//"++C0,Scheme) -> + case scan_hostport(C0,Scheme) of + {C1,Host,Port} -> + case scan_pathquery(C1) of + {error,Error} -> + {error,Error}; + {Path,Query} -> + {Scheme,Host,Port,Path,Query} + end; + {error,Error} -> + {error,Error} + end; +parse_http(_,_) -> + {error,invalid_url}. + +scan_pathquery(C0) -> + case scan_abspath(C0) of + {error,Error} -> + {error,Error}; + {[],[]} -> % Add implicit path + {"/",""}; + {"?"++C1,Path} -> + case scan_query(C1,[]) of + {error,Error} -> + {error,Error}; + Query -> + {Path,"?"++Query} + end; + {"#"++C1,Path} -> % 'query' and 'fragment' are both defined as '*uric' + case scan_query(C1,[]) of + {error,Error} -> + {error,Error}; + Fragment -> + {Path,"#"++Fragment} + end; + {[],Path} -> + {Path,""} + end. + + +%%% ............................................................................ +-define(FTP_DEFAULT_PORT, 21). + +%%% FTP (Source RFC 2396, RFC 1738, RFC 959) +%%% Encoded :, @, or / characters appearing within the username or +%%% password fields (as required by RFC 1738) are not handled. +%%% +%%% Note: This BNF has been modified to better fit with RFC 2396 +%%% ftp_URL = "ftp:" "//" [ ftp_userinfo "@"] host [ ":" port ] ftp_abs_path +%%% ftp_userinfo = ftp_user [ ":" ftp_password ] +%%% ftp_abs_path = "/" ftp_path_segments [ ";type=" ftp_type ] +%%% ftp_path_segments = ftp_segment *( "/" ftp_segment) +%%% ftp_segment = *[ ftp_uchar | "?" | ":" | "@" | "&" | "=" ] +%%% ftp_type = "A" | "I" | "D" | "a" | "i" | "d" +%%% ftp_user = *[ ftp_uchar | ";" | "?" | "&" | "=" ] +%%% ftp_password = *[ ftp_uchar | ";" | "?" | "&" | "=" ] +%%% ftp_uchar = ftp_unreserved | escaped +%%% ftp_unreserved = alphanum | mark | "$" | "+" | "," +parse_ftp("//"++C0,Scheme) -> + case ftp_userinfo(C0) of + {error, Error} -> + {error,Error}; + {C1,Creds} -> + case scan_hostport(C1,Scheme) of + {C2,Host,Port} -> + case scan_abspath(C2) of + {error,Error} -> + {error,Error}; + {[],[]} -> % Add implicit path + {Scheme,Creds,Host,Port,"/"}; + {[],Path} -> + {Scheme,Creds,Host,Port,Path} + end; + {error,Error} -> + {error,Error} + end + end. + +ftp_userinfo(C0) -> + ftp_userinfo(C0, []). + +ftp_userinfo([], Acc) -> + {lists:reverse(Acc), {"",""}}; +ftp_userinfo(C0=[$/ |_], Acc) -> + {lists:reverse(Acc)++C0, {"",""}}; +ftp_userinfo([$@ |C0], Acc) -> + {C0, ftp_userinfo_1(lists:reverse(Acc), 0, "", "")}; +ftp_userinfo([C |C0], Acc) -> + ftp_userinfo(C0, [C |Acc]). + + +ftp_userinfo_1([], 0, Acc, []) -> + { lists:reverse(Acc), ""}; +ftp_userinfo_1([], 1, Acc, User) -> + {User, lists:reverse(Acc)}; +ftp_userinfo_1([$:|_], 0, [], []) -> + {error,no_user}; +ftp_userinfo_1([$:|C0], 0, Acc,[]) -> + ftp_userinfo_1(C0, 1, [], lists:reverse(Acc)); + +ftp_userinfo_1([C|C0],Stage, Acc, User) -> + ftp_userinfo_1(C0,Stage, [C|Acc], User). + + +%%% ......................................................................... +-define(SIP_DEFAULT_PORT, 5060). +-define(SIPTLS_DEFAULT_PORT, 5061). + +%%% SIP (Source RFC 2396, RFC 3261) +%%% sip_URL = "sip:" [ sip_userinfo "@" ] host [ ":" port ] +%%% sip_uri-parameters [ sip_headers ] +%%% sip_userinfo = (sip_user | sip_telephone-subscriber) +%%% [ ":" sip_password ] +%%% sip_user = *( unreserved | escaped | +%%% "&" | "=" | "+" | "$" | "," | ";" | "?" | "/") +%%% sip_telephone-subscriber = See RFC2806 +%%% sip_password = *( unreserved | escaped | +%%% "&" | "=" | "+" | "$" | "," ) +%%% sip_uri-parameters = *( ";" sip_uri-parameter ) +%%% sip_uri-parameter = sip_transport-param | sip_user-param | +%%% sip_method-param | sip_ttl-param | +%%% sip_maddr-param | sip_lr-param | sip_other-param +%%% sip_transport-param = "transport=" ( "udp" | "tcp" | "sctp" | "tls" | token) +%%% sip_user-param = "user=" ( "phone" | "ip" | token) +%%% sip_method-param = "method=" sip_Method +%%% sip_ttl-param = "ttl=" sip_ttl +%%% sip_maddr-param = "maddr=" host +%%% sip_lr-param = "lr" +%%% sip_other-param = 1*sip_paramchar [ "=" 1*sip_paramchar ] +%%% sip_Method = "INVITE" | "ACK" | "OPTIONS" | "BYE" | +%%% "CANCEL" | "REGISTER" | token +%%% sip_ttl = 1*3DIGIT ; 0 to 255 +%%% sip_paramchar = sip_param-unreserved | unreserved | escaped +%%% sip_param-unreserved = "[" | "]" | "/" | ":" | "&" | "+" | "$" +%%% sip_headers = "?" sip_header *( "&" sip_header ) +%%% sip_header = sip_hname "=" sip_hvalue +%%% sip_hname = 1*( sip_hnv-unreserved | unreserved | escaped ) +%%% sip_hvalue = *( sip_hnv-unreserved / unreserved / escaped ) +%%% sip_hnv-unreserved = "[" | "]" | "/" | "?" | ":" | "+" | "$" + +%%% Note: +%%% - FIXME: Headers not parsed +parse_sip(C0,Scheme) -> + case string:tokens(C0,"@") of + [Userinfo,Hostport] -> + {User,Pass}=sip_userinfo(Userinfo), + {C1,Host,Port}=scan_hostport(Hostport,Scheme), + {C2,Parameters}=scan_parameters(C1), + Headers=scan_headers(C2), + {Scheme,User,Pass,Host,Port,Parameters,Headers}; + [Hostport] -> + {C1,Host,Port}=scan_hostport(Hostport,Scheme), + {C2,Parameters}=scan_parameters(C1), + Headers=scan_headers(C2), + {Scheme,none,none,Host,Port,Parameters,Headers} + end. + +%%% FIXME! User can be telephone subscriber +sip_userinfo(Userinfo) -> + case string:tokens(Userinfo,":") of + [User,Pass] -> {User,Pass}; + [User] -> {User,none} + end. + +scan_parameters(C1) -> + ParList=string:tokens(C1,";"), + scan_parameters2(ParList,[], []). + +%% Is Foo the way to go? This code needs further investigation. (As +%% does most of this module.) If we decide to keep it! +scan_parameters2([],Out, Foo) -> + {lists:reverse(Foo), lists:reverse(Out)}; +scan_parameters2(["transport"++Val|Rest],Out, Foo) -> + scan_parameters2(Rest,[{transport,Val}|Out], Foo); +scan_parameters2(["user"++Val|Rest],Out, Foo) -> + scan_parameters2(Rest,[{user,Val}|Out], Foo); +scan_parameters2(["method"++Val|Rest],Out, Foo) -> + scan_parameters2(Rest,[{method,Val}|Out], Foo); +scan_parameters2(["ttl"++Val|Rest],Out, Foo) -> + scan_parameters2(Rest,[{ttl,Val}|Out], Foo); +scan_parameters2(["maddr"++Val|Rest],Out, Foo) -> + scan_parameters2(Rest,[{maddr,Val}|Out], Foo); +scan_parameters2(["lr"|Rest],Out, Foo) -> + scan_parameters2(Rest,[{lr,""}|Out], Foo); +scan_parameters2([Other|Rest],Out, Foo) -> + scan_parameters2(Rest,[Out], [Other |Foo]). + +%%% FIXME! +scan_headers(C2) -> + C2. + +%%% ............................................................................ +%%% SMS (Source draft-wilde-sms-uri-01, January 24 2002 and +%%% draft-allocchio-gstn-01, November 2001) +%%% The syntax definition for "gstn-phone" is taken from +%%% [draft-allocchio-gstn-01], allowing global as well as local telephone +%%% numbers. +%%% Note: This BNF has been modified to better fit with RFC 2396 +%%% sms_URI = sms ":" 1*( sms-recipient ) [ sms-body ] +%%% sms-recipient = gstn-phone sms-qualifier +%%% [ "," sms-recipient ] +%%% sms-qualifier = *( smsc-qualifier / pid-qualifier ) +%%% smsc-qualifier = ";smsc=" SMSC-sub-addr +%%% pid-qualifier = ";pid=" PID-sub-addr +%%% sms-body = ";body=" *urlc +%%% gstn-phone = ( global-phone / local-phone ) +%%% global-phone = "+" 1*( DIGIT / written-sep ) +%%% local-phone = [ exit-code ] dial-number / exit-code [ dial-number ] +%%% exit-code = phone-string +%%% dial-number = phone-string +%%% subaddr-string = phone-string +%%% post-dial = phone-string +%%% phone-string = 1*( DTMF / pause / tonewait / written-sep ) +%%% DTMF = ( DIGIT / "#" / "*" / "A" / "B" / "C" / "D" ) +%%% written-sep = ( "-" / "." ) +%%% pause = "p" +%%% tonewait = "w" +parse_sms(Cont,Scheme) -> + {Scheme,Cont}. + + +%%% ========================================================================== +%%% Generic URI parsing. BNF rules from RFC 2396 + +%%% hostport = host [ ":" port ] +scan_hostport(C0,Scheme) -> + case scan_host(C0) of + {error,Error} -> + {error,Error}; + {":"++C1,Host} -> + {C2,Port}=scan_port(C1,[]), + {C2,Host,list_to_integer(Port)}; + {C1,Host} when Scheme==http -> + {C1,Host,?HTTP_DEFAULT_PORT}; + {C1,Host} when Scheme==https -> + {C1,Host,?HTTPS_DEFAULT_PORT}; + {C1,Host} when Scheme==ftp -> + {C1,Host,?FTP_DEFAULT_PORT}; + {C1,Host} when Scheme==sip -> + {C1,Host,?SIP_DEFAULT_PORT} + end. + + +%%% host = hostname | IPv4address | IPv6reference +%%% hostname = *( domainlabel "." ) toplabel [ "." ] +%%% domainlabel = alphanum | alphanum *( alphanum | "-" ) alphanum +%%% toplabel = alpha | alpha *( alphanum | "-" ) alphanum +%%% IPv4address = 1*3DIGIT "." 1*3DIGIT "." 1*3DIGIT "." 1*3DIGIT +%%% ipv6reference = "[" IPv6address "]" +%%% IPv6address = hexpart [ ":" IPv4address ] +%%% hexpart = hexseq | hexseq "::" [ hexseq ] | "::" [ hexseq ] +%%% hexseq = hex4 *( ":" hex4) +%%% hex4 = 1*4HEXDIG + +%%% Note: +%%% Bitfields are set as follows: +%%% Bit 0 = 0-9 +%%% Bit 1 = a-f +%%% Bit 2 = g-z +-define(BIT1, 1). +-define(BIT2, 2). +-define(BIT3, 4). + +%%% 1 = DIGIT are only digits +%%% 3 = HEX are DIGITS + a-f +%%% 6 = ALPHA are HEX - DIGITS + g-z +-define(DIGIT, 1). +-define(HEX, 3). +-define(ALPHA, 6). + + +scan_host(C0) -> + case scan_host2(C0,[],0,[],[]) of + {C1,IPv4address,[?DIGIT,?DIGIT,?DIGIT,?DIGIT]} -> + {C1,lists:reverse(lists:append(IPv4address))}; +%% {C1,IPv6address,[$[,Hex1,Hex2,Hex3,Hex4,$]]} when Hex1=<?HEX; +%% Hex2=<?HEX; +%% Hex3=<?HEX; +%% Hex4=<?HEX -> +%% {C1,lists:reverse(lists:append(IPv6address))}; + {C1,Hostname,[A|_HostF]} -> + {C1,lists:reverse(lists:append(Hostname))}; + _ -> + {error,no_host} + end. + +scan_host2([H|C0],Acc,CurF,Host,HostF) when $0=<H,H=<$9 -> + scan_host2(C0,[H|Acc],CurF bor ?BIT1,Host,HostF); +scan_host2([H|C0],Acc,CurF,Host,HostF) when $a=<H,H=<$z; $A=<H,H=<$Z -> + scan_host2(C0,[H|Acc],CurF bor ?ALPHA,Host,HostF); +scan_host2([$-|C0],Acc,CurF,Host,HostF) when CurF=/=0 -> + scan_host2(C0,[$-|Acc],CurF,Host,HostF); +scan_host2([$.|C0],Acc,CurF,Host,HostF) when CurF=/=0 -> + scan_host2(C0,[],0,[".",Acc|Host],[CurF|HostF]); +scan_host2(C0,Acc,CurF,Host,HostF) -> + {C0,[Acc|Host],[CurF|HostF]}. + + +%%% port = *digit +scan_port([H|C0],Acc) when $0=<H,H=<$9 -> + scan_port(C0,[H|Acc]); +scan_port(C0,Acc) -> + {C0,lists:reverse(Acc)}. + +%%% abs_path = "/" path_segments +scan_abspath([]) -> + {[],[]}; +scan_abspath("/"++C0) -> + scan_pathsegments(C0,["/"]); +scan_abspath(_) -> + {error,no_abspath}. + +%%% path_segments = segment *( "/" segment ) +scan_pathsegments(C0,Acc) -> + case scan_segment(C0,[]) of + {"/"++C1,Segment} -> + scan_pathsegments(C1,["/",Segment|Acc]); + {C1,Segment} -> + {C1,lists:reverse(lists:append([Segment|Acc]))} + end. + + +%%% segment = *pchar *( ";" param ) +%%% param = *pchar +scan_segment(";"++C0,Acc) -> + {C1,ParamAcc}=scan_pchars(C0,";"++Acc), + scan_segment(C1,ParamAcc); +scan_segment(C0,Acc) -> + case scan_pchars(C0,Acc) of + {";"++C1,Segment} -> + {C2,ParamAcc}=scan_pchars(C1,";"++Segment), + scan_segment(C2,ParamAcc); + {C1,Segment} -> + {C1,Segment} + end. + +%%% query = *uric +%%% uric = reserved | unreserved | escaped +%%% reserved = ";" | "/" | "?" | ":" | "@" | "&" | "=" | "+" | +%%% "$" | "," | "[" | "]" +%%% unreserved = alphanum | mark +%%% mark = "-" | "_" | "." | "!" | "~" | "*" | "'" | +%%% "(" | ")" +%%% escaped = "%" hex hex +scan_query([],Acc) -> + lists:reverse(Acc); +scan_query([$%,H1,H2|C0],Acc) -> % escaped + scan_query([hex2dec(H1)*16+hex2dec(H2)|C0],Acc); +scan_query([H|C0],Acc) when $a=<H,H=<$z;$A=<H,H=<$Z;$0=<H,H=<$9 -> % alphanum + scan_query(C0,[H|Acc]); +scan_query([H|C0],Acc) when H==$;; H==$/; H==$?; H==$:; H==$@; H==$[; H==$]; + H==$&; H==$=; H==$+; H==$$; H==$, -> % reserved + scan_query(C0,[H|Acc]); +scan_query([H|C0],Acc) when H==$-; H==$_; H==$.; H==$!; H==$~; + H==$*; H==$'; H==$(; H==$) -> % mark + scan_query(C0,[H|Acc]); +scan_query([H|C0],Acc) when 0=<H,H=<127 -> % US ASCII + {H1,H2}=dec2hex(H), + scan_query(C0,[H2,H1,$%|Acc]); +scan_query([_H|_C0],_Acc) -> + {error,no_query}. + + +%%% pchar = unreserved | escaped | +%%% ":" | "@" | "&" | "=" | "+" | "$" | "," +scan_pchars([],Acc) -> + {[],Acc}; +scan_pchars([$%,H1,H2|C0],Acc) -> % escaped + scan_pchars([hex2dec(H1)*16+hex2dec(H2)|C0],Acc); +scan_pchars([H|C0],Acc) when $a=<H,H=<$z;$A=<H,H=<$Z;$0=<H,H=<$9 -> % alphanum + scan_pchars(C0,[H|Acc]); +scan_pchars([H|C0],Acc) when H==$-; H==$_; H==$.; H==$!; H==$~; + H==$*; H==$'; H==$(; H==$) -> % mark + scan_pchars(C0,[H|Acc]); +scan_pchars([H|C0],Acc) when H==$:; H==$@; H==$&; H==$=; H==$+; H==$$; H==$, -> + scan_pchars(C0,[H|Acc]); +scan_pchars([H|C0],Acc) when 0=<H,H=<127, % US ASCII + H=/=$?,H=/=$;,H=/=$/,H=/=$# -> + {H1,H2}=dec2hex(H), + scan_pchars(C0,[H2,H1,$%|Acc]); +scan_pchars(C0,Acc) -> + {C0,Acc}. + +hex2dec(X) when X>=$0,X=<$9 -> X-$0; +hex2dec(X) when X>=$A,X=<$F -> X-$A+10; +hex2dec(X) when X>=$a,X=<$f -> X-$a+10. + +dec2hex(H) when H<256 -> + <<H1:4,H2:4>> = <<H>>, + {nibble2hex(H1),nibble2hex(H2)}. + +nibble2hex(X) when 0=<X,X=<9 -> X+$0; +nibble2hex(10) -> $a; +nibble2hex(11) -> $b; +nibble2hex(12) -> $c; +nibble2hex(13) -> $d; +nibble2hex(14) -> $e; +nibble2hex(15) -> $f. diff --git a/lib/xmerl/src/xmerl_validate.erl b/lib/xmerl/src/xmerl_validate.erl new file mode 100644 index 0000000000..893e23ca34 --- /dev/null +++ b/lib/xmerl/src/xmerl_validate.erl @@ -0,0 +1,663 @@ +%% +%% %CopyrightBegin% +%% +%% Copyright Ericsson AB 2003-2009. All Rights Reserved. +%% +%% The contents of this file are subject to the Erlang Public License, +%% Version 1.1, (the "License"); you may not use this file except in +%% compliance with the License. You should have received a copy of the +%% Erlang Public License along with this software. If not, it can be +%% retrieved online at http://www.erlang.org/. +%% +%% Software distributed under the License is distributed on an "AS IS" +%% basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See +%% the License for the specific language governing rights and limitations +%% under the License. +%% +%% %CopyrightEnd% +%% + +-module(xmerl_validate). + +-export([validate/2]). + + +-include("xmerl.hrl"). % record def, macros + + + +%% +type validate(xmerl_scanner(),xmlElement())-> +%% xmlElment() | {error,tuple()}. +validate(#xmerl_scanner{doctype_name=DTName,doctype_DTD=OpProv}, + #xmlElement{name=Name}) + when DTName=/=Name,OpProv=/=option_provided-> + {error, {mismatched_root_element,Name,DTName}}; +validate(#xmerl_scanner{rules=Rules}=S, + XML=#xmlElement{name=Name})-> + catch do_validation(read_rules(Rules,Name),XML,Rules,S); +validate(_, XML) -> + {error, {no_xml_element, XML}}. + + + +%% +type validate(rules(),xmlElement())-> +%% {ok,xmlElement()} | {error,tuple()}. +do_validation(undefined,#xmlElement{name=Name}, _Rules,_S) -> + {error,{unknown_element,Name}}; +do_validation(El_Rule,XML,Rules,S)-> + case catch valid_attributes(El_Rule#xmlElement.attributes, + XML#xmlElement.attributes,S) of + {'EXIT',Reason} -> + {error,Reason}; + {error,Reason} -> + {error,Reason}; + Attr_2-> +% XML_=XML#xmlElement{attributes=Attr_2}, + El_Rule_Cont = El_Rule#xmlElement.content, + WSActionMode = ws_action_mode(El_Rule#xmlElement.elementdef, + El_Rule_Cont,S), + XML_Cont = XML#xmlElement.content, + check_direct_ws_SDD(XML_Cont,WSActionMode), + case valid_contents(El_Rule_Cont, + XML_Cont,Rules,S,WSActionMode) of + {error,Reason}-> + {error,Reason}; + {error,Reason,N}-> + {error,Reason,N}; + XMLS -> + XML#xmlElement{attributes=Attr_2,content=XMLS} + end + end. + +check_direct_ws_SDD(XML,always_preserve) -> + case XML of + [#xmlText{}|_Rest] -> + exit({error,{illegal_whitespace_standalone_doc,XML}}); + _ -> ok + end, + case lists:reverse(XML) of + [#xmlText{}|_Rest2] -> + exit({error,{illegal_whitespace_standalone_doc,XML}}); + _ -> ok + end; +check_direct_ws_SDD(_,_) -> ok. + +ws_action_mode({external,_},Content,#xmerl_scanner{standalone=yes}) -> + case element_content(Content) of + children -> + always_preserve; + _ -> + preserve + end; +ws_action_mode(_,_,_) -> + preserve. + +element_content(A) when is_atom(A),A /= any, A /= empty -> + children; +element_content({choice,L}) when is_list(L) -> + element_content(L); +element_content({seq,L}) when is_list(L) -> + element_content(L); +element_content(['#PCDATA'|_T]) -> + mixed; +element_content('#PCDATA') -> + mixed; +element_content({'*',Rest}) -> + element_content(Rest); +element_content(_) -> children. + +%% +type read_rules(DTD::atom(),Element_Name::atom())-> +%% undefined | xmlElement(). +read_rules(_, pcdata) -> + pcdata; +read_rules(T, Name) -> + case ets:lookup(T, {elem_def, Name}) of + [] -> + undefined; + [{_K, V}] -> + V + end. + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +%%%%%%%%%%%%% Attributes Validation %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +%% +deftype attribute_rule() = {Attr_Name::atom(),attribute_type(), +%% attribute_priority()}. + +%% +type valid_attributes([attribute_rule()],[xmlAttribute()])-> +%% [xmlAttribute()] | {error,attribute_unknow}. +valid_attributes(All_Attr,[#xmlAttribute{}|_T]=Attr,S)-> + single_ID_definition(All_Attr), + vc_Name_Token_IDREFS(All_Attr,Attr), + lists:foreach(fun(#xmlAttribute{name=Name})-> + case is_attribute_exist(Name,All_Attr) of + true -> + ok; + false -> + exit({error,{attribute_unknown,Name}}) + end + end, + Attr), + lists:flatten(lists:foldl(fun({Name,DataType,IF,DefDecl,Env},Attr_2)-> + Attr_2++ + [valid_attribute(Name,DataType,IF, + DefDecl,Attr,Env,S)] + end,[],All_Attr)); +valid_attributes([],[],_) -> + []; +valid_attributes(All_Attr,[],S) -> + single_ID_definition(All_Attr), + lists:flatten(lists:foldl(fun({Name,DataType,IF,DefDecl,Env},Attr_2)-> + Attr_2++[valid_attribute(Name, + DataType,IF, + DefDecl, + [], + Env,S)] + end,[],All_Attr)). + +%%%% [60] DefaultDecl::= +%%%% '#REQUIRED' | '#IMPLIED' +%%%% | (('#FIXED' S)? AttValue) +%% +deftype attribute_priority = '#REQUIRED'|'#FIXED'|'#IMPLIED'. + +%% +type valid_attribute(Name::atom(),DataType::attribute_value(), +%% IF::attribute_priority(),[xmlAttribute()])-> +%% [xmlAttribute()] | exit(). +valid_attribute(Name,DataType,IF,DefaultDecl,List_of_Attributes,Env,S)-> + SA = S#xmerl_scanner.standalone, + Attr=search_attr(Name,List_of_Attributes), + check_SDD_validity(SA,Env,Attr,IF), + case {DefaultDecl,IF,Attr} of + {'#REQUIRED',_,no_attribute}-> + exit({error,{Name,is_required}}); + {'#IMPLIED',_,no_attribute}-> + []; %% and no default value + {'#FIXED',DefVal,#xmlAttribute{value=DefVal}=Attr} -> + Attr; + {'#FIXED',A,no_attribute} -> + #xmlAttribute{name=Name,value=A}; % FIXED declare value becomes default. + {'#FIXED',A,B} -> + exit({error,{fixed_default_value_missmatch,A,B}}); + {_,Value,no_attribute} when is_list(Value)-> + #xmlAttribute{name=Name,value=Value}; + {_,_,#xmlAttribute{}=Attr}-> + %% do test data value, and default_value + test_attribute_value(DataType,Attr,IF,S); + {DefDecl,Else,XML} -> + exit({error,{unknow_attribute_type,DefDecl,Else,XML}}) + end. + +vc_Name_Token_IDREFS([{Name,Type,_,_,_}|Rest],Attrs) + when Type=='NMTOKEN';Type=='NMTOKENS'-> + case lists:keysearch(Name,#xmlAttribute.name,Attrs) of + {value,A} -> + valid_nmtoken_value(A#xmlAttribute.value,Type); + _ -> ok + end, + vc_Name_Token_IDREFS(Rest,Attrs); +vc_Name_Token_IDREFS([{Name,Type,_,_,_}|Rest],Attrs) + when Type=='IDREFS'-> + case lists:keysearch(Name,#xmlAttribute.name,Attrs) of + {value,A} -> + valid_IDREFS(A#xmlAttribute.value,Type); + _ -> ok + end, + vc_Name_Token_IDREFS(Rest,Attrs); +vc_Name_Token_IDREFS([_H|Rest],Attrs) -> + vc_Name_Token_IDREFS(Rest,Attrs); +vc_Name_Token_IDREFS([],_) -> ok. + +valid_nmtoken_value([],'NMTOKENS') -> + exit({error,{at_least_one_Nmtoken_required}}); +% valid_nmtoken_value([H|_T] = L,'NMTOKENS') when is_list(H) -> +% ValidChar = +% fun(X) -> +% case xmerl_lib:is_namechar(X) of +% false -> +% exit({error,{invalid_character_in_Nmtoken,X}}); +% _ -> ok +% end +% end, +% ValidCharList = +% fun([Nmtok|T],F) -> +% lists:foreach(ValidChar,Nmtok), +% F(T,F); +% ([],_) -> ok +% end, +% ValidCharList(L,ValidChar); +valid_nmtoken_value(Nmtok,_) -> + ValidChar = + fun(X) when ?whitespace(X),Nmtok=='NMTOKENS' -> + ok; + (X) -> + case xmerl_lib:is_namechar(X) of + false -> + exit({error,{invalid_character_in_Nmtoken,X}}); + _ -> ok + end + end, + lists:foreach(ValidChar,Nmtok). + +valid_IDREFS([],'IDREFS') -> + exit({error,{at_least_one_IDREF_Name_required}}); +valid_IDREFS(_Str,'IDREFS') -> + ok. + +single_ID_definition([{_,'ID',_,_,_}=Att1|Rest]) -> + case lists:keysearch('ID',2,Rest) of + {value,Att2} -> + exit({error,{just_one_ID_definition_allowed,Att1,Att2}}); + _ -> ok + end; +single_ID_definition([_H|T]) -> + single_ID_definition(T); +single_ID_definition([]) -> + ok. + +check_SDD_validity(yes,{external,_},#xmlAttribute{name=Name,normalized=true},_) -> + exit({error,{externally_defed_attribute_normalized_in_standalone_doc,Name}}); +check_SDD_validity(yes,{external,_},no_attribute,V) when V /= no_value-> + exit({error,{externally_defed_attribute_with_default_value_missing_in_standalone_doc}}); +check_SDD_validity(_,_,_,_) -> + ok. + +search_attr(Name,[#xmlAttribute{name=Name}=H|_T])-> + H; +search_attr(Name,[#xmlAttribute{}|T])-> + search_attr(Name,T); +search_attr(_Name,_T) -> + no_attribute. + +is_attribute_exist(Name,[{Name,_,_,_,_}|_T])-> + true; +is_attribute_exist(Name,[{_Attr,_,_,_,_}|T]) -> + is_attribute_exist(Name,T); +is_attribute_exist(_Name,[]) -> + false. + +%%%%[54] AttType::= StringType | TokenizedType | EnumeratedType +%%%%[55] StringType::= 'CDATA' +%%%%[56] TokenizedType::= 'ID'|'IDREF'| 'IDREFS'|'ENTITY'| 'ENTITIES' +%%%% | 'NMTOKEN'| 'NMTOKENS' +%%%%[57] EnumeratedType::= NotationType | Enumeration +%%%%[58] NotationType::= 'NOTATION' S '(' S? Name (S? '|' S? Name)* S? ')' +%%%%[59] Enumeration ::= '(' S? Nmtoken (S? '|' S? Nmtoken)* S? ')' + +%% +deftype attribute_type()-> 'CDATA' | 'ID'|'IDREF'| 'IDREFS'|'ENTITY'| +%% 'ENTITIES'| 'NMTOKEN'| 'NMTOKENS' +%% {enumeration,[List_of_value::atom()]}. + +%% +type test_attribute_value(attribute_type(),xmlAttribute())-> +%% xmlAttribute()| exit. +%%%% test the constraint validity of Attribute value. +test_attribute_value('CDATA',#xmlAttribute{}=Attr,_,_) -> + Attr; +test_attribute_value('NMTOKEN',#xmlAttribute{name=Name,value=V}=Attr, + Default,_S) -> + Fun = + fun (X)-> + case xmerl_lib:is_namechar(X) of + true-> + ok; + false-> + %%io:format("Warning*** nmtoken,value_incorrect: ~p~n",[V]), + exit({error,{invalid_value_nmtoken,Name,V}}) + end + end, + lists:foreach(Fun,V), + if + is_list(Default) -> + lists:foreach(Fun,Default); + true -> ok + end, + Attr; +test_attribute_value('NMTOKENS',#xmlAttribute{name=Name,value=V}=Attr, + Default,_S) -> + Fun = + fun (X)-> + case xmerl_lib:is_namechar(X) of + true-> + ok; + false when ?whitespace(X)-> + ok; + false -> + exit({error,{invalid_value_nmtokens,Name,V}}) + end + end, + lists:foreach(Fun,V), + if + is_list(Default) -> + lists:foreach(Fun,Default); + true -> ok + end, + Attr; +test_attribute_value(Ent,#xmlAttribute{name=_Name,value=V}=Attr,_Default, + S=#xmerl_scanner{rules_read_fun=Read}) + when Ent == 'ENTITY'; Ent == 'ENTITIES'-> + %% The default value is already checked + NameListFun = + fun([],Acc,_) -> + lists:reverse(Acc); + (Str,Acc,Fun) -> + {N,Str2} = scan_name(Str,[]), + Fun(Str2,[N|Acc],Fun) + end, + NameList = NameListFun(V,[],NameListFun), + VC_Entity_Name = + fun(X) -> + case Read(entity,X,S) of + {_,external,{_,{ndata,_}}} -> + ok; + _ -> exit({error,{vc_Entity_Name,X,V}}) + end + end, + lists:foreach(VC_Entity_Name,NameList), + Attr; +test_attribute_value({Type,L},#xmlAttribute{value=Value}=Attr,Default,_S) + when Type == enumeration; Type == notation -> + ValidDefault = + if + is_atom(Default) -> true; + true -> lists:member(list_to_atom(Default),L) + end, + NoDuplicatesFun = + fun(_,_,notation) -> true; + ([],_,_) -> true; + ([H|T],F,Enum) -> + case lists:member(H,T) of + true -> false; + _ -> F(T,F,Enum) + end + end, + NoDuplicates = NoDuplicatesFun(L,NoDuplicatesFun,Type), + case {lists:member(list_to_atom(Value),L),ValidDefault,NoDuplicates} of + {true,true,true}-> + Attr; + {false,_,_} -> + exit({error,{attribute_value_unknow,Value,{list,L}}}); + {_,false,_} -> + exit({error,{attribute_default_value_unknow,Default,{list,L}}}); + {_,_,false} -> + exit({error,{duplicate_tokens_not_allowed,{list,L}}}) + end; +test_attribute_value(_Rule,Attr,_,_) -> +% io:format("Attr Value*****~nRule~p~nValue~p~n",[Rule,Attr]), + Attr. + + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +%%%%%%%%%%%%% Contents Validation %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +%%%%Element-content Models +%%%%[47] children::= (choice | seq) ('?' | '*' | '+')? +%%%%[48] cp::= (Name | choice | seq) ('?' | '*' | '+')? +%%%%[49] choice::= '(' S? cp ( S? '|' S? cp )+ S? ')' +%%%%[50] seq::= '(' S? cp ( S? ',' S? cp )* S? ')' +%%%%[51] Mixed::= '(' S? '#PCDATA' (S? '|' S? Name)* S? ')*' +%%%% | '(' S? '#PCDATA' S? ')' + + +%% +type valid_contents([rule()],[xmlElement()])-> +%% [xmlElement() | {error,???}. +valid_contents(Rule,XMLS,Rules,S,WSActionMode)-> + case parse(Rule,XMLS,Rules,WSActionMode,S) of + {XML_N,[]}-> + lists:flatten(XML_N); + {_,[#xmlElement{name=Name}|_T]} -> + exit({error,{element,Name,isnt_comprise_in_the_rule,Rule}}); + {_,[#xmlText{}=Txt|_T]} -> + exit({error,{element,text,Txt,isnt_comprise_in_the_rule,Rule}}); + {error,Reason} -> + {error,Reason}; + {error,Reason,N} -> + {error,Reason,N} + end. + +parse({'*',SubRule},XMLS,Rules,WSaction,S)-> + star(SubRule,XMLS,Rules,WSaction,[],S); +parse({'+',SubRule},XMLS,Rules,WSaction,S) -> + plus(SubRule,XMLS,Rules,WSaction,S); +parse({choice,CHOICE},XMLS,Rules,WSaction,S)-> +% case XMLS of +% [] -> +% io:format("~p~n",[{choice,CHOICE,[]}]); +% [#xmlElement{name=Name,pos=Pos}|_] -> +% io:format("~p~n",[{choice,CHOICE,{Name,Pos}}]); +% [#xmlText{value=V}|_] -> +% io:format("~p~n",[{choice,CHOICE,{text,V}}]) +% end, + choice(CHOICE,XMLS,Rules,WSaction,S); +parse(empty,[],_Rules,_WSaction,_S) -> + {[],[]}; +parse({'?',SubRule},XMLS,Rules,_WSaction,S)-> + question(SubRule,XMLS,Rules,S); +parse({seq,List},XMLS,Rules,WSaction,S) -> + seq(List,XMLS,Rules,WSaction,S); +parse(El_Name,[#xmlElement{name=El_Name}=XML|T],Rules,_WSaction,S) + when is_atom(El_Name)-> + case do_validation(read_rules(Rules,El_Name),XML,Rules,S) of + {error,R} -> +% {error,R}; + exit(R); + {error,R,_N}-> +% {error,R,N}; + exit(R); + XML_-> + {[XML_],T} + end; +parse(any,Cont,Rules,_WSaction,S) -> + case catch parse_any(Cont,Rules,S) of + Err = {error,_} -> Err; + ValidContents -> {ValidContents,[]} + end; +parse(El_Name,[#xmlElement{name=Name}|_T]=S,_Rules,_WSa,_S) when is_atom(El_Name)-> + {error, + {element_seq_not_conform,{wait,El_Name},{is,Name}}, + {{next,S},{act,[]}} }; +parse(_El_Name,[#xmlPI{}=H|T],_Rules,_WSa,_S) -> + {[H],T}; +parse('#PCDATA',XML,_Rules,_WSa,_S)-> + %%% PCDATA it is 0 , 1 or more #xmlText{}. + parse_pcdata(XML); +parse(El_Name,[#xmlText{}|_T]=S,_Rules,_WSa,_S)-> + {error, + {text_in_place_of,El_Name}, + {{next,S},{act,[]}}}; +parse([],_,_,_,_) -> + {error,no_rule}; +parse(Rule,[],_,_,_) -> + {error,{no_xml_element,Rule}}. + +parse_any([],_Rules,_S) -> + []; +parse_any([H|T],Rules,S) -> + case parse_any(H,Rules,S) of + [Cont] -> + [Cont|parse_any(T,Rules,S)]; + Err -> throw(Err) + end; +parse_any(#xmlElement{}=XML,Rules,S) -> + case do_validation(read_rules(Rules,el_name(XML)),XML,Rules,S) of + {error,R} -> + {error,R}; + {error,R,N}-> + {error,R,N}; + XML_-> + [XML_] + end; +parse_any(El,_Rules,_S) -> + [El]. + + + +%% XXX remove first function clause +% choice(_Choice,[#xmlText{}=T|R],_Rules) -> +% {[T],R}; +choice([CH|CHS],[_XML|_T]=XMLS,Rules,WSaction,S)-> + {WS,XMLS1} = whitespace_action(XMLS,ws_action(WSaction,remove)), + case parse(CH,XMLS1,Rules,ws_action(WSaction,remove),S) of + {error,_R} -> + choice(CHS,XMLS,Rules,WSaction,S); + {error,_R,_N} -> + choice(CHS,XMLS,Rules,WSaction,S); %% XXX add a case {[],XML} + {[],XMLS1} -> %% Maybe a sequence with * or ? elements that + %% didn't match + case CHS of + [] -> % choice has succeded but without matching XMLS1 + {[],XMLS1}; + _ -> % there are more choice alternatives to try with + choice(CHS,XMLS1,Rules,WSaction,S) + end; +%% choice(CHS,XMLS1,Rules,WSaction,S); + {Tree,XMLS2}-> + {WS2,XMLS3} = whitespace_action(XMLS2,ws_action(WSaction,remove)), + {WS2++[Tree]++WS,XMLS3} + end; +choice([],XMLS,_,WSaction,_S)-> + case whitespace_action(XMLS,ws_action(WSaction,remove)) of + Res={_,[]} -> Res; + _ -> + {error,element_unauthorize_in_choice,{{next,XMLS},{act,[]}}} + end; +choice(_,[],_,_,_S) -> + {[],[]}. + +plus(Rule,XMLS,Rules,WSaction,S) -> + %% 1 or more + {WS,XMLS1}=whitespace_action(XMLS,WSaction), + case parse(Rule,XMLS1,Rules,WSaction,S) of + {error, Reason,_XML} -> + {error, Reason}; + {error, X} -> + {error, X}; + {Tree, XMLS2} -> + case star(Rule, XMLS2,Rules,WSaction,[],S) of + {[], _} -> + {WS++[Tree], XMLS2}; + {Tree_1, XMLS3} -> + {WS++[Tree]++Tree_1, XMLS3} + end + end. + +star(_Rule,XML,_Rules,_WSa,Tree,_S) when length(XML)==0-> + {[Tree],[]}; +star(Rule,XMLS,Rules,WSaction,Tree,S) -> + {WS,XMLS1} = whitespace_action(XMLS,WSaction), + case parse(Rule,XMLS1,Rules,WSaction,S) of + {error, _E, {{next,N},{act,A}}}-> + %%io:format("Error~p~n",[_E]), + {WS++Tree++A,N}; + {error, _E}-> + %%io:format("Error~p~n",[_E]), +% {WS++[Tree],[]}; + case whitespace_action(XMLS,ws_action(WSaction,remove)) of + {[],_} -> + {WS++[Tree],XMLS}; + {WS2,XMLS2} -> + {WS2++[Tree],XMLS2} + end; + {Tree1,XMLS2}-> + star(Rule,XMLS2,Rules,WSaction,Tree++WS++[Tree1],S) + end. + +question(_Rule, [],_Rules,_S) -> + {[],[]}; +question(Rule, Toks,Rules,S) -> + %% 0 or 1 + case parse(Rule, Toks,Rules,preserve,S) of + {error, _E, _Next}-> + {[],Toks}; + {error, _E} -> + {[], Toks}; + {T,Toks1} -> + {T, Toks1} + end. + +seq(H,Toks,Rules,WSaction,S)-> + case seq2(H,Toks,Rules,[],WSaction,S) of + {error,E}-> + {error,E}; + {error,R,N}-> + {error,R,N}; + {Tree,Toks2}-> + {Tree,Toks2} + end. + +seq2([],[],_,Tree,_WSa,_S)-> + {Tree,[]}; +% seq2([],[#xmlElement{name=Name}|_T]=XMLS,_,Tree,_WSa,_S)-> +% {error,{sequence_finish,Name,isnt_in_the_right_place}, +% {{next,XMLS},{act,Tree}}}; +seq2([],[#xmlText{}]=XML,_,Tree,_WSa,_S)-> + case whitespace_action(XML,remove) of + {[],_} -> + {error,sequence_finish,{{next,XML},{act,Tree}}}; + {WS,Rest} -> + {WS++Tree,Rest} + end; +seq2([],Rest,_,Tree,_WSa,_S) -> + {WS,Rest2}=whitespace_action(Rest,remove), + {WS++Tree,Rest2}; +seq2([H|T],Toks,Rules,Tree,WSaction,S) -> + {WS,Toks1} = whitespace_action(Toks,ws_action(WSaction,remove)), + case parse(H,Toks1,Rules,remove,S) of %% H maybe only match parts of Toks + {error,Reason,_XML}-> + {error,Reason}; + {error,E}-> + {error,E}; + {[],Toks2}-> + seq2(T,Toks2,Rules,Tree,WSaction,S); + {Tree1,Toks2} when is_list(Tree1)-> + seq2(T,Toks2,Rules,Tree++WS++Tree1,WSaction,S); + {Tree1,Toks2}-> + seq2(T,Toks2,Rules,Tree++WS++[Tree1],WSaction,S) + end. + +el_name(#xmlElement{name=Name})-> + Name. + +parse_pcdata([#xmlText{}=H|T])-> + parse_pcdata(T,[H]); +parse_pcdata(H) -> + {[],H}. + +parse_pcdata([#xmlText{}=H|T],Acc)-> + parse_pcdata(T,Acc++[H]); +parse_pcdata(H,Acc) -> + {Acc,H}. + +whitespace([]) -> + true; +whitespace([H|T]) when ?whitespace(H) -> + whitespace(T); +whitespace(_) -> + false. + +whitespace_action(XML,remove) -> + whitespace_remove(XML,[]); +whitespace_action(XML,_) -> + {[],XML}. + +whitespace_remove([#xmlText{value=V,type=text}=T|R]=L,Acc) -> + case whitespace(V) of + true -> + whitespace_remove(R,[T|Acc]); + _ -> + {lists:reverse(Acc),L} + end; +whitespace_remove(L,Acc) -> + {lists:reverse(Acc),L}. + +ws_action(always_preserve=A,_) -> + A; +ws_action(_,B) -> + B. + +scan_name(N,_) when is_atom(N) -> + N; +scan_name([$\s|T],Acc) -> + {list_to_atom(lists:reverse(Acc)),T}; +scan_name([H|T],Acc) -> + scan_name(T,[H|Acc]); +scan_name("",Acc) -> + {list_to_atom(lists:reverse(Acc)),[]}. diff --git a/lib/xmerl/src/xmerl_xlate.erl b/lib/xmerl/src/xmerl_xlate.erl new file mode 100644 index 0000000000..5c4d9d2540 --- /dev/null +++ b/lib/xmerl/src/xmerl_xlate.erl @@ -0,0 +1,50 @@ +%% +%% %CopyrightBegin% +%% +%% Copyright Ericsson AB 2003-2009. All Rights Reserved. +%% +%% The contents of this file are subject to the Erlang Public License, +%% Version 1.1, (the "License"); you may not use this file except in +%% compliance with the License. You should have received a copy of the +%% Erlang Public License along with this software. If not, it can be +%% retrieved online at http://www.erlang.org/. +%% +%% Software distributed under the License is distributed on an "AS IS" +%% basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See +%% the License for the specific language governing rights and limitations +%% under the License. +%% +%% %CopyrightEnd% +%% + +%%% Description : Wrapper to xmerl_scan, which reads an XML document +%%% from disk (alt. consumes a string), processes it, and +%%% exports is using the specified Callback module. +-module(xmerl_xlate). + + +-export([file/3, + string/3]). + + +-include("xmerl.hrl"). + +file(F, Title, Callback) -> + case file:read_file(F) of + {ok, Bin} -> + string(binary_to_list(Bin), Title, Callback); + Error -> + Error + end. + +string(Str, Title, Callback) -> + xmerl_scan:string(Str, [{hook_fun, fun hook/2, {Title, Callback}}]). + + +hook(E = #xmlElement{parents = []}, S) -> + {Title, Callback} = xmerl_scan:hook_state(S), + Data = xmerl:export([E], Callback, [{title, Title}]), + {Data, S}; +hook(X, S) -> + {X, S}. + diff --git a/lib/xmerl/src/xmerl_xml.erl b/lib/xmerl/src/xmerl_xml.erl new file mode 100644 index 0000000000..702a654629 --- /dev/null +++ b/lib/xmerl/src/xmerl_xml.erl @@ -0,0 +1,62 @@ +%% +%% %CopyrightBegin% +%% +%% Copyright Ericsson AB 2003-2009. All Rights Reserved. +%% +%% The contents of this file are subject to the Erlang Public License, +%% Version 1.1, (the "License"); you may not use this file except in +%% compliance with the License. You should have received a copy of the +%% Erlang Public License along with this software. If not, it can be +%% retrieved online at http://www.erlang.org/. +%% +%% Software distributed under the License is distributed on an "AS IS" +%% basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See +%% the License for the specific language governing rights and limitations +%% under the License. +%% +%% %CopyrightEnd% +%% + +%% Description : Callback module for exporting +%% complete or simple forms to XML. + +-module(xmerl_xml). + +-export(['#xml-inheritance#'/0]). + +-export(['#root#'/4, + '#element#'/5, + '#text#'/1]). + +-import(xmerl_lib, [markup/3, empty_tag/2, export_text/1]). + +-include("xmerl.hrl"). + + +'#xml-inheritance#'() -> []. + + +%% The '#text#' function is called for every text segment. + +'#text#'(Text) -> +%io:format("Text=~p~n",[Text]), + export_text(Text). + + +%% The '#root#' tag is called when the entire structure has been +%% exported. It does not appear in the structure itself. + +'#root#'(Data, [#xmlAttribute{name=prolog,value=V}], [], _E) -> + [V,Data]; +'#root#'(Data, _Attrs, [], _E) -> + ["<?xml version=\"1.0\"?>", Data]. + + +%% The '#element#' function is the default handler for XML elements. + +'#element#'(Tag, [], Attrs, _Parents, _E) -> +%io:format("Empty Tag=~p~n",[Tag]), + empty_tag(Tag, Attrs); +'#element#'(Tag, Data, Attrs, _Parents, _E) -> +%io:format("Tag=~p~n",[Tag]), + markup(Tag, Attrs, Data). diff --git a/lib/xmerl/src/xmerl_xpath.erl b/lib/xmerl/src/xmerl_xpath.erl new file mode 100644 index 0000000000..182a186d2c --- /dev/null +++ b/lib/xmerl/src/xmerl_xpath.erl @@ -0,0 +1,776 @@ +%% +%% %CopyrightBegin% +%% +%% Copyright Ericsson AB 2003-2009. All Rights Reserved. +%% +%% The contents of this file are subject to the Erlang Public License, +%% Version 1.1, (the "License"); you may not use this file except in +%% compliance with the License. You should have received a copy of the +%% Erlang Public License along with this software. If not, it can be +%% retrieved online at http://www.erlang.org/. +%% +%% Software distributed under the License is distributed on an "AS IS" +%% basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See +%% the License for the specific language governing rights and limitations +%% under the License. +%% +%% %CopyrightEnd% +%% + +%% Description : Implements a search engine based on XPath + +%% @doc The xmerl_xpath module handles the entire XPath 1.0 spec +%% XPath expressions typically occurs in XML attributes and are used to addres +%% parts of an XML document. +% The grammar is defined in <code>xmerl_xpath_parse.yrl</code>. +% The core functions are defined in <code>xmerl_xpath_pred.erl</code>. +% +% <p>Some useful shell commands for debugging the XPath parser</p> +% <pre> +% c(xmerl_xpath_scan). +% yecc:yecc("xmerl_xpath_parse.yrl", "xmerl_xpath_parse", true, []). +% c(xmerl_xpath_parse). +% +% xmerl_xpath_parse:parse(xmerl_xpath_scan:tokens("position() > -1")). +% xmerl_xpath_parse:parse(xmerl_xpath_scan:tokens("5 * 6 div 2")). +% xmerl_xpath_parse:parse(xmerl_xpath_scan:tokens("5 + 6 mod 2")). +% xmerl_xpath_parse:parse(xmerl_xpath_scan:tokens("5 * 6")). +% xmerl_xpath_parse:parse(xmerl_xpath_scan:tokens("-----6")). +% xmerl_xpath_parse:parse(xmerl_xpath_scan:tokens("parent::node()")). +% xmerl_xpath_parse:parse(xmerl_xpath_scan:tokens("descendant-or-self::node()")). +% xmerl_xpath_parse:parse(xmerl_xpath_scan:tokens("parent::processing-instruction('foo')")). +%% </pre> +%% +%% @type docEntity() = +%% xmlElement() +%% | xmlAttribute() +%% | xmlText() +%% | xmlPI() +%% | xmlComment() +%% @type nodeEntity() = +%% xmlElement() +%% | xmlAttribute() +%% | xmlText() +%% | xmlPI() +%% | xmlNamespace() +%% | xmlDocument() +%% @type option_list(). <p>Options allows to customize the behaviour of the +%% XPath scanner. +%% </p> +%% Possible options are: +%% <dl> +%% <dt><code>{namespace, #xmlNamespace}</code></dt> +%% <dd>Set namespace nodes, from XmlNamspace, in xmlContext</dd> +%% <dt><code>{namespace, Nodes}</code></dt> +%% <dd>Set namespace nodes in xmlContext.</dd> +%% </dl> + +%% <dt><code>{bindings, Bs}</code></dt> +%% <dd></dd> +%% <dt><code>{functions, Fs}</code></dt> +%% <dd></dd> +-module(xmerl_xpath). + + +%% main API +-export([string/2, + string/3, + string/5]). + +%% exported helper functions, internal for the XPath support +-export([eval_path/3, + axis/3, axis/4]). + +%% debug function +-export([write_node/1]). + + +-include("xmerl.hrl"). +-include("xmerl_internal.hrl"). + + +-record(state, {context = #xmlContext{}, + acc = []}). + + +-define(nodeset(NS), #state{context = #xmlContext{nodeset = NS}}). +-define(context(C), #state{context = C}). + + + + +%% @spec string(Str, Doc) -> [docEntity()] | Scalar +%% @equiv string(Str,Doc, []) +string(Str, Doc) -> + string(Str, Doc, []). + +%% @spec string(Str,Doc,Options) -> +%% [docEntity()] | Scalar +%% @equiv string(Str,Doc, [],Doc,Options) +string(Str, Doc, Options) -> + string(Str, Doc, [], Doc, Options). + +%% @spec string(Str,Node,Parents,Doc,Options) -> +%% [docEntity()] | Scalar +%% Str = xPathString() +%% Node = nodeEntity() +%% Parents = parentList() +%% Doc = nodeEntity() +%% Options = option_list() +%% Scalar = xmlObj +%% @doc Extracts the nodes from the parsed XML tree according to XPath. +%% xmlObj is a record with fields type and value, +%% where type is boolean | number | string +string(Str, Node, Parents, Doc, Options) -> +%% record with fields type and value, +%% where type is boolean | number | string + FullParents = + case Parents of + [] -> + []; + [{H, P}|_] when is_atom(H), is_integer(P) -> + full_parents(Parents, Doc) + end, +%io:format("string FullParents=~p~n",[FullParents]), + ContextNode=#xmlNode{type = node_type(Node), + node = Node, + parents = FullParents}, +%io:format("string ContextNode=~p~n",[ContextNode]), + WholeDoc = whole_document(Doc), +%io:format("string WholeDoc=~p~n",[WholeDoc]), + Context=(new_context(Options))#xmlContext{context_node = ContextNode, + whole_document = WholeDoc}, +%io:format("string Context=~p~n",[Context]), + #state{context = NewContext} = match(Str, #state{context = Context}), +%io:format("string NewContext=~p~n",[NewContext]), + case NewContext#xmlContext.nodeset of + ScalObj = #xmlObj{type=Scalar} + when Scalar == boolean; Scalar == number; Scalar == string -> + ScalObj; + #xmlObj{type=nodeset,value=NodeSet} -> + NodeSet; + _ -> + [N || #xmlNode{node = N} <- NewContext#xmlContext.nodeset] + end. + + +whole_document(#xmlDocument{} = Doc) -> + #xmlNode{type = root_node, + node = Doc, + parents = []}; +whole_document(Other) -> + #xmlNode{type = root_node, + node = #xmlDocument{content = Other}, + parents = []}. + + +new_context(Options) -> + new_context(Options, #xmlContext{}). + +new_context([{namespace, #xmlNamespace{nodes = Nodes}}|T], C) -> + new_context(T, C#xmlContext{namespace = ns_nodes(Nodes)}); +new_context([{namespace, Nodes}|T], C) -> + new_context(T, C#xmlContext{namespace = ns_nodes(Nodes)}); +new_context([{bindings, Bs}|T], C) -> + new_context(T, C#xmlContext{bindings = Bs}); +new_context([{functions, Fs}|T], C) -> + new_context(T, C#xmlContext{functions = Fs}); +new_context([], C) -> + C. + + +ns_nodes([{Prefix, URI}|T]) -> + [{to_string(Prefix), to_atom(URI)}|ns_nodes(T)]; +ns_nodes([]) -> + []. + +full_parents(Ps, Doc) -> + full_parents1(lists:reverse(Ps), [Doc], []). + +full_parents1([{Name, Pos}|Ns], Content, Parents) -> + E = locate_element(Name, Pos, Content), + PN = #xmlNode{type = element, + node = E, + parents = Parents}, + full_parents1(Ns, get_content(E), [PN|Parents]); +full_parents1([], _E, Parents) -> + Parents. + + +locate_element(Name, Pos, [E = #xmlElement{name = Name, pos = Pos}|_]) -> + E; +locate_element(_Name, Pos, [#xmlElement{pos = P}|_]) when P >= Pos -> + %% we've passed Pos (P > Pos) or the name is wrong (P == Pos) + exit(invalid_parents); +locate_element(_Name, _Pos, []) -> + exit(invalid_parents); +locate_element(Name, Pos, [_|T]) -> + locate_element(Name, Pos, T). + + +match(Str, S = #state{}) -> + Tokens = xmerl_xpath_scan:tokens(Str), + case xmerl_xpath_parse:parse(Tokens) of + {ok, Expr} -> + match_expr(Expr, S); + Error -> + Error + end. + + +match_expr({path, Type, Arg}, S) -> + eval_path(Type, Arg, S#state.context); +%% PrimaryExpr +match_expr(PrimExpr,S) -> + eval_primary_expr(PrimExpr,S). + + + + + +path_expr({refine, StepExpr1, StepExpr2}, S) -> + ?dbg("StepExpr1=~p StepExpr2=~p~n", [StepExpr1,StepExpr2]), + ?dbg("length(nodeset) = ~p~n", + [length((S#state.context)#xmlContext.nodeset)]), + S1 = path_expr(StepExpr1, S), + ?dbg("length(nodeset1) = ~p~n", + [length((S1#state.context)#xmlContext.nodeset)]), + path_expr(StepExpr2, S1); +path_expr({step, {Axis, NodeTest, PredExpr}}, S = #state{context = C, + acc = Acc}) -> + ?dbg("PredExpr = ~p~n", [PredExpr]), + NewContext = axis(Axis, NodeTest, C, Acc), + pred_expr(PredExpr, S#state{context = NewContext}); +path_expr('/', S) -> + S. + + +pred_expr([], S) -> + S; +pred_expr([{pred, Pred}|Preds], S = #state{}) -> + ?dbg("Pred = ~p~n", [Pred]), + NewS = eval_pred(Pred, S), + pred_expr(Preds, NewS). + +%% simple case: the predicate is a number, e.g. para[5]. +%% No need to iterate over all nodes in the nodeset; we know what to do. +%% +eval_pred({number, N0}, + S = #state{context = C = #xmlContext{nodeset = NS, + axis_type = AxisType}}) -> + Len = length(NS), + case Len>=N0 of + true -> + N = case AxisType of + forward -> + N0; + reverse -> + Len + 1 - N0 + end, + NewNodeSet = [lists:nth(N, NS)], + NewContext = C#xmlContext{nodeset = NewNodeSet}, + S#state{context = NewContext}; + false -> S#state{context = C#xmlContext{nodeset = []}} + end; +eval_pred(Predicate, S = #state{context = C = + #xmlContext{nodeset = NodeSet}}) -> + NewNodeSet = + lists:filter( + fun(Node) -> + %io:format("current node: ~p~n", [write_node(Node)]), + ThisContext = C#xmlContext{context_node = Node}, + xmerl_xpath_pred:eval(Predicate, ThisContext) + end, NodeSet), + NewContext = C#xmlContext{nodeset = NewNodeSet}, + S#state{context = NewContext}. + + + +%% write_node(Node::xmlNode()) -> {Type,Pos,Name,Parents} +%% Helper function to access essential information from the xmlNode record. +%% @hidden +write_node(#xmlNode{pos = Pos, + node = #xmlAttribute{name = Name, + parents = Ps}}) -> + {attribute, Pos, Name, Ps}; +write_node(#xmlNode{pos = Pos, + node = #xmlElement{name = Name, + parents = Ps}}) -> + {element, Pos, Name, Ps}; +write_node(#xmlNode{pos = Pos, + node = #xmlText{value = Txt, + parents = Ps}}) -> + {text, Pos, Txt, Ps}; +write_node(_) -> + other. + + +%% eval_path(Type,Arg,S::state()) -> state() +%% Eval path +%% @hidden +eval_path(union, {PathExpr1, PathExpr2}, C = #xmlContext{}) -> + S = #state{context = C}, + S1 = match_expr(PathExpr1, S), +%% NewNodeSet = (S1#state.context)#xmlContext.nodeset, + S2 = match_expr(PathExpr2, S1#state{context=C}), + NodeSet1 = (S1#state.context)#xmlContext.nodeset, + NodeSet2 = (S2#state.context)#xmlContext.nodeset, + NewNodeSet = ordsets:to_list(ordsets:union(ordsets:from_list(NodeSet1), + ordsets:from_list(NodeSet2))), + S2#state{context=(S2#state.context)#xmlContext{nodeset=NewNodeSet}}; +eval_path(abs, PathExpr, C = #xmlContext{}) -> + NodeSet = [C#xmlContext.whole_document], + Context = C#xmlContext{nodeset = NodeSet}, + S = #state{context = Context}, + path_expr(PathExpr, S); +eval_path(rel, PathExpr, C = #xmlContext{}) -> + NodeSet = [C#xmlContext.context_node], + Context = C#xmlContext{nodeset = NodeSet}, + S = #state{context = Context}, + path_expr(PathExpr, S); +eval_path(filter, {PathExpr, PredExpr}, C = #xmlContext{}) -> + S = #state{context = C}, + S1 = path_expr(PathExpr, S), + pred_expr(PredExpr, S1). + +eval_primary_expr(FC = {function_call,_,_},S = #state{context = Context}) -> +%% NewNodeSet = xmerl_xpath_pred:eval(FC, Context), + NewNodeSet = xmerl_xpath_lib:eval(primary_expr, FC, Context), + NewContext = Context#xmlContext{nodeset = NewNodeSet}, + S#state{context = NewContext}; +eval_primary_expr(PrimExpr,_S) -> + exit({primary_expression,{not_implemented, PrimExpr}}). + + +%% axis(Axis,NodeTest,Context::xmlContext()) -> xmlContext() +%% axis(Axis,NodeTest,Context,[]) +%% @hidden +axis(Axis, NodeTest, Context) -> + axis(Axis, NodeTest, Context, []). + + +%% axis(Axis,NodeTest,Context::xmlContext(),Acc) -> xmlContext() +%% +%% An axis specifies the tree relationship between the nodes selected by +%% the location step and the context node. +%% @hidden +axis(Axis, NodeTest, Context = #xmlContext{nodeset = NS0}, Acc) -> + NewNodeSet=lists:foldr( + fun(N, AccX) -> + axis1(Axis, NodeTest, N, AccX, Context) + end, Acc, NS0), + update_nodeset(fwd_or_reverse(Axis, Context), NewNodeSet). + + +axis1(self, Tok, N, Acc, Context) -> + match_self(Tok, N, Acc, Context); +axis1(descendant, Tok, N, Acc, Context) -> + match_descendant(Tok, N, Acc, Context); +axis1(child, Tok, N, Acc, Context) -> + match_child(Tok, N, Acc, Context); +axis1(parent, Tok, N, Acc, Context) -> + match_parent(Tok, N, Acc, Context); +axis1(ancestor, Tok, N, Acc, Context) -> + match_ancestor(Tok, N, Acc, Context); +axis1(following_sibling, Tok, N, Acc, Context) -> + match_following_sibling(Tok, N, Acc, Context); +axis1(preceding_sibling, Tok, N, Acc, Context) -> + match_preceding_sibling(Tok, N, Acc, Context); +axis1(following, Tok, N, Acc, Context) -> + match_following(Tok, N, Acc, Context); +axis1(preceding, Tok, N, Acc, Context) -> + match_preceding(Tok, N, Acc, Context); +axis1(attribute, Tok, N, Acc, Context) -> + match_attribute(Tok, N, Acc, Context); +%axis1(namespace, Tok, N, Acc, Context) -> +% match_namespace(Tok, N, Acc, Context); +axis1(ancestor_or_self, Tok, N, Acc, Context) -> + match_ancestor_or_self(Tok, N, Acc, Context); +axis1(descendant_or_self, Tok, N, Acc, Context) -> + match_descendant_or_self(Tok, N, Acc, Context). + + +fwd_or_reverse(ancestor, Context) -> + reverse_axis(Context); +fwd_or_reverse(ancestor_or_self, Context) -> + reverse_axis(Context); +fwd_or_reverse(preceding_sibling, Context) -> + reverse_axis(Context); +fwd_or_reverse(preceding, Context) -> + reverse_axis(Context); +fwd_or_reverse(_, Context) -> + forward_axis(Context). + +reverse_axis(Context) -> + Context#xmlContext{axis_type = reverse}. +forward_axis(Context) -> + Context#xmlContext{axis_type = forward}. + + + +match_self(Tok, N, Acc, Context) -> + case node_test(Tok, N, Context) of + true -> + [N|Acc]; + false -> + Acc + end. + + +match_descendant(Tok, N, Acc, Context) -> + #xmlNode{parents = Ps, node = Node, type = Type} = N, + case Type of + El when El == element; El == root_node -> + NewPs = [N|Ps], + match_desc(get_content(Node), NewPs, Tok, Acc, Context); + _Other -> + Acc + end. + + +match_desc([E = #xmlElement{}|T], Parents, Tok, Acc, Context) -> + Acc1 = match_desc(T, Parents, Tok, Acc, Context), + N = #xmlNode{type = node_type(E), + node = E, + parents = Parents}, + NewParents = [N|Parents], + Acc2 = match_desc(get_content(E), NewParents, Tok, Acc1, Context), + match_self(Tok, N, Acc2, Context); +match_desc([E|T], Parents, Tok, Acc, Context) -> + Acc1 = match_desc(T, Parents, Tok, Acc, Context), + N = #xmlNode{node = E, + type = node_type(E), + parents = Parents}, + match_self(Tok, N, Acc1, Context); +match_desc([], _Parents, _Tok, Acc, _Context) -> + Acc. + + + +%% "The 'descendant-or-self' axis contains the context node and the +%% descendants of the context node." +match_descendant_or_self(Tok, N, Acc, Context) -> + Acc1 = match_descendant(Tok, N, Acc, Context), + match_self(Tok, N, Acc1, Context). + + +match_child(Tok, N, Acc, Context) -> + %io:format("match_child(~p)~n", [write_node(N)]), + #xmlNode{parents = Ps, node = Node, type = Type} = N, + case Type of + El when El == element; El == root_node -> + NewPs = [N|Ps], + lists:foldr( + fun(E, AccX) -> + ThisN = #xmlNode{type = node_type(E), + node = E, + parents = NewPs}, + match_self(Tok, ThisN, AccX, Context) + end, Acc, get_content(Node)); + _Other -> + Acc + end. + + +%% "The 'parent' axis contains the parent of the context node, +%% if there is one." +match_parent(Tok, N, Acc, Context) -> + case N#xmlNode.parents of + [] -> + Acc; + [PN|_] -> + match_self(Tok, PN, Acc, Context) + end. + + +%% "The 'ancestor' axis contains the ancestors of the context node; +%% the ancestors of the context node consists of the parent of the context +%% node and the parent's parent and so on; thus, the ancestor axis will +%% always include the root node, unless the context node is the root node." +match_ancestor(Tok, N, Acc, Context) -> + Parents = N#xmlNode.parents, + lists:foldl( + fun(PN, AccX) -> + match_self(Tok, PN, AccX, Context) + end, Acc, Parents). + + + + +%% "The 'ancestor-or-self' axis contains the context node and the ancestors +%% of the context node; thus, the acestor axis will always include the +%% root node." +match_ancestor_or_self(Tok, N, Acc, Context) -> + Acc1 = match_self(Tok, N, Acc, Context), + match_ancestor(Tok, N, Acc1, Context). + + +match_following_sibling(_Tok, #xmlAttribute{}, Acc, _Context) -> + Acc; +match_following_sibling(_Tok, #xmlNamespace{}, Acc, _Context) -> + Acc; + +match_following_sibling(Tok, N, Acc, Context) -> + #xmlNode{parents = Ps, node = Node} = N, + case Ps of + [#xmlNode{type = element, + node = #xmlElement{} = PNode}|_] -> + FollowingSiblings = lists:nthtail(get_position(Node), + get_content(PNode)), + lists:foldr( + fun(E, AccX) -> + ThisN = #xmlNode{type = node_type(E), + node = E, + parents = Ps}, + match_self(Tok, ThisN, AccX, Context) + end, Acc, FollowingSiblings); + _Other -> + Acc + end. + + +%% "The 'following' axis contains all nodes in the same document as the +%% context node that are after the context node in document order, excluding +%% any descendants and excluding attribute nodes and namespace nodes." +match_following(Tok, N, Acc, Context) -> + #xmlNode{parents = Ps, node = Node} = N, + case Ps of + [#xmlNode{type = element, + node = #xmlElement{} = PNode} = P|_] -> + FollowingSiblings = lists:nthtail(get_position(Node), + get_content(PNode)), + Acc0 = match_following(Tok, P, Acc, Context), + lists:foldr( + fun(E, AccX) -> + ThisN = #xmlNode{type = node_type(E), + node = E, + parents = Ps}, + match_descendant_or_self(Tok, ThisN, AccX, Context) + end, Acc0, FollowingSiblings); + _Other -> + Acc + end. + + +%% "The preceding-sibling axis contains all the preceding siblings of the +%% context node; if the context node is an attribute node or namespace node, +%% the preceding-sibling axis is empty." +match_preceding_sibling(_Tok, #xmlAttribute{}, Acc, _Context) -> + Acc; +match_preceding_sibling(_Tok, #xmlNamespace{}, Acc, _Context) -> + Acc; + +match_preceding_sibling(Tok, N, Acc, Context) -> + #xmlNode{parents = Ps, node = Node} = N, + case Ps of + [#xmlNode{type = element, + node = #xmlElement{} = PNode}|_] -> + PrecedingSiblings = lists:sublist(get_content(PNode), 1, + get_position(Node) - 1), + lists:foldr( + fun(E, AccX) -> + ThisN = #xmlNode{type = node_type(E), + node = E, + parents = Ps}, + match_self(Tok, ThisN, AccX, Context) + end, Acc, PrecedingSiblings); + _Other -> + Acc + end. + + +%% "The 'preceding' axis contains all nodes in the same document as the context +%% node that are before the context node in document order, exluding any +%% ancestors and excluding attribute nodes and namespace nodes." +match_preceding(Tok, N, Acc, Context) -> + #xmlNode{parents = Ps, node = Node} = N, + case Ps of + [#xmlNode{type = element, + node = #xmlElement{} = PNode} = P|_] -> + PrecedingSiblings = lists:sublist(get_content(PNode), 1, + get_position(Node) - 1), + Acc0 = lists:foldr( + fun(E, AccX) -> + ThisN = #xmlNode{type = node_type(E), + node = E, + parents = Ps}, + match_descendant_or_self(Tok, ThisN, + AccX, Context) + end, Acc, PrecedingSiblings), + match_preceding(Tok, P, Acc0, Context); + _Other -> + Acc + end. + + +%% "The 'attribute' axis contains the attributes of the context node; the +%% axis will be empty unless the context node is an element." +match_attribute(Tok, N, Acc, Context) -> + case N#xmlNode.type of + element -> + #xmlNode{parents = Ps, node = E} = N, + lists:foldr( + fun(A, AccX) -> + ThisN = #xmlNode{type = attribute, + node = A, + parents = [N|Ps]}, + match_self(Tok, ThisN, AccX, Context) + end, Acc, E#xmlElement.attributes); + _Other -> + %%[] + Acc + end. + +node_type(#xmlAttribute{}) -> attribute; +node_type(#xmlElement{}) -> element; +node_type(#xmlText{}) -> text; +node_type(#xmlPI{}) -> processing_instruction; +node_type(#xmlNamespace{}) -> namespace; +node_type(#xmlDocument{}) -> root_node. + +%% "The namespace axis contains the namespace nodes of the context node; +%% the axis will be empty unless the context node is an element." +%match_namespace(_Tok, _N, _Acc, _Context) -> + %% TODO: IMPLEMENT NAMESPACE AXIS +% erlang:fault(not_yet_implemented). + + +update_nodeset(Context = #xmlContext{axis_type = AxisType}, NodeSet) -> + MapFold = + case AxisType of + forward -> + mapfoldl; + reverse -> + mapfoldr + end, + {Result, _N} = + lists:MapFold(fun(Node, N) -> + {Node#xmlNode{pos = N}, N + 1} + end, 1, NodeSet), + Context#xmlContext{nodeset = Result}. + + + +node_test(F, N, Context) when is_function(F) -> + F(N, Context); +node_test({wildcard, _}, #xmlNode{type=ElAt}, _Context) + when ElAt==element; ElAt==attribute -> + true; +node_test({prefix_test, Prefix}, #xmlNode{node = N}, _Context) -> + case N of + #xmlElement{nsinfo = {Prefix, _}} -> true; + #xmlAttribute{nsinfo = {Prefix, _}} -> true; + _ -> + false + end; +node_test({name, {Tag, _Prefix, _Local}}, + #xmlNode{node = #xmlElement{name = Tag}}=_N, _Context) -> + %io:format("node_test({tag, ~p}, ~p) -> true.~n", [Tag, write_node(_N)]), + true; +node_test({name, {Tag, Prefix, Local}}, + #xmlNode{node = #xmlElement{name = Name, + expanded_name = EExpName, + nsinfo = {_Prefix1, _} + }}, Context) -> + case expanded_name(Prefix, Local, Context) of + [] -> + Res = (Tag == Name), + ?dbg("node_test(~p, ~p) -> ~p.~n", + [{Tag, Prefix, Local}, write_node(Name), Res]), + Res; + ExpName -> + Res = (ExpName == EExpName), + ?dbg("node_test(~p, ~p) -> ~p.~n", + [{Tag, Prefix, Local}, write_node(Name), Res]), + Res + end; +node_test({name, {_Tag, Prefix, Local}}, + #xmlNode{node = #xmlElement{name = Name, + expanded_name = _EExpName, + namespace = NS + }}, Context) -> + case expanded_name(Prefix, Local, Context) of + [] -> + ?dbg("node_test(~p, ~p) -> ~p.~n", + [{_Tag, Prefix, Local}, write_node(Name), false]), + false; + ExpName -> + Res = (ExpName == {NS#xmlNamespace.default,Name}), + ?dbg("node_test(~p, ~p) -> ~p.~n", + [{_Tag, Prefix, Local}, write_node(Name), Res]), + Res + end; +node_test({name, {Tag,_Prefix,_Local}}, + #xmlNode{node = #xmlAttribute{name = Tag}}, _Context) -> + true; +node_test({name, {_Tag, Prefix, Local}}, + #xmlNode{node = #xmlAttribute{expanded_name = {URI, Local}, + nsinfo = {_Prefix1, _}, + namespace = NS}}, _Context) -> + NSNodes = NS#xmlNamespace.nodes, + case lists:keysearch(Prefix, 1, NSNodes) of + {value, {_, URI}} -> + ?dbg("node_test(~, ~p) -> true.~n", + [{_Tag, Prefix, Local}, write_node(NSNodes)]), + true; + false -> + ?dbg("node_test(~, ~p) -> false.~n", + [{_Tag, Prefix, Local}, write_node(NSNodes)]), + false + end; +node_test({node_type, NT}, #xmlNode{node = N}, _Context) -> + case {NT, N} of + {text, #xmlText{}} -> + true; + {node, _} -> + true; + {attribute, #xmlAttribute{}} -> + true; + {namespace, #xmlNamespace{}} -> + true; + _ -> + false + end; +node_test({processing_instruction, {literal, _, Name}}, + #xmlNode{node = {processing_instruction, Name, _Data}}, _Context) -> + true; +node_test(_Other, _N, _Context) -> + %io:format("node_test(~p, ~p) -> false.~n", [_Other, write_node(_N)]), + false. + + +expanded_name(Prefix, Local, #xmlContext{namespace = NS}) -> + case lists:keysearch(Prefix, 1, NS) of + {value, {_, URI}} -> + {URI, list_to_atom(Local)}; + false -> + [] + end. + + +to_atom(A) when is_atom(A) -> A; +to_atom(S) when is_list(S) -> list_to_atom(S). + +to_string(A) when is_atom(A) -> atom_to_list(A); +to_string(S) when is_list(S) -> S. + + +get_content(#xmlElement{content = C}) when is_list(C) -> + C; +get_content(#xmlElement{content = F} = E) when is_function(F) -> + case F() of + C when is_list(C) -> + C; + _Other -> + exit({bad_content, E}) + end; +get_content(#xmlDocument{content = C}) when is_list(C) -> + C; +get_content(#xmlDocument{content = C}) -> + [C]. + + +get_position(#xmlElement{pos = N}) -> + N; +get_position(#xmlText{pos = N}) -> + N. diff --git a/lib/xmerl/src/xmerl_xpath_lib.erl b/lib/xmerl/src/xmerl_xpath_lib.erl new file mode 100644 index 0000000000..cfd0e36667 --- /dev/null +++ b/lib/xmerl/src/xmerl_xpath_lib.erl @@ -0,0 +1,53 @@ +%% +%% %CopyrightBegin% +%% +%% Copyright Ericsson AB 2008-2009. All Rights Reserved. +%% +%% The contents of this file are subject to the Erlang Public License, +%% Version 1.1, (the "License"); you may not use this file except in +%% compliance with the License. You should have received a copy of the +%% Erlang Public License along with this software. If not, it can be +%% retrieved online at http://www.erlang.org/. +%% +%% Software distributed under the License is distributed on an "AS IS" +%% basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See +%% the License for the specific language governing rights and limitations +%% under the License. +%% +%% %CopyrightEnd% +%% + +%% +-module(xmerl_xpath_lib). + +-include("xmerl.hrl"). + +-export([eval/3]). + +-define(string(X), #xmlObj{type = string, + value = X}). +-define(number(X), #xmlObj{type = number, + value = X}). + + +eval(primary_expr,PrimExpr,C) -> + primary_expr(PrimExpr, C); +eval(predicate,Pred,C) -> + xmerl_xpath_pred:eval(Pred,C). + +primary_expr({number, N}, _C) -> + ?number(N); +primary_expr({literal, S}, _C) -> + ?string(S); +primary_expr({function_call, F, Args}, C) -> + case xmerl_xpath_pred:core_function(F) of + {true, F1} -> + xmerl_xpath_pred:F1(C, Args); + true -> + xmerl_xpath_pred:F(C, Args); + false -> + %% here, we should look up the function in the context provided + %% by the caller, but we haven't figured this out yet. + exit({not_a_core_function, F}) + end. + diff --git a/lib/xmerl/src/xmerl_xpath_parse.yrl b/lib/xmerl/src/xmerl_xpath_parse.yrl new file mode 100644 index 0000000000..37576b9e61 --- /dev/null +++ b/lib/xmerl/src/xmerl_xpath_parse.yrl @@ -0,0 +1,311 @@ +%% +%% %CopyrightBegin% +%% +%% Copyright Ericsson AB 2003-2009. All Rights Reserved. +%% +%% The contents of this file are subject to the Erlang Public License, +%% Version 1.1, (the "License"); you may not use this file except in +%% compliance with the License. You should have received a copy of the +%% Erlang Public License along with this software. If not, it can be +%% retrieved online at http://www.erlang.org/. +%% +%% Software distributed under the License is distributed on an "AS IS" +%% basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See +%% the License for the specific language governing rights and limitations +%% under the License. +%% +%% %CopyrightEnd% +%% + +%% Description : Yecc spec for XPATH grammar +%% This version of the parser is based on the XPATH spec: +%% http://www.w3.org/TR/1999/REC-xpath-19991116 (XPATH version 1.0) + + +Nonterminals + 'LocationPath' + 'AbsoluteLocationPath' + 'RelativeLocationPath' + 'Step' +%% 'AxisSpecifier' + 'NodeTest' + 'Predicate' + 'PredicateExpr' + 'AbbreviatedAbsoluteLocationPath' + 'AbbreviatedRelativeLocationPath' + 'AbbreviatedStep' +%% 'AbbreviatedAxisSpecifier' + 'Expr' + 'PrimaryExpr' + 'FunctionCall' + 'Argument' + 'UnionExpr' + 'PathExpr' + 'FilterExpr' + 'OrExpr' + 'AndExpr' + 'EqualityExpr' + 'RelationalExpr' + 'AdditiveExpr' + 'MultiplicativeExpr' + 'UnaryExpr' +%% 'Operator' +%% 'OperatorName' + 'MultiplyOperator' + 'NameTest' + '<PredicateList>' + '<PredicateMember>' + '<ArgumentList>' + '<ArgumentMember>' + . + +Terminals + 'number' + 'axis' + 'node_type' + 'literal' + 'prefix_test' + 'var_reference' + 'function_name' + 'name' + 'processing-instruction' + 'wildcard' + '(' ')' '[' ']' '.' '..' '@' ',' '::' + 'and' 'or' 'mod' 'div' + '/' '//' '|' '+' '-' '=' '!=' '<' '<=' '>' '>=' + '*' + . + +Rootsymbol 'Expr'. + +Endsymbol '$end' . + +Left 100 'or' . +Left 200 'and' . +Left 300 '=' . +Left 300 '!=' . +Left 400 '<' . +Left 400 '>=' . +Left 400 '>' . +Left 400 '<=' . +Unary 500 '-' . + +Expect 2. + +%%------------------------------------------------------------ +%% Clauses +%% + +%% [1] +'LocationPath' -> 'RelativeLocationPath' : {path, rel, '$1'} . +'LocationPath' -> 'AbsoluteLocationPath' : {path, abs, '$1'}. + +%% [2] +'AbsoluteLocationPath' -> '/' 'RelativeLocationPath' : '$2' . +'AbsoluteLocationPath' -> '/' : '/' . + +%% [3] +'RelativeLocationPath' -> 'AbbreviatedAbsoluteLocationPath' : '$1' . +'RelativeLocationPath' -> 'Step' : '$1' . +'RelativeLocationPath' -> 'RelativeLocationPath' '/' 'Step' : + {refine, '$1', '$3'} . +'RelativeLocationPath' -> 'AbbreviatedRelativeLocationPath' : '$1' . + +%% [4] +'Step' -> 'axis' '::' 'NodeTest' '<PredicateList>' + : {step, {value('$1'), '$3', '$4'}} . +'Step' -> 'axis' '::' 'NodeTest' + : {step, {value('$1'), '$3', []}} . +'Step' -> '@' 'name' '<PredicateList>' + : {step, {value('$1'), '$2', '$3'}} . +'Step' -> '@' 'name' + : {step, {'attribute', '$2', []}} . +'Step' -> 'NodeTest' '<PredicateList>' + : {step, {'child', '$1', '$2'}} . +'Step' -> 'NodeTest' + : {step, {'child', '$1', []}} . +'Step' -> 'AbbreviatedStep' + : {abbrev_step, '$1'} . + + +'<PredicateList>' -> '<PredicateMember>' : lists:reverse('$1') . + + +'<PredicateMember>' -> '<PredicateMember>' 'Predicate' + : ['$2'|'$1'] . +'<PredicateMember>' -> 'Predicate' : ['$1'] . + + +%% [5] +%% 'AxisSpecifier' -> 'axis' '::' : '$1' . +%% 'AxisSpecifier' -> 'AbbreviatedAxisSpecifier' : '$1' . + + +%% [7] +'NodeTest' -> 'NameTest' : '$1' . +'NodeTest' -> 'node_type' '(' ')' : {node_type, value('$1')} . +'NodeTest' -> 'processing-instruction' '(' 'literal' ')' + : {processing_instruction, value('$3')} . + + +%% [8] +'Predicate' -> '[' 'PredicateExpr' ']' : {pred, '$2'} . + +%% [9] +'PredicateExpr' -> 'Expr' : '$1' . + +%% [10] +'AbbreviatedAbsoluteLocationPath' -> '//' 'RelativeLocationPath' + : {'//', '$2'} . + +%% [11] +'AbbreviatedRelativeLocationPath' -> 'RelativeLocationPath' '//' 'Step' + : {'$1', '//', '$3'} . + +%% [12] +'AbbreviatedStep' -> '.' : '$1' . +'AbbreviatedStep' -> '..' : '$1' . + +%% [13] +%% 'AbbreviatedAxisSpecifier' -> '$empty' : 'child' . +%% 'AbbreviatedAxisSpecifier' -> '@' : '$1' . + +%% [14] +'Expr' -> 'OrExpr' : '$1' . + +%% [15] +'PrimaryExpr' -> 'var_reference' : {variable_reference, value('$1')} . +'PrimaryExpr' -> '(' Expr ')' : '$2' . +'PrimaryExpr' -> 'literal' : {literal, value('$1')} . +'PrimaryExpr' -> 'number' : {number, value('$1')} . +'PrimaryExpr' -> 'FunctionCall' : '$1' . + + +%% [16] +'FunctionCall' -> 'function_name' '(' ')' : {function_call, value('$1'), []} . +'FunctionCall' -> 'function_name' '(' '<ArgumentList>' ')' + : {function_call, value('$1'), '$3'} . + +'<ArgumentList>' -> '<ArgumentMember>' : lists:reverse('$1') . + +'<ArgumentMember>' -> '<ArgumentMember>' ',' 'Argument' + : ['$3'|'$1'] . +'<ArgumentMember>' -> 'Argument' : ['$1'] . + + +%% [17] +'Argument' -> 'Expr' : '$1' . + + +%% [18] +'UnionExpr' -> 'PathExpr' : '$1' . +'UnionExpr' -> 'UnionExpr' '|' 'PathExpr' : {path, union, {'$1', '$3'}} . + + +%% [19] +'PathExpr' -> 'LocationPath' : '$1' . +'PathExpr' -> 'FilterExpr' : '$1' . +'PathExpr' -> 'FilterExpr' '/' 'RelativeLocationPath' : {refine, '$1', '$3'} . +'PathExpr' -> 'FilterExpr' '//' 'RelativeLocationPath' : {'$1', '//', '$3'} . + +%% [20] +'FilterExpr' -> 'PrimaryExpr' : '$1' . +'FilterExpr' -> 'FilterExpr' 'Predicate' : {path, filter, {'$1', '$2'}} . + + +%% [21] +'OrExpr' -> 'AndExpr' : '$1' . +'OrExpr' -> 'OrExpr' 'or' 'AndExpr' + : {bool, 'or', '$1', '$3'} . + + +%% [22] +'AndExpr' -> 'EqualityExpr' : '$1' . +'AndExpr' -> 'AndExpr' 'and' 'EqualityExpr' + : {bool, 'and', '$1', '$3'} . + +%% [23] +'EqualityExpr' -> 'RelationalExpr' : '$1' . +'EqualityExpr' -> 'EqualityExpr' '=' 'RelationalExpr' + : {comp, '=', '$1', '$3'} . +'EqualityExpr' -> 'EqualityExpr' '!=' 'RelationalExpr' + : {comp, '!=', '$1', '$3'} . + +%%[24] +'RelationalExpr' -> 'AdditiveExpr' : '$1' . +'RelationalExpr' -> 'RelationalExpr' '<' 'AdditiveExpr' + : {comp, '<', '$1', '$3'} . +'RelationalExpr' -> 'RelationalExpr' '>' 'AdditiveExpr' + : {comp, '>', '$1', '$3'} . +'RelationalExpr' -> 'RelationalExpr' '<=' 'AdditiveExpr' + : {comp, '<=', '$1', '$3'} . +'RelationalExpr' -> 'RelationalExpr' '>=' 'AdditiveExpr' + : {comp, '>=', '$1', '$3'} . + + +%% [25] +'AdditiveExpr' -> 'MultiplicativeExpr' : '$1' . +'AdditiveExpr' -> 'AdditiveExpr' '+' 'MultiplicativeExpr' + : {arith, '+', '$1', '$3'} . +'AdditiveExpr' -> 'AdditiveExpr' '-' 'MultiplicativeExpr' + : {arith, '-', '$1', '$3'} . + + +%% [26] +'MultiplicativeExpr' -> 'UnaryExpr' : '$1' . +'MultiplicativeExpr' -> 'MultiplicativeExpr' 'MultiplyOperator' 'UnaryExpr' + : {arith, '$2', '$1', '$3'} . +'MultiplicativeExpr' -> 'MultiplicativeExpr' 'div' 'UnaryExpr' + : {arith, 'div', '$1', '$3'} . +'MultiplicativeExpr' -> 'MultiplicativeExpr' 'mod' 'UnaryExpr' + : {arith, 'mod', '$1', '$3'} . + + +%% [27] +'UnaryExpr' -> 'UnionExpr' : '$1' . +'UnaryExpr' -> '-' UnaryExpr : {'negative', '$2'} . + + + +%% [32] +%% 'Operator' -> 'OperatorName' : '$1' . +%% 'Operator' -> 'MultiplyOperator' : '$1' . +%% 'Operator' -> '/' : '$1' . +%% 'Operator' -> '//' : '$1' . +%% 'Operator' -> '|' : '$1' . +%% 'Operator' -> '+' : '$1' . +%% 'Operator' -> '-' : '$1' . +%% 'Operator' -> '=' : '$1' . +%% 'Operator' -> '!=' : '$1' . +%% 'Operator' -> '<' : '$1' . +%% 'Operator' -> '<=' : '$1' . +%% 'Operator' -> '>' : '$1' . +%% 'Operator' -> '>=' : '$1' . + +%% [33] +%% 'OperatorName' -> 'and' : '$1' . +%% 'OperatorName' -> 'mod' : '$1' . +%% 'OperatorName' -> 'div' : '$1' . + +%% [34] +'MultiplyOperator' -> '*' : '*' . + + +%% [37] +'NameTest' -> 'wildcard' : {wildcard, value('$1')} . +'NameTest' -> 'prefix_test' : {prefix_test, value('$1')} . +'NameTest' -> 'name' : {name, value('$1')} . + + + +Erlang code. + +% token({Token, _Line}) -> +% Token; +% token({Token, _Line, _Value}) -> +% Token. + +value({Token, _Line}) -> + Token; +value({_Token, _Line, Value}) -> + Value. diff --git a/lib/xmerl/src/xmerl_xpath_pred.erl b/lib/xmerl/src/xmerl_xpath_pred.erl new file mode 100644 index 0000000000..451a09bee3 --- /dev/null +++ b/lib/xmerl/src/xmerl_xpath_pred.erl @@ -0,0 +1,808 @@ +%% +%% %CopyrightBegin% +%% +%% Copyright Ericsson AB 2003-2009. All Rights Reserved. +%% +%% The contents of this file are subject to the Erlang Public License, +%% Version 1.1, (the "License"); you may not use this file except in +%% compliance with the License. You should have received a copy of the +%% Erlang Public License along with this software. If not, it can be +%% retrieved online at http://www.erlang.org/. +%% +%% Software distributed under the License is distributed on an "AS IS" +%% basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See +%% the License for the specific language governing rights and limitations +%% under the License. +%% +%% %CopyrightEnd% +%% + +%% Description : Helper module to xmerl_xpath: XPATH predicates. + +-module(xmerl_xpath_pred). + +%% API +-export([eval/2]). + + +%% internal functions (called via apply/3) +-export([boolean/1, boolean/2, + ceiling/2, + concat/2, + contains/2, + count/2, + floor/2, + fn_false/2, + fn_not/2, + fn_true/2, + id/2, + lang/2, + last/2, + 'local-name'/2, + 'namespace-uri'/2, + name/2, + string/2, + nodeset/1, + 'normalize-space'/2, + number/1, number/2, + position/2, + round/2, + 'starts-with'/2, + string/1, + 'string-length'/2, + substring/2, + 'substring-after'/2, + 'substring-before'/2, + sum/2, + translate/2]). +-export([core_function/1]). + +-include("xmerl.hrl"). +-include("xmerl_xpath.hrl"). + +%% -record(obj, {type, +%% value}). + + +-define(string(X), #xmlObj{type = string, + value = X}). +-define(nodeset(X), #xmlObj{type = nodeset, + value = X}). +-define(number(X), #xmlObj{type = number, + value = X}). +-define(boolean(X), #xmlObj{type = boolean, + value = X}). + + + + +eval(Expr, C = #xmlContext{context_node = #xmlNode{pos = Pos}}) -> + Obj = expr(Expr, C), + Res = case Obj#xmlObj.type of + number when Obj#xmlObj.value == Pos -> + true; + number -> + false; + boolean -> + Obj#xmlObj.value; + _ -> + mk_boolean(C, Obj) + end, +% io:format("eval(~p, ~p) -> ~p~n", [Expr, Pos, Res]), + Res. + + +string(X) -> + ?string(X). + +nodeset(X) -> + ?nodeset(X). + +number(X) -> + ?number(X). + +boolean(X) -> + ?boolean(X). + + +expr({arith, Op, E1, E2}, C) -> + arith_expr(Op, E1, E2, C); +expr({comp, Op, E1, E2}, C) -> + comp_expr(Op, E1, E2, C); +expr({bool, Op, E1, E2}, C) -> + bool_expr(Op, E1, E2, C); +expr({'negative', E}, C) -> + N = mk_number(C, E), + - N; +expr({number, N}, _C) -> + ?number(N); +expr({literal, S}, _C) -> + ?string(S); +expr({function_call, F, Args}, C) -> + case core_function(F) of + {true, F1} -> + ?MODULE:F1(C, Args); + true -> + ?MODULE:F(C, Args); + false -> + %% here, we should look up the function in the context provided + %% by the caller, but we haven't figured this out yet. + exit({not_a_core_function, F}) + end; +expr({path, Type, PathExpr}, C) -> + #state{context=#xmlContext{nodeset = NS}} = + xmerl_xpath:eval_path(Type, PathExpr, C), + ?nodeset(NS); +expr(Expr, _C) -> + exit({unknown_expr, Expr}). + + +arith_expr('+', E1, E2, C) -> + ?number(mk_number(C, E1) + mk_number(C, E2)); +arith_expr('-', E1, E2, C) -> + ?number(mk_number(C, E1) - mk_number(C, E2)); +arith_expr('*', E1, E2, C) -> + ?number(mk_number(C, E1) * mk_number(C, E2)); +arith_expr('div', E1, E2, C) -> + ?number(mk_number(C, E1) / mk_number(C, E2)); +arith_expr('mod', E1, E2, C) -> + ?number(mk_number(C, E1) rem mk_number(C, E2)). + +comp_expr('>', E1, E2, C) -> + N1 = expr(E1,C), + N2 = expr(E2,C), + ?boolean(compare_ineq_format(N1,N2,C) > compare_ineq_format(N2,N1,C)); +comp_expr('<', E1, E2, C) -> + N1 = expr(E1,C), + N2 = expr(E2,C), + ?boolean(compare_ineq_format(N1,N2,C) > compare_ineq_format(N2,N1,C)); +comp_expr('>=', E1, E2, C) -> + N1 = expr(E1,C), + N2 = expr(E2,C), + ?boolean(compare_ineq_format(N1,N2,C) > compare_ineq_format(N2,N1,C)); +comp_expr('<=', E1, E2, C) -> + N1 = expr(E1,C), + N2 = expr(E2,C), + ?boolean(compare_ineq_format(N1,N2,C) > compare_ineq_format(N2,N1,C)); +comp_expr('=', E1, E2, C) -> + N1 = expr(E1,C), + N2 = expr(E2,C), + ?boolean(compare_eq_format(N1,N2,C) == compare_eq_format(N2,N1,C)); +comp_expr('!=', E1, E2, C) -> + N1 = expr(E1,C), + N2 = expr(E2,C), + ?boolean(compare_eq_format(N1,N2,C) /= compare_eq_format(N2,N1,C)). + +bool_expr('or', E1, E2, C) -> + ?boolean(mk_boolean(C, E1) or mk_boolean(C, E2)); +bool_expr('and', E1, E2, C) -> + ?boolean(mk_boolean(C, E1) and mk_boolean(C, E2)). + +%% According to chapter 3.4 in XML Path Language ver 1.0 the format of +%% the compared objects are depending on the type of the other +%% object. +%% 1. Comparisons involving node-sets is treated equally despite +%% of which comparancy operand is used. In this case: +%% - node-set comp node-set: string values are used +%% - node-set comp number : ((node-set string value) -> number) +%% - node-set comp boolean : (node-set string value) -> boolean +%% 2. Comparisons when neither object is a node-set and the operand +%% is = or != the following transformation is done before comparison: +%% - if one object is a boolean the other is converted to a boolean. +%% - if one object is a number the other is converted to a number. +%% - otherwise convert both to the string value. +%% 3. Comparisons when neither object is a node-set and the operand is +%% <=, <, >= or > both objects are converted to a number. +compare_eq_format(N1=#xmlObj{type=T1},N2=#xmlObj{type=T2},C) when T1==nodeset; + T2==nodeset -> + compare_nseq_format(N1,N2,C); +compare_eq_format(N1=#xmlObj{type=T1},#xmlObj{type=T2},C) when T1==boolean; + T2==boolean -> + mk_boolean(C,N1); +compare_eq_format(N1=#xmlObj{type=T1},#xmlObj{type=T2},C) when T1==number; + T2==number -> + mk_number(C,N1); +compare_eq_format(N1,_,C) -> + mk_string(C,string_value(N1)). + +compare_ineq_format(N1=#xmlObj{type=T1}, + N2=#xmlObj{type=T2},C) when T1==nodeset; + T2==nodeset -> + compare_nseq_format(N1,N2,C); +compare_ineq_format(N1,_N2,C) -> + mk_number(C,N1). + +compare_nseq_format(N1=#xmlObj{type = number},_N2,C) -> + mk_number(C,N1); +compare_nseq_format(N1=#xmlObj{type = boolean},_N2,C) -> + mk_boolean(C,N1); +compare_nseq_format(N1=#xmlObj{type = string},_N2,C) -> + mk_string(C,N1); +compare_nseq_format(N1=#xmlObj{type = nodeset},_N2=#xmlObj{type=number},C) -> + %% transform nodeset value to its string-value + mk_number(C,string_value(N1)); +compare_nseq_format(N1=#xmlObj{type = nodeset},_N2=#xmlObj{type=boolean},C) -> + mk_boolean(C,N1); +compare_nseq_format(N1=#xmlObj{type = nodeset},_N2,C) -> + mk_string(C,string_value(N1)). + + +core_function('last') -> true; +core_function('position') -> true; +core_function('count') -> true; +core_function('id') -> true; +core_function('local-name') -> true; +core_function('namespace-uri') -> true; +core_function('name') -> true; +core_function('string') -> true; +core_function('concat') -> true; +core_function('starts-with') -> true; +core_function('contains') -> true; +core_function('substring-before') -> true; +core_function('substring-after') -> true; +core_function('string-length') -> true; +core_function('normalize-space') -> true; +core_function('translate') -> true; +core_function('boolean') -> true; +core_function('not') -> {true, fn_not}; +core_function('true') -> {true, fn_true}; +core_function('false') -> {true, fn_false}; +core_function('lang') -> true; +core_function('number') -> true; +core_function('sum') -> true; +core_function('floor') -> true; +core_function('ceiling') -> true; +core_function('round') -> true; +core_function(_) -> + false. + + +%%% node set functions + +%% number: last() +last(#xmlContext{nodeset = Set}, []) -> + ?number(length(Set)). + +%% number: position() +position(#xmlContext{context_node = #xmlNode{pos = Pos}}, []) -> + ?number(Pos). + +%% number: count(node-set) +count(C, [Arg]) -> + ?number(length(mk_nodeset(C, Arg))). + +%% node-set: id(object) +id(C, [Arg]) -> + WD = C#xmlContext.whole_document, + NS0 = [WD], + Obj = mk_object(C,Arg), + case Obj#xmlObj.type of + nodeset -> + NodeSet = Obj#xmlObj.value, + IdTokens = + lists:foldl( + fun(N, AccX) -> + StrVal = string_value(N), + TokensX = id_tokens(StrVal), + TokensX ++ AccX + end, [], NodeSet), + NewNodeSet = + xmerl_xpath:axis(descendant_or_self, + fun(Node) -> + attribute_test(Node, id, IdTokens) + end, C#xmlContext{nodeset = NS0}), + ?nodeset(NewNodeSet); + _ -> + StrVal = string_value(Obj#xmlObj.value), + IdTokens = id_tokens(StrVal), + NodeSet = [(WD#xmlNode.node)#xmlDocument.content], + NewNodeSet = lists:foldl( + fun(Tok, AccX) -> + select_on_attribute(NodeSet, id, Tok, AccX) + end, [], IdTokens), + ?nodeset(NewNodeSet) + + end. + +id_tokens(Str=#xmlObj{type=string}) -> + string:tokens(Str#xmlObj.value, " \t\n\r"). +%%id_tokens(Str) when list(Str) -> +%% string:tokens(Str, " \t\n\r"). + +attribute_test(#xmlNode{node = #xmlElement{attributes = Attrs}}, + Key, Vals) -> + case lists:keysearch(Key, #xmlAttribute.name, Attrs) of + {value, #xmlAttribute{value = V}} -> + lists:member(V, Vals); + _ -> + false + end; +attribute_test(_Node, _Key, _Vals) -> + false. + +%%% CONTINUE HERE!!!! + +%% string: local-name(node-set?) +'local-name'(C, []) -> + local_name1(default_nodeset(C)); + +'local-name'(C, [Arg]) -> + local_name1(mk_nodeset(C, Arg)). + +local_name1([]) -> + ?string([]); +local_name1([#xmlNode{type=element,node=El}|_]) -> + #xmlElement{name=Name,nsinfo=NSI} = El, + local_name2(Name,NSI); +local_name1([#xmlNode{type=attribute,node=Att}|_]) -> + #xmlAttribute{name=Name,nsinfo=NSI} = Att, + local_name2(Name,NSI); +local_name1([#xmlElement{name = Name, nsinfo = NSI}|_]) -> + local_name2(Name,NSI). +local_name2(Name, NSI) -> + case NSI of + {_Prefix, Local} -> + ?string(Local); + [] -> + ?string(atom_to_list(Name)) + end. + +%% string: namespace-uri(node-set?) +'namespace-uri'(C, []) -> + ns_uri(default_nodeset(C)); + +'namespace-uri'(C, [Arg]) -> + ns_uri(mk_nodeset(C, Arg)). + + +ns_uri([]) -> + ?string([]); +ns_uri([#xmlElement{nsinfo = NSI, namespace = NS}|_]) -> + ns_uri2(NSI,NS); +ns_uri([#xmlNode{type=element,node=El}|_]) -> + #xmlElement{nsinfo=NSI, namespace = NS} = El, + ns_uri2(NSI,NS); +ns_uri([#xmlNode{type=attribute,node=Att}|_]) -> + #xmlAttribute{nsinfo=NSI, namespace = NS} = Att, + ns_uri2(NSI,NS); +ns_uri(_) -> + ?string([]). + +ns_uri2(NSI,NS) -> + case NSI of + {Prefix, _} -> + case lists:keysearch(Prefix, 1, NS#xmlNamespace.nodes) of + false -> + ?string([]); + {value, {_K, V}} -> + string_value(V) + end; + [] -> + ?string([]) + end. + +%% name(node-set) -> xmlObj{type=string} +%% The name function returns a string containing the QName of the node +%% first in document order. The representation of the QName is not +%% standardized and applications have their own format. At +%% http://xml.coverpages.org/clarkNS-980804.html (the author of XPath) +%% adopts the format "namespace URI"+"local-name" but according to +%% other sources it is more common to use the format: +%% '{'"namespace URI"'}'"local-name". This function also uses this +%% latter form. +name(C,[]) -> + name1(default_nodeset(C)); +name(C, [Arg]) -> + name1(mk_nodeset(C, Arg)). +name1([]) -> + ?string([]); +name1(NodeSet) -> + NSVal = + case ns_uri(NodeSet) of + #xmlObj{value=NSStr} when NSStr =/= [] -> + "{"++NSStr++"}"; + _ -> + "" + end, + #xmlObj{value=LocalName} = local_name1(NodeSet), + ?string(NSVal++LocalName). + + + +%%% String functions + +%% string: string(object?) +string(C, []) -> + ns_string(default_nodeset(C)); +string(C, [Arg]) -> + string_value(mk_object(C, Arg)). + +ns_string([Obj|_]) -> + string_value(Obj). + +string_value(#xmlObj{type=nodeset,value=[]}) -> + ?string(""); +string_value(N=#xmlObj{type=nodeset}) -> + string_value(hd(N#xmlObj.value)); +string_value(N=#xmlObj{}) -> + string_value(N#xmlObj.value); +%% Needed also string_value for root_nodes, elements (concatenation of +%% al decsendant text nodes) and attribute nodes (normalized value). +string_value(A=#xmlNode{type=attribute}) -> + #xmlAttribute{value=AttVal}=A#xmlNode.node, + ?string(AttVal); +string_value(El=#xmlNode{type=element}) -> + #xmlElement{content=C} = El#xmlNode.node, + TextValue = fun(#xmlText{value=T},_Fun) -> T; + (#xmlElement{content=Cont},Fun) -> Fun(Cont,Fun); + (_,_) -> [] + end, + TextDecendants=fun(X) -> TextValue(X,TextValue) end, + ?string(lists:flatten(lists:map(TextDecendants,C))); +string_value(T=#xmlNode{type=text}) -> + #xmlText{value=Txt} = T#xmlNode.node, + ?string(Txt); +string_value(infinity) -> ?string("Infinity"); +string_value(neg_infinity) -> ?string("-Infinity"); +string_value(A) when is_atom(A) -> + ?string(atom_to_list(A)); +string_value(N) when is_integer(N) -> + ?string(integer_to_list(N)); +string_value(N) when is_float(N) -> + N1 = round(N * 10000000000000000), + ?string(strip_zeroes(integer_to_list(N1))); +string_value(Str) when is_list(Str) -> + ?string(Str). + +strip_zeroes(Str) -> + strip_zs(lists:reverse(Str), 15). + +strip_zs([H|T], 0) -> + lists:reverse(T) ++ [$., H]; +strip_zs("0" ++ T, N) -> + strip_zs(T, N-1); +strip_zs([H|T], N) -> + strip_zs(T, N-1, [H]). + +strip_zs([H|T], 0, Acc) -> + lists:reverse(T) ++ [$.,H|Acc]; +strip_zs([H|T], N, Acc) -> + strip_zs(T, N-1, [H|Acc]). + + +%% string: concat(string, string, string*) +concat(C, Args = [_, _|_]) -> + Strings = [mk_string(C, A) || A <- Args], + ?string(lists:concat(Strings)). + +%% boolean: starts-with(string, string) +'starts-with'(C, [A1, A2]) -> + ?boolean(lists:prefix(mk_string(C, A2), mk_string(C, A1))). + +%% boolean: contains(string, string) +contains(C, [A1, A2]) -> + Pos = string:str(mk_string(C, A1), mk_string(C, A2)), + ?boolean(Pos > 0). + +%% string: substring-before(string, string) +'substring-before'(C, [A1, A2]) -> + S1 = mk_string(C, A1), + S2 = mk_string(C, A2), + Pos = string:str(S1, S2), + ?string(string:substr(S1, 1, Pos)). + +%% string: substring-after(string, string) +'substring-after'(C, [A1, A2]) -> + S1 = mk_string(C, A1), + S2 = mk_string(C, A2), + case string:str(S1, S2) of + 0 -> + ?string([]); + Pos -> + ?string(string:substr(S1, Pos)) + end. + +%% string: substring(string, number, number?) +substring(C, [A1, A2]) -> + S = mk_string(C, A1), + Pos = mk_integer(C, A2), + ?string(string:substr(S, Pos)); +substring(C, [A1, A2, A3]) -> + S = mk_string(C, A1), + Pos = mk_integer(C, A2), + Length = mk_integer(C, A3), + ?string(string:substr(S, Pos, Length)). + + +%% number: string-length(string?) +'string-length'(C = #xmlContext{context_node = N}, []) -> + length(mk_string(C, string_value(N))); + +'string-length'(C, [A]) -> + length(mk_string(C, A)). + + +%% string: normalize-space(string?) +'normalize-space'(C = #xmlContext{context_node = N}, []) -> + normalize(mk_string(C, string_value(N))); + +'normalize-space'(C, [A]) -> + normalize(mk_string(C, A)). + + +%% string: translate(string, string, string) +translate(C, [A1, A2, A3]) -> + S1 = mk_string(C, A1), + S2 = mk_string(C, A2), + S3 = mk_string(C, A3), + ?string(translate1(S1, translations(S2, S3))). + +translate1([H|T], Xls) -> + case lists:keysearch(H, 1, Xls) of + {value, {_, remove}} -> + translate1(T, Xls); + {value, {_, replace, H1}} -> + [H1|translate1(T, Xls)]; + false -> + [H|translate1(T, Xls)] + end; +translate1([], _) -> + []. + +translations([H|T], [H1|T1]) -> + [{H, replace, H1}|translations(T, T1)]; +translations(Rest, []) -> + [{X, remove} || X <- Rest]; +translations([], _Rest) -> + []. + + + +%% boolean: boolean(object) +boolean(C, [Arg]) -> + ?boolean(mk_boolean(C, Arg)). + +%% boolean: not(boolean) -> +fn_not(C, [Arg]) -> + ?boolean(not(mk_boolean(C, Arg))). + +%% boolean: true() -> +fn_true(_C, []) -> + ?boolean(true). + +%% boolean: false() -> +fn_false(_C, []) -> + ?boolean(false). + +%% boolean: lang(string) -> +lang(C = #xmlContext{context_node = N}, [Arg]) -> + S = mk_string(C, Arg), + Lang = + case N of + #xmlElement{language = L} -> L; + #xmlAttribute{language = L} -> L; + #xmlText{language = L} -> L; + #xmlComment{language = L} -> L; + _ -> [] + end, + case Lang of + [] -> + ?boolean(false); + _ -> + ?boolean(match_lang(upcase(S), upcase(Lang))) + end. + + +upcase([H|T]) when H >= $a, H =< $z -> + [H+($A-$a)|upcase(T)]; +upcase([H|T]) -> + [H|upcase(T)]; +upcase([]) -> + []. + +match_lang([H|T], [H|T1]) -> + match_lang(T, T1); +match_lang([], "-" ++ _) -> + true; +match_lang([], []) -> + true; +match_lang(_, _) -> + false. + + + +%% number: number(object) +number(C = #xmlContext{context_node = N}, []) -> + ?number(mk_number(C, string(C, N))); +number(C, [Arg]) -> + ?number(mk_number(C, Arg)). + + +sum(C, [Arg]) -> + NS = mk_nodeset(C, Arg), + lists:foldl( + fun(N, Sum) -> + Sum + mk_number(C, string(C, N)) + end, 0, NS). + +floor(C, [Arg]) -> + Num = mk_number(C, Arg), + case trunc(Num) of + Num1 when Num1 > Num -> + ?number(Num1-1); + Num1 -> + ?number(Num1) + end. + +ceiling(C, [Arg]) -> + Num = mk_number(C, Arg), + case trunc(Num) of + Num1 when Num1 < Num -> + ?number(Num1+1); + Num1 -> + ?number(Num1) + end. + + +round(C, [Arg]) -> + case mk_number(C, Arg) of + A when is_atom(A) -> + A; + N when is_integer(N) -> + N; + F when is_float(F) -> + round(F) + end. + + +select_on_attribute([E = #xmlElement{attributes = Attrs}|T], K, V, Acc) -> + case lists:keysearch(K, #xmlAttribute.name, Attrs) of + {value, #xmlAttribute{value = V}} -> + Acc2 = select_on_attribute(E#xmlElement.content,K,V,[E|Acc]), + select_on_attribute(T, K, V, Acc2); + _ -> + Acc2 = select_on_attribute(E#xmlElement.content,K,V,Acc), + select_on_attribute(T, K, V, Acc2) + end; +select_on_attribute([H|T], K, V, Acc) when is_record(H,xmlText) -> + select_on_attribute(T, K, V, Acc); +select_on_attribute([], _K, _V, Acc) -> + Acc. + + +%%%% + +mk_nodeset(_C0, #xmlContext{nodeset = NS}) -> + NS; +mk_nodeset(_C0, #xmlObj{type = nodeset, value = NS}) -> + NS; +mk_nodeset(C0, Expr) -> + case expr(Expr, C0) of + #xmlObj{type = nodeset, value = NS} -> + NS; + Other -> + exit({expected_nodeset, Other}) + end. + + +default_nodeset(#xmlContext{context_node = N}) -> + [N]. + + +mk_object(_C0, Obj = #xmlObj{}) -> + Obj; +mk_object(C0, Expr) -> + expr(Expr, C0). + + +mk_string(_C0, #xmlObj{type = string, value = V}) -> + V; +mk_string(C0, Obj = #xmlObj{}) -> + mk_string(C0,string_value(Obj)); +mk_string(C0, Expr) -> + mk_string(C0, expr(Expr, C0)). + + + +mk_integer(_C0, #xmlObj{type = number, value = V}) when is_float(V) -> + round(V); +mk_integer(_C0, #xmlObj{type = number, value = V}) when is_integer(V) -> + V; +mk_integer(C, Expr) -> + mk_integer(C, expr(Expr, C)). + + +mk_number(_C, #xmlObj{type = string, value = V}) -> + scan_number(V); +mk_number(_C, #xmlObj{type = number, value = V}) -> + V; +mk_number(C, N=#xmlObj{type = nodeset}) -> + mk_number(C,string_value(N)); +mk_number(_C, #xmlObj{type = boolean, value = false}) -> + 0; +mk_number(_C, #xmlObj{type = boolean, value = true}) -> + 1; +mk_number(C, Expr) -> + mk_number(C, expr(Expr, C)). + + +mk_boolean(_C, #xmlObj{type = boolean, value = V}) -> + V; +mk_boolean(_C, #xmlObj{type = number, value = 0}) -> + false; +mk_boolean(_C, #xmlObj{type = number, value = V}) when is_float(V) ; is_integer(V) -> + true; +mk_boolean(_C, #xmlObj{type = nodeset, value = []}) -> + false; +mk_boolean(_C, #xmlObj{type = nodeset, value = _V}) -> + true; +mk_boolean(_C, #xmlObj{type = string, value = []}) -> + false; +mk_boolean(_C, #xmlObj{type = string, value = _V}) -> + true; +mk_boolean(C, Expr) -> + mk_boolean(C, expr(Expr, C)). + + +normalize([H|T]) when ?whitespace(H) -> + normalize(T); +normalize(Str) -> + ContF = fun(_ContF, RetF, _S) -> + RetF() + end, + normalize(Str, + #xmerl_scanner{acc_fun = fun() -> exit(acc_fun) end, + event_fun = fun() -> exit(event_fun) end, + hook_fun = fun() -> exit(hook_fun) end, + continuation_fun = ContF}, + []). + + +normalize(Str = [H|_], S, Acc) when ?whitespace(H) -> + case xmerl_scan:accumulate_whitespace(Str, S, preserve, Acc) of + {" " ++ Acc1, [], _S1} -> + lists:reverse(Acc1); + {Acc1, [], _S1} -> + lists:reverse(Acc1); + {Acc1, T1, S1} -> + normalize(T1, S1, Acc1) + end; +normalize([H|T], S, Acc) -> + normalize(T, S, [H|Acc]); +normalize([], _S, Acc) -> + lists:reverse(Acc). + + +scan_number([H|T]) when ?whitespace(H) -> + scan_number(T); +scan_number("-" ++ T) -> + case catch xmerl_xpath_scan:scan_number(T) of + {{number, N}, Tail} -> + case is_all_white(Tail) of + true -> + N; + false -> + 'NaN' + end; + _Other -> + 'NaN' + end; +scan_number(T) -> + case catch xmerl_xpath_scan:scan_number(T) of + {{number, N}, Tail} -> + case is_all_white(Tail) of + true -> + N; + false -> + 'NaN' + end; + _Other -> + 'NaN' + end. + +is_all_white([H|T]) when ?whitespace(H) -> + is_all_white(T); +is_all_white([_H|_T]) -> + false; +is_all_white([]) -> + true. diff --git a/lib/xmerl/src/xmerl_xpath_scan.erl b/lib/xmerl/src/xmerl_xpath_scan.erl new file mode 100644 index 0000000000..10e2756e74 --- /dev/null +++ b/lib/xmerl/src/xmerl_xpath_scan.erl @@ -0,0 +1,308 @@ +%% +%% %CopyrightBegin% +%% +%% Copyright Ericsson AB 2003-2009. All Rights Reserved. +%% +%% The contents of this file are subject to the Erlang Public License, +%% Version 1.1, (the "License"); you may not use this file except in +%% compliance with the License. You should have received a copy of the +%% Erlang Public License along with this software. If not, it can be +%% retrieved online at http://www.erlang.org/. +%% +%% Software distributed under the License is distributed on an "AS IS" +%% basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See +%% the License for the specific language governing rights and limitations +%% under the License. +%% +%% %CopyrightEnd% +%% + +%% Description : Token scanner for XPATH grammar + +%%%---------------------------------------------------------------------- +%%% +%%% The XPATH grammar is a bit tricky, due to operator overloading. +%%% This version of the scanner is based on the XPATH spec: +%%% http://www.w3.org/TR/1999/REC-xpath-19991116 (XPATH version 1.0) +%%% +%%% Quote from the spec: +%%% +%%% "The following special tokenization rules must be applied in the order +%%% specified to disambiguate the ExprToken grammar: +%%% +%%% o If there is a preceding token and the preceding token is not one of +%%% @, ::. (, [, or an Operator, then a * must be recognized as a +%%% MultiplyOperator and an NCName must be recognized as an OperatorName +%%% o If the character following an NCName (possible after intervening +%%% ExprWhiteSpace) is (, then the token must be recognized as a NodeType +%%% or a FunctionName. +%%% o If the two characters following an NCName (possible after intervening +%%% ExprWhiteSpace) are ::, then the token must be recognized as an +%%% AxisName. +%%% o Otherwise, the token must not be recognized as a MultiplyOperator, an +%%% OperatorName, a NodeType, a FunctionName, or an AxisName." +%%%---------------------------------------------------------------------- + +-module(xmerl_xpath_scan). + + +%% main API +-export([tokens/1]). + +%% exported helper functions +-export([scan_number/1]). + +-include("xmerl.hrl"). + +-define(L, 1). + + +tokens(Str) -> + tokens(strip_ws(Str), []). + +tokens([], Acc) -> + lists:reverse([{'$end', ?L, '$end'}|Acc]); +tokens(Str, Acc) -> + case scan_token(Str, Acc) of + {rescan, NewStr} -> + tokens(NewStr, Acc); + {Token, T} -> + tokens(strip_ws(T), [Token|Acc]) + end. + +%% Expr Tokens +scan_token("(" ++ T, _A) -> {{'(', ?L, '('}, T}; +scan_token(")" ++ T, _A) -> {{')', ?L, ')'}, T}; +scan_token("[" ++ T, _A) -> {{'[', ?L, '['}, T}; +scan_token("]" ++ T, _A) -> {{']', ?L, ']'}, T}; +scan_token(".." ++ T, _A) -> {rescan,"parent::node()" ++ T} ; + % {{'..',?L,'..'}, T}; +scan_token("@" ++ T, _A) -> {rescan,"attribute::" ++ T}; + % {{'@',?L,'@'},T}; +scan_token("," ++ T, _A) -> {{',', ?L, ','}, T}; +scan_token("::" ++ T, _A) -> {{'::', ?L, '::'}, T}; + +%% operators +scan_token("//" ++ T, _A) -> {rescan,"/descendant-or-self::node()/" ++ T}; + % {{'//',?L,'//'},T}; +scan_token("/" ++ T, _A) -> {{'/', ?L, '/'}, T}; +scan_token("|" ++ T, _A) -> {{'|', ?L, '|'}, T}; +scan_token("+" ++ T, _A) -> {{'+', ?L, '+'}, T}; +scan_token("-" ++ T, _A) -> {{'-', ?L, '-'}, T}; +scan_token("=" ++ T, _A) -> {{'=', ?L, '='}, T}; +scan_token("!=" ++ T, _A) -> {{'!=', ?L, '!='}, T}; +scan_token("<=" ++ T, _A) -> {{'<=', ?L, '<='}, T}; +scan_token("<" ++ T, _A) -> {{'<', ?L, '<'}, T}; +scan_token(">=" ++ T, _A) -> {{'>=', ?L, '>='}, T}; +scan_token(">" ++ T, _A) -> {{'>', ?L, '>'}, T}; + +scan_token("*" ++ T, A) -> + Tok = + case A of + [{X,_,_}|_] -> + case special_token(X) of + false -> + {'*', ?L, '*'}; + true -> + {'wildcard', ?L, 'wildcard'} + end; + _ -> + {'wildcard', ?L, 'wildcard'} + end, + {Tok, T}; + +%% numbers +scan_token(Str = [H|_], _A) when H >= $0, H =< $9 -> + scan_number(Str); +scan_token(Str = [$., H|_], A) when H >= $0, H =< $9 -> + scan_number(Str, A); +scan_token("." ++ T, _A) -> +% {{'.', ?L, '.'}, T}; + {rescan, "self::node()" ++ T}; + +%% Variable Reference +scan_token([$$|T], _A) -> + {{Prefix, Local}, T1} = scan_name(T), + case Prefix of + [] -> + {{var_reference, ?L, list_to_atom(Local)}, T1}; + _ -> + {{var_reference, ?L, list_to_atom(Prefix++":"++Local)}, T1} + end; + +scan_token([H|T], _A) when H == $" ; H == $' -> + {Literal, T1} = scan_literal(T, H, []), + {{literal, ?L, Literal}, T1}; + +scan_token(T, A) -> + {{Prefix, Local}, T1} = scan_name(T), + case A of + [{X,_,_}|_] -> + case special_token(X) of + false -> + operator_name(Prefix, Local, T1); + true -> + other_name(Prefix, Local, strip_ws(T1)) + end; + _ -> + other_name(Prefix, Local, T1) + end. + +operator_name([], "and", T) -> {{'and', ?L, 'and'}, T}; +operator_name([], "or", T) -> {{'or', ?L, 'or'}, T}; +operator_name([], "mod", T) -> {{'mod', ?L, 'mod'}, T}; +operator_name([], "div", T) -> {{'div', ?L, 'div'}, T}. + + +other_name(Prefix, [], "*" ++ T) -> + %% [37] NameTest ::= '*' | NCName ':' '*' | QName + {{prefix_test, ?L, Prefix}, T}; +other_name(Prefix, Local, T = "(" ++ _) -> + node_type_or_function_name(Prefix, Local, T); +other_name(Prefix, Local, T = "::" ++ _) -> + axis(Prefix, Local, T); +other_name([], Local, T) -> + {{name, ?L, {list_to_atom(Local), [], Local}}, T}; +other_name(Prefix, Local, T) -> + {{name, ?L, {list_to_atom(Prefix++":"++Local), Prefix, Local}}, T}. + + + +%% node types +node_type_or_function_name([], "comment", T) -> + {{node_type, ?L, comment}, T}; +node_type_or_function_name([], "text", T) -> + {{node_type, ?L, text}, T}; +node_type_or_function_name([], "processing-instruction", T) -> + {{'processing-instruction', ?L, 'processing-instruction'}, T}; +node_type_or_function_name([], "node", T) -> + {{node_type, ?L, node}, T}; +node_type_or_function_name(Prefix, Local, T) -> + {{function_name, ?L, list_to_atom(Prefix ++ Local)}, T}. + + +%% axis names +axis([], "ancestor-or-self", T) -> {{axis, ?L, ancestor_or_self}, T}; +axis([], "ancestor", T) -> {{axis, ?L, ancestor}, T}; +axis([], "attribute", T) -> {{axis, ?L, attribute}, T}; +axis([], "child", T) -> {{axis, ?L, child}, T}; +axis([], "descendant-or-self", T) -> {{axis, ?L, descendant_or_self}, T}; +axis([], "descendant", T) -> {{axis, ?L, descendant}, T}; +axis([], "following-sibling", T) -> {{axis, ?L, following_sibling}, T}; +axis([], "following", T) -> {{axis, ?L, following}, T}; +axis([], "namespace", T) -> {{axis, ?L, namespace}, T}; +axis([], "parent", T) -> {{axis, ?L, parent}, T}; +axis([], "preceding-sibling", T) -> {{axis, ?L, preceding_sibling}, T}; +axis([], "preceding", T) -> {{axis, ?L, preceding}, T}; +axis([], "self", T) -> {{axis, ?L, self}, T}. + + + + +scan_literal([H|T], H, Acc) -> + {lists:reverse(Acc), T}; +scan_literal([H|T], Delim, Acc) -> + scan_literal(T, Delim, [H|Acc]). + + +scan_name([H1, H2 | T]) when H1 == $: ; H1 == $_ -> + if ?whitespace(H2) -> + exit({invalid_name, [H1, H2, '...']}); + true -> + scan_prefix(T, [H2, H1]) + end; +scan_name([H|T]) -> + case xmerl_lib:is_letter(H) of + true -> + scan_prefix(T, [H]); + false -> + exit({invalid_name, lists:sublist([H|T], 1, 6)}) + end; +scan_name(Str) -> + exit({invalid_name, lists:sublist(Str, 1, 6)}). + +scan_prefix([], Acc) -> + {{[], lists:reverse(Acc)}, []}; +scan_prefix(Str = [H|_], Acc) when ?whitespace(H) -> + {{[], lists:reverse(Acc)}, Str}; +scan_prefix(T = "::" ++ _, Acc) -> + %% This is the next token + {{[], lists:reverse(Acc)}, T}; +scan_prefix(":" ++ T, Acc) -> + {LocalPart, T1} = scan_local_part(T, []), + Prefix = lists:reverse(Acc), + {{Prefix, LocalPart}, T1}; +scan_prefix(Str = [H|T], Acc) -> + case xmerl_lib:is_namechar(H) of + true -> + scan_prefix(T, [H|Acc]); + false -> + {{[], lists:reverse(Acc)}, Str} + end. + +scan_local_part([], Acc) -> + {lists:reverse(Acc), []}; +scan_local_part(Str = [H|_], Acc) when ?whitespace(H) -> + {lists:reverse(Acc), Str}; +scan_local_part(Str = [H|T], Acc) -> + case xmerl_lib:is_namechar(H) of + true -> + scan_local_part(T, [H|Acc]); + false -> + {lists:reverse(Acc), Str} + end. + + +scan_number(T) -> + scan_number(T, []). + +scan_number([], Acc) -> + {{number, ?L, list_to_integer(lists:reverse(Acc))}, []}; +scan_number("." ++ T, []) -> + {Digits, T1} = scan_digits(T, ".0"), + Number = list_to_float(Digits), + {{number, ?L, Number}, T1}; +scan_number("." ++ T, Acc) -> + {Digits, T1} = scan_digits(T, "." ++ Acc), + Number = list_to_float(Digits), + {{number, ?L, Number}, T1}; +scan_number([H|T], Acc) when H >= $0, H =< $9 -> + scan_number(T, [H|Acc]); +scan_number(T, Acc) -> + {{number, ?L, list_to_integer(lists:reverse(Acc))}, T}. + +scan_digits([], Acc) -> + {lists:reverse(Acc), []}; +scan_digits([H|T], Acc) when H >= $0, H =< $9 -> + scan_digits(T, [H|Acc]); +scan_digits(T, Acc) -> + {lists:reverse(Acc), T}. + + +strip_ws([H|T]) when ?whitespace(H) -> + strip_ws(T); +strip_ws(T) -> + T. + + +special_token('@') -> true; +special_token('::') -> true; +special_token('(') -> true; +special_token('[') -> true; +special_token('/') -> true; +special_token('//') -> true; +special_token('|') -> true; +special_token('+') -> true; +special_token('-') -> true; +special_token('=') -> true; +special_token('!=') -> true; +special_token('<') -> true; +special_token('<=') -> true; +special_token('>') -> true; +special_token('>=') -> true; +special_token('and') -> true; +special_token('or') -> true; +special_token('mod') -> true; +special_token('div') -> true; +special_token(_) -> + false. diff --git a/lib/xmerl/src/xmerl_xs.erl b/lib/xmerl/src/xmerl_xs.erl new file mode 100644 index 0000000000..f42a470a43 --- /dev/null +++ b/lib/xmerl/src/xmerl_xs.erl @@ -0,0 +1,123 @@ +%% +%% %CopyrightBegin% +%% +%% Copyright Ericsson AB 2003-2009. All Rights Reserved. +%% +%% The contents of this file are subject to the Erlang Public License, +%% Version 1.1, (the "License"); you may not use this file except in +%% compliance with the License. You should have received a copy of the +%% Erlang Public License along with this software. If not, it can be +%% retrieved online at http://www.erlang.org/. +%% +%% Software distributed under the License is distributed on an "AS IS" +%% basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See +%% the License for the specific language governing rights and limitations +%% under the License. +%% +%% %CopyrightEnd% +%% + +%% Description : Implements XSLT like transformations in Erlang + +%% @doc +% Erlang has similarities to XSLT since both languages +% have a functional programming approach. Using <code>xmerl_xpath</code> +% it is possible to write XSLT like transforms in Erlang. +% +% <p>XSLT stylesheets are often used when transforming XML +% documents, to other XML documents or (X)HTML for presentation. +% XSLT contains quite many +% functions and learning them all may take some effort. +% This document assumes a basic level of +% understanding of XSLT. +% </p> +% <p>Since XSLT is based on a functional programming approach +% with pattern matching and recursion it is possible to write +% similar style sheets in Erlang. At least for basic +% transforms. This +% document describes how to use the XPath implementation together +% with Erlangs pattern matching and a couple of functions to write +% XSLT like transforms.</p> +% <p>This approach is probably easier for an Erlanger but +% if you need to use real XSLT stylesheets in order to "comply to +% the standard" there is an adapter available to the Sablotron +% XSLT package which is written i C++. +% See also the <a href="xmerl_xs_examples.html">Tutorial</a>. +% </p> + +-module(xmerl_xs). + +-export([xslapply/2, value_of/1, select/2, built_in_rules/2 ]). +-include("xmerl.hrl"). + + +%% @spec xslapply(Function, EList::list()) -> List +%% Function = () -> list() +%% @doc xslapply is a wrapper to make things look similar to +%% xsl:apply-templates. +%% +%% <p>Example, original XSLT:</p><br/><pre> +%% <xsl:template match="doc/title"> +%% <h1> +%% <xsl:apply-templates/> +%% </h1> +%% </xsl:template> +%% </pre> +%% +%% <p>becomes in Erlang:</p><br/><pre> +%% template(E = #xmlElement{ parents=[{'doc',_}|_], name='title'}) -> +%% ["<h1>", +%% xslapply(fun template/1, E), +%% "</h1>"]; +%% </pre> + +xslapply(Fun, EList) when is_list(EList) -> + lists:map( Fun, EList); +xslapply(Fun, E = #xmlElement{})-> + lists:map( Fun, E#xmlElement.content). + + +%% @spec value_of(E) -> List +%% E = unknown() +%% +%% @doc Concatenates all text nodes within the tree. +%% +%% <p>Example:</p><br/><pre> +%% <xsl:template match="title"> +%% <div align="center"> +%% <h1><xsl:value-of select="." /></h1> +%% </div> +%% </xsl:template> +%% </pre> +%% +%% <p>becomes:</p><br/> <pre> +%% template(E = #xmlElement{name='title'}) -> +%% ["<div align="center"><h1>", +%% value_of(select(".", E)), "</h1></div>"] +%% </pre> +value_of(E)-> + lists:reverse(xmerl_lib:foldxml(fun value_of1/2, [], E)). + +value_of1(#xmlText{}=T1, Accu)-> + [xmerl_lib:export_text(T1#xmlText.value)|Accu]; +value_of1(_, Accu) -> + Accu. + +%% @spec select(String::string(),E)-> E +%% +%% @doc Extracts the nodes from the xml tree according to XPath. +%% @see value_of/1 +select(Str,E)-> + xmerl_xpath:string(Str,E). + +%% @spec built_in_rules(Fun, E) -> List +%% +%% @doc The default fallback behaviour. Template funs should end with: +%% <br/><code>template(E) -> built_in_rules(fun template/1, E)</code>. +built_in_rules(Fun, E = #xmlElement{})-> + lists:map(Fun, E#xmlElement.content); +built_in_rules(_Fun, E = #xmlText{}) -> + xmerl_lib:export_text(E#xmlText.value); +built_in_rules(_Fun, E = #xmlAttribute{}) -> + E#xmlAttribute.value; +built_in_rules(_Fun, _E) ->[]. diff --git a/lib/xmerl/src/xmerl_xsd.erl b/lib/xmerl/src/xmerl_xsd.erl new file mode 100644 index 0000000000..c7bca86205 --- /dev/null +++ b/lib/xmerl/src/xmerl_xsd.erl @@ -0,0 +1,5710 @@ +%% +%% %CopyrightBegin% +%% +%% Copyright Ericsson AB 2006-2009. All Rights Reserved. +%% +%% The contents of this file are subject to the Erlang Public License, +%% Version 1.1, (the "License"); you may not use this file except in +%% compliance with the License. You should have received a copy of the +%% Erlang Public License along with this software. If not, it can be +%% retrieved online at http://www.erlang.org/. +%% +%% Software distributed under the License is distributed on an "AS IS" +%% basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See +%% the License for the specific language governing rights and limitations +%% under the License. +%% +%% %CopyrightEnd% +%% + +%% @doc Interface module for XML Schema vlidation. +%% It handles the W3.org +%% <a href="http://www.w3.org/XML/Schema#dev">specifications</a> +%% of XML Schema second edition 28 october 2004. For an introduction to +%% XML Schema study <a href="http://www.w3.org/TR/xmlschema-0/">part 0.</a> +%% An XML structure is validated by xmerl_xsd:validate/[2,3]. +%% @type global_state(). <p>The global state of the validator. It is +%% representated by the <code>#xsd_state{}</code> record. +%% </p> +%% @type option_list(). <p>Options allow to customize the behaviour of the +%% validation. +%% </p> +%% Possible options are : +%% <dl> +%% <dt><code>{tab2file,boolean()}</code></dt> +%% <dd>Enables saving of abstract structure on file for debugging +%% purpose.</dd> +%% <dt><code>{xsdbase,filename()}</code></dt> +%% <dd>XSD Base directory.</dd> +%% <dt><code>{fetch_fun,FetchFun}</code></dt> +%% <dd>Call back function to fetch an external resource.</dd> +%% <dt><code>{fetch_path,PathList}</code></dt> +%% <dd>PathList is a list of directories to search when fetching files. +%% If the file in question is not in the fetch_path, the URI will +%% be used as a file name.</dd> +%% <dt><code>{state,State}</code></dt> +%% <dd>It is possible by this option to provide a state with process +%% information from an earlier validation.</dd> +%% </dl> +%%%------------------------------------------------------------------- +-module(xmerl_xsd). + +%%---------------------------------------------------------------------- +%% Include files +%%---------------------------------------------------------------------- +-include("xmerl.hrl"). +-include("xmerl_internal.hrl"). +-include("xmerl_xsd.hrl"). +-include_lib("kernel/include/file.hrl"). + +%%---------------------------------------------------------------------- +%% External exports +%%---------------------------------------------------------------------- +-export([ + validate/2,validate/3,process_validate/2,process_validate/3, + process_schema/1,process_schema/2, + process_schemas/1,process_schemas/2, + state2file/1,state2file/2,file2state/1,format_error/1 + ]). + +%%---------------------------------------------------------------------- +%% Internal exports +%%---------------------------------------------------------------------- +-export([print_table/1]). +%%-export([whitespace/1]). + +%%---------------------------------------------------------------------- +%% Imports +%%---------------------------------------------------------------------- +-import(xmerl_lib,[is_facet/1, is_builtin_simple_type/1, is_xsd_string/1]). +-import(xmerl_xsd_type,[facet_fun/2]). +-import(lists,[reverse/1,reverse/2,foldl/3,member/2,filter/2,flatten/1,map/2, + splitwith/2,mapfoldl/3,keysearch/3,keymember/3, + keyreplace/4,keydelete/3]). + + + +%%====================================================================== +%% Functions +%%====================================================================== + +%% @spec validate(Element,State) -> Result +%% @equiv validate(Element,State,[]) +validate(Xml,State) -> + validate(Xml,State,[]). + +%% @spec validate(Element,State,Options) -> Result +%% Element = XmlElement +%% Options = option_list() +%% Result = {ValidElement,global_state()} | {error,Reasons} +%% ValidElement = XmlElement +%% State = global_state() +%% Reasons = [ErrorReason] | ErrorReason +%% @doc Validates a parsed well-formed XML element (Element). +%% <p>A call to validate/2 or validate/3 must provide a well formed +%% parsed XML element <code>#xmlElement{}</code> and a State, +%% <code>global_state()</code>, which holds necessary information from +%% an already processed schema. +%% Thus validate enables reuse of the schema information and +%% therefore if one shall validate several times towards the same +%% schema it reduces time consumption.</p> +%% <p>The result, ValidElement, is the valid element that conforms to the +%% post-schema-validation infoset. When the validator finds an error it +%% tries to continue and reports a list of all errors found. In those cases +%% an unexpected error is found it may cause a single error reason. +%% </p> +%% <p> Usage example:</p> +%% <p> +%% <code>1>{E,_} = xmerl_scan:file("my_XML_document.xml").</code><br/> +%% <code>2>{ok,S} = xmerl_xsd:process_schema("my_XML_Schema.xsd").</code><br/> +%% <code>3>{E2,_} = xmerl_xsd:validate(E,S).</code> +%% </p> +%% <p> Observe that E2 may differ from E if for instance there are default +%% values defined in <code>my_XML_Schema.xsd</code>.</p> +validate(Xml,State,Opts) when is_record(State,xsd_state) -> + S2 = initiate_state2(State,Opts), + S3 = validation_options(S2,Opts), + validate3(S3#xsd_state.schema_name,Xml,S3). + +%% @spec state2file(State) -> ok | {error,Reason} +%% @doc Same as state2file(State,SchemaName) +%% +%% The name of the saved file is the same as the name of the +%% schema, but with <code>.xss</code> extension. +state2file(S=#xsd_state{schema_name=SN}) -> + state2file(S,filename:rootname(SN)). + +%% @spec state2file(State,FileName) -> ok | {error,Reason} +%% State = global_state() +%% FileName = filename() +%% @doc Saves the schema state with all information of the processed +%% schema in a file. You can provide the file name for the saved +%% state. FileName is saved with the <code>.xss</code> extension +%% added. +state2file(S,FileName) when is_record(S,xsd_state) -> + save_xsd_state(S), + case catch ets:tab2file(S#xsd_state.table,lists:append(FileName,".xss")) of + {'EXIT',Reason} -> + {error,{[],?MODULE,Reason}}; + Ret -> Ret + end. + +%% @spec file2state(FileName) -> {ok,State} | {error,Reason} +%% State = global_state() +%% FileName = filename() +%% @doc Reads the schema state with all information of the processed +%% schema from a file created with <code>state2file/[1,2]</code>. The +%% format of this file is internal. The state can then be used +%% validating an XML document. +file2state(FileName) -> + case catch ets:file2tab(FileName) of + {ok,Tab} -> + case load_xsd_state(Tab) of + [{state,S}] when is_record(S,xsd_state) -> + xmerl_xsd_vsn_check(S); +%% {ok,S}; + Other -> + {error,{[],?MODULE,{incomplete_file,FileName,Other}}} + end; + {error,Reason} -> + {error,{[],?MODULE,Reason}}; + Other -> + {error,{[],?MODULE,Other}} + end. + +save_xsd_state(S) -> + catch ets:insert(S#xsd_state.table,{state,S}). +load_xsd_state(Table) -> + catch ets:lookup(Table,state). + +xmerl_xsd_vsn() -> + case lists:keysearch(vsn,1,xmerl_xsd:module_info(attributes)) of + {value,{_,MD5_VSN}} -> + MD5_VSN; + _ -> + undefined + end. +xmerl_xsd_vsn_check(S=#xsd_state{vsn=MD5_VSN}) -> + case [V||{vsn,V}<-xmerl_xsd:module_info(attributes)] of + [MD5_VSN] -> + {ok,S}; + _ -> + {error,{[],?MODULE,{different_version_of_xmerl_xsd_module_used, + state_not_reliable}}} + end. + + + +%% @spec process_validate(Schema,Element) -> Result +%% @equiv process_validate(Schema,Xml,[]) +process_validate(Schema,Xml) -> + process_validate(Schema,Xml,[]). +%% @spec process_validate(Schema,Element,Options) -> Result +%% Schema = filename() +%% Element = XmlElement +%% Options = option_list() +%% Result = {ValidXmlElement,State} | {error,Reason} +%% Reason = [ErrorReason] | ErrorReason +%% @doc Validates a parsed well-formed XML element towards an XML +%% schema. <p> Validates in two steps. First it processes the schema, +%% saves the type and structure info in an ets table and then +%% validates the element towards the schema.</p> +%% <p> Usage example:</p> +%% <p> +%% <code>1>{E,_} = xmerl_scan:file("my_XML_document.xml").</code><br/> +%% <code>2>{E2,_} = xmerl_xsd:validate("my_XML_Schema.xsd",E).</code> +%% </p> +%% <p> Observe that E2 may differ from E if for instance there are default +%% values defined in <code>my_XML_Schema.xsd</code>.</p> +process_validate(Schema,Xml,Opts) -> + TargetNamespace = target_namespace(Xml), + case Schema of + [H|_] when is_list(H); is_tuple(H) -> + case process_schemas(Schema, + [{target_namespace,TargetNamespace}|Opts]) of + {ok,S} -> + S2 = validation_options(S,Opts), + validate3(S2#xsd_state.schema_name,Xml,S2); + Err -> + Err + end; + _ -> + process_validate2(xmerl_scan:file(Schema),Schema,Xml,Opts) + end. + +process_validate2(Err={error,_},_,_,_) -> + Err; +process_validate2({SE,_},Schema,Xml,Opts) -> + S = initiate_state(Opts,Schema), + S1 = validate_schema(SE,S), + S2 = validate_schema_ph2(S1), + S3 = schema_concistence_checks(S2), + S4 = validation_options(S3,Opts), + validate3(Schema,Xml,S4). + +validate3(Schema,Xml,S=#xsd_state{errors=[]}) -> + Ret = {_,S2} = + case catch validate_xml(Xml,S) of + {[XML2],[],Sx} -> + {XML2,Sx}; + {XML2,[],Sx} -> + {XML2,Sx}; + {_,UnValidated,Sx} -> + {Xml,acc_errs(Sx,{error_path(UnValidated,Xml#xmlElement.name),?MODULE, + {unvalidated_rest,UnValidated}})}; + _Err = {error,Reason} -> + {Xml,acc_errs(S,Reason)}; + {'EXIT',Reason} -> + {Xml,acc_errs(S,{error_path(Xml,Xml#xmlElement.name),?MODULE, + {undefined,{internal_error,Reason}}})} + end, + save_to_file(S2,filename:rootname(Schema)++".tab2"), + case S2#xsd_state.errors of + [] -> + Ret; + L -> + %%delete_table(S2), + return_error(L) + end; +validate3(_,_,S) -> + return_schema_error(S#xsd_state.errors). + +%% @spec process_schema(Schema) -> Result +%% @equiv process_schema(Schema,[]) +process_schema(Schema) -> + process_schema(Schema,[]). +%% @spec process_schema(Schema,Options) -> Result +%% Schema = filename() +%% Result = {ok,State} | {error,Reason} +%% State = global_state() +%% Reason = [ErrorReason] | ErrorReason +%% Options = option_list() +%% @doc Reads the referenced XML schema and checks that it is valid. +%% Returns the <code>global_state()</code> with schema info or an +%% error reason. The error reason may be a list of several errors +%% or a single error encountered during the processing. +process_schema(Schema,Options) when is_list(Options) -> + S = initiate_state(Options,Schema), + process_schema2(xmerl_scan:file(filename:join(S#xsd_state.xsd_base, Schema)),S,Schema); +process_schema(Schema,State) when is_record(State,xsd_state) -> + process_schema2(xmerl_scan:file(filename:join(State#xsd_state.xsd_base, Schema)),State,Schema). + +process_schema2(Err={error,_},_,_) -> + Err; +process_schema2({SE,_},State,_Schema) -> + S1 = validate_schema(SE,State), + S2 = validate_schema_ph2(S1), + case schema_concistence_checks(S2) of + S3 = #xsd_state{errors=[]} -> + {ok,S3}; + S3 -> + delete_table(S3), + return_error(S3#xsd_state.errors) + end. + +%% @spec process_schemas(Schemas) -> Result +%% @equiv process_schema(Schemas,[]) +process_schemas(Schemas) -> + process_schemas(Schemas,[]). +%% @spec process_schemas(Schemas,Options) -> Result +%% Schemas = [{NameSpace,filename()}|Schemas] | [] +%% Result = {ok,State} | {error,Reason} +%% Reason = [ErrorReason] | ErrorReason +%% Options = option_list() +%% @doc Reads the referenced XML schemas and controls they are valid. +%% Returns the <code>global_state()</code> with schema info or an +%% error reason. The error reason may be a list of several errors +%% or a single error encountered during the processing. +process_schemas(Schemas=[{_,Schema}|_],Options) when is_list(Options) -> + process_schemas(Schemas,initiate_state(Options,Schema)); +process_schemas([{_NS,Schema}|Rest],State=#xsd_state{fetch_fun=Fetch}) -> +%% case process_external_schema_once(Schema,if_list_to_atom(NS),State) of +%% S when is_record(S,xsd_state) -> +%% case process_schema(filename:join([State#xsd_state.xsd_base,Schema]),State) of +%% {ok,S} -> + Res= + case Fetch(Schema,State) of + {ok,{file,File},_} -> + process_schema2(xmerl_scan:file(File),State,Schema); + {ok,{string,Str},_} -> + process_schema2(xmerl_scan:string(Str),State,Schema); + {ok,[],_} -> + {ok,State}; + Err -> + Err + end, + case Res of + {ok,S2} -> + process_schemas(Rest,S2); + _ -> + Res + end; +process_schemas([],S) when is_record(S,xsd_state) -> + {ok,S}. + + +initiate_state(Opts,Schema) -> + XSDBase = filename:dirname(Schema), + {{state,S},RestOpts}=new_state(Opts), + S2 = create_tables(S), + initiate_state2(S2#xsd_state{schema_name = Schema, + xsd_base = XSDBase, + fetch_fun = fun fetch/2},RestOpts). +initiate_state2(S,[]) -> + S; +initiate_state2(S,[{tab2file,Bool}|T]) -> + initiate_state2(S#xsd_state{tab2file=Bool},T); +initiate_state2(S,[{xsdbase,XSDBase}|T]) -> + initiate_state2(S#xsd_state{xsd_base=XSDBase},T); +initiate_state2(S,[{fetch_fun,FetchFun}|T]) -> + initiate_state2(S#xsd_state{fetch_fun=FetchFun},T); +initiate_state2(S,[{fetch_path,FetchPath}|T]) -> + initiate_state2(S#xsd_state{fetch_path=FetchPath},T); +initiate_state2(S,[{schema_preprocessed,Bool}|T]) -> + initiate_state2(S#xsd_state{schema_preprocessed=Bool},T); +initiate_state2(S,[{target_namespace,_NS}|T]) -> +%% initiate_state2(S#xsd_state{targetNamespace=if_list_to_atom(NS)},T); + initiate_state2(S,T); %% used in validation phase +initiate_state2(S,[H|T]) -> + error_msg("Invalid option: ~p~n",[H]), + initiate_state2(S,T). + +validation_options(S,[{target_namespace,NS}|T]) -> + validation_options(S#xsd_state{targetNamespace=if_list_to_atom(NS)},T); +validation_options(S,[_H|T]) -> + validation_options(S,T); +validation_options(S,[]) -> + S. + +new_state(Opts) -> + XSD_VSN = xmerl_xsd_vsn(), + keysearch_delete(state,1,Opts,{state,#xsd_state{vsn=XSD_VSN}}). + + +%% validate_schema/2 traverses the shema element to save necessary +%% information as defined elements and types. +validate_schema(E=#xmlElement{}, + S) -> + %% namespace is always a xmlNamespace record, attributs a list of + %% #xmlAttributes and content a list of #xmlElements|#xmlText|... + + %% Have to save namespace nodes. Use of namespace in paths for + %% unique,key and keyref are used after the schema is processed. + + S1 = S#xsd_state{targetNamespace=target_namespace(E)}, + case is_already_processed(S1#xsd_state.targetNamespace,S1) of + true -> + save_namespace_definition(S1#xsd_state.targetNamespace,S1); + _ -> + S2 = S1,%save_namespace_definition(S1#xsd_state.targetNamespace,S1), + {CM,S3} = traverse_content(E,S2), + save_schema_element(CM,S3), + S3 + end. + +validate_schema_ph2(S=#xsd_state{derived_types=[]}) -> + S; +validate_schema_ph2(S=#xsd_state{derived_types=DT}) -> + deduce_derived_types(DT,S). + +%% traverse_content/2 creates the content model of the schema. +%% content model depends on (1) the type: +%% complex type: +%% sequence, choice, all +%% simple type: no content other than characters +%% (2) minOccurs/maxOccurs attributes. +%% The schema for schemas content model is: +%% schema: ((include | import | redefine | annotation)*, +%% (((simpleType | complexType | group | attributeGroup) +%% | element | attribute | notation), annotation*)*) +%% attribute: (annotation?, simpleType?) +%% element: (annotation?, ((simpleType | complexType)?, (unique | +%% key | keyref)*)) +%% complexType: (annotation?, (simpleContent | complexContent | +%% ((group | all | choice | sequence)?, +%% ((attribute | attributeGroup)*,anyAttribute?)))) +%% attributeGroup:(annotation?, +%% ((attribute | attributeGroup)*, anyAttribute?)) +%% group: (annotation?, (all | choice | sequence)?) +%% all: (annotation?, element*) +%% sequence: (annotation?, +%% (element | group | choice | sequence | any)*) +%% choice: (annotation?, (element | group | choice | sequence | +%% any)*) +%% any: (annotation?) any wellformed xml inside "any" +%% unique: (annotation?, (selector, field+)) +%% key: (annotation?, (selector, field+)) +%% keyref: (annotation?, (selector, field+)) +%% selector: (annotation?) +%% field: (annotation?) +%% notation: (annotation?) +%% annotation: (appinfo | documentation)* +%% appinfo: ({any})* +%% documentation: ({any})* +%% simpleType: (annotation?, (restriction | list | union)) +%% restriction: (annotation?, (simpleType?, (minExclusive | +%% minInclusive | maxExclusive | maxInclusive | +%% totalDigits | fractionDigits | length | minLength | +%% maxLength | enumeration | whiteSpace | pattern)*)) +%% list: (annotation?, simpleType?) +%% union: (annotation?, simpleType*) +%% include: (annotation?) +%% import: (annotation?) +%% redefine: (annotation | (simpleType | complexType | group | +%% attributeGroup))* +traverse_content(E=#xmlElement{name=Name},S) -> + case local_name(Name) of + schema -> + Content = E#xmlElement.content, +%% S1 = S#xsd_state{targetNamespace=target_namespace(E)}, + ThisNS = {"#this#",S#xsd_state.schema_name, + S#xsd_state.targetNamespace}, + S2 = S#xsd_state{checked_namespace_nodes= + add_once(ThisNS,S#xsd_state.checked_namespace_nodes)}, + S3 = namespace_nodes(E,S2), + S4 = element_form_default(E,S3), + S5 = attribute_form_default(E,S4), + S6 = substitution_default(finalDefault,E,S5), + S7 = substitution_default(blockDefault,E,S6), + traverse_content2(Content,S7,[]); + Err -> + exit({error,{[],?MODULE,{schema_error,Err}}}) + end. + + +traverse_content2([],S,Acc) -> + {reverse(remove_annotation(Acc)),reset_scope(S)}; +traverse_content2([El|Els],S,Acc) when is_record(El,xmlElement) -> + %% element declaration: save name, type, scope. + {Object,S2} = element_content(kind(El,S),El,S#xsd_state.scope),%% Object={Kind,Obj} + traverse_content2(Els,S2,[Object|Acc]); +traverse_content2([_T|Els],S,Acc) -> %% xmlText,xmlPI ... + traverse_content2(Els,S,Acc). + +target_namespace(E) -> + case get_attribute_value(targetNamespace,E,undefined) of + URI when is_list(URI) -> + list_to_atom(URI); + URI -> + URI + end. + +%% namespace_nodes/2 -> +%% NS. +namespace_nodes(#xmlElement{namespace=#xmlNamespace{nodes=NS}}, + S=#xsd_state{namespace_nodes=NSN, + global_namespace_nodes=GNSN}) -> + S2 =S#xsd_state{namespace_nodes=foldl(fun add_once/2,NSN,NS)}, + S2#xsd_state{global_namespace_nodes= + add_key_once(S#xsd_state.targetNamespace,1, + {S#xsd_state.targetNamespace,NS}, + GNSN)}. + +attribute_form_default(#xmlElement{attributes=Atts},S)-> + Def=form_default(attributeFormDefault,Atts,S), + S#xsd_state{attributeFormDefault=Def}. +element_form_default(#xmlElement{attributes=Atts},S) -> + Def=form_default(elementFormDefault,Atts,S), + S#xsd_state{elementFormDefault=Def}. +form_default(Key,Atts,_S) -> + case keyNsearch(Key,#xmlAttribute.name,Atts,unqualified) of + #xmlAttribute{value=V} when is_list(V) -> list_to_atom(V); + #xmlAttribute{value=V} ->V; + _-> unqualified + end. + +substitution_default(Subst = finalDefault,El,S) -> + S#xsd_state{finalDefault = substitution(Subst,El,S)}; +substitution_default(Subst = blockDefault,El,S) -> + S#xsd_state{blockDefault = substitution(Subst,El,S)}. +substitution(Subst,El,_S) -> + split_by_whitespace(get_attribute_value(Subst,El,[]),[]). + + +%% element_content may be one of: annotation, type def(simple or +%% complex), import, unique, key, keyref, attribute def, attribute +%% group, all, group, complexContent, simpleContent, choice, sequence +element_content({attribute,S=#xsd_state{scope=Scope}},Att,Env) -> + case qualify_NCName(Att,S) of + no_name -> + Ref = attribute_ref(Att), + AttRef = + {attribute,get_QName(Ref,Att#xmlElement.namespace, %%QQQ + reset_scope(S))}, + {AttRef,add_ref(S,AttRef)}; + Name -> + {AttrType,S2} = attribute_type(Att,[Name|Env],S), + S3 = check_cm(attribute,allowed_content(attribute,Env),AttrType,S2), + {Attr,S4} = attribute_properties(Att#xmlElement.attributes, + #schema_attribute{type=AttrType},S3), + Object = {attribute, + Attr#schema_attribute{name=Name,scope=Scope}}, + S5 = save_object(Object,S4), + {{attribute,Name},S5} + end; +element_content({element,S},El,Env) -> + %% The type of an element may be a simple or complex type (named + %% or anonymous), a referenced name or member of a substitution group. + case qualify_NCName(El,S) of + no_name -> + Ref = particle_ref(El), + {Occ,S2} = occurance(El,{1,1},S), + %% 3.3.3 bullet 2.2 + S3 = element_forbidden_properties(El,S2), + S4 = element_forbidden_content(El#xmlElement.content,S3), + ElRef = + {element, + {get_QName(Ref,El#xmlElement.namespace,reset_scope(S)), + Occ}}, + {ElRef,add_ref(S4,ElRef)}; + Name -> + {Type,S2} = element_type(El,[Name|Env],S), + S3 = check_cm(element,allowed_content(element,Env),Type,S2), + Type2 = remove_annotation(Type), + Unique = [X||X={unique,_} <- Type2], + Key = [X||X={K,_} <- Type2,K == key orelse K==keyref], + {Occur,S4} = occurance(El,{1,1},S3), + {SE,S5} = element_properties(El#xmlElement.attributes, + #schema_element{},El,S4), + CM = remove_attributes([X||X={Y,_}<-Type2, + unique=/=Y,key=/=Y, + keyref=/=Y,annotation=/=Y]), + %% take care of key/keyref later + SE2 = SE#schema_element{name=Name,type=CM,uniqueness=Unique, + key=Key, occurance=Occur, + scope=S5#xsd_state.scope}, + S6 = insert_substitutionGroup(SE2,S5), + S7 = save_object({element,SE2},S6), + {{element,{Name,Occur}},S7} + end; +element_content({complexType,S},CT,Env) -> + %% complex type definition without a name is returnd and added to + %% the content model at this level. A complex type may also contain + %% attributes or attribute group references in the end of its content. + %%?debug("complexType content: ~p~nenv: ~p~n",[CT,Env]), + {SCT,S1} = c_t_properties(CT,#schema_complex_type{},S), + {Mixed,S2} = mixed(CT,S1), + Complexity = complexity(CT#xmlElement.content), + {Object,Name,S7} = + case qualify_NCName(CT,S2) of + no_name -> + {CM,S3} = type(CT#xmlElement.content, + in_scope(anonymous,S2),[complexType|Env]), + S4 = check_cm(complexType,allowed_content(complexType,Env),CM,S3), + Name1 = get_QName('_xmerl_no_name_',CT#xmlElement.namespace,S4), + S5 = set_scope(S#xsd_state.scope,S4), + {Content,Attributes}=split_content(remove_annotation(CM)), + SCT2 = base_type(Content,SCT), + CTObj = + {complexType, + SCT2#schema_complex_type{name=Name1, + scope=S5#xsd_state.scope, + attributes=Attributes, + complexity=Complexity, + content=mixify(Mixed,Content)}}, + {CTObj,Name1,S5}; + Name2 -> + S3 = in_scope(Name2,S2), + S3a = push_circularity_mark({typeDef,Name2},S3), + {CM,S4} = type(CT#xmlElement.content,S3a, + [complexType|Env]), + S4a = pop_circularity_mark({typeDef,Name2},S4), + S5 = check_cm(complexType,allowed_content(complexType,Env), + CM,S4a), + S6 = set_scope(S#xsd_state.scope,S5), + {Content,Attributes}=split_content(remove_annotation(CM)), + SCT2 = base_type(Content,SCT), + {{complexType, + SCT2#schema_complex_type{name=Name2, + scope=S6#xsd_state.scope, + attributes=Attributes, + complexity=Complexity, + content=mixify(Mixed,Content)}}, + Name2,S6} + end, + S8 = save_object(Object,S7), + S9 = derived_type(Object,S8), + {{complexType,Name},S9}; +element_content({attributeGroup,S},AG,Env) -> + %% an attribute group always have a name or a ref, the content is + %% (annotation?,(attribute | attributGroup)*, anyAttribute?). + case qualify_NCName(AG,S) of + no_name -> + %% an attribute group ref inside complex type def or attr + %% group def ( XSD1:3.6.2). + Ref = attributeGroup_ref(AG), + AGRef = + {attributeGroup,get_QName(Ref,AG#xmlElement.namespace,%%QQQ + reset_scope(S))}, + {AGRef,add_ref(S,AGRef)}; + Name -> + %% must occur on top level of schema( XSD1:3.6.2). The + %% only thing needed in content are the names of all + %% attributes or referenced attribute groups. + {CM,S2} = type(AG#xmlElement.content,in_scope(Name,S), + [attributeGroup|Env]), + S2_1 = out_scope(Name,S2), + S3 = check_cm(attributeGroup,allowed_content(attributeGroup,Env),CM,S2_1), + S4 = save_object({attributeGroup, + #schema_attribute_group{name=Name, + content=keep_attributes(CM)}},S3), + {{attributeGroup,Name},S4} + end; +element_content({group,S},G,Env) -> + %% a model group associates a name with a content model. It can be + %% a reference or a definition. + %% content is one of all, choice or sequence. + case qualify_NCName(G,S) of + no_name -> % reference. + %% If reference is a recursive ref to a group with the + %% same name as this group points at the redefined valid + %% schema group. See XMLSchema part 1, section 4.2.2 + %% "Schema Representation Constraint: Individual Component + %% Redefinition" + Ref = particle_ref(G), + {Occur,S2} = occurance(G,{1,1},S), + GRef = + {group, + {get_QName(Ref,G#xmlElement.namespace,reset_scope(S2)),%%QQQ + Occur}}, + {GRef,add_ref(S2,GRef)}; + Name -> % definition, always schema or redefine as parent + {CM,S2} = type(G#xmlElement.content,in_scope(Name,S),[group|Env]), + CM2 = recursive_redefine(Name,CM,S2), + S2_1 = out_scope(Name,S2), + S3 = check_cm(group,allowed_content(group,Env),CM2,S2_1), + S4 = save_object({group,#schema_group{name=Name, + content=remove_annotation(CM2)}},S3), + {{group,Name},S4} + end; +element_content({all,S},All,Env) -> + %% each element occurs 0 or 1 times in any order + %% {all,[{element_name,occurance}]} +%% CM = content_model(Seq#xmlElement.content,S,[all|Env]), + {Occur,S1} = occurance(All,{1,1},S), + {CM,S2} = type(All#xmlElement.content,S1,[all|Env]), + S3 = check_cm(all,allowed_content(all,Env),CM,S2), + {{all,{[X||X = {element,_} <- CM],Occur}},S3}; +element_content({sequence,S},Seq,Env) -> + %% {sequence,[{element_name,occurance}]} +%% CM = content_model(Seq#xmlElement.content,S,[sequence|Env]), + {Occur,S1} = occurance(Seq,{1,1},S), + {CM,S2} = type(Seq#xmlElement.content,S1,[sequence|Env]), + S3 = check_cm(sequence,allowed_content(sequence,Env),CM,S2), + {{sequence,{remove_annotation(CM),Occur}},S3}; +element_content({choice,S},Choice,Env) -> + %% allowed content: (annotation?, + %% (element | group | choice | sequence | any)*) + %% returns: {choice,[element_name]} +%% CM = content_model(Choice#xmlElement.content,S,[choice|Env]), + {Occur,S1} = occurance(Choice,{1,1},S), + {CM,S2} = type(Choice#xmlElement.content,S1,[choice|Env]), + S3 = check_cm(choice,allowed_content(choice,Env),CM,S2), + {{choice,{remove_annotation(CM),Occur}},S3}; +element_content({any,S},Any,_Env) -> + {Occur,S1} = occurance(Any,{1,1},S), + NameSpace = wildcard_namespace(Any,S1), + PC = processor_contents(Any), + ?debug("element_content, any: Any content:~p~n",[Any#xmlElement.content]), + Pred = fun(E=#xmlElement{}) -> case kind(E) of + annotation -> false; + _ -> true + end; + (_) -> + false + end, + S2 = case filter(Pred,Any#xmlElement.content) of + [] -> S1; + Err -> %% report error + acc_errs(S1,{error_path(Any,Any#xmlElement.name),?MODULE, + {unexpected_content_in_any,Err}}) + end, + {{any,{NameSpace,Occur,PC}},S2}; +element_content({IDC,S},El,Env) + when IDC==unique;IDC==key;IDC==keyref-> + QName = qualify_NCName(El,reset_scope(S)), + Ref = keyrefer(IDC,El,S), + {SelField,S2} = type(El#xmlElement.content,S,[IDC|Env]), + case {[X||X={selector,_} <- SelField],[X||X={field,_} <- SelField]} of + {[Sel],Fields=[_H|_T]} -> + IDConstr = #id_constraint{category=IDC,name=QName,refer=Ref, + selector=Sel,fields=Fields}, + S3=save_idc(IDC,IDConstr,S2), + {{IDC,IDConstr},S3}; + Err -> + S3 = acc_errs(S2,{error_path(El,El#xmlElement.name),?MODULE, + {erronous_content_in_identity_constraint,IDC,Err}}), + {{IDC,[]},S3} + end; +element_content({selector,S},Sel,_Env) -> + case get_attribute_value(xpath,Sel,error) of + error -> + S2 = acc_errs(S,{error_path(Sel,Sel#xmlElement.name),?MODULE, + {missing_xpath_attribute,selector}}), + {{selector,[]},S2}; + XPath -> + {{selector,XPath},S} + end; +element_content({field,S},F,_Env) -> + case get_attribute_value(xpath,F,error) of + error -> + S2 = acc_errs(S,{error_path(F,F#xmlElement.name),?MODULE, + {missing_xpath_attribute,field}}), + {{field,[]},S2}; + XPath -> + {{field,XPath},S} + end; +element_content({notation,S},_N,_Env) -> + {{notation,[]},S}; +element_content({annotation,S},_Ann,_Env) -> + {{annotation,[]},S}; +element_content({appinfo,S},_AI,_Env) -> + {{appinfo,[]},S}; +element_content({documentation,S},_D,_Env) -> + {{documentation,[]},S}; +element_content({simpleType,S},ST,Env) -> + Name = case qualify_NCName(ST,S) of + no_name -> + get_QName('_xmerl_no_name_',ST#xmlElement.namespace, + in_scope('_xmerl_no_name_',S));%%--- + QName -> + QName + end, + {Type,S2} = type(ST#xmlElement.content, + push_circularity_mark({typeDef,Name},in_scope(Name,S)), + [simpleType|Env]), + S2_1 = pop_circularity_mark({typeDef,Name},S2), + S3 = set_scope(S#xsd_state.scope,S2_1), + S4 = check_cm(simpleType,allowed_content(simpleType,Env),Type,S3), + {BaseType,Facets} = facets(Type,S4), + Variety = variety(Type), + Final = simpleType_final(ST,S4), + Object = {simpleType,#schema_simple_type{name=Name, + base_type=BaseType, + final=Final, + facets=Facets, + variety=Variety, + content=remove_annotation(Type), + scope=S4#xsd_state.scope}}, + S5 = save_object(Object,S4), + S6 = derived_type(Object,S5), + {{simpleType,Name},S6}; +element_content({restriction,S},R,Env) -> + %% If complexContent, all element definitions of base type must be + %% repeated. However, attributes are inherited. + %% possible parents are simpleType or complexType (grand parent) + %% If parent is simpleType the base type is either the attribute + %% base (resolved by base_type/1) or the type defined in content. + {CM,S2} = type(R#xmlElement.content,S,[restriction|Env]), + S3 = check_cm(restriction,allowed_content(restriction,Env),CM,S2), + + {BaseTypeName,CM2,S4} = restriction_base_type(R,CM,S3), %% a QName +%% S5 = add_circularity_mark(BaseTypeName,S4), + BaseTypeType = base_type_type(Env), + {{restriction,{BaseTypeName,remove_annotation(CM2)}}, + add_ref(S4,{BaseTypeType,BaseTypeName})}; %% Does not return name but content model +element_content({list,S=#xsd_state{scope=Scope}},L,Env) -> + {Type,S2} = list_type(L,S,[list|Env]), + S3 = check_cm(list,allowed_content(list,Scope),Type,S2), + {{list,remove_annotation(Type)},S3}; +element_content({union,S=#xsd_state{scope=Scope}},U,Env) -> + {Types,S2} = union_types(U,S,[union|Env]), + S3 = check_cm(union,allowed_content(union,Scope),Types,S2), + {{union,Types},S3}; +element_content({include,S=#xsd_state{schema_name=ThisSchema, + targetNamespace=TNS}},I,_Env) -> + S2 = process_external_schema_once(I,S#xsd_state.targetNamespace,S), + {{include,[]},S2#xsd_state{schema_name=ThisSchema,targetNamespace=TNS}}; +element_content({import,S=#xsd_state{schema_name=ThisSchema, + targetNamespace=ThisNameS}},I,_Env) -> + %% import unlike include and redefine may include definitions from + %% other namespaces than the target namespace of the including + %% schema. + + %% namespace and schemaLocation + Namespace = + case get_attribute_value(namespace,I,undefined) of + L when is_list(L) -> + list_to_atom(L); + A -> A + end, + %% If Namespace is absent, then the import allows unqualified + %% reference to components with no target namespace. + + SchemaLocation = get_attribute_value(schemaLocation,I,absent), + %% If SchemaLocation is absent, the identification of that schema + %% is leaved to the instance, application or user, via the + %% mechanisms described ��4.3 in XML Schema Part 1. + + S2 = process_external_schema_once(SchemaLocation,Namespace,S), + {{import,[]},S2#xsd_state{schema_name=ThisSchema, + targetNamespace=ThisNameS}}; +element_content({redefine,S=#xsd_state{schema_name=ThisSchema}},RD,Env) -> + %% Must be a child of "schema" element + %% redefine of simple and complex types, groups and attribute + %% groups obtained from external files. + %% Brings in all definitions of external schema and redefines one. + %% External schema must be in same namespace as current schema or + %% no namespace. + S2 = process_external_schema_once(RD,S#xsd_state.targetNamespace, + S#xsd_state{errors=[]}), + case S2#xsd_state.errors of + [] -> +%% RedefSource = S2#xsd_state.schema_name, + S3 = S2#xsd_state{schema_name=ThisSchema, +%% global_element_source=add_once({ThisSchema,RedefSource},GES), + errors=S#xsd_state.errors}, + {CM,S4} = type(RD#xmlElement.content, + S3#xsd_state{redefine=true},[redefine|Env]), + S5 = S4#xsd_state{redefine=false}, + S6 = check_cm(redefine,allowed_content(redefine,Env),CM,S5), + S7 = redefine(CM,S6), + {{redefine,[]},S7}; + Errs -> + S3 = S2#xsd_state{schema_name=ThisSchema, + errors=Errs++S#xsd_state.errors}, + {{redefine,[]},S3} + end; +element_content({anyAttribute,S},AA,_Env) -> + %% has attributes processContents = (lax | skip | strict) : strict + %% namespace = ((##any | ##other) | + %% List of (anyURI | (##targetNamespace | ##local)) ) : ##any + + NameSpace = wildcard_namespace(AA,S), + PC = processor_contents(AA), + Pred = fun(E=#xmlElement{}) -> case kind(E) of + annotation -> false; + _ -> true + end; + (_) -> false + end, + S2 = + case filter(Pred,AA#xmlElement.content) of + [] -> S; + Err -> %% report error + acc_errs(S,{error_path(AA,AA#xmlElement.name),?MODULE, + {content_in_anyAttribute,Err}}) + end, + {{anyAttribute,{NameSpace,PC}},S2}; +element_content({simpleContent,S},SC,Env) -> + %% only as child of complexType. + %% allowed content: (annotation?, (restriction | extension)) + S2 = pre_check_cm(simpleContent,SC#xmlElement.content,mk_name(S#xsd_state.scope),S), + case filter(fun(X=#xmlElement{}) -> + case kind(X) of + restriction -> true; + extension -> true; + _ -> false + end; + (_) -> false + end, + SC#xmlElement.content) of + [E] -> + element_content(kind(E,S2),E,[simpleContent|Env]); + Err -> + {[],acc_errs(S2,{error_path(SC,SC#xmlElement.name),?MODULE, + {content_in_simpleContent,Err}})} + end; +element_content({complexContent,S},CC,Env) -> + S2 = pre_check_cm(complexContent,CC#xmlElement.content, + mk_name(S#xsd_state.scope),S), + %% the mixed attribute was fetched in the complexType element that + %% held this complexContent + case filter(fun(X=#xmlElement{}) -> case kind(X) of + restriction -> true; + extension -> true; + _ -> false + end; + (_) -> false + end,CC#xmlElement.content) of + [E] -> + element_content(kind(E,S2),E,[complexContent|Env]); + Err -> + {[],acc_errs(S2,{error_path(CC,CC#xmlElement.name),?MODULE, + {complexContent_content_failure,Err}})} + end; +element_content({extension,S},Ext,Env) -> + %% may be used in both simple and complex content with different + %% content allowed. + %% this should be returned and checked for allowed content in + %% parent, but we don't know if base type is a forward reference. + BaseType = base_type(Ext), + {CM,S2} = type(Ext#xmlElement.content,S,[extension|Env]), + S3 = check_cm(extension,allowed_content(extension,S#xsd_state.scope),CM,S2), + BaseTypeName = get_QName(BaseType,Ext#xmlElement.namespace,reset_scope(S)),%%QQQ + BaseTypeType = base_type_type(Env), + {{extension,{BaseTypeName,CM}},add_ref(S3,{BaseTypeType,BaseTypeName})}; +%% The following are facets +element_content({minExclusive,S},CF,_Env) -> + Value = get_value(CF), + {{minExclusive,Value},S}; +element_content({minInclusive,S},CF,_Env) -> + Value = get_value(CF), + {{minInclusive,Value},S}; +element_content({maxExclusive,S},CF,_Env) -> + Value = get_value(CF), + {{maxExclusive,Value},S}; +element_content({maxInclusive,S},CF,_Env) -> + Value = get_value(CF), + {{maxInclusive,Value},S}; +element_content({totalDigits,S},CF,_Env) -> + Value = get_value(CF), + {{totalDigits,Value},S}; +element_content({fractionDigits,S},CF,_Env) -> + Value = get_value(CF), + {{fractionDigits,Value},S}; +element_content({length,S},CF,_Env) -> + Value = get_value(CF), + {{length,Value},S}; +element_content({minLength,S},CF,_Env) -> + Value = get_value(CF), + {{minLength,Value},S}; +element_content({maxLength,S},CF,_Env) -> + Value = get_value(CF), + {{maxLength,Value},S}; +element_content({enumeration,S},CF,_Env) -> + Value = get_value(CF), + {{enumeration,Value},S}; +element_content({whiteSpace,S},CF,_Env) -> + Value = get_value(CF), + {{whiteSpace,Value},S}; +element_content({pattern,S},CF,_Env) -> + Value = get_value(CF), + {{pattern,Value},S}; +element_content({Other,S=#xsd_state{errors=Errs}},C,_Env) -> + case Errs of + [] -> + {[],acc_errs(S,{error_path(C,C#xmlElement.name),?MODULE, + {unknown_content,Other}})}; + _ -> + {[],S} + end. + + +type(C,S,Env) -> + type(C,S,Env,[]). +type([E=#xmlElement{}|Els],S,Env,Acc) -> + {CM,S2} = element_content(kind(E,S),E,Env), + type(Els,set_scope(S#xsd_state.scope,S2), + Env,[CM|Acc]); +type([_H|Els],S,Env,Acc) -> + type(Els,S,Env,Acc); +type([],S,_Env,Acc) -> + {flatten(reverse(Acc)),S}. + +simpleUrType() -> + {anySimpleType,[]}. +%% simpleUrTypeRef() -> +%% {anySimpleType,[],'http://www.w3.org/2001/XMLSchema'}. +urType() -> + {anyType,[]}. + + +attribute_type(Att,Env=[Name|_],S) -> + %% The attribute type may be referenced by the type attribute or + %% explicitly defined as a simpleType inside the attribute + %% element. In the latter case the type must be saved with the + %% unique name of the scope and name attribute combined. + {CM,S2} = type(Att#xmlElement.content,in_scope(Name,S),Env), + case remove_annotation(CM) of + [] -> + case keyNsearch(type,#xmlAttribute.name, + Att#xmlElement.attributes,[]) of + #xmlAttribute{value=SimpleTypeName} -> %% a QName as string + %% This name may be a forward reference to a simple type. + TypeRef = {simpleType,get_QName(SimpleTypeName, %%QQQ + Att#xmlElement.namespace, + reset_scope(S))}, + {[TypeRef], + set_scope(S#xsd_state.scope,add_ref(S2,TypeRef))}; + _ -> {[{simpleType,simpleUrType()}], + set_scope(S#xsd_state.scope,S2)} + end; + Type -> + {Type,set_scope(S#xsd_state.scope,S2)} + end. + +element_type(El,Env=[Name|_],S) -> + %% In the top environment of the schema there may exist: global + %% element declarations, substitution group members. + %% Other element declarations are local + {CM,S2} = type(El#xmlElement.content,in_scope(Name,S),Env), + case remove_annotation(CM) of + [] -> %% no simple or complex type definition + case {get_attribute_value(type,El,no_name), + get_attribute_value(substitutionGroup,El,undefined)} of + {no_name,SGName} when is_list(SGName) -> + QN = get_QName(SGName,El#xmlElement.namespace,reset_scope(S)),%%QQQ + case is_simple_type(QN,S2) of + true -> + exit(this_can_never_happen), + %% A substitutionGroup is an element, and + %% the type of this element is the + %% resolved type of the referenced + %% element. + TRef = {simpleType,QN}, + {[TRef], + add_ref(set_scope(S#xsd_state.scope,S2),TRef)}; + _ -> + {[{substitutionGroup,QN}], + set_scope(S#xsd_state.scope,S2)} + end; + {TName,_} when is_list(TName) -> + QN = get_QName(TName,El#xmlElement.namespace,reset_scope(S2)),%%QQQ + case is_simple_type(QN,S2) of + true -> + TRef={simpleType,QN}, + {[TRef], + add_ref(set_scope(S#xsd_state.scope,S2),TRef)}; + _ -> + TRef = {simple_or_complex_Type,QN}, + {[TRef], + add_ref(set_scope(S#xsd_state.scope,S2),TRef)} + end; + _ -> + case {get_attribute_value(ref,El,no_name), + is_global_env(Env)} of + {Ref,false} when is_list(Ref) -> + %% a ref attribute references an element + {[{element, + get_QName(Ref,El#xmlElement.namespace,%%QQQ + reset_scope(S))}], + set_scope(S#xsd_state.scope,S2)}; + _ -> + {[urType()], + set_scope(S#xsd_state.scope,S2)} + end + end; +%% Type -> +%% {Type,set_scope(S#xsd_state.scope,S2)} + _Type -> + {CM,set_scope(S#xsd_state.scope,S2)} + end. + +%% list_type/3 -> list() | name() +list_type(L,S,Env) -> + case keyNsearch(itemType,#xmlAttribute.name,L#xmlElement.attributes,[]) of + [] -> +%% {element(1,type(L#xmlElement.content,S,Env)),S}; + type(L#xmlElement.content,S,Env); + #xmlAttribute{value=V} -> + %% this type should be preliminary saved and checked after + %% the parsing of the schema. + TypeRef ={simpleType, + get_QName(V,L#xmlElement.namespace,reset_scope(S))}, + {[TypeRef],add_ref(S,TypeRef)} + end. +union_types(U,S,Env) -> + {MemberTypes,S2} = + case keyNsearch(memberTypes,#xmlAttribute.name,U#xmlElement.attributes,[]) of + [] -> + {[],S}; + #xmlAttribute{value = NameString} -> + Names = namestring2namelist(NameString), + UTypeRefs = + [{simpleType,get_QName(X,U#xmlElement.namespace, + reset_scope(S))}||X<-Names], + {UTypeRefs,foldl(fun(X,S_in) -> add_ref(S_in,X) end,S,UTypeRefs)} + end, + {DefinedTypes,S3} = union_types1(U#xmlElement.content,S2,Env), + {MemberTypes++DefinedTypes,S3}. + +union_types1(C,S,Env) -> + union_types1(C,S,Env,[]). +union_types1([],S,_Env,Acc) -> + {Acc,S}; +union_types1([C=#xmlElement{}|Cs],S,Env,Acc) -> + case element_content(kind(C,S),C,Env) of + {ST={simpleType,_},S2} -> + union_types1(Cs,S2,Env,[ST|Acc]); + {{annotation,_},S2} -> + union_types1(Cs,S2,Env,Acc); + {IllegalType,S2} -> + Err = {error_path(C,C#xmlElement.name),?MODULE, + {union_member_type_not_simpleType,IllegalType}}, + union_types1(Cs,acc_errs(S2,Err),Env,Acc) + end; +union_types1([_H|T],S,Env,Acc) -> + union_types1(T,S,Env,Acc). + +%% If a group in a redefine refer to itself the reference is to the +%% "old" definition of the group. See XMLSchema part 1, section 4.2.2 +%% "Schema Representation Constraint: Individual Component +%% Redefinition" +recursive_redefine(Name,CM,S=#xsd_state{redefine=true}) -> + case remove_annotation(CM) of + [{MG,{C,Occ}}] -> + [{MG,{recursive_redefine2(Name,C,S),Occ}}]; + _ -> + CM + end; +recursive_redefine(_,CM,_) -> + CM. +recursive_redefine2(Name,[{group,{Name,Occ}}|T],S) -> + %% Rename old group definition + case rename_redef_group(Name,S) of + failed -> + [{group,{Name,Occ}}|T]; + NewName -> + [{group,{NewName,Occ}}|T] + end; +recursive_redefine2(Name,[{MG,{C,Occ}}|T],S) + when MG =:= sequence; MG =:= choice; MG=:= all; MG=:= group -> + C2 = recursive_redefine2(Name,C,S), + [{MG,{C2,Occ}}|recursive_redefine2(Name,T,S)]; +recursive_redefine2(Name,[H|T],S) -> + [H|recursive_redefine2(Name,T,S)]; +recursive_redefine2(_,[],_) -> + []. + +rename_redef_group(Name={LN,Scope,NS},S) -> + %% Scope must be [] + NewName = {LN,['#redefine'|Scope],NS}, + case resolve({group,NewName},S) of + {SG=#schema_group{name=Name},_} -> + save_object({group,SG#schema_group{name=NewName}},S), + NewName; + _ -> + failed + end. + + +add_ref(S=#xsd_state{unchecked_references=UR},STRef={simpleType,Ref}) -> + case {is_builtin_simple_type(Ref),Ref} of + {true,_} -> + S; + {_,{'',_,_}} -> + S; + _ -> + S2 = S#xsd_state{unchecked_references=add_once(STRef,UR)}, + add_circularity_ref(STRef,S2) + end; +add_ref(S=#xsd_state{unchecked_references=UR},STRef={simple_or_complex_Type,Ref}) -> + case {is_builtin_simple_type(Ref),Ref} of + {true,_} -> + S; + {_,{'',_,_}} -> + S; + {_,{anyType,_,?XSD_NAMESPACE}} -> + S; + {_,{anySimpleType,_,?XSD_NAMESPACE}} -> + S; + _ -> + S2 = S#xsd_state{unchecked_references=add_once(STRef,UR)}, + add_circularity_ref(STRef,S2) + end; +add_ref(S,{complexType,{anyType,_,?XSD_NAMESPACE}}) -> + S; +add_ref(S=#xsd_state{unchecked_references=UR},Ref) -> + S2 = S#xsd_state{unchecked_references=add_once(Ref,UR)}, + add_circularity_ref(Ref,S2). +%% add_ref(S=#xsd_state{unchecked_references=UR},Ref) -> +%% S#xsd_state{unchecked_references=add_once(Ref,UR)}. + +%% Name of simpleType/complexType is unique within the whole +%% environment, which is checked elsewhere, so ignore the kind of type +%% for simplicity. +add_circularity_ref(Ref={Kind,To},S=#xsd_state{circularity_disallowed=CD, + redefine=false}) + when Kind==simpleType;Kind==simple_or_complex_Type;Kind==complexType -> + case get_circularity_mark(Ref,S) of + [] -> + S; + From -> %% This is the node from which the graph reaches Ref + S#xsd_state{circularity_disallowed=add_once({From,{typeDef,To}},CD)} + end; +add_circularity_ref(_,S) -> + S. +get_circularity_mark({TD,_},S) + when TD==simpleType;TD==complexType;TD==simple_or_complex_Type -> + case S#xsd_state.circularity_stack of + [From={typeDef,_}|_] -> + From; + _ -> [] + end; +get_circularity_mark(_,_S) -> + []. + +push_circularity_mark(Mark,S=#xsd_state{circularity_stack=CS, + redefine=false}) -> + S#xsd_state{circularity_stack=[Mark|CS]}; +push_circularity_mark(_,S) -> + S. +pop_circularity_mark(Mark,S=#xsd_state{redefine=false}) -> + case S#xsd_state.circularity_stack of + [Mark|Rest] -> + S#xsd_state{circularity_stack=Rest}; + _ -> + S + end; +pop_circularity_mark(_,S) -> + S. + +derived_type({complexType,#schema_complex_type{name=Name,content=C}}, + S=#xsd_state{derived_types=DT}) -> + case {keymember(restriction,1,C),keymember(extension,1,C)} of + {false,false} -> + S; + _ -> + S#xsd_state{derived_types=[{complexType,Name}|DT]} + end; +derived_type({simpleType,#schema_simple_type{name=Name,content=C}}, + S=#xsd_state{derived_types=DT}) -> + case keymember(restriction,1,C) of + true -> + S#xsd_state{derived_types=[{simpleType,Name}|DT]}; + _ -> + S + end. + +facets([{annotation,_}|Rest],S) -> + facets(Rest,S); +facets([{restriction,{BaseType,CM}}],_S) -> + Facets = [X||X={F,_} <- CM,is_facet(F)], + GroupFacets = group_facets(Facets), + {BaseType,GroupFacets}; +facets(_,_S) -> + {undefined,[]}. + +group_facets(Facets) -> + group_facets(Facets,[]). +group_facets(L=[{enumeration,_}|_Rest],Acc) -> + {Enums,Rest} = splitwith(fun({enumeration,_}) -> true; + (_) -> false + end, + L), + group_facets(Rest,[{enumeration,[X||{enumeration,X}<-Enums]}|Acc]); +group_facets([H|T],Acc) -> + group_facets(T,[H|Acc]); +group_facets([],Acc) -> + reverse(Acc). + +simpleType_final(ST,_S) -> + Final = get_attribute_value(final,ST,[]), + split_by_whitespace(Final,[]). + +%% A redefine may contain (simpleType | complexType | group | +%% attributeGroup)* +%%{simpleType,Name},{complexType,Name},{group,Name},{attributeGroup,Name} +redefine([CM|Rest],S) -> + S2=redefine(CM,S), + redefine(Rest,S2); +redefine(ST={Type,_Name},S) + when Type==simpleType ; Type==complexType -> + %% Get the original definition + {OriginalType,S2} = resolve(ST,S), + %% unnecessary to delete saved object, it will be overwritten. + {RedefinedType,S3} = load_redefine_object(ST,S2), + {_MergedType,S4} = merge_derived_types(OriginalType,RedefinedType,redefine,S3), + S4; +redefine(_,S) -> + %% attributeGroup and group redefines are already redefined + S. + +keyrefer(keyref,El,S) -> + Ref=get_attribute_value(refer,El,undefined), + get_QName(Ref,El#xmlElement.namespace,reset_scope(S)); +keyrefer(_,_,_) -> + undefined. + +remove_annotation(CM) when is_list(CM) -> + [X||X = {K,_} <- CM, K=/=annotation]. +remove_attributes(CM) when is_list(CM) -> + [X||X = {K,_} <- CM, K=/=attribute,K=/=anyAttribute,K=/=attributeGroup]. +keep_attributes(CM) when is_list(CM) -> + [X||X = {K,_} <- CM, K==attribute orelse K==anyAttribute orelse K==attributeGroup]. +split_content([{restriction,{BaseT,CM}}]) -> + {[{restriction,{BaseT,remove_attributes(CM)}}],keep_attributes(CM)}; +split_content([{extension,{BaseT,CM}}]) -> + {[{extension,{BaseT,remove_attributes(remove_annotation(CM))}}], + keep_attributes(CM)}; +split_content(CM) -> + {remove_attributes(CM),keep_attributes(CM)}. + +restriction_base_type(R,CM,S) -> + case base_type(R) of + [] -> + case [X||X={simpleType,_}<-CM] of + [{simpleType,TypeName}] -> + {TypeName,keydelete(simpleType,1,CM),S}; + Other -> + Err = {error_path(R,R#xmlElement.name),?MODULE, + {missing_base_type,restriction,Other}}, + {{[],[],[]},CM,acc_errs(S,Err)} + end; + BT -> + {get_QName(BT,R#xmlElement.namespace,reset_scope(S)),CM,S} + end. + +base_type([{restriction,{BaseT,_}}],SCT) -> + SCT#schema_complex_type{base_type=BaseT}; +base_type([{extension,{BaseT,_}}],SCT) -> + SCT#schema_complex_type{base_type=BaseT}; +base_type(_,SCT) -> + SCT. + +variety([{list,_ItemType}]) -> + list; +variety([{union,_ItemType}]) -> + union; +variety(_) -> + atomic. + +%% pre_check_cm/2 is for now only for simpleContent | complexContent +%% which allow content: (annotation?, (restriction | extension)) +pre_check_cm(Kind,Cs=[C=#xmlElement{}|RestC],Name,S) -> + case kind(C,S) of + {annotation,_} -> + pre_check_cm2(Kind,RestC,Name,C,S,0); + {_,S2} -> + pre_check_cm2(Kind,Cs,Name,C,S2,0) + end; +pre_check_cm(Kind,[_C|Cs],Name,S) -> + pre_check_cm(Kind,Cs,Name,S); +pre_check_cm(Kind,[],Name,S) -> + Err = {[],?MODULE,{content_failure,Kind,[],Name}}, + acc_errs(S,Err). + +pre_check_cm2(Kind,[C=#xmlElement{}|Cs],Name,_El,S,N) -> + S2 = + case kind(C,S) of + {restriction,_} -> + S; + {extension,_} -> + S; + {Other,S1} -> + Err = {error_path(C,C#xmlElement.name),?MODULE, + {illegal_element,Kind,Other,Name}}, + acc_errs(S1,Err) + end, + pre_check_cm2(Kind,Cs,Name,C,S2,N+1); +pre_check_cm2(Kind,[_H|T],Name,El,S,N) -> + pre_check_cm2(Kind,T,Name,El,S,N); +pre_check_cm2(_,[],_,_,S,N) when N==1 -> + S; +pre_check_cm2(Kind,[],Name,El,S,N) -> + Err = + case N of + 0 -> + {error_path(El,El#xmlElement.name),?MODULE, + {content_failure_expected_restriction_or_extension, + Kind,Name}}; + _ -> + {error_path(El,El#xmlElement.name),?MODULE, + {content_failure_only_one_restriction_or_extension_allowed, + Kind,Name}} + end, + acc_errs(S,Err). + + +%% check_cm(Arg1,Arg2,Arg3) +%% Arg1 - The allowed content for this type according to schema for schemas +%% Arg2 - The content model of this particular schema +check_cm(Kind,S4SCM,ContentModel,S) -> + case check_cm2(Kind,S4SCM,ContentModel,S) of + {[],_S} -> + S; + {[_,[]|_],_S} -> + S; + {_CM,S2} -> + S2; + Err -> + exit({error,{[],?MODULE,{internal_error,Err}}}) + end. + +check_cm2(Kind,#chain{content=S4SCM,occurance=Occ}, + ContentModel,S) -> + case occurance_loop(Occ,fun check_chain/1, + [S4SCM,ContentModel,Kind,S],0) of + {ok,[]} -> + {[],S}; + {ok,[S4SCMRest,CMRest|_]} -> + case all_optional(S4SCMRest) of + true -> + {CMRest,S}; + _ -> + Err = {[],?MODULE, + {mandatory_component_missing,S4SCMRest,Kind}}, + acc_errs(S,Err) + end; + {error,{_,_,Reason}} -> + Err = {[],?MODULE,{illegal_content,Reason,Kind}}, + {ContentModel,acc_errs(S,Err)} + end; +check_cm2(Kind,#alternative{content=S4SCM,occurance=Occ}, + ContentModel,S) -> + case occurance_loop(Occ,fun check_alternative/1, + [S4SCM,ContentModel,Kind,S],0) of + {ok,[]} -> + {[],S}; + {ok,[_,CMRest|_]} -> + {CMRest,S}; + {error,Reason} -> + {ContentModel,acc_errs(S,Reason)} + end; +check_cm2(_,{Kind,Occ},CM,S) -> + case occurance_loop(Occ,fun check_simple_cm/1,[Kind,CM],0) of + {ok,[]} -> + {[],S}; + {ok,[_,CMRest|_]} -> + {CMRest,S}; + {error,Reason} -> + {CM,acc_errs(S,Reason)}; + Err -> + {CM,acc_errs(S,Err)} + end. + +%% check_simple_cm +check_simple_cm([Kind,CM]) -> + check_simple_cm(Kind,CM). + + +check_simple_cm(Kind,[]) -> + {error,{[],?MODULE,{no_match,{Kind,[]}}}}; +check_simple_cm(Kind,[{Kind,_}|Rest]) -> + {ok,[Kind,Rest]}; +check_simple_cm(Kind,[{Other,_}|Rest]) + when Kind==simpleType;Kind==complexType -> + case Other of + simple_or_complex_Type -> {ok,[Kind,Rest]}; + _ -> {error,{[],?MODULE,{no_match,Other}}} + end; +check_simple_cm(_Kind,[{Other,_}|_]) -> + {error,{[],?MODULE,{no_match,Other}}}. + + +check_chain([S4SCM,ContentModel,Kind,S]) -> + check_chain(Kind,S4SCM,ContentModel,S). +check_chain(Kind,[S4SC|S4SCs],ChainCM=[_H|_T], + S=#xsd_state{errors=Errs}) -> + NewKind = + case S4SC of + {NK,_} -> NK; + _ -> Kind + end, + case check_cm2(NewKind,S4SC,ChainCM,S) of + {ChainCMRest,#xsd_state{errors=Errs}} -> + check_chain(Kind,S4SCs,ChainCMRest,S); + {_ChainCMRest,_S2} -> + case optional(S4SC) of + true -> + check_chain(Kind,S4SCs,ChainCM,S); + _ -> + {error,{[],?MODULE,{unmatched_mandatory_object,Kind,S4SC}}} + end + end; +check_chain(Kind,[],CM,S) -> + {ok,[[],CM,Kind,S]}; +check_chain(Kind,Rest,CM,S) -> + case all_optional(Rest) of + true -> + {ok,[Rest,CM,Kind,S]}; %% or {ok,[[],CM,Kind,S]} + _ -> + {error,{[],?MODULE,{bad_match,Rest,CM}}} + + end. + + +check_alternative([S4SC,CM,Kind,S]) -> + check_alternative(Kind,S4SC,CM,S). +check_alternative(Kind,[S4SC|S4SCs],AltCM = [_H|_T], + S=#xsd_state{errors=Err}) -> + NewKind = + case S4SC of + {NK,_} -> NK; + _ -> Kind + end, + case check_cm2(NewKind,S4SC,AltCM,S) of + {AltCMRest,#xsd_state{errors=Err}} -> + {ok,[[S4SC],AltCMRest,Kind,S]}; + {AltCMRest,_S2} -> + check_alternative(Kind,S4SCs,AltCMRest,S) + end; +check_alternative(Kind,[],_AltCM,_S) -> + {error,{[],?MODULE,{no_match,Kind}}}. + + +%% occurance_loop keeps track of the right number of elements +%% Third argument is a list: [S4SContent,ContentModel] +%% returns {ok,Rest} where Rest is the next unmatched abstract +%% structure. +occurance_loop({Min,Max},_CheckFun,[_,[]|_Rest],N) + when Min =< N, Max >= N -> + {ok,[]}; +occurance_loop(Occ={Min,Max},CheckFun,Args,N) -> + Nplus1 = N+1, + case CheckFun(Args) of + {error,{_,_,{no_match,_}}} when Min =< N, Max >= N -> + {ok,Args}; + Err = {error,_} -> + Err; + {ok,Args} -> + {error,{[],?MODULE,{no_match,occurance_kind(Args)}}}; + {ok,NewArgs} when Nplus1 < Max -> + occurance_loop(Occ,CheckFun,NewArgs,Nplus1); + Ret = {ok,_NewArgs} -> + Ret + end. + +occurance_kind([Kind,_]) -> + Kind; +occurance_kind([_,_,Kind,_]) -> + Kind; +occurance_kind(_) -> + []. +%% if_simple_hd(S4SCM,ConstrCM) +%% when is_record(S4SCM,chain);is_record(S4SCM,alternative);is_list(S4SCM) -> +%% ConstrCM; +%% if_simple_hd(_,[H|_Tl]) -> +%% H. + +%% if_simple_tl(S4SCM,_ConstrCM) +%% when is_record(S4SCM,chain);is_record(S4SCM,alternative);is_list(S4SCM) -> +%% []; +%% if_simple_tl(_,[_|Tl]) -> +%% Tl. + + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% + +count_occur({Min,Max}) -> +% {decrease(Min),decrease(Max)}; + {decrease(Min),Max}; +count_occur(Other) -> + Other. + +decrease(I) when is_integer(I), I > 0 -> + I-1; +decrease(I) -> + I. + +decrease_occurance({K,{ID,Occ}}) -> + {K,{ID,count_occur(Occ)}}; +decrease_occurance(Other) -> + Other. + +get_occur({_,{_,Occ={Min,_}}}) when is_integer(Min) -> + Occ; +get_occur({_,{_,Occ={Min,_},_}}) when is_integer(Min) -> + Occ; +get_occur(Other) -> + Other. + +%% remove_whitespace(L=[T=#xmlText{}|Rest]) -> +%% case is_whitespace(T) of +%% true -> +%% remove_whitespace(Rest); +%% _ -> L +%% end; +%% remove_whitespace(L) -> +%% L. + +optional(optional_text) -> + true; +optional({_,{0,_}}) -> + true; +optional({_,{_,{0,_}}}) -> + true; %% sequence, all or choice +optional({any,{_,{0,_},_}}) -> + true; +optional(#chain{occurance={0,_}}) -> + true; +optional(#alternative{occurance={0,_}}) -> + true; +optional(#chain{content=Content}) -> + catch is_optional_content(Content); +optional(#alternative{content=Content}) -> + catch is_optional_content(Content); +optional({all,{Content,_}}) -> + catch is_optional_content(Content); +optional(_) -> + false. + +is_optional_content([H|T]) -> + case optional(H) of + true -> + is_optional_content(T); + false -> + throw(false) + end; +is_optional_content([]) -> + true. + +not_optional(X) -> + case optional(X) of + true -> + false; + _ -> + true + end. + +all_optional([]) -> + true; +all_optional(L) -> + case filter(fun not_optional/1,L) of + [] -> + true; + _ -> + false + end. + + +%% allowed_content/2 returns a representation of the allowed content +%% model for that object +allowed_content(element,_Parents) -> + #chain{content= + [{annotation,{0,1}}, + #chain{content= + [#alternative{content= + [{simpleType,{1,1}},{complexType,{1,1}}], + occurance={0,1}}, + #alternative{content= + [{unique,{1,1}},{key,{1,1}},{keyref,{1,1}}], + occurance={0,unbounded}}] + }] + }; +allowed_content(attribute,_Parents) -> + #chain{content=[{annotation,{0,1}},{simpleType,{0,1}}]}; +allowed_content(complexType,Parents) -> + #chain{content= + [{annotation,{0,1}}, + #alternative{content= + [set_occurance(allowed_content(simpleContent,Parents),{1,1}), + set_occurance(allowed_content(complexContent,Parents),{1,1}), + #chain{content= + [#alternative{content= + [{group,{1,1}},{all,{1,1}}, + {choice,{1,1}},{sequence,{1,1}}], + occurance={0,1}}, + #chain{content= + [#alternative{content= + [{attribute,{1,1}}, + {attributeGroup,{1,1}}], + occurance={0,unbounded}}, + {anyAttribute,{0,1}}] + } + ] + } + ] + } + ] + }; +allowed_content(attributeGroup,Parents) -> + case member(simpleContent,Parents) of + true -> + {annotation,{0,1}}; + _ -> + #chain{content= + [{annotation,{0,1}}, + #chain{content= + [#alternative{content= + [{attribute,{1,1}}, + {attributeGroup,{1,1}}], + occurance={0,unbounded}}, + {anyAttribute,{0,1}}]}]} + end; +allowed_content(group,_Parents) -> + #chain{content= + [{annotation,{0,1}}, + #alternative{content= + [{all,{1,1}},{choice,{1,1}},{sequence,{1,1}}], + occurance={0,1}}]}; +allowed_content(all,_Parents) -> + #chain{content=[{annotation,{0,1}},{element,{0,unbounded}}]}; +allowed_content(SorC,_Parents) when SorC==sequence;SorC==choice -> + #chain{content= + [{annotation,{0,1}}, + #alternative{content= + [{element,{1,1}},{group,{1,1}}, + {choice,{1,1}},{sequence,{1,1}}, + {any,{1,1}}], + occurance={0,unbounded}}]}; +allowed_content(E,_Parents) + when E==any;E==selector;E==field;E==notation;E==include;E==import; + E==anyAttribute -> + {annotation,{0,1}}; +allowed_content(UKK,_Parents) when UKK==unique;UKK==key;UKK==keyref-> + #chain{content= + [{annotation,{0,1}}, + #chain{content= + [{selector,{1,1}},{selector,{1,unbounded}}]}]}; +allowed_content(annotation,_Parents) -> + #alternative{content=[{appinfo,{1,1}},{documentation,{1,1}}], + occurance={0,unbounded}}; +allowed_content(E,_Parents) when E==appinfo;E==documentation -> + {any,{0,unbounded}}; +allowed_content(simpleType,_Parents) -> + #chain{content= + [{annotation,{0,1}}, + #alternative{content=[{restriction,{1,1}},{list,{1,1}}, + {union,{1,1}}]}]}; +allowed_content(restriction,Parents) -> + case member(simpleType,Parents) of + true -> + allowed_content2(restriction,simpleType); + _ -> + case member(simpleContent,Parents) of + true -> + allowed_content2(restriction,simpleContent); + _ -> + allowed_content2(restriction,complexContent) + end + end; +allowed_content(LU,_Parent) when LU==list;LU==union -> + #chain{content=[{annotation,{0,1}},{simpleType,{0,1}}]}; +allowed_content(schema,_) -> + #chain{content= + [#alternative{content= + [{include,{1,1}},{import,{1,1}}, + {redefine,{1,1}},{annotation,{1,1}}], + occurance={0,1}}, + #chain{content= + [#alternative{content= + [#alternative{content= + [{simpleType,{1,1}},{complexType,{1,1}}, + {group,{1,1}},{attributeGroup,{1,1}}]}, + {element,{1,1}}, + {attribute,{1,1}}, + {notation,{1,1}}]}, + {annotation,{0,unbounded}}], + occurance={0,unbounded}}]}; +allowed_content(redefine,_Parents) -> + #alternative{content= + [{annotation,{1,1}}, + #alternative{content= + [{simpleType,{1,1}},{complexType,{1,1}}, + {group,{1,1}},{attributeGroup,{1,1}}]}], + occurance={0,unbounded}}; +allowed_content(E,_Parents) when E==simpleContent; + E==complexContent -> + #chain{content= + [{annotation,{0,1}}, + #alternative{content= + [{restriction,{1,1}},{extension,{1,1}}]}]}; +allowed_content(extension,Parents) -> + case member(simpleContent,Parents) of + true -> + allowed_content2(extension,simpleContent); + _ -> + allowed_content2(extension,complexContent) + end; +allowed_content(minExclusive,_Parents) -> + []; +allowed_content(minInclusive,_Parents) -> + []; +allowed_content(maxExclusive,_Parents) -> + []; +allowed_content(maxInclusive,_Parents) -> + []; +allowed_content(totalDigits,_Parents) -> + []; +allowed_content(fractionDigits,_Parents) -> + []; +allowed_content(length,_Parents) -> + []; +allowed_content(minLength,_Parents) -> + []; +allowed_content(maxLength,_Parents) -> + []; +allowed_content(enumeration,_Parents) -> + []; +allowed_content(whiteSpace,_Parents) -> + []; +allowed_content(pattern,_Parents) -> + []. + + + + +allowed_content2(restriction,simpleType) -> + #chain{content= + [{annotation,{0,1}}, + #chain{content= + [{simpleType,{0,1}}, + #alternative{content= + [{minExclusive,{1,1}},{minInclusive,{1,1}}, + {maxExclusive,{1,1}},{maxInclusive,{1,1}}, + {totalDigits,{1,1}},{fractionDigits,{1,1}}, + {length,{1,1}},{minLength,{1,1}}, + {maxLength,{1,1}},{enumeration,{1,1}}, + {whiteSpace,{1,1}},{pattern,{1,1}}], + occurance={0,unbounded}}]}]}; +allowed_content2(restriction,simpleContent) -> + #chain{content= + [{annotation,{0,1}}, + #chain{content= + [{simpleType,{0,1}}, + #alternative{content= + [{minExclusive,{1,1}},{minInclusive,{1,1}}, + {maxExclusive,{1,1}},{maxInclusive,{1,1}}, + {totalDigits,{1,1}},{fractionDigits,{1,1}}, + {length,{1,1}},{minLength,{1,1}}, + {maxLength,{1,1}},{enumeration,{1,1}}, + {whiteSpace,{1,1}},{pattern,{1,1}}], + occurance={0,unbounded}}], + occurance={0,1}}, + #chain{content= + [#alternative{content= + [{attribute,{1,1}},{attributeGroup,{1,1}}], + occurance={0,unbounded}}, + {anyAttribute,{0,1}}]}]}; +allowed_content2(restriction,complexContent) -> + #chain{content= + [{annotation,{0,1}}, + #alternative{content= + [{group,{1,1}},{all,{1,1}},{choice,{1,1}}, + {sequence,{1,1}}], + occurance={0,1}}, + #chain{content= + [#alternative{content= + [{attribute,{1,1}},{attributeGroup,{1,1}}], + occurance={0,unbounded}}, + {anyAttribute,{0,1}}]}]}; +allowed_content2(extension,simpleContent) -> + #chain{content= + [{annotation,{0,1}}, + #chain{content= + [#alternative{content= + [{attribute,{1,1}},{attributeGroup,{1,1}}], + occurance={0,unbounded}}, + {anyAttribute,{0,1}}]}]}; +allowed_content2(extension,complexContent) -> + #chain{content= + [{annotation,{0,1}}, + #chain{content= + [#alternative{content= + [{group,{1,1}},{all,{1,1}},{choice,{1,1}}, + {sequence,{1,1}}], + occurance={0,1}}, + #chain{content= + [#alternative{content= + [{attribute,{1,1}}, + {attributeGroup,{1,1}}], + occurance={0,1}}, + {anyAttribute,{0,1}}]}]}]}. + + +set_occurance(Ch = #chain{},Occ) -> + Ch#chain{occurance=Occ}; +set_occurance(Alt = #alternative{},Occ) -> + Alt#alternative{occurance=Occ}; +set_occurance({Name,_},Occ) when is_atom(Name) -> + {Name,Occ}; +set_occurance(CM,_) -> + CM. + + +process_external_schema_once(E,Namespace,S) when is_record(E,xmlElement) -> + case get_attribute_value(schemaLocation,E,[]) of + [] -> + Err = {missing_schemalocation_attribute,E#xmlElement.name}, + acc_errs(S,Err); + Path -> + process_external_schema_once(Path,Namespace,S) + end; +process_external_schema_once(SchemaLocation,Namespace,S) -> + case fetch_external_schema(SchemaLocation,S) of + {E=#xmlElement{},S2} -> + case is_already_processed(Namespace,S2) of + true -> + save_namespace_definition(Namespace,S2); + _ -> + S3 = save_namespace_definition(Namespace,S2), + traverse_ext_schema(E,S3#xsd_state{targetNamespace=Namespace}) + end; + {_,S2} -> + S2 + end. + +%% process_external_schema/2 returns: +%% {ok,some_result()} | {error,reason()} +process_external_schema(Path,S) when is_list(Path) -> + case fetch_external_schema(Path,S) of + {E=#xmlElement{},S2} -> + traverse_ext_schema(E,S2); + {_,S2} -> + S2 + end; +process_external_schema(absent,S) -> + S. + +fetch_external_schema(Path,S) when is_list(Path) -> + FetchFun = S#xsd_state.fetch_fun, + %% {ExtXSD,S2} = + case FetchFun(Path,S) of + {ok,{file,File},_} -> + ?debug("scanning file: ~p~n",[File]), + case xmerl_scan:file(File,S#xsd_state.xml_options) of + {error,Reason} -> + {error,acc_errs(S,{[],?MODULE,{parsing_external_schema_failed,File,Reason}})}; + {EXSD,_} -> + {EXSD,S#xsd_state{schema_name=File}} + end; + {_,{string,String},_} -> %% this is for a user defined fetch fun that returns an xml document on string format. + ?debug("scanning string: ~p~n",[File]), + case xmerl_scan:string(String,S#xsd_state.xml_options) of + {error,Reason} -> + {error,acc_errs(S,{[],?MODULE,{parsing_external_schema_failed,Path,Reason}})}; + {EXSD,_} -> + {EXSD,S#xsd_state{schema_name=Path}} + end; + {ok,[],_} -> + {ok,S}; + {_,Other,_} -> + {error,acc_errs(S,{[],?MODULE,{fetch_fun_failed,Other}})} + end; +fetch_external_schema(absent,S) -> + {ok,S}. + + +%% The schema name is also important here because a schema may import +%% and must include from the same namespace as the target namespace of +%% the including schema. +is_already_processed(NameSpace,#xsd_state{schema_name=SchemaName, + checked_namespace_nodes=CNS}) -> +%% case {keymember(SchemaName,2,CNS),keymember(NameSpace,3,CNS)} of +%% {true,true} -> + case keysearch(SchemaName,2,CNS) of + {_,{_,_,NameSpace}} -> + true; + _ -> + false + end. + +%% +save_namespace_definition(NameSpace, + S=#xsd_state{targetNamespace=TNS, + global_namespace_nodes=GNS, + checked_namespace_nodes=CNS}) -> + %% 1) Have to find a matching namespace in the global list for + %% this schema, and get the associated prefix. 2) Then check + %% whether a schema with this prefix - namespace combinaton + %% already is checked, if so do nothing. 3a) If this namespace is + %% checked but with another prefix only add the prefix - namespace + %% pair to the checked namespace list. 3b) Otherwise add the + %% prefix - namespace pair. + {Prefix,S2} = + case keysearch(TNS,1,GNS) of + {value,{_,ImportedNodes}} -> + case keysearch(NameSpace,2,ImportedNodes) of + {value,{_P,_}} -> {_P,S}; + _ -> {none,S} + end; + _ -> + Err = {[],?MODULE,{imported_namespace_wo_namespace_definition,NameSpace}}, + {none,acc_errs(S,Err)} + end, + %% Instead of 2, 3a and 3b just add_once + case Prefix of + none -> + S2; + _ -> + S#xsd_state{checked_namespace_nodes= + add_once({Prefix,S#xsd_state.schema_name,NameSpace},CNS)} + end. + +%% prefix_namespace_2global + +%% adds {Prefix,Namespace} to the global namespace nodes list for the +%% targetnamespace. Prefix is the right one found in Nodes. +prefix_namespace_2global(Namespace, + #xmlNamespace{nodes=Nodes}, + S=#xsd_state{targetNamespace=TNS, + global_namespace_nodes=GNS}) -> + case keysearch(Namespace,2,Nodes) of + {value,{Prefix,_}} -> + case keysearch(TNS,1,GNS) of + {value,{_,DefinedNamespaces}} -> + S#xsd_state{global_namespace_nodes= + keyreplace(TNS,1,GNS, + {TNS,add_once({Prefix,Namespace}, + DefinedNamespaces)})}; + _ -> + S#xsd_state{global_namespace_nodes= + [{TNS,[{Prefix,Namespace}|default_namespace_by_convention()]}]} + end; + _ -> + S + end; +prefix_namespace_2global(_,_,S) -> + S. + + +traverse_ext_schema(E,S) -> + TargetNS = target_namespace(E), + case {TargetNS,S#xsd_state.targetNamespace} of + {undefined,_} -> + traverse_ext_schema2(E,S); + {TNS,TNS} -> + traverse_ext_schema2(E,S); + _ -> + Err = {error_path(E,schema),?MODULE,{illegal_target_namespace_external_schema,E#xmlElement.name}}, + acc_errs(S,Err) + end. +traverse_ext_schema2(E,S) -> + + S1 = namespace_nodes(E,S), + S2 = element_form_default(E,S1), + S3 = attribute_form_default(E,S2), + S4 = substitution_default(finalDefault,E,S3), + S5 = substitution_default(blockDefault,E,S4), + {CM,S6} = traverse_content2(E#xmlElement.content,S5,[]), +%% ?debug("External schema S6:~n~p~n",[S6]), + save_schema_element(CM,S6), + S6. + + +attribute_properties([#xmlAttribute{name=default,value=Default}|Rest], + Attr,S) -> + attribute_properties(Rest,Attr#schema_attribute{default=Default},S); +attribute_properties([#xmlAttribute{name=fixed,value=Fixed}|Rest],Attr,S) -> + attribute_properties(Rest,Attr#schema_attribute{fixed=Fixed},S); +attribute_properties([#xmlAttribute{name=use,value=Use}|Rest],Attr,S) -> + {Use2,S2} = attribute_use(Use,S), + attribute_properties(Rest,Attr#schema_attribute{use=Use2},S2); +attribute_properties([#xmlAttribute{name=form,value=Form}|Rest],Attr,S) -> + {Form2,S2} = attribute_form(Form,S), + attribute_properties(Rest,Attr#schema_attribute{form=Form2},S2); +attribute_properties([#xmlAttribute{name=id,value=ID}|Rest],Attr,S) -> + S2 = check_and_save_ID(ID,S), + attribute_properties(Rest,Attr#schema_attribute{id=ID},S2); +attribute_properties([_H|Rest],Attr,S) -> + attribute_properties(Rest,Attr,S); +attribute_properties([],Attr,S) -> + {Attr,S}. +attribute_use(Use,S) when Use=="optional";Use=="prohibited";Use=="required" -> + {list_to_atom(Use),S}; +attribute_use(Use,S) -> + {Use,acc_errs(S,{[],?MODULE,{illegal_use_value,Use}})}. +attribute_form(Form,S) when Form=="qualified";Form=="unqualified" -> + {list_to_atom(Form),S}; +attribute_form(Form,S) -> + {Form,acc_errs(S,{[],?MODULE,{illegal_form_value,Form}})}. + +element_properties([#xmlAttribute{name=default,value=Default}|Rest],SE,El,S) -> + case SE#schema_element.value_constraint of + {fixed,_} -> + Err = {error_path(El,schema),?MODULE,{"only one of final/default attributes allowed", + El#xmlElement.name}}, + element_properties(Rest,SE,El,acc_errs(S,Err)); + _ -> + element_properties(Rest,SE#schema_element{value_constraint= + {default,Default}},El,S) + end; +element_properties([#xmlAttribute{name=fixed,value=Fixed}|Rest],SE,El,S) -> + case SE#schema_element.value_constraint of + {default,_} -> + Err = {error_path(El,schema),?MODULE, + {"only one of final/default attributes allowed", + El#xmlElement.name}}, + element_properties(Rest,SE,El,acc_errs(S,Err)); + _ -> + element_properties(Rest,SE#schema_element{value_constraint= + {fixed,Fixed}},El,S) + end; +element_properties([#xmlAttribute{name=substitutionGroup,value=SG}|Rest], + SE,El,S) -> + SGName = get_QName(SG,El#xmlElement.namespace,reset_scope(S)), + element_properties(Rest,SE#schema_element{substitutionGroup=SGName},El, + add_ref(S,{element,SGName})); +element_properties([#xmlAttribute{name=form,value=F}|Rest],SE,El,S) -> + {Form,S2} = attribute_form(F,S), + element_properties(Rest,SE#schema_element{form=Form},El,S2); +element_properties([#xmlAttribute{name=id,value=ID}|Rest],SE,El,S) -> + S2 = check_and_save_ID(ID,S), + element_properties(Rest,SE#schema_element{id=ID},El,S2); +element_properties([#xmlAttribute{name=nillable,value=N}|Rest],SE,El,S) -> + case boolean_to_atom(N) of + error -> + element_properties(Rest,SE,El, + acc_errs(S,{error_path(El,schema),?MODULE,{illegal_nillable_value,N}})); + N_atom -> + element_properties(Rest,SE#schema_element{nillable=N_atom},El,S) + end; +element_properties([#xmlAttribute{name=abstract,value=A}|Rest],SE,El,S) -> + case boolean_to_atom(A) of + error -> + element_properties(Rest,SE,El, + acc_errs(S,{error_path(El,schema),?MODULE,{illegal_abstract_value,A}})); + A_atom -> + element_properties(Rest,SE#schema_element{abstract=A_atom},El,S) + end; +element_properties([#xmlAttribute{name=block,value=B}|Rest],SE,El,S) -> + BlockValues = split_by_whitespace(B,[]), + case legal_block_values(element,BlockValues) of + {error,Reason} -> + element_properties(Rest,SE,El, + acc_errs(S,{error_path(El,schema),?MODULE,{illegal_block_values,Reason}})); + _ -> + element_properties(Rest,SE#schema_element{block=BlockValues},El,S) + end; +element_properties([#xmlAttribute{name=final,value=F}|Rest],SE,El,S) -> + FinalValues = split_by_whitespace(F,[]), + case legal_final_values(element,FinalValues) of + {error,Reason} -> + element_properties(Rest,SE,El, + acc_errs(S,{error_path(El,schema),?MODULE,{illegal_final_values,Reason}})); + _ -> + element_properties(Rest,SE#schema_element{final=FinalValues},El,S) + end; +element_properties([_H|T],SE,El,S) -> + element_properties(T,SE,El,S); +element_properties([],SE,_El,S) -> + {SE,S}. + +%% 3.3.3 bullet 2.2 +%% nillable, default, fixed, form, block and type properties must be +%% absent in element with ref. +element_forbidden_properties(El,S) -> + element_forbidden_properties(El#xmlElement.attributes,El,S). +element_forbidden_properties([#xmlAttribute{name=nillable,value=V}|Atts],El,S) -> + element_forbidden_properties(Atts,El,acc_errs(S,{error_path(El,schema),?MODULE,{forbidden_property,nillable,V}})); +element_forbidden_properties([#xmlAttribute{name=default,value=V}|Atts],El,S) -> + element_forbidden_properties(Atts,El,acc_errs(S,{error_path(El,schema),?MODULE,{forbidden_property,default,V}})); +element_forbidden_properties([#xmlAttribute{name=fixed,value=V}|Atts],El,S) -> + element_forbidden_properties(Atts,El,acc_errs(S,{error_path(El,schema),?MODULE,{forbidden_property,fixed,V}})); +element_forbidden_properties([#xmlAttribute{name=form,value=V}|Atts],El,S) -> + element_forbidden_properties(Atts,El,acc_errs(S,{error_path(El,schema),?MODULE,{forbidden_property,form,V}})); +element_forbidden_properties([#xmlAttribute{name=block,value=V}|Atts],El,S) -> + element_forbidden_properties(Atts,El,acc_errs(S,{error_path(El,schema),?MODULE,{forbidden_property,block,V}})); +element_forbidden_properties([#xmlAttribute{name=type,value=V}|Atts],El,S) -> + element_forbidden_properties(Atts,El,acc_errs(S,{error_path(El,schema),?MODULE,{forbidden_property,type,V}})); +element_forbidden_properties([#xmlAttribute{}|Atts],El,S) -> + element_forbidden_properties(Atts,El,S); +element_forbidden_properties([],_,S) -> + S. + +%% 3.3.3 bullet 2.2 +%% complexType, simpleType, key, keyref and unique must be absent in +%% element with ref. +element_forbidden_content([],S) -> + S; +element_forbidden_content([El=#xmlElement{}|Els],S) -> + case kind(El) of + K when K==complexType;K==simpleType;K==key;K==keyref;K==unique -> + acc_errs(S,{error_path(El,schema),?MODULE,{element_content_must_not_contain,K,El}}); + annotation -> + element_forbidden_content(Els,S); + Other -> + acc_errs(S,{error_path(El,schema),?MODULE,{illegal_element_content,Other}}) + end; +element_forbidden_content([T=#xmlText{}|Rest],S) -> + case is_whitespace(T) of + true -> + element_forbidden_content(Rest,S); + _ -> + acc_errs(S,{error_path(T,schema),?MODULE,{illegal_element_content,T}}) + end. + +c_t_properties(El,CT,S) -> + c_t_properties(El#xmlElement.attributes,El,CT,S). +c_t_properties([#xmlAttribute{name=final,value=V}|Rest],El,CT,S) -> + FinalValues = split_by_whitespace(V,[]), + case legal_final_values(complexType,FinalValues) of + {error,Reason} -> + Err = {error_path(El,schema),?MODULE,{illegal_final_values,Reason}}, + c_t_properties(Rest,El,CT,acc_errs(S,Err)); + _ -> + c_t_properties(Rest,El, + CT#schema_complex_type{final=FinalValues},S) + end; +c_t_properties([#xmlAttribute{name=block,value=V}|Rest],El,CT,S) -> + BlockValues = split_by_whitespace(V,[]), + case legal_block_values(complexType,BlockValues) of + {error,Reason} -> + Err = {error_path(El,schema),?MODULE, + {illegal_block_values,Reason}}, + c_t_properties(Rest,El,CT,acc_errs(S,Err)); + _ -> + c_t_properties(Rest,El,CT#schema_complex_type{block=BlockValues},S) + end; +c_t_properties([#xmlAttribute{name=abstract,value=V}|Rest],El,CT,S) -> + case boolean_to_atom(V) of + error -> + Err = {error_path(El,schema),?MODULE, + {illegal_abstract_value,V}}, + c_t_properties(Rest,El,CT,acc_errs(S,Err)); + V_atom -> + c_t_properties(Rest,El,CT#schema_complex_type{abstract=V_atom},S) + end; +c_t_properties([_H|T],El,CT,S) -> + c_t_properties(T,El,CT,S); +c_t_properties([],_,CT,S) -> + {CT,S}. + + +legal_block_values(_,['#all']) -> + true; +legal_block_values(element,BlockValues) -> + list_members(BlockValues,[extension,restriction,substitution]); +legal_block_values(complexType,BlockValues) -> + list_members(BlockValues,[extension,restriction]). + +legal_final_values(_,['#all']) -> + true; +legal_final_values(_,FinalValues) -> + list_members(FinalValues,[extension,restriction]). + +boolean_to_atom(B) when B=="1";B=="true" -> + true; +boolean_to_atom(B) when B=="0";B=="false" -> + false; +boolean_to_atom(_) -> + error. + + +count_num_el(S=#xsd_state{num_el=N}) -> + S#xsd_state{num_el=N+1}. +set_num_el(S=#xsd_state{},I) when is_integer(I) -> + S#xsd_state{num_el=I}; +set_num_el(S=#xsd_state{},#xsd_state{num_el=I}) -> + S#xsd_state{num_el=I}. + + +occurance(El=#xmlElement{attributes=Atts},{Min,Max},S) -> + AttVal=fun(#xmlAttribute{value=V},Sin) -> + case catch mk_int_or_atom(V) of + {'EXIT',_} -> + Err = {error_path(El,schema),?MODULE, + {illegal_occurance_value,V}}, + {V,acc_errs(Sin,Err)}; + IAV -> {IAV,Sin} + end; + (V1,Sin) -> {V1,Sin} + end, + {MinVal,S2} = AttVal(keyNsearch(minOccurs,#xmlAttribute.name, + Atts,Min),S), + {MaxVal,S3} = AttVal(keyNsearch(maxOccurs,#xmlAttribute.name, + Atts,Max),S2), + {{MinVal,MaxVal},S3}. + +mk_int_or_atom(V="unbounded") -> + list_to_atom(V); +mk_int_or_atom(V) when is_list(V) -> + list_to_integer(V); +mk_int_or_atom(V) -> + V. + +%% E is a complexType, possible kind of content is A)simpleContent, B) +%% complexContent or C) one or zero of 1)group,2)all,3)choice or +%% 4)sequence, followed by any number of attribute or attributeGroup +%% and finally one optional anyAttribute +mixed(E=#xmlElement{content=C},S) -> + case {get_attribute_value(mixed,E,undefined), + [Y||Y=#xmlElement{}<-C,kind(Y)==simpleContent]} of + {_,[_SCE]} -> + {false,S}; %% mixed is false in simpleContent + {undefined,_} -> + case [X||X=#xmlElement{}<-C, + kind(X)==complexContent] of + [E2] -> +%% {get_attribute_value(mixed,E2,false),S}; + mixed(E2,S); + _ -> + {false,S} + end; + {M,_} when M=="1";M=="true" -> {true,S}; + {M,_} when M=="0";M=="false" -> {false,S}; + {M,_} -> + Err = {error_path(E,schema),?MODULE,{invalid_mixed_value,M}}, + {false,acc_errs(S,Err)} + end. + +mixify(false,CM) -> + CM; +mixify(true,CM) -> + mixify2(CM,[optional_text]). +mixify2([],Acc) -> + reverse(Acc); +mixify2([H|T],Acc) -> + mixify2(T,[optional_text,H|Acc]). + +complexity([]) -> + undefined; +complexity([#xmlText{}|T]) -> + complexity(T); +complexity([H|T]) -> + case kind(H) of + simpleContent -> + simple; + complexContent -> + complex; + _ -> + complexity(T) + end. + +%% Validation takes care of the following: +%% 1) a) Check that targetNamespace attribute in schema matches +%% namespace URI if the element. +%% b) If schema don't have a targetNamespace the instance element +%% must not be namespace-qualified + +%% 2) a) Examine type of the element according to the schema and block +%% attributes in the element decl. +%% b) Default values and other infoset contributions are applied. + +%% 3) Check the immediate attributes and contents of the element +%% comparing these against the attributes and contents +%% permitted. +%% a) simple type: +%% -verify there are no attributes or elements. +%% -verify character content matches rules for type. +%% b) complex type: +%% -verify attributes present and values ok. +%% -check subelements according to content model. +%% validate_xml/2 +validate_xml(El = #xmlElement{name=Name}, + S=#xsd_state{table=Tab,schemaLocations=SchemaLocations}) -> + ElQName = {_,_,Namespace} = mk_EII_QName(Name,El,S), + SchemaCM = get_schema_cm(Tab,Namespace), + case [X||X={element,{QName,Occ}} <- SchemaCM#schema.content, + cmp_name(ElQName,QName,S), + at_least_one(Occ)] of + [Obj] -> + {Object,S2} = load_object(Obj,S), + validate_xml(El,Object,S2); + _ -> + %% In case the namespace in El is not processed even + %% though it is present. + case is_already_processed(Namespace,S) of + true -> %% nothing more to do + {error,{error_path(El,Name),?MODULE, + {element_not_in_schema,[Name,ElQName,SchemaCM]}}}; + _ -> + case keysearch(if_atom_to_list(Namespace),1,SchemaLocations) of + {value,{_,Location}} -> + %% namespace present by schemaLocation + %% attribute in instance. + S1 = prefix_namespace_2global(Namespace,El#xmlElement.namespace,S), + S2 = save_namespace_definition(Namespace,S1), + S3 = process_external_schema(Location,S2#xsd_state{targetNamespace=Namespace}), + validate_xml(El,S3); + _ -> %% namespace not imported in schema or instance. + {error,{error_path(El,Name),?MODULE, + {element_not_in_schema,[Name,ElQName,SchemaCM]}}} + end + end + end. +%% validate_xml/3 +validate_xml(XMLEl=#xmlElement{},SEl=#schema_element{},S) -> + %% check that targetNamespace of schema matches URI of the element. + case check_target_namespace(XMLEl,S) of + ok -> + %% Extract the schemaLocation links in the instance, + %% examine type of the element according to the schema and + %% the block attributes in the element declaration + S2 = schemaLocations(XMLEl,S), + ?debug("schemaLocations: ~p~n",[S2#xsd_state.schemaLocations]), + #schema_element{name=_Name,type=_Type,block=Bl} = SEl, + Block = blocking(Bl,S2#xsd_state.blockDefault), %% complex types, elements + Ret = check_element_type([XMLEl],SEl,[],Block,S2,[]), + case Ret of + {ValXML,UnvalRest,S3} -> +%% S4 = schema_concistence_checks(S3), + {ValXML,UnvalRest,S3}; + _ -> + Ret + end; + _ -> + Err = {error_path(XMLEl,XMLEl#xmlElement.name),?MODULE, + {target_namespace_missmatch}}, + {XMLEl,[],acc_errs(S,Err)} + end. + +%% check_element_type/3 +%% examine type according to schema including info of block +%% attributes. If complex type do test recursively +%% 2 often +check_element_type(XML=[XMLTxt=#xmlText{}|Rest],CM=[CMEl|CMRest],Env, + Block,S,Checked) -> + %% XMLTxt ��r det f��rsta av content i element, + %% CMEl ��r den till��tna typen enligt schemat + case is_whitespace(XMLTxt) of + true -> %% Ignore XMLEl + check_element_type(Rest,CM,Env,Block,S,[XMLTxt|Checked]); + _ -> %% CMEl allows optional_text or is an absent optional element + {ResolvedT,S2} = resolve(CMEl,S), + case check_text_type(XML,ResolvedT,S2) of + {error,Reason} -> + case is_optional(CMEl,S) of + true -> + check_element_type(XML,CMRest,Env,Block,S,Checked); + _ -> + check_element_type(Rest,CM,Env,Block, + acc_errs(S,Reason),Checked) + end; + {Ret,Rest2,S3} -> + check_element_type(Rest2,CMRest,Env,Block,S3,reverse(Ret,Checked)) + end + end; +%% If CMEl is a sequence more than the first element of the XML list +%% may match. +check_element_type(XML=[#xmlElement{}|_],[{sequence,{CM,Occ}}|_CMRest], + Env,_Block,S,Checked) -> + ?debug("calling sequence/6~n",[]), + check_sequence(XML,CM,Occ,Env,set_num_el(S,0),Checked); +check_element_type(XML=[#xmlElement{}|_],[{choice,{CM,Occ}}|_CMRest], + Env,_Block,S,Checked) -> + ?debug("calling choice/6~n",[]), + check_choice(XML,CM,Occ,Env,set_num_el(S,0),Checked); +check_element_type(XML=[#xmlElement{}|_],[{all,{CM,Occ}}|_CMRest], + Env,_Block,S,Checked) -> + ?debug("calling choice/6~n",[]), + check_all(XML,CM,Occ,Env,S,Checked,XML); +%% 3 often. CMEL may be ((simpleType | complexType)?, (unique | key | keyref)*)) +check_element_type(XML=[XMLEl=#xmlElement{}|_],[CMEl|CMRest],Env, + Block,S,Checked) -> + %% Three possible releations between XMLEl - CMEl: + %% (1) XMLEl matches CMEl. + %% (2) XMLEl don't matches CMEl and CMEl is optional. + %% (3) XMLEl don't matches CMEl, CMEl mandatory, - error. + %% On the other side may CMEl also match more elements in + %% Rest. This should come down to 2) next function call. + + {ResolvedT,S2} = resolve(CMEl,S), + case check_element_type(XML,ResolvedT,Env,Block,S2,[]) of + {error,Reason} -> % 3 + check_element_type(tl(XML),CMRest,Env,Block, + acc_errs(S,Reason),[XMLEl|Checked]); + {[],_,_} -> % 2 + check_element_type(XML,CMRest,Env,Block,S,Checked); + {XMLEl2,RestXML,S3} -> % 1 This return value does not conform to the others + check_element_type(RestXML,[decrease_occurance(CMEl)|CMRest],Env, + Block,S3,XMLEl2++Checked) + end; +check_element_type([],[],_Env,_Block,S,Checked) -> + {Checked,[],S}; +check_element_type([],[CMEl|CMRest],Env,Block,S,Checked) -> + case is_optional(CMEl,S) of + true -> + check_element_type([],CMRest,Env,Block,S,Checked); + _ -> + Err = {error_path(Checked,undefined),?MODULE, + {missing_mandatory_element,CMEl}}, + {Checked,[],acc_errs(S,Err)} + end; +check_element_type(_XML=[], + #schema_complex_type{name=_Name,base_type=BT, + complexity=simple, + content=_C} = CT, + _Env,_Block,S,Checked) -> + + %% maybe check attributes here as well. + {ResolvedType,_} = resolve({simple_or_complex_Type,BT},S), + case ResolvedType of + #schema_simple_type{} -> + {NewVal,S2} = check_type(ResolvedType,[],unapplied,S), + {NewVal,[],S2}; + {simpleType,_} -> + {NewVal,S2} = check_type(ResolvedType,[],unapplied,S), + {NewVal,[],S2}; + _ -> + {error,{error_path(Checked,undefined),?MODULE, + {empty_content_not_allowed,CT}}} + end; +check_element_type([],#schema_complex_type{name=_Name,block=_Bl,content=C}, + _Env,_Block,S,Checked) -> + %% This type must have an empty content to be valid + case allow_empty_content(C) of + true -> {[],[],S}; + false -> + {error,{error_path(Checked,undefined),?MODULE, + {empty_content_not_allowed,C}}} + end; +check_element_type(C,{anyType,_},_Env,_Block,S,_Checked) -> + %% permitt anything + {C,[],S}; + +check_element_type(XML=[#xmlText{}|_],Type=#schema_simple_type{}, + _Env,_Block,S,_Checked) -> + check_text_type(XML,Type,S); +check_element_type(XML=[#xmlText{}|_],Type={simpleType,_NameNS}, + _Env,_Block,S,_Checked) -> + check_text_type(XML,Type,S); + +check_element_type(XML=[#xmlText{}|_], + #schema_complex_type{name=_Name,base_type=BT, + complexity=simple, + content=_C},Env,Block,S,Checked) -> + + %% maybe check attributes here as well. + {ResolvedType,_} = resolve({simple_or_complex_Type,BT},S), + check_element_type(XML,ResolvedType,Env,Block,S,Checked); + +%% single schema object +check_element_type(XML=[_H|_], + #schema_complex_type{name=Name,block=Bl,content=C}, + Env,_Block,S,Checked) -> + EnvName = case Name of + {LN,_Scope,_NS} -> LN; + _ -> anonymous + end, + Block = blocking(Bl,S#xsd_state.blockDefault), + check_element_type(XML,C,[EnvName|Env],Block,name_scope(Name,S),Checked); + +%% 1 +check_element_type(XML=[XMLEl=#xmlElement{name=Name}|RestXML], + CMEl=#schema_element{name=CMName,type=Type}, + Env,Block,S,Checked) -> + ElName = mk_EII_QName(Name,XMLEl,S#xsd_state{scope=element(2,CMName)}), + {Min,Max} = CMEl#schema_element.occurance, + case cmp_name(ElName,CMName,S) of %% substitutionGroup + true when S#xsd_state.num_el =< Max -> + S1 = id_constraints(CMEl,XMLEl,S), + %% If CMEl element has a substitutionGroup we have to + %% switch to the rigth element and type here. + {CMEl2,Type2,S2} = + if + ElName =:= CMName -> + {CMEl,Type,S1}; + true -> + case resolve({element,ElName},S1) of + {SESub=#schema_element{type=SubType},Ssub} -> + {SESub,SubType,Ssub}; + {_,Ssub} -> + {CMEl,Type,Ssub} + end + end, + + {ResolvedType,S3} = resolve(Type2,XMLEl,S2), + %% What's the value of Resolve?: It must be a simpleType, + %% complexType or an identity-constraint object + XsiFactors = xsi_factors(CMEl2), + {XMLEl2,S4} = check_attributes(XMLEl,ResolvedType, + XsiFactors,S3), + S5 = check_abstract(ElName,XMLEl,CMEl,S4), + S6 = check_form(ElName,Name,XMLEl, + actual_form_value(CMEl#schema_element.form, + S5#xsd_state.elementFormDefault), + S5), + %Step into content of XML element. + {Content,_,S7} = + case + check_element_type(XMLEl2#xmlElement.content, + ResolvedType,Env, + Block,S6,Checked) of + {error,Reason} -> + {XMLEl2#xmlElement.content,[],acc_errs(S6,Reason)}; + Result ={_,[],_} -> Result; + {_,UnexpectedRest,_} -> + Err = {error_path(XMLEl,Name),?MODULE, + {unexpected_rest,UnexpectedRest}}, + {XMLEl2#xmlElement.content,[], + acc_errs(S6,Err)} + end, + {[XMLEl2#xmlElement{content=reverse(Content)}], + RestXML, + set_scope(S5#xsd_state.scope,set_num_el(S7,S6))}; + true -> + {error,{error_path(XMLEl,Name),?MODULE, + {element_not_suitable_with_schema,ElName,S}}}; + _ when S#xsd_state.num_el >= Min -> + %% it may be a match error or an optional element not + %% present + {[],XML,S#xsd_state{num_el=0}}; + _ -> + {error,{error_path(XMLEl,Name),?MODULE, + {element_not_suitable_with_schema,ElName,CMName,CMEl,S}}} + end; +check_element_type(XML,#schema_group{content=[CM]},Env,Block,S,Checked) -> + %% content may contain one of all | choice | sequence or empty + check_element_type(XML,CM,Env,Block,S,Checked); +check_element_type(XML,#schema_group{content=[]},_Env,_Block,_S,_Checked) -> + {error,{error_path(XML,undefined),?MODULE,{no_element_expected_in_group,XML}}}; +check_element_type(XML=[#xmlElement{content=_Content}|_Rest], + {sequence,{Els,Occ}},Env,_Block,S,Checked) -> + ?debug("calling sequence/6~n",[]), + case check_sequence(XML,Els,Occ,Env,S#xsd_state{num_el=0},Checked) of + Err = {error,_} -> + Err; + {ValidContent,Rest2,S2} -> + %% The sequence may consume more than one element + %%{ValidContent,Rest,acc_errs(S2,{sequence_unexpected_rest_objects,UnexpRest})} + {ValidContent,Rest2,S2} + end; +check_element_type(XML=[#xmlElement{}|_Rest], + {choice,{Els,Occ}},Env,_Block,S,Checked) -> + ?debug("calling choice/6~n",[]), + + case check_choice(XML,Els,Occ,Env,S#xsd_state{num_el=0},Checked) of + Err = {error,_} -> + Err; + {ValidContent,Rest2,S2} -> + %% The choice may consume more than one element + {ValidContent,Rest2,S2} + end; +check_element_type(XML=[E=#xmlElement{name=Name}|Rest], + Any={any,{Namespace,_Occ={Min,_},ProcessorContents}},Env, + _Block,S,_Checked) -> + ?debug("check any: {any,{~p,~p,~p}}~n",[Namespace,Occ,ProcessorContents]), + %% ProcessorContents any of lax | strict | skip + %% lax: may validate if schema is found + %% strict: must validate + ElName = mk_EII_QName(Name,E,S), + case cmp_any_namespace(ElName,Namespace,S) of + true -> + case ProcessorContents of + skip -> + {[E],Rest,S}; + lax -> + {[E],Rest,S}; +%% strict when Namespace==['##local'] -> + strict -> + case member(absent,Namespace) of + true -> + %% unqualified well-formed xml is required. The + %% xml is well-formed, check that it is + %% unqualified. + Traverse = + fun(#xmlElement{nsinfo=[], + attributes=Atts, + content=C}, + Sin,Fun) -> + Sin2 = Fun(Atts,Sin,Fun), + Fun(C,Sin2,Fun); + (#xmlAttribute{namespace=[]},Sin,_Fun) -> + Sin; + (#xmlText{},Sin,_Fun) -> Sin; + ([H|T],Sin,Fun) -> + Sin2 = Fun(H,Sin,Fun), + Fun(T,Sin2,Fun); + ([],Sin,_Fun) -> + Sin; + (El,Sin,_Fun) -> + Err = {error_path(E,Name),?MODULE, + {illegal_component_in_any,El}}, + acc_errs(Sin,Err) + end, + S2 = Traverse(E,S,Traverse), + {[E],Rest,S2}; + _ -> + {Result,S2}=check_any(E,Any,Env,S), + {[Result],Rest,S2} + end + end; + false when S#xsd_state.num_el >= Min -> + {[],XML,S}; + _ -> + {error,{error_path(E,Name),?MODULE,{element_bad_match,E,Any,Env}}} + end; +check_element_type([],CM,_Env,_Block,_S,Checked) -> + %% #schema_complex_type, any, #schema_group, anyType and lists are + %% catched above. + case CM of + {simpleType,_} -> + {error,{error_path(Checked,undefined),?MODULE, + {empty_content_not_allowed,CM}}}; + _ -> + {error,{error_path(Checked,undefined),?MODULE, + {empty_content_not_allowed,CM}}} + end; +check_element_type(XML,CM,_Env,_Block,S,_Checked) -> + {error,{error_path(XML,undefined),?MODULE,{match_failure,XML,CM,S}}}. +%% single xml content object and single schema object +check_text_type(XML=[#xmlText{}|_],optional_text,S) -> +% {XMLTxt,optional_text}; + {XMLText,Rest} = split_xmlText(XML), + {XMLText,Rest,S}; +check_text_type(XML=[Txt=#xmlText{}|_],Type={simpleType,_},S) -> + {XMLText,Rest} = split_xmlText(XML), + {NewVal,S2}=check_type(Type,flatten([X||#xmlText{value=X}<-XMLText]),unapplied,S), + {[Txt#xmlText{value=NewVal}],Rest,S2}; +check_text_type(XML=[Txt=#xmlText{}|_],Type=#schema_simple_type{},S) -> + {XMLText,Rest} = split_xmlText(XML), + {NewVal,S2}=check_type(Type,flatten([X||#xmlText{value=X}<-XMLText]),unapplied,S), + {[Txt#xmlText{value=NewVal}],Rest,S2}; +check_text_type([XMLTxt=#xmlText{}|_],CMEl,_S) -> + {error,{error_path(XMLTxt,undefined),?MODULE, + {cannot_contain_text,XMLTxt,CMEl}}}. + +split_xmlText(XML) -> + splitwith(fun(#xmlText{}) -> true;(_) -> false end,XML). + +%% Sequence +check_sequence([T=#xmlText{}|Rest],Els,Occ,Env,S,Checked) -> + check_sequence(Rest,Els,Occ,Env,S,[T|Checked]); +check_sequence(Seq=[_InstEl=#xmlElement{}|_],[El|Els],Occ={_Min,_Max},Env,S,Checked) -> + %% El any of (element | group | choice | sequence | any)* + + {ResolvedT,S2} = resolve(El,S), + case check_element_type(Seq,ResolvedT,Env,[],count_num_el(S2),[]) of + {[],_,S3} -> %% An optional element not present or maybe content == []. + case is_optional(El,S3) of + true -> + check_sequence(Seq,Els,Occ,Env,set_num_el(S3,0),Checked); + _ -> + {error,{error_path(Checked,undefined),?MODULE, + {missing_mandatory_elements,El}}} + end; + Err={error,_Reason} -> + case {is_optional(El,S),S#xsd_state.num_el,get_occur(El)} of + {true,_,_} -> + check_sequence(Seq,Els,Occ,Env,set_num_el(S,0),Checked); + {_,N,{_Min2,Max}} when N>=Max -> + check_sequence(Seq,Els,Occ,Env,set_num_el(S,0),Checked); + _ -> + Err + end; +%% {error,_Reason} when Min==0 -> %% optional element +%% {[],Seq,S}; %% {Checked,Seq,S} +%% {error,_Reason} when S#xsd_state.num_el >= Max -> +%% %% This failure because of number limit +%% {Checked,Seq,S}; +%% Err = {error,_Reason} -> +%% %% Even though this match failed +%% Err; + {Ret,UnValRest,S3} -> + %% must also take care of more elements of same name + %% decrease occurance in El for the optional measurements + %% when Seq is empty. + check_sequence(UnValRest,[decrease_occurance(El)|Els],Occ,Env, + count_num_el(set_num_el(S3,S2)), + Ret++Checked) + end; +check_sequence(Rest,[],_Occ,_Env,S,Checked) -> + {Checked,Rest,set_num_el(S,0)}; +check_sequence([],Els,_Occ,_Env,S,Checked) -> + case [X||X={_,Y={_,_}} <- Els,optional(Y)==false] of + [] -> + {Checked,[],set_num_el(S,0)}; + MandatoryEls -> + {error,{error_path(Checked,undefined),?MODULE, + {missing_mandatory_elements,MandatoryEls}}} + end. +%%check_sequence(Seq,[],_Occ,_Env,_S,_Checked) -> + %%{error,{unmatched_elements,Seq}}. + + +%% Choice one alternative must occur unless all alternatives are +%% optional or the entire choice is optional. +check_choice([T=#xmlText{}|Rest],Els,Occ,Env,S,Checked) -> + case is_whitespace(T) of + true -> + check_choice(Rest,Els,Occ,Env,S,[T|Checked]); + _ -> + {error,{error_path(T,undefined),?MODULE, + {choice_missmatch,T,Els}}} + end; +check_choice(Ch=[#xmlElement{}|_],[El|Els],Occ,Env,S,Checked) -> + {ResolvedT,S2} = resolve(El,S), + case check_element_type(Ch,ResolvedT,Env,[],count_num_el(S2),[]) of + {[],_,_S3} -> %% not matched optional element + check_choice(Ch,Els,Occ,Env,S2,Checked); + {error,_Reason} -> %% This may happen but not for the + %% last alternative element unless the + %% entire choice is optional. So, just + %% continue. + case [X||X=#xmlElement{}<-Checked] of + [] -> + check_choice(Ch,Els,Occ,Env,S2,Checked); + _ -> + {Checked,Ch,set_num_el(S,0)} + end; + {Result,UnValRest,S3} -> %% in this case only more elements of + %% El may be allowed + check_choice(UnValRest,[El],Occ,Env, + count_num_el(set_num_el(S3,S)),Result++Checked) + end; +check_choice([],_,_,_,S,Checked) -> + {Checked,[],set_num_el(S,0)}; +check_choice(XML,[],{0,_},_,S,Checked) -> + %% Choice is optional + {Checked,XML,set_num_el(S,0)}; +check_choice(XML,[],_,_,S,Checked) -> + %% Choice has already matched something, the rest is for somthing + %% else to match. + case S#xsd_state.num_el > 0 of + true -> + {Checked,XML,set_num_el(S,0)}; + _ -> + {error,{error_path(XML,undefined),?MODULE, + {no_element_matching_choice,XML}}} + end. + +check_all([T=#xmlText{}|RestXML],CM,Occ,Env,S,Checked,XML) -> + case is_whitespace(T) of + true -> + check_all(RestXML,CM,Occ,Env,S,[T|Checked],XML); + _ -> + {error,{error_path(T,undefined),?MODULE,{all_missmatch,T,CM}}} + end; +check_all(XML=[E=#xmlElement{name=Name}|RestXML],CM,Occ,Env,S, + Checked,PrevXML) -> + ElName = mk_EII_QName(Name,E,S), + case search_delete_all_el(ElName,CM,S) of + {CMEl={element,_},RestCM} -> + {ResolvedT,S2} = resolve(CMEl,S), + case check_element_type(XML,ResolvedT,Env,[],S2,[]) of + {[],_,_S3} -> + Err = {error_path(E,Name),?MODULE, + {validation_error_all,ElName,CM}}, + check_all(RestXML,CM,Occ,Env,acc_errs(S,Err), + Checked,PrevXML); + {error,_} when element(1,Occ)==0 -> + {[],PrevXML,S}; + {error,Reason} -> + check_all(RestXML,RestCM,Occ,Env, + acc_errs(S,Reason),[E|Checked],PrevXML); + {Result,UnValRest,S3} -> + check_all(UnValRest,RestCM,Occ,Env, + S3#xsd_state{scope=S#xsd_state.scope}, + Result++Checked,PrevXML) + end; + _ when element(1,Occ) == 0 -> + {[],PrevXML,S}; + _ -> + Err = {error_path(E,Name),?MODULE, + {element_not_in_all,ElName,E,CM}}, + check_all(RestXML,CM,Occ,Env,acc_errs(S,Err),[E|Checked],PrevXML) + end; +check_all(XML,[],_,_,S,Checked,_) -> + {Checked,XML,S}; +check_all([],CM,_Occ,_,S,Checked,_PrevXML) -> + case [X||X={_,Y={_,_}} <- CM,optional(Y)==false] of + [] -> + {Checked,[],set_num_el(S,0)}; + MandatoryEls -> + {error,{error_path(Checked,undefined),?MODULE, + {missing_mandatory_elements_in_all,MandatoryEls}}} + end. + +check_any(E,Any,_Env,S) -> + case catch validate_xml(E,S#xsd_state{scope=[]}) of + {[Result],[],S2} -> + {Result,S2#xsd_state{scope=S#xsd_state.scope}}; + {Result,[],S2} -> + {Result,S2#xsd_state{scope=S#xsd_state.scope}}; + {_,_Unvalidated,S2} -> + Err = {error_path(E,undefined),?MODULE,{failed_validating,E,Any}}, + {E,acc_errs(S2#xsd_state{scope=S#xsd_state.scope},Err)}; + {error,Reason} -> + {E,acc_errs(S,Reason)}; + {'EXIT',Reason} -> +%% {E,acc_errs(S,format_error({internal_error,Reason},E,Any,Env))} + Err = {error_path(E,undefined),?MODULE,{internal_error,Reason}}, + {E,acc_errs(S,Err)} + end. + +check_target_namespace(XMLEl,S) -> + case {S#xsd_state.targetNamespace,XMLEl#xmlElement.nsinfo} of + {undefined,[]} -> + ok; + {URI,{Prefix,_}} -> + NS = XMLEl#xmlElement.namespace, + case namespace(Prefix,NS,NS#xmlNamespace.default) of + URI -> + ok; + _ -> + failed + end; + {URI,_} -> + case (XMLEl#xmlElement.namespace)#xmlNamespace.default of + URI -> + ok; + _ -> + failed + end + end. + +schemaLocations(El=#xmlElement{attributes=Atts},S) -> + Pred = fun(#xmlAttribute{name=schemaLocation}) -> false; + (#xmlAttribute{namespace={_,"schemaLocation"}}) -> false; + (_) -> true + end, + case lists:dropwhile(Pred,Atts) of + [] -> + S; + [#xmlAttribute{value=Paths}|_] -> + case string:tokens(Paths," ") of + L when length(L) > 0 -> + case length(L) rem 2 of + 0 -> + PairList = + fun([],_Fun) -> + []; + ([SLNS,SLLoc|Rest],Fun) -> + [{SLNS,SLLoc}|Fun(Rest,Fun)] + end, + S#xsd_state{schemaLocations=PairList(L,PairList)}; + _ -> + Err = {error_path(El,El#xmlElement.name),?MODULE, + {schemaLocation_list_failure,Paths}}, + acc_errs(S,Err) + end; + _ -> + S + end; + _ -> + S + end. + +blocking([],BlockDefault) -> + BlockDefault; +blocking(Block,_) -> + Block. + +allow_empty_content([]) -> + true; +allow_empty_content([{restriction,{_BT,_CM=[]}}]) -> + true; +allow_empty_content([{extension,{_BT,_CM=[]}}]) -> + true; +allow_empty_content([{_,{_,{0,_}}}|Rest]) -> + allow_empty_content(Rest); +allow_empty_content([{_,{Content,_}}|Rest]) -> + case allow_empty_content(Content) of + true -> + allow_empty_content(Rest); + _ -> false + end; +allow_empty_content(_) -> + false. + +empty_xml_content([]) -> + true; +empty_xml_content([H|T]) -> + case is_whitespace(H) of + true -> + empty_xml_content(T); + _ -> + false + end; +empty_xml_content(_) -> + false. + +xsi_factors(#schema_element{nillable=N}) -> + [{nillable,N}]. +check_xsi_factors({nil,_,?XSD_INSTANCE_NAMESPACE}, + #xmlAttribute{value="true"},XsiFactors,XMLEl,S) -> + case key1search(nillable,XsiFactors,false) of + {_,true} -> + case empty_xml_content(XMLEl#xmlElement.content) of + true -> + S; + _ -> + Err = {error_path(XMLEl,XMLEl#xmlElement.name),?MODULE, + {element_content_not_nil,XMLEl}}, + acc_errs(S,Err) + end; + _ -> + S + end; +check_xsi_factors(_,_,_,_,S) -> + S. + +check_attributes(XMLEl=#xmlElement{attributes=Atts}, + #schema_complex_type{name=Name,attributes=SchemaAtts}, + XsiFactors,S) -> + %% For each att in Atts check that it is allowed, and has right type. + %% For each att in CT that is required check that it exists. Apply + %% none present atts that have default values. + OldScope = S#xsd_state.scope, + SchemaAtts2 = resolve_attributeGroups(SchemaAtts,XMLEl,S), + {XMLEl2,S2}=check_attributes(Atts,SchemaAtts2,XMLEl,XsiFactors, + name_scope(Name,S),[]), + {XMLEl2,S2#xsd_state{scope=OldScope}}; +check_attributes(XMLEl=#xmlElement{attributes=[]},_,_,S) -> + {XMLEl,S}; +check_attributes(XMLEl=#xmlElement{name=N,attributes=Atts},_,XsiFactors,S) -> + Fun = + fun(AttX,S_in) -> + case reserved_attribute(AttX,XMLEl#xmlElement.namespace) of + true -> + AttQName = + mk_EII_QName(AttX#xmlAttribute.name,XMLEl,S_in), + check_xsi_factors(AttQName,AttX,XsiFactors,XMLEl,S_in); + _ -> + Err = {error_path(XMLEl,N),?MODULE, + {attribute_in_simpleType,XMLEl,AttX}}, + acc_errs(S_in,Err) + end + end, + {XMLEl,foldl(Fun,S,Atts)}. + +check_attributes([],[SA|SchemaAtts],XMLEl,XsiFactors,S,CheckedAtts) -> + case resolve(SA,S) of + {#schema_attribute{name=Name,use=Use,default=Def,fixed=Fix},S2} -> + case {Use,Def,Fix} of + {required,_,_} -> + Err = {error_path(XMLEl,XMLEl#xmlElement.name),?MODULE, + {required_attribute_missed,XMLEl,Name}}, + check_attributes([],SchemaAtts,XMLEl,XsiFactors, + acc_errs(S2,Err),CheckedAtts); + {optional,undefined,undefined} -> + check_attributes([],SchemaAtts,XMLEl,XsiFactors, + S2,CheckedAtts); + {optional,Default,undefined} -> + NewAtt = create_attribute(Name,Default), + check_attributes([],SchemaAtts,XMLEl,XsiFactors,S2, + [NewAtt|CheckedAtts]); + {optional,undefined,Fix} -> + NewAtt = create_attribute(Name,Def), + check_attributes([],SchemaAtts,XMLEl,XsiFactors,S2, + [NewAtt|CheckedAtts]); + {optional,Default,Fix} -> + Err = {error_path(XMLEl,XMLEl#xmlElement.name),?MODULE, + {default_and_fixed_attributes_mutual_exclusive, + Name,Default,Fix}}, + check_attributes([],SchemaAtts,XMLEl,XsiFactors, + acc_errs(S2,Err),CheckedAtts); + _ -> + check_attributes([],SchemaAtts,XMLEl,XsiFactors, + S2,CheckedAtts) + end; + {{anyAttribute,{_Namespaces,_PC}},S2} -> + check_attributes([],SchemaAtts,XMLEl,XsiFactors, + S2,CheckedAtts); + Err -> + ErrMsg={error_path(XMLEl,XMLEl#xmlElement.name),?MODULE, + {schema_error,unexpected_object,SA,Err}}, + check_attributes([],SchemaAtts,XMLEl,XsiFactors, + acc_errs(S,ErrMsg),CheckedAtts) + end; +check_attributes([],[],XMLEl,_XsiFactors,S,CheckedAtts) -> + {XMLEl#xmlElement{attributes=reverse(CheckedAtts)},S}; +check_attributes([Att|Atts],SchemaAtts,XMLEl,XsiFactors, + S,CheckedAtts) -> +%% AttQName = mk_EII_QName(Att#xmlAttribute.name,XMLEl,S), + {IsQ,AttQName} = mk_EII_Att_QName(Att#xmlAttribute.name,XMLEl,S), + case search_attribute(IsQ,AttQName,SchemaAtts) of + {AttObj={attribute,_},SchemaAtts2} -> + {SA,S2} = load_object(AttObj,S), + #schema_attribute{type=[AttType]} = SA, + {Val,S4} = check_type(AttType, + Att#xmlAttribute.value, unapplied,S2), + check_attributes(Atts,SchemaAtts2,XMLEl,XsiFactors,S4, + [Att#xmlAttribute{value=Val}|CheckedAtts]); + {undefined,SchemaAtts2} -> + %% check for reserved attributes or anyAttribute + case reserved_attribute(Att,XMLEl#xmlElement.namespace) of + true -> + S2 = check_xsi_factors(AttQName,Att,XsiFactors,XMLEl,S), + check_attributes(Atts,SchemaAtts2,XMLEl,XsiFactors, + S2,[Att|CheckedAtts]); + _ -> + case check_anyAttribute(Att,SchemaAtts2,XMLEl,S) of + {error,Reason} -> + check_attributes(Atts,SchemaAtts2,XMLEl,XsiFactors, + acc_errs(S,Reason),CheckedAtts); + {Att2,S2} -> + check_attributes(Atts,SchemaAtts2,XMLEl,XsiFactors, + S2,[Att2|CheckedAtts]) + end + end; + Other -> + Err = {[],?MODULE,{internal_error,Other}}, + check_attributes(Atts,SchemaAtts,XMLEl,XsiFactors, + acc_errs(S,Err),CheckedAtts) + end. + +check_anyAttribute(Att,SchemaAtts,El=#xmlElement{name=Name,namespace=NS},S) -> + case [Any||Any={anyAttribute,_}<-SchemaAtts] of + [] -> + {error,{error_path(El,Name),?MODULE, + {attribute_not_defined_in_schema, + Att#xmlAttribute.name}}}; + [{_,{Namespace,PC}}|_] -> + case check_anyAttribute_namespace(Namespace,NS) of + ok -> + check_anyAttribute2(Namespace,PC,Att,NS,S); + _ -> + {error,{error_path(El,Name),?MODULE, + {disallowed_namespace,Namespace, + NS,Att#xmlAttribute.name}}} + end + end. +check_anyAttribute2(_,PC,Att,_,S) when PC==skip;PC==lax -> + {Att,S}; +check_anyAttribute2(_Namespace,_,Att,_NS,S) -> + %% PC == strict + {Att,S}. + +check_anyAttribute_namespace(['##any'|_],_NS) -> + ok; +check_anyAttribute_namespace([absent],_NS) -> + ok; +check_anyAttribute_namespace([NS|_],NS) -> + ok; +check_anyAttribute_namespace([{'not',NS}|_],NS) -> + false; +check_anyAttribute_namespace([_H|T],NS) -> + check_anyAttribute_namespace2(T,NS). +check_anyAttribute_namespace2([NS|_],NS) -> + ok; +check_anyAttribute_namespace2([_H|T],NS) -> + check_anyAttribute_namespace2(T,NS); +check_anyAttribute_namespace2([],_NS) -> + false. + +resolve_attributeGroups(SchemaAtts,El,S) -> + resolve_attributeGroups(SchemaAtts,El,S,[],[]). +resolve_attributeGroups([AG={attributeGroup,_}|SchemaAtts],El,S,Parents,Acc) -> + case resolve(AG,S) of + {#schema_attribute_group{name=Name,content=AGC},_S2} -> + case {member(Name,Parents),S#xsd_state.redefine} of + {true,false} -> + Err = {error_path(El,El#xmlElement.name),?MODULE, + {cirkular_attributeGroup_reference,Name}}, + resolve_attributeGroups(SchemaAtts,El,acc_errs(S,Err), + Parents,Acc); + {true,_} -> + resolve_attributeGroups(SchemaAtts,El,S,Parents,Acc); + _ -> + resolve_attributeGroups(AGC++[marker|SchemaAtts], + El,S,[Name|Parents],Acc) + end; + Err -> + ErrMsg={error_path(El,El#xmlElement.name),?MODULE, + {schema_error,unexpected_object,AG,Err}}, + resolve_attributeGroups(SchemaAtts,El,acc_errs(S,ErrMsg), + Parents,Acc) + end; +resolve_attributeGroups([marker|T],El,S,[_P|Ps],Acc) -> + resolve_attributeGroups(T,El,S,Ps,Acc); +resolve_attributeGroups([H|T],El,S,Parents,Acc) -> + resolve_attributeGroups(T,El,S,Parents,[H|Acc]); +resolve_attributeGroups([],_,_,_,Acc) -> + Acc. + +check_type(Type=#schema_simple_type{},Value,FacetS,S) -> + check_simpleType(Type,Value,FacetS,S); +check_type({simpleType,{anySimpleType,_}},Value, _FacetS,S) -> + {Value,S}; +check_type({union,Types},Value,_FacetS,S) -> + check_union_types(Types,Value,S); +check_type(ST={simpleType,QName={Name,_Scope,_NS}},Value, FacetS,S) -> + case is_builtin_simple_type(QName) of + true -> + {ConstrainedValue,S2} = + constrained(QName,default_facets(FacetS,Name),Value,S), + case xmerl_xsd_type:check_simpleType(Name,ConstrainedValue,S2) of + {ok,_} when Name=='IDREF';Name=='IDREFS' -> + %% do something more + {ConstrainedValue,S2}; + {ok,_} -> + {ConstrainedValue,S2}; + {error,Reason} -> + ?debug("Error validating type: ~p~nwith value: ~p~n",[Name,Value]), + {Value,acc_errs(S2,Reason)} + end; + _ -> + case resolve(ST,S) of + {[],S2} -> + Err = {[],?MODULE,{could_not_resolve_type,ST}}, + {Value,acc_errs(S2,Err)}; + {RefedST,S2} -> + check_type(RefedST,Value, unapplied,S2) + end + end; +check_type(Type,Value, _FacetS,S) -> + Err = {[],?MODULE,{could_not_check_value_for_type,Type}}, + ?debug("ERROR: not implemented: ~p~nfor value: ~p~n",[Type,Value]), + {Value,acc_errs(S,Err)}. + +check_simpleType(#schema_simple_type{base_type=BT,final=_Final, + facets=Facets,content=Type}, + Value,FacetS,S) -> + case {BT,Type} of + {{_ST,_,_},_} -> + case is_builtin_simple_type(BT) of + true -> + {ConstrainedValue,S2} = + constrained(BT,merge_facets(default_facets(FacetS,BT),Facets),Value,S), + {_,_S3} = check_type({simpleType,BT},ConstrainedValue,applied,S2); + _ -> + case resolve({simpleType,BT},S) of + {BaseST=#schema_simple_type{facets=Facets2},_} -> + check_simpleType(BaseST#schema_simple_type{facets=Facets++Facets2},Value,unapplied,S); + _ -> + Err = {[],?MODULE,{unknown_simpleType,BT}}, + {Value,acc_errs(S,Err)} + end + end; + {_,[CT]} -> + {_,_S2} = check_type(CT,Value,unapplied,S) + end. + +check_union_types(Types,Value,S) -> + check_union_types(Types,Types,Value,S). +check_union_types([],UT,Value,S) -> + acc_errs(S,{[],?MODULE,{value_not_valid,Value,UT}}); +check_union_types([T|Ts],UT,Value,S = #xsd_state{errors=Errs}) -> + case check_type(T,Value,unapplied,S) of + {Val,S2=#xsd_state{errors=Errs}} -> + {Val,S2}; + {_,_} -> + check_union_types(Ts,UT,Value,S) + end. + +reserved_attribute({RA,_,?XSD_INSTANCE_NAMESPACE},_) + when RA==type;RA==nil;RA==schemaLocation;RA==noNamespaceSchemaLocation -> + true; +reserved_attribute(#xmlAttribute{name=Name},#xmlNamespace{nodes=NSNodes}) -> + NameStr = if + is_atom(Name) -> atom_to_list(Name); + true -> Name + end, + case string:tokens(NameStr,":") of + ["xmlns"|_] -> + true; + [Prefix,InstAtt] when InstAtt=="type"; + InstAtt=="nil"; + InstAtt=="schemaLocation"; + InstAtt=="noNamespaceSchemaLocation" -> + case keyNsearch(?XSD_INSTANCE_NAMESPACE,2,NSNodes,[]) of + {Prefix,_} -> + true; + _ -> + false + end; + _ -> + false + end; +reserved_attribute(_,_) -> + false. + + +default_facets(applied,_) -> + []; +default_facets(_,Type) -> + default_facets(Type). +default_facets({Name,_,_}) when is_list(Name) -> + %% Type already proven to be a built in simple type + default_facets(list_to_atom(Name)); +default_facets({Name,_,_}) -> + default_facets(Name); +default_facets(TypeName) -> + case is_xsd_string(TypeName) of + false -> + [{whiteSpace,"collapse"}]; + _ -> + [] + end. + +merge_facets([],DefinedF) -> + DefinedF; +merge_facets([F={Name,_}|Rest],DefinedF) -> + %% At this moment only F has the allowed value + merge_facets(Rest,keyreplace(Name,1,DefinedF,F)). + +constrained({T,_,_},Facets,Value,S) -> + FacetFuns = [facet_fun(T,F)||F<-Facets], + constrained2(FacetFuns,Value,S). +constrained2([],Value,S) -> + {Value,S}; +constrained2([Facet|RestFacets],Value,S) -> + case Facet(Value) of + {error,Reason} -> + constrained2(RestFacets,Value,acc_errs(S,Reason)); + {ok,NewValue} -> + constrained2(RestFacets,NewValue,S) + end. + +id_constraints(CMEl,XMLEl,S) -> + S1 = check_uniqueness(CMEl#schema_element.uniqueness, + XMLEl,S), + S2 = check_keys([X||{key,X}<-CMEl#schema_element.key],XMLEl,S1), + prepare_keyrefs([X||{keyref,X}<-CMEl#schema_element.key],XMLEl,S2). + +check_abstract(ElName,El,#schema_element{name=ElName,abstract=true},S) -> + acc_errs(S,{error_path(El,El#xmlElement.name),?MODULE, + {abstract_element_instance,ElName}}); +check_abstract(ElName,_El,#schema_element{name=ElName},S) -> + S; +check_abstract(ElName,El,#schema_element{},S) -> + {XMLEl,_S2} = load_object({element,ElName},S), + check_abstract(ElName,El,XMLEl,S). + +%% Check of form compliance. +%% Globally declared elements may be qualified even though +%% elementformdefault = "unqualified". +%% If ActualFormValue = "qualified" locally defined names must be +%% explicitly or implicitly qualified. +%% check_form({LocalName,Scope,Namespace},LocalName, +%% InstanceNamespace,ActualFormDefault,S) -> NewS +check_form({LocalName,_,Namespace},LocalName, + El=#xmlElement{name=Name,namespace=NS},qualified,S) -> + case NS#xmlNamespace.default of + Namespace -> + S; + _ -> + acc_errs(S,{error_path(El,Name),?MODULE, + {qualified_name_required,LocalName}}) + end; +check_form({LocalName,_,_},LocalName,_El,_ActualFormDefault,S) -> + S; +check_form({_LocalName,[],_},_QualifiedName,_El,_ActualFormDefault,S) -> + S; +check_form({_LocalName,_,_},QualifiedName,El,unqualified,S) -> + acc_errs(S,{error_path(El,El#xmlElement.name),?MODULE, + {unqualified_name_required,QualifiedName}}); +check_form({_LocalName,_,_},_QualifiedName,_El,_ActualFormDefault,S) -> + S. + +actual_form_value(undefined,GlobalForm) -> + GlobalForm; +actual_form_value(LocalForm,_) -> + LocalForm. + + +check_uniqueness(undefined,_,S) -> + S; +check_uniqueness(Unique,XMLEl,S) -> + case Unique of + [{unique,#id_constraint{selector={selector,SelectorPath}, + fields=Fields}}] -> + TargetNodeSet = target_node_set(SelectorPath,XMLEl,S), + case qualified_node_set(Fields,TargetNodeSet,XMLEl,S) of + {[],S1} -> S1; + {[_E],S1} -> S1; + {L,S1} when is_list(L) -> + key_sequence_uniqueness(L,XMLEl,S1) + end; + _ -> S + end. + +target_node_set(SelectorPath,XMLEl,S) -> + xmerl_xpath:string(SelectorPath,XMLEl, + [{namespace,S#xsd_state.namespace_nodes}]). + +qualified_node_set(Fields,Set,El,S) -> + qualified_node_set([X||{field,X} <- Fields],Set,El,S,[]). + +qualified_node_set([],_Set,_El,S,Acc) -> + {Acc,S}; +qualified_node_set(_,[],_El,S,Acc) -> + {Acc,S}; +qualified_node_set(Paths,[QN|QNs],El,S,Acc) -> + Fun = fun(P,Sx) -> + case apply_field(P,QN,Sx) of + L when length(L) =< 1 -> % Part1:3.11.4.3 + {L,Sx}; + Err -> + RetErr = + {error_path(El,El#xmlElement.name),?MODULE, + {illegal_key_sequence_value,Err}}, + {[],acc_errs(Sx,RetErr)} + end + end, + {KeySequence,S2} = mapfoldl(Fun,S,Paths), + qualified_node_set(Paths,QNs,El,S2,[flatten(KeySequence)|Acc]). + + +apply_field(F,El,S) -> + %% xmerl_xpath:string returns a list + xmerl_xpath:string(F,El,[{namespace,S#xsd_state.namespace_nodes}]). + +check_keys([],_XMLEl,S) -> + S; +check_keys([Key=#id_constraint{selector={selector,SelectorPath}, + fields=Fields}|Keys],XMLEl,S) -> + TargetNodeSet = target_node_set(SelectorPath,XMLEl,S), + S3= + case qualified_node_set(Fields,TargetNodeSet,XMLEl,S) of + {L,S1} when length(L)==length(TargetNodeSet) -> + %% Part1: 3.11.4.4.2.1 + S2 = key_sequence_uniqueness(L,XMLEl,S1), + save_key(Key#id_constraint{key_sequence=L},S2), + S2; + {Err,S1} -> + acc_errs(S1,{error_path(XMLEl,XMLEl#xmlElement.name),?MODULE, + {qualified_node_set_not_correct_for_key,Err}}) + end, + check_keys(Keys,XMLEl,S3). + +%% A reference to a key may occur in another environment than the key +%% was defined. Thus the key must be referenced after the whole +%% document has been processed. At this moment save the info about the +%% keyref and compare it with the key later. +prepare_keyrefs([],_XMLEl,S) -> + S; +prepare_keyrefs([KeyRef=#id_constraint{selector={selector,SelectorPath}, + fields=Fields}|Rest],XMLEl,S) -> + TargetNodeSet = target_node_set(SelectorPath,XMLEl,S), + {L,S1} = qualified_node_set(Fields,TargetNodeSet,XMLEl,S), + save_keyref(KeyRef#id_constraint{key_sequence=L},S1), + prepare_keyrefs(Rest,XMLEl,S1). + + + +%% key_sequence_uniqueness(KeySequence,XMLElement,State) +%% Each element in KeySequence has same length and is a list of one or +%% more elements. key_sequence_uniqueness/2 checks that no two +%% elements has equal values. If it detects two (or more) elements +%% that have equal first subelements it must continue comparing the +%% other subelements of those elements. It returns the state with all +%% detected errors saved. +key_sequence_uniqueness([],_,S) -> + S; +key_sequence_uniqueness([_H],_,S) -> + S; +key_sequence_uniqueness([KS=[F1|FRest]|KSs],El,S) -> + case is_key_sequence_equal(F1,KSs) of + {true,TailOfEquals} -> + S1 = + case k_s_u(FRest,TailOfEquals,S) of + true -> + acc_errs(S,{error_path(El,El#xmlElement.name),?MODULE, + {key_value_not_unique,KS}}); + _ -> + S + end, + key_sequence_uniqueness(KSs,El,S1); + false -> + key_sequence_uniqueness(KSs,El,S) + end. + +k_s_u([],_,_) -> + true; +k_s_u([F|Fs],KSs,S) -> + case is_key_sequence_equal(F,KSs) of + {true,TailOfEquals} -> + k_s_u(Fs,TailOfEquals,S); + _ -> + false + end. + +is_key_sequence_equal(F,KSs) -> + is_key_sequence_equal(F,KSs,[]). +is_key_sequence_equal(_F,[],[]) -> + false; +is_key_sequence_equal(_F,[],Acc) -> + {true,reverse(Acc)}; +is_key_sequence_equal(F,[[F1|TlF1]|Rest],Acc) -> + case is_key_el_equal(F,F1) of + true -> + is_key_sequence_equal(F,Rest,[TlF1|Acc]); + false -> + is_key_sequence_equal(F,Rest,Acc) + end. + +%% This test must be more elaborated considering the equal facet +is_key_el_equal(#xmlElement{content=C1},#xmlElement{content=C2}) -> + %% content must be empty or text since elements must be of + %% simpleType + is_equal_content(C1,C2); +is_key_el_equal(#xmlAttribute{value=V1},#xmlAttribute{value=V1}) -> + true; +is_key_el_equal(_,_) -> + false. + +is_equal_content([T1|Rest1],[T2|Rest2]) + when is_record(T1,xmlText),is_record(T2,xmlText) -> + case is_whitespace(T1) of + true -> + case is_whitespace(T2) of + true -> + is_equal_content(Rest1,Rest2); + _ -> + is_equal_content(Rest1,[T2|Rest2]) + end; + _ -> + case T1#xmlText.value==T2#xmlText.value of + true -> + is_equal_content(Rest1,Rest2); + _ -> + false + end + end; +is_equal_content([],[]) -> + true; +is_equal_content(_,_) -> + false. + +schema_concistence_checks(S) -> + S2 = check_keyrefs(S), + S3 = check_references(S2), + S4 = check_substitutionGroups(S3#xsd_state.substitutionGroups,S3), + S5 = check_cyclic_defs(S4), + reset_state(S5). + +reset_state(S) -> + S#xsd_state{keyrefs=[], + 'IDs'=[], + unchecked_references=[], + substitutionGroups=[], + derived_types=[], + circularity_stack=[], + circularity_disallowed=[]}. + +check_keyrefs(S) -> + KeyRefs = S#xsd_state.keyrefs, + %% check that a key exists with same name as each keyref + KeyExist = + fun({keyref,Name,Refer},S_in) -> + case load_key(Refer,S_in) of + Key=#id_constraint{} -> + check_keyref_cardinality(Name, + load_keyref(Name,S_in), + Key,S_in); +% S_in; + _ -> + acc_errs(S_in,{[],?MODULE, + {keyref_missed_matching_key,Refer}}) + end; + (Other,S_in) -> + acc_errs(S_in,{[],?MODULE, + {keyref_unexpected_object,Other}}) + end, + foldl(KeyExist, S, KeyRefs). +check_keyref_cardinality(_,KR=#id_constraint{category=keyref,fields=KeyRefFs}, + K=#id_constraint{fields=KeyFs},S) -> + case length(KeyRefFs) == length(KeyFs) of + true -> + S; + _ -> + acc_errs(S,{[],?MODULE, + {cardinality_of_fields_not_equal,KR,K}}) + end; +check_keyref_cardinality(Name,_,_,S) -> + acc_errs(S,{[],?MODULE,{could_not_load_keyref,Name}}). + +check_references(S) when is_record(S,xsd_state) -> + check_references(S#xsd_state.unchecked_references,S). +check_references([],S) -> + S; +check_references([H|T],S) -> + check_references(T,check_reference(H,S)). +check_reference(Ref={attribute,_},S) -> + case load_object(Ref,S) of + {#schema_attribute{},S2} -> + S2; + _ -> + acc_errs(S,{[],?MODULE,{reference_undeclared,attribute,Ref}}) + end; +check_reference(Ref={element,_},S) -> + case load_object(Ref,S) of + {#schema_element{},S2} -> + S2; + _ -> + acc_errs(S,{[],?MODULE,{reference_undeclared,element,Ref}}) + end; +check_reference(Ref={attributeGroup,_},S) -> + case load_object(Ref,S) of + {#schema_attribute_group{},S2} -> + S2; + _ -> + acc_errs(S,{[],?MODULE,{reference_undeclared,attributeGroup,Ref}}) + end; +check_reference(Ref={group,_},S) -> + case load_object(Ref,S) of + {#schema_group{},S2} -> S2; + _ -> acc_errs(S,{[],?MODULE,{reference_undeclared,group,Ref}}) + end; +check_reference(Ref={simpleType,_},S) -> + case load_object(Ref,S) of + {#schema_simple_type{},S2} -> S2; + _ -> acc_errs(S,{[],?MODULE,{reference_undeclared,simpleType,Ref}}) + end; +check_reference(Ref={complexType,_},S) -> + case load_object(Ref,S) of + {#schema_complex_type{},S2} -> S2; + _ -> acc_errs(S,{[],?MODULE,{reference_undeclared,complexType,Ref}}) + end; +check_reference({simple_or_complex_Type,Ref},S=#xsd_state{errors=Errs}) -> + %% complex or simple type + case check_reference({complexType,Ref},S) of + S2=#xsd_state{errors=Errs} -> S2; + _ -> check_reference({simpleType,Ref},S) + end; +check_reference(Ref,S) -> + acc_errs(S,{[],?MODULE,{internal_error,unknown_reference,Ref}}). + +%% Substitution groups should be checked for cirkular references +%% (invalid), that reference structure and type structure are +%% concistent. +check_substitutionGroups([],S) -> + S; +check_substitutionGroups(SGs,S) -> + S2 = check_substGr_acyclic(SGs,S), + S3 = check_substGr_type_structure(SGs,S2), + save_substitutionGroup(SGs,S3). +check_substGr_acyclic(SGs,S) -> + Set = sofs:family(SGs), + case catch sofs:family_to_digraph(Set, [acyclic]) of + {'EXIT',{cyclic,_}} -> + acc_errs(S,{[],?MODULE,{cyclic_substitutionGroup,SGs}}); + DG -> + digraph:delete(DG), + S + end. +check_substGr_type_structure([SG|SGs],S) -> + check_substGr_type_structure(SGs,check_substGr_type_structure2(SG,S)); +check_substGr_type_structure([],S) -> + S. +check_substGr_type_structure2({Head,SGMembers},S) -> + TypeCheck = + fun(SG,S_in) -> + case catch cmp_substGr_types(Head,SG,S_in) of + {'EXIT',_} -> + acc_errs(S_in,{[],?MODULE, + {substitutionGroup_error,Head,SG}}); + S_out -> S_out + end + end, + foldl(TypeCheck,S,SGMembers). +cmp_substGr_types(Head,SG,S) -> + {HeadElement,S2} = load_object({element,Head},S), + {MemberElement,S3} = load_object({element,SG},S2), + case catch derived_or_equal(MemberElement#schema_element.type, + HeadElement#schema_element.type, + [],S3) of + S4=#xsd_state{} -> + S4; + _ -> + acc_errs(S3,{[],?MODULE,{internal_error,derived_or_equal, + MemberElement#schema_element.type, + HeadElement#schema_element.type}}) + end. +check_cyclic_defs(S=#xsd_state{circularity_disallowed=CA}) -> + Set = sofs:relation_to_family(sofs:relation(CA)), + case catch sofs:family_to_digraph(Set, [acyclic]) of + {'EXIT',{cyclic,_}} -> + acc_errs(S,{[],?MODULE,{cyclic_definition,CA}}); + DG -> + digraph:delete(DG), + S + end. + + + +derived_or_equal(Type,Type,_Block,S) -> + S; +derived_or_equal([MemberTypeRef],[HeadTypeRef],Block,S) -> + %% HeadType has to be a + {HeadType,_} = resolve(HeadTypeRef,S), + {MemberType,_} = resolve(MemberTypeRef,S), + derived_or_equal_types(MemberType,HeadType,schema,Block,S). +derived_or_equal_types(MemT,{anyType,_},Env,Block,S) -> + case MemT of + #schema_simple_type{content=Cntnt} -> + is_derivation_blocked(Env,Block,Cntnt,S); + #schema_complex_type{content=Cntnt} -> + is_derivation_blocked(Env,Block,Cntnt,S); + _ -> S + end; +derived_or_equal_types(MemT=#schema_simple_type{name=Mem,base_type=MemBase}, + #schema_simple_type{name=Head},Env,Block,S) + when Mem==Head;MemBase==Head -> + is_derivation_blocked(Env,Block,MemT#schema_simple_type.content,S); +derived_or_equal_types({simpleType,Name}, + {simpleType,Name},_Env,_Block,S) -> + S; +derived_or_equal_types(#schema_simple_type{base_type=Name,content=Content}, + {simpleType,Name},Env,Block,S) -> + is_derivation_blocked(Env,Block,Content,S); +derived_or_equal_types(#schema_simple_type{content=[{LoU,[Content]}]}, + SimpleType,Env,Block,S) when LoU==list;LoU==union -> + {NewMemType,S2}=resolve(Content,S), + derived_or_equal_types(NewMemType,SimpleType,Env,Block,S2); +derived_or_equal_types(MemT=#schema_complex_type{name=Mem,base_type=MemBase}, + #schema_complex_type{name=Head},Env,Block,S) + when Mem==Head;MemBase==Head -> + is_derivation_blocked(Env,Block,MemT#schema_complex_type.content,S); +derived_or_equal_types(MemT,HeadT,_Env,_Block,S) -> + acc_errs(S,{[],?MODULE,{type_of_element_not_derived,MemT,HeadT}}). + +is_derivation_blocked(schema,_,_,S) -> + S; +is_derivation_blocked(instance,['#all'],Derivation,S) -> + acc_errs(S,{derivation_blocked,'#all',Derivation}); +is_derivation_blocked(instance,[],_,S) -> + S; +is_derivation_blocked(instance,Block,C=[{Derivation,_}],S) -> + case member(Derivation,Block) of + true -> + acc_errs(S,{[],?MODULE,{derivation_blocked,Derivation,C}}); + _ -> + S + end; +is_derivation_blocked(instance,_Block,_,S) -> + S. + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% + +create_attribute(QName,Value) -> + {Name,_Scope,NSName} = QName, + #xmlAttribute{name=Name,namespace={Name,NSName},value=Value}. + +%% mk_name(L), L must be a list in reversed order +mk_name(L) -> + mk_name(L,[]). +mk_name([],_Acc) -> + []; +mk_name([H],[]) -> + H; +mk_name([H],Acc) -> + list_to_atom(lists:concat([H,'_'|Acc])); +mk_name([H|T],[]) -> + mk_name(T,[H]); +mk_name([H1|T],Acc) -> + mk_name(T,[H1,'_'|Acc]). + +cmp_name({LName,Scope,NS},{LName,Scope,NS},_S) -> + true; +%% substitutionGroup allows different names +cmp_name(XMLName={_,Scope,NS},CMName={_,Scope,NS},S) -> + {El,_S2} = load_object({element,XMLName},S), + cmp_SG_name(El,CMName,S); +cmp_name(_,_,_) -> + false. + +cmp_SG_name(#schema_element{substitutionGroup=Name},Name,_S) -> + true; +cmp_SG_name(#schema_element{substitutionGroup=SGName},CMName,S) -> + cmp_name(SGName,CMName,S); +cmp_SG_name(_,_,_) -> + false. + +%% Namespace: [{not,NS} | NS] +%% +cmp_any_namespace({_,_,EIINS},Namespace,_S) -> + case member(EIINS,Namespace) of + true -> + true; + _ -> + case keysearch(EIINS,2,Namespace) of + {value,{'not',EIINS}} -> + false; + _ -> + true + end + end. + +at_least_one({_Min,Max}) when Max > 0 -> + true; +at_least_one(_) -> + false. + +is_optional({element,{_,{0,_}}},_S) -> + true; +is_optional({any,{_,{0,_},_}},_S) -> + true; +is_optional({MG,{_CM,{0,_}}},_S) + when MG =:= all; MG =:= sequence; MG =:= choice -> + true; +is_optional({MG,{CM,_Occ}},S) + when MG =:= all; MG =:= sequence; MG =:= choice -> + case member(false,[is_optional(Y,S)||Y<-CM]) of + true -> + false; + _ -> true + end; +is_optional({group,{_,{0,_}}},_S) -> + true; +is_optional(G={group,_},S) -> + case resolve(G,S) of + {#schema_group{content=[]},_} -> + true; + {#schema_group{content=[CM]},_} -> + is_optional(CM,S) + end; +is_optional(_,_) -> + false. + + + +acc_errs(S=#xsd_state{errors=Errs},ErrMsg) -> + S#xsd_state{errors=[ErrMsg|Errs]}. + +%% invoked with an element/XML-node and a name of the +error_path([H|_T],Top) when H==#xmlElement{};H==#xmlText{} -> + error_path(H,Top); +error_path([_H|T],Top) -> + error_path(T,Top); +error_path(#xmlElement{parents=Ps,pos=Pos},Top) -> + error_path(Ps,Pos,Top); +error_path(#xmlAttribute{parents=Ps,pos=Pos},Top) -> + error_path(Ps,Pos,Top); +error_path(#xmlText{parents=Ps,pos=Pos},Top) -> + error_path(Ps,Pos,Top); +error_path(_,_) -> + []. +error_path([],Pos,Top) when is_integer(Pos) -> + mk_xpath_path([{Top,Pos}]); +error_path([],_,Top) -> + Top; +error_path(Nodes,_,_) -> + mk_xpath_path(Nodes). + +mk_xpath_path(Nodes) -> + Slash = + fun([H1,H2|T],Fun,Acc) -> Fun([H2|T],Fun,["/",H1|Acc]); + ([H1],_,Acc) -> [H1|Acc]; + ([],_,Acc) -> Acc + end, + flatten(Slash([lists:concat([A,"[",B,"]"])||{A,B}<-Nodes],Slash,[])). + +resolve(XSDType,InstanceEl,S) -> + explicit_type(XSDType,InstanceEl,S). + +resolve([H],S) -> + resolve(H,S); +resolve(Any={any,_},S) -> + {Any,S}; +resolve(Any={anyAttribute,_},S) -> + {Any,S}; +resolve(Any={anyType,_},S) -> + {Any,S}; +resolve(Seq={sequence,_},S) -> + {Seq,S}; +resolve(Choice={choice,_},S) -> + {Choice,S}; +resolve({simple_or_complex_Type,QN},S) -> +%% case load_object({simpleType,QN},S) of + case resolve({simpleType,QN},S) of + Res={#schema_simple_type{},_S1} -> + Res; + {[],_S} -> + case load_object({complexType,QN},S) of + {[],_} -> + ?debug("could not load object ~p~n", + [{simple_or_complex_Type,QN}]), + {[],S}; + T -> + T + end; + T -> + T + end; +resolve({complexType,{anyType,_,_}},S) -> + {{anyType,[]},S}; +resolve({simpleType,{anyType,_,_}},S) -> + {{anyType,[]},S}; +resolve(ST={simpleType,NameNS={_,_,_}},S) -> + case load_object(ST,S) of + {[],_S} -> case is_builtin_simple_type(NameNS) of + true -> + {ST,S}; + _ -> + {[],S} + end; + Obj -> + %resolve(Obj,S) + Obj + end; +resolve({substitutionGroup,QName},S) -> + %% This shall resolve to the type of the element QName + case load_object({element,QName},S) of + Ret = {[],_S} -> Ret; + {#schema_element{type=[Type]},S2} -> + case Type of + {simple_or_complex_Type,_} -> + resolve(Type,S2); + _ -> + {Type,S2} + end; + {#schema_element{type=Type},S2} -> + {Type,S2} + end; +resolve({extension,{BaseType,CM}},S) -> + case is_builtin_simple_type(BaseType) of + true -> + {{simpleType,BaseType},S}; + _ -> + case resolve({simple_or_complex_Type,BaseType},S) of + {ST = #schema_simple_type{},_} -> + {ST,S}; %% any attributes in CM are already + %% propagated to the outer complex type. + {CT = #schema_complex_type{content=C},_} -> + {NewC,S2} = extend_type(C,CM,S), + {CT#schema_complex_type{content=NewC},S2}; + T -> T + end + end; +resolve({restriction,{BaseType,CM}},S) -> + case is_builtin_simple_type(BaseType) of + true -> + {{simpleType,BaseType},S}; + _ -> + case resolve({simple_or_complex_Type,BaseType},S) of + {ST = #schema_simple_type{content=C},_} -> + {NewContent,S2} = restrict_simple_type(C,CM,BaseType,S), + {ST#schema_simple_type{content=NewContent},S2}; + %% the outer complex type. + {CT = #schema_complex_type{content=C},_} -> + {NewContent,S2} = restrict_type(C,CM,BaseType,S), + {CT#schema_complex_type{content=NewContent},S2}; + T -> T + end + end; +resolve(optional_text,S) -> + {optional_text,S}; +resolve(E,S) -> + ?debug("resolve(~p, S)~n",[E]), + load_object(E,S). + +%% explicit_type checks whether the instance element is of an explicit +%% type pointed out by xsi:type. A type refernced by xsi:type must be +%% the same as, or derived from the instance element's type. Concluded +%% from 3.4.6 section "Schema Component Constraint: Type Derivation OK +%% (Complex)". +explicit_type(XSDType,InstanceEl=#xmlElement{namespace=NS,attributes=Atts},S) -> + case get_instance_type(NS,Atts) of + false -> + resolve(XSDType,S); + {ok,Name} -> + %% Create a {name,scope,namespace}, what is scope? + %% assume scope always is at top for the referenced type. + QName = mk_EII_QName(Name,InstanceEl,S#xsd_state{scope=[]}), + %% The type referenced by "xsi:type" attribute must be a + %% legal substitution for InstanceEl: "xsi:type" is the + %% same as or a derivation from InstanceEl's type. + + {XsiType,S2} = resolve({simple_or_complex_Type,QName},S), + {_Blocks,S3} = legal_substitution(InstanceEl,XsiType,S2), +%% {ResXSDType,S4} = resolve(XSDType,S3), + {XsiType,S3} +%% merge_derived_types(ResXSDType,XsiType,Blocks,xsitype,S4) + end. + +get_instance_type(#xmlNamespace{nodes=Nodes},Atts) -> + case keyNsearch(?XSD_INSTANCE_NAMESPACE,2,Nodes,[]) of + {Prefix,_} -> + TypeAtt = list_to_atom(Prefix++":type"), + case keyNsearch(TypeAtt,#xmlAttribute.name,Atts,[]) of + #xmlAttribute{value=Value} -> + {ok,Value}; + _ -> false + end; + _ -> + false + end. + +merge_derived_types(Type1,Type2,Mode,S) -> + merge_derived_types(Type1,Type2,[],Mode,S). +merge_derived_types(Type,Type,_Blocks,_Mode,S) -> + {Type,S}; +merge_derived_types(XSDType,InstType,Blocks,Mode,S) -> + case catch merge_derived_types2(XSDType,InstType,Blocks,Mode,S) of + {'EXIT',Reason} -> + {InstType,acc_errs(S,{[],?MODULE,{internal_error,merge_derived_types,Reason}})}; + {error,S2} -> + {InstType,S2}; + {MergedType,S2} -> + save_merged_type(MergedType,S2), + {MergedType,S2} + end. + +merge_derived_types2(XSDType=#schema_complex_type{}, + InstType=#schema_complex_type{},Blocks,Mode,S) -> + %% InstType is the type of the instance element that may reference + %% a type that is an extension/restriction of the XSDType. + %% Alternatively XSDType is the base type and InstType the derived + %% type or XSDType is the original type that is redefined into + %% InstType. + %% + %% complexType can turn into: + %% simpleContent | complexContent + %% simpleContent -> restriction + %% complexContent -> restriction | extension + %% of course also one of: + %% ((group | all | choice | sequence)?, + %% ((attribute | attributeGroup)*,anyAttribute?)))) + %% but then it shouldn't be any difference between XSDType + %% and InstType + case InstType#schema_complex_type.content of + [{extension,{BaseTypeName,CM}}] -> + {ExtendedAtts,S2} = + extend_attributes(XSDType#schema_complex_type.attributes, + InstType#schema_complex_type.attributes, + BaseTypeName,CM,Mode, + allowed_derivation(extension,Blocks,S)), + case compare_base_types(BaseTypeName,XSDType,S2) of + ok -> + {NewContent,S3} = + extend_type(XSDType#schema_complex_type.content,CM,S2), + {InstType#schema_complex_type{attributes=ExtendedAtts, + content=NewContent},S3}; + Err -> + {error,acc_errs(S2,Err)} + end; + [{restriction,{BaseTypeName,CM}}] -> + {RestrictedAtts,S2} = + restrict_attributes(XSDType#schema_complex_type.attributes, + InstType#schema_complex_type.attributes, + allowed_derivation(restriction,Blocks,S)), + case compare_base_types(BaseTypeName,XSDType,S2) of + ok -> + {NewContent,S3}= + case InstType#schema_complex_type.complexity of + simple -> + restrict_simple_type(XSDType#schema_complex_type.content,CM,BaseTypeName,S2); + _ -> + restrict_type(XSDType#schema_complex_type.content,CM,BaseTypeName,S2) + end, + {InstType#schema_complex_type{attributes=RestrictedAtts, + content=NewContent},S3}; + Err -> + {error,acc_errs(S,Err)} + end; + Other -> + {error,acc_errs(S,{[],?MODULE,{unexpected_type,Other}})} + end; +merge_derived_types2(XSDType=#schema_simple_type{}, + InstType=#schema_simple_type{},Blocks,_Mode,S) -> + case InstType#schema_simple_type.content of + [{restriction,{BaseTypeName,CM}}] -> + case compare_base_types(BaseTypeName,XSDType,S) of + ok -> + + {NewContent,S2}= + restrict_simple_type(XSDType#schema_simple_type.content,CM, + BaseTypeName,S), + {InstType#schema_simple_type{content=NewContent}, + allowed_derivation(restriction,Blocks,S2)}; + Err -> + {error,allowed_derivation(restriction,Blocks, + acc_errs(S,Err))} + end; + Other -> + {error,acc_errs(S,{unexpected_type,Other})} + end; +merge_derived_types2(XSDType=#schema_simple_type{content=XSDContent}, + InstType=#schema_complex_type{},Blocks,_Mode,S) -> + %% This is the way to add attributes to a simpleType + case InstType#schema_complex_type.content of + [{extension,{BaseTypeName,CM}}] -> + case compare_base_types(BaseTypeName,XSDType,S) of + ok -> + {NewContent,S2} = + if CM==[] -> {XSDContent,S}; + true -> extend_type(XSDContent,CM,S) + end, + {InstType#schema_complex_type{content=NewContent}, + allowed_derivation(extension,Blocks,S2)}; + Err -> + {error,allowed_derivation(extension,Blocks, + acc_errs(S,Err))} + end; + [{restriction,{BaseTypeName,_CM}}] + when InstType#schema_complex_type.complexity == simple -> + case compare_base_types(BaseTypeName,XSDType,S) of + ok -> + {InstType, + allowed_derivation(restriction,Blocks,S)}; + Err -> + {error,allowed_derivation(extension,Blocks, + acc_errs(S,Err))} + end; + Other -> + {error,acc_errs(S,{[],?MODULE,{unexpected_type,Other}})} + end; +merge_derived_types2(_XSDType={simpleType,BuiltInType}, + InstType=#schema_complex_type{content=Content}, + Blocks,_Mode,S) -> + case Content of + [{extension,{BuiltInType,CM}}] -> + {NewContent,S2} = extend_type([],CM,S), + {InstType#schema_complex_type{base_type=BuiltInType, + content=NewContent}, + allowed_derivation(extension,Blocks,S2)}; + [{restriction,{BuiltInType,CM}}] -> + {NewContent,S2} = restrict_simple_type([],CM,BuiltInType,S), + {InstType#schema_complex_type{base_type=BuiltInType, + content=NewContent}, + allowed_derivation(restriction,Blocks,S2)}; + Other -> + {error,acc_errs(S,{[],?MODULE,{unexpected_content,Other,InstType}})} + end; +merge_derived_types2(_XSDType={anyType,_},InstType,Blocks,_Mode,S) -> + case type_content(InstType) of + [{restriction,{_BaseTypeName,CM}}] -> + {set_type_content(InstType,CM), + allowed_derivation(restriction,Blocks,S)}; + Other -> + {error,acc_errs(S,{[],?MODULE,{unexpected_content,Other,InstType}})} + end; +merge_derived_types2({simpleType,BuiltInType}, + InstType=#schema_simple_type{content=Content}, + Blocks,_Mode,S) -> + case Content of + [{restriction,{BuiltInType,CM}}] -> + {InstType#schema_simple_type{base_type=BuiltInType, + content=CM}, + allowed_derivation(restriction,Blocks,S)}; + Other -> + {error,acc_errs(S,{[],?MODULE,{unexpected_content,Other,InstType}})} + end; +merge_derived_types2(XSDType,InstType,Blocks,Mode,S) -> + case {variety_type(XSDType,S),variety_type(InstType,S)} of + {XSDType,InstType} -> + {error,acc_errs(S,{[],?MODULE,{unexpected_type,XSDType,InstType}})}; + {_XSDType2,InstType2} -> + case allowed_derivation(substitution,Blocks,S) of + S -> + merge_derived_types2(XSDType,InstType2,Blocks,Mode,S); + S2 -> + {error,S2} + end + end. + +variety_type(#schema_simple_type{variety=list,content=[{list,[Type]}]},S) -> + {VarietyType,_}=resolve(Type,S), + VarietyType; +variety_type(#schema_simple_type{variety=union,content=[{union,Types}]},S) -> + [T||{T,_}<-[resolve(VarietyType,S)||VarietyType<-Types]]; +variety_type(Type,_S) -> + Type. + +allowed_derivation(_Derivation,_Blocks,S) -> +%% case {member(Derivation,Blocks),member('#all',Blocks)} of +%% {true,_} -> +%% acc_errs(S,{[],?MODULE,{derivation_blocked,Blocks,Derivation}}); +%% {_,true} -> +%% acc_errs(S,{[],?MODULE,{derivation_blocked,'#all',Derivation}}); +%% _ -> +%% S +%% end. + S. + +%% El is the instance element that has the xsi:type attribute with +%% XsiType. +legal_substitution(El=#xmlElement{name=ElName},XsiType,S) -> + %% See 3.3.6, Substitution Group OK (Transitive) + %% For ok one of following: 1) same type in El as XsiType, 2) + %% XsiType is a restriction/extension where El's type is the + %% base, 3) XsiType is a member in the substitutionGroup of + %% ElName. + QName = mk_EII_QName(ElName,El,S), + {HeadElement,_} = load_object({element,QName},S), + + legal_substitution2(HeadElement,XsiType,S). +legal_substitution2(#schema_element{type=Type,block=Bl},XsiType,S) -> + {HeadType,_}=resolve(Type,S), + Block = blocking(Bl,S#xsd_state.blockDefault), + S2 = derived_or_equal_types(XsiType,HeadType,instance,Block,S), + {Block,S2}. + +compare_base_types(QName,#schema_complex_type{name=QName},_S) -> + ok; +compare_base_types(QName1,#schema_complex_type{name=QName2},_S) -> + {[],?MODULE,{names_not_equal,QName1,QName2}}; +compare_base_types(QName,#schema_simple_type{name=QName},_S) -> + ok; +compare_base_types(QName1,#schema_simple_type{name=QName2},_S) -> + {[],?MODULE,{names_not_equal,QName1,QName2}}. +%%compare_base_types(QName1,Other,_S) -> +%% {[],?MODULE,{miss_match_base_types,QName1,Other}}. + +extend_type(Base,Extension,S) -> + extend_type(Base,Extension,[],S). +%% Content may be (attribute | attributeGroup)*, anyAttribute? if +%% it is of simpleContent or: +%% (group | all | choice | sequence)?,((attribute | attributeGroup)*, +%% anyAttribute?) if it is of complexContent +extend_type([],[],Acc,S) -> + {reverse(Acc),S}; +extend_type([BaseCM|BaseRest],Ext=[{SeqCho,{Extension,Occ}}|ExtRest],Acc,S) + when SeqCho == sequence; SeqCho == choice -> + case BaseCM of + {SeqCho,{BC,_Occ}} -> + extend_type(BaseRest,ExtRest,[{SeqCho,{BC++Extension,Occ}}|Acc],S); + G = {group,{_Ref,_Occ}} -> + {ResG,S2} = resolve(G,S), + case ResG of + #schema_group{content=GC} -> + case keysearch(SeqCho,1,GC) of + {value,SCC} -> + extend_type([SCC|BaseRest],Ext,Acc,S); + _ -> + S3 = acc_errs(S2,{[],?MODULE,{illegal_content_in_extension,Ext}}), + {reverse(Acc),S3} + end; + _ -> + S3 = acc_errs(S2,{[],?MODULE,{illegal_content_in_extension,ResG}}), + {reverse(Acc),S3} + end; + _ -> + %% BaseCM may be a group that has a sequence + extend_type([BaseCM|BaseRest],ExtRest,[{SeqCho,{Extension,Occ}}|Acc],S) + end; +extend_type(BaseCM,ExtCM,Acc,S) when is_list(BaseCM),is_list(ExtCM) -> + extend_type([],[],reverse(ExtCM)++reverse(BaseCM)++Acc,S). + +restrict_type(Content,CM,BaseTypeName,S) -> + restrict_type(Content,CM,BaseTypeName,[],S). +%% Restriction may appear within a 1) simpleType, 2) simpleContent or +%% 3) complexContent construct. +%% The possible content of restriction in different contexts are: +%% 1) (simpleType?, (Any facet)*) +%% 2) (simpleType?, (Any facet)*),((attribute | attributeGroup)*, anyAttribute?) +%% 3) (group | all | choice | sequence)?, +%% ((attribute | attributeGroup)*, anyAttribute?) +%% A restriction of a simpleType narrows the possible values of the +%% base type by applying facets. +%% A restriction of a complexType (simpleContent / complexContent) must +%% enumerate all elements, including the preserved ones of the base type. +%% Attributes don't have to be enumerated. +restrict_type([],[],_TypeName,Acc,S) -> + {reverse(Acc),S}; +restrict_type([{restriction,{_Type,CM1}}],[],_TypeName,Acc,S) -> + {CM1++reverse(Acc),S}; +restrict_type([{extension,{_Type,CM1}}],[],_TypeName,Acc,S) -> + {CM1++reverse(Acc),S}; +restrict_type(BaseRest,[ST={simpleType,_Name}|RestrRest],TypeName,Acc,S) -> + %% context 1 or 2 + restrict_type(BaseRest,RestrRest,TypeName,[ST|Acc],S); +restrict_type([BaseCM|BaseRest],[{SeqCho,{CM,Occ}}|RestrRest],TypeName,Acc,S) + when SeqCho == sequence; SeqCho == choice -> + %% context 3 + case BaseCM of + {SeqCho,{BCM,_}} -> + case check_element_presence(CM,BCM) of + {error,Reason} -> + {reverse(Acc),acc_errs(S,Reason)}; + ok -> + restrict_type(BaseRest,RestrRest,TypeName, + [{SeqCho,{CM,Occ}}|Acc],S) + end; + Other -> + {reverse(Acc),acc_errs(S,{[],?MODULE,{SeqCho,expected,Other,found}})} + end; +restrict_type(BaseRest,[Facet={F,_Val}|RestrRest],TypeName,Acc,S) -> + case is_facet(F) of + true -> + restrict_type(BaseRest,RestrRest,TypeName,[Facet|Acc],S); + _ -> + {reverse(Acc),acc_errs(S,{[],?MODULE,{does_not_support,Facet,in_restriction}})} + end. + +restrict_simple_type([{restriction,{_Type,BaseCM}}],RestrCM,_TypeName,S) -> + restrict_simple_type(BaseCM,RestrCM,_TypeName,S); +restrict_simple_type(CM=[{extension,{_Type,_BaseCM}}],_RestrCM,TypeName,S) -> + {[],acc_errs(S,{[],?MODULE,{illegal_content_simple_type,CM,TypeName}})}; +restrict_simple_type(BaseCM,RestrCM,TypeName,S) -> + %% all restrictions in base comes first, then check that no one of + %% the facets in the restriction attempts to redefine a fixed + %% facet in the base. Add the facets of the restriction. + {Acc,S2} = + case BaseCM of + [] -> {[],S}; + _ -> + restrict_simple_type([],BaseCM,TypeName,S) + end, + %% Acc = reverse(BaseCM), + Fun = fun(X={simpleType,_},{Acc_in,S_in})-> + {[X|Acc_in],S_in}; + (X={LU,_},{Acc_in,S_in}) when LU==list;LU==union -> + {[X|Acc_in],S_in}; + (X={F,_},{Acc_in,S_in})-> +%% Fun = fun(X={F,_},{Acc_in,S_in})-> + case is_facet(F) of + true -> + {[X|Acc_in],S_in}; + _ -> + {Acc_in,acc_errs(S_in,{[],?MODULE,{illegal_in_restriction_of_simpleType,X}})} + end; + (X,{Acc_in,S_in}) -> + {Acc_in,acc_errs(S_in,{[],?MODULE,{illegal_in_restriction_of_simpleType,X}})} + end, + foldl(Fun,{Acc,S2},RestrCM). + +check_element_presence([],_BCM) -> + ok; +check_element_presence([{element,{Name,_}}|CM],BCM) -> + case check_element_presence2(Name,BCM) of + {ok,BCM2} -> + check_element_presence(CM,BCM2); + _ -> + {error,{[],?MODULE,{element,Name,not_present_in_restriction}}} + end; +check_element_presence([_C|CM],BCM) -> + check_element_presence(CM,BCM). + +check_element_presence2(Name,BCM) -> + check_element_presence2(Name,BCM,[]). +check_element_presence2({LocalName,_,NS},[{element,{{LocalName,_,NS},_}}|BCM],Acc) -> + {ok,reverse(Acc)++BCM}; +check_element_presence2(Name,[E|BCM],Acc) -> + check_element_presence2(Name,BCM,[E|Acc]); +check_element_presence2(_Name,[],_Acc) -> + error. + +%% A check of the extended attribute should take place here. +%% +extend_attributes(BaseAtts,[EA={attribute,Name}|ExtAtts], + BaseTypeName,CM,Mode,S) -> + NewAtts=key_replace_or_insert(Name,2,BaseAtts,EA), + extend_attributes(NewAtts,ExtAtts,BaseTypeName,CM,Mode,S); +%% Extension of wildcards should be handled as described in chapter +%% 3.4.2 and subsection "Complex Type Definition with simple content +%% Schema Component". +extend_attributes(BaseAtts,[LocalWC={anyAttribute,_NS_PC}|ExtAtts], + BaseTypeName,CM,deduce,S) -> + {CompleteWC,S2} = complete_wildcard(LocalWC,CM,S), + BaseWC = base_wildcard(BaseAtts), + {NewWC,S4} = + case BaseWC of + [] -> {CompleteWC,S2}; + _ -> + if CompleteWC==LocalWC -> {BaseWC,S2}; + true -> + {NS,S3} = attribute_wildcard_union(wc_ns(CompleteWC), + wc_ns(BaseWC),S2), + PC = wc_pc(CompleteWC), + {[{anyAttribute,{NS,PC}}],S3} + end + end, + NewBaseAtts = keyreplace(anyAttribute,1,BaseAtts,NewWC), + extend_attributes(NewBaseAtts,ExtAtts,BaseTypeName,CM,deduce,S4); +extend_attributes(Atts,[],_,_,_Mode,S) -> + {reverse(Atts),S}. +%% A check of the restricted attribute should take place here. +restrict_attributes(BaseAtts,[RA|RAtts],S) -> +%% NewAtts = keyreplace(Name,2,BaseAtts,EA), + {NewAtts,S2} = restrict_attribute_replace(BaseAtts,RA,S), + restrict_attributes(NewAtts,RAtts,S2); +restrict_attributes(Atts,[],S) -> + {reverse(Atts),S}. +restrict_attribute_replace(BaseAtts,EA={attribute,Name},S) -> + {keyreplace(Name,2,BaseAtts,EA),S}; +restrict_attribute_replace(BaseAtts,EA={anyAttribute,{NS,_}},S) -> + case key1search(anyAttribute,BaseAtts,false) of + false -> + {BaseAtts,acc_errs(S,{invalid_derivation,EA,BaseAtts})}; + {_,{BaseNS,_}} -> + S2 = wildcard_subset(BaseNS,NS,S), + {keyreplace(anyAttribute,1,BaseAtts,EA),S2} + end. +%% 3.10.6 Constraints on Wildcard Schema Components +%% Schema Component Constraint: Wildcard Subset +%% bullet 1: +wildcard_subset(['##any'],_NS,S) -> + S; +%% bullet 2: +wildcard_subset([{'not',NS}],[{'not',NS}],S) -> + S; +%% bullet 3: +%% if NS has a number of namespaces all of them must be in BaseNS, +%% if BaseNS has {not,Namespaces} neither of Namespaces must be in NS +wildcard_subset(_,[],S) -> + S; +wildcard_subset(BaseNS,NS,S) when is_list(BaseNS),is_list(NS) -> + case [X||X<-NS,member(X,BaseNS)] of + NS -> + S; + _ -> + acc_errs(S,{[],?MODULE,{wildcard_namespace,NS, + not_subset_of_base_namespace,BaseNS}}) + end; +wildcard_subset(BaseNS=[{'not',BNS}],NS,S) when is_list(NS) -> + case [X||X<-BNS,member(X,NS)] of + [] -> + S; + _ -> + acc_errs(S,{[],?MODULE,{wildcard_namespace,NS, + not_subset_of_base_namespace,BaseNS}}) + end; +wildcard_subset(BaseNS,NS,S) -> + acc_errs(S,{[],?MODULE,{wildcard_namespace,NS, + not_subset_of_base_namespace,BaseNS}}). + +base_wildcard(BaseAtts) -> + key1search(anyAttribute,BaseAtts,[]). + +complete_wildcard(LocalWC,CM,S) -> + case keysearch(attributeGroup,1,CM) of + {value,AttG={_,_Name}} -> + case resolve(AttG,S) of + {#schema_attribute_group{content=Atts},_S} -> + case keysearch(anyAttribute,1,Atts) of + {value,AA} -> + {PC,S2} = + attribute_wildcard_intersection(wc_ns(LocalWC), + wc_ns(AA),S), + {{anyAttribute,{wc_pc(LocalWC),PC}},S2}; + _ -> {LocalWC,S} + end; + _ -> {LocalWC,S} + end; + _ -> {LocalWC,S} + end. + +wc_ns({anyAttribute,{NS,_}})-> + NS; +wc_ns(_) -> + []. +wc_pc({anyAttribute,{_,PC}})-> + PC; +wc_pc(_) -> + strict. + +%% Union of wildcard namespace: +%% 3.10.6 Constraints on Wildcard Schema Components +%% Schema Component Constraint: Attribute Wildcard Union +%% bullet 1 +attribute_wildcard_union(NS,NS,S) -> + {NS,S}; +%% bullet 2 +attribute_wildcard_union(NS1,NS2,S) when NS1==['##any'];NS2==['##any'] -> + {['##any'],S}; +attribute_wildcard_union(NS1,NS2,S) -> + case {keysearch('not',1,NS1),keysearch('not',1,NS2)} of + {false,false} -> %% bullet 3 + {NS1 ++ [X||X<-NS2,member(X,NS1)==false],S}; + {{value,{_,Set1}},{value,{_,Set2}}} -> %% bullet 4 or 1 + case {lists:sort(Set1),lists:sort(Set2)} of + {L,L} -> {[{'not',L}],S}; + _ -> {[{'not',[absent]}],S} + end; + _ -> %% either is a {not,NS} + case toggle_ns(NS1,NS2) of + {_O1=[absent],NS3} -> %% bullet 6 + case member(absent,NS3)of + true -> {['##any'],S}; + _ -> {[{'not',[absent]}],S} + end; + {O1=[O1Name],NS4} -> %% bullet 5 + case member(O1Name,NS4) of + true -> + case member(absent,NS4) of + true -> {['##any'],S}; %% 5.1 + _ -> {[{'not',[absent]}],S} %% 5.2 + end; + _ -> + case member(absent,NS4) of + true -> + %% not expressible 5.3 + Err = {[],?MODULE,{wildcard_namespace_union_not_expressible,NS1,NS2}}, + {[],acc_errs(S,Err)}; + _ -> {[{'not',O1}],S} %% 5.4 + end + end + end + end. + +%% Schema Component Constraint: Attribute Wildcard Intersection +%% bullet 1 +attribute_wildcard_intersection(O1,O1,S) -> {O1,S}; +%% bullet 2 +attribute_wildcard_intersection(['##any'],O2,S) -> {O2,S}; +attribute_wildcard_intersection(O1,['##any'],S) -> {O1,S}; +%% bullet 6 +attribute_wildcard_intersection([{'not',[absent]}],O2=[{'not',_}],S) -> {O2,S}; +attribute_wildcard_intersection(O1=[{'not',_}],[{'not',[absent]}],S) -> {O1,S}; +%% bullet 5 +attribute_wildcard_intersection([{'not',NS1}],[{'not',NS2}],S) -> + case [X||X<-NS1,member(X,NS2)] of + [] -> {[],acc_errs(S,{[],?MODULE,{wildcard_namespace_intersection_not_expressible,NS1,NS2}})}; + NS3 -> {[{'not',NS3}],S} + end; +%% bullet 3 +attribute_wildcard_intersection([{'not',NS}],O2,S) -> + {lists:delete(absent,[X||X<-O2,member(X,NS)==false]),S}; +attribute_wildcard_intersection(O1,[{'not',NS}],S) -> + {lists:delete(absent,[X||X<-O1,member(X,NS)==false]),S}; +%% bullet 4 +attribute_wildcard_intersection(O1,O2,S) -> + case [X||X<-O1,member(X,O2)] of + [] -> + {[absent],S}; + L ->{L,S} + end. + +toggle_ns(NS1,NS2=[{'not',_}]) -> + {NS2,NS1}; +toggle_ns(NS1,NS2) -> + {NS1,NS2}. + + +deduce_derived_types([DT|DTs],S) -> + deduce_derived_types(DTs,deduce_derived_type(DT,S,[])); +deduce_derived_types([],S) -> + S. + +%% deduce_derived_type +deduce_derived_type(DT={_Kind,TName},S,RefChain) -> + %% check circular references + case keymember(TName,2,RefChain) of + true -> + acc_errs(S,{[],?MODULE,{circular_reference_of_type,TName}}); + _ -> + deduce_derived_type2(DT,S,[DT|RefChain]) + end. +deduce_derived_type2(DT,S,RefChain) -> + {DerivedType,_} = resolve(DT,S), + case is_unmerged_type(DerivedType) of + true -> + BaseTypeRef = get_base_type(DerivedType), + {BaseType,_} = resolve({simple_or_complex_Type,BaseTypeRef},S), + BaseTypeKind = + fun(#schema_complex_type{}) -> complexType; + (_) -> simpleType + end (BaseType), + case is_unmerged_type(BaseType) of + true -> + %% both derived and not deduced + S2 = deduce_derived_type({BaseTypeKind,BaseTypeRef},S,RefChain), + case S2#xsd_state.errors==S#xsd_state.errors of + true -> deduce_derived_type2(DT,S2,RefChain); + _ -> S2 + end; + _ -> + {_,S2} = merge_derived_types(BaseType,DerivedType,deduce,S), + S2 + end; + _ -> + S + end. +is_unmerged_type(Type) -> + case type_content(Type) of + [{restriction,_}] -> true; + [{extension,_}] -> true; + _ -> false + end. +type_content(#schema_simple_type{content=C}) -> + C; +type_content(#schema_complex_type{content=C}) -> + C; +type_content(_) -> + []. + +set_type_content(Type=#schema_simple_type{},CM) -> + Type#schema_simple_type{content=CM}; +set_type_content(Type=#schema_complex_type{},CM) -> + Type#schema_complex_type{content=CM}. + +get_base_type(#schema_simple_type{base_type=BT}) -> + BT; +get_base_type(#schema_complex_type{base_type=BT}) -> + BT. + +in_scope({Local,_Scope,_NS},S) -> + in_scope(Local,S); +in_scope(Name,S=#xsd_state{scope=Scope}) when is_atom(Name) -> + S#xsd_state{scope=[Name|Scope]}; +in_scope(Name,S=#xsd_state{scope=Scope}) when is_list(Name) -> + S#xsd_state{scope=[atom_if_shortasciilist(Name)|Scope]}. + +out_scope({Local,_,_},S) -> + out_scope(atom_if_shortasciilist(Local),S); +out_scope(Name,S=#xsd_state{scope=[Name|Rest]}) -> + S#xsd_state{scope=Rest}; +out_scope(_Name,S) -> + S. + +name_scope({'_xmerl_no_name_',Scope,_NS},S) -> + S#xsd_state{scope=Scope}; +name_scope({Name,Scope,_NS},S) -> + S#xsd_state{scope=[Name|Scope]}. + +reset_scope(S) -> + S#xsd_state{scope=[]}. + +set_scope(Scope,S) when is_list(Scope) -> + S#xsd_state{scope=Scope}; +set_scope(_,S) -> + S. + +is_global_env([_Env]) -> + true; +is_global_env(_) -> + false. + +kind(#xmlElement{name=Name},S) -> + LocalName=local_name(Name), + is_a(LocalName,S). + +kind(#xmlElement{name=Name}) -> + LocalName=local_name(Name), + element(1,is_a(LocalName,dummy)). + +is_a(element,S) -> {element,S}; +is_a(annotation,S) -> {annotation,S}; +is_a(simpleType,S) -> {simpleType,S}; +is_a(complexType,S) -> {complexType,S}; +is_a(simpleContent,S) -> {simpleContent,S}; +is_a(complexContent,S) -> {complexContent,S}; +is_a(include,S) -> {include,S}; +is_a(import,S) -> {import,S}; +is_a(redefine,S) -> {redefine,S}; +is_a(unique,S) -> {unique,S}; +is_a(key,S) -> {key,S}; +is_a(keyref,S) -> {keyref,S}; +is_a(attribute,S) -> {attribute,S}; +is_a(attributeGroup,S) -> {attributeGroup,S}; +is_a(group,S) -> {group,S}; +is_a(all,S) -> {all,S}; +is_a(sequence,S) -> {sequence,S}; +is_a(choice,S) -> {choice,S}; +is_a(any,S) -> {any,S}; +is_a(anyAttribute,S) -> {anyAttribute,S}; +is_a(selector,S) -> {selector,S}; +is_a(field,S) -> {field,S}; +is_a(notation,S) -> {notation,S}; +is_a(appinfo,S) -> {appinfo,S}; +is_a(documentation,S) -> {documentation,S}; +is_a(restriction,S) -> {restriction,S}; +is_a(extension,S) -> {extension,S}; +is_a(list,S) -> {list,S}; +is_a(union,S) -> {union,S}; +is_a(schema,S) -> {schema,S}; +is_a(minExclusive,S) -> {minExclusive,S}; +is_a(minInclusive,S) -> {minInclusive,S}; +is_a(maxExclusive,S) -> {maxExclusive,S}; +is_a(maxInclusive,S) -> {maxInclusive,S}; +is_a(totalDigits,S) -> {totalDigits,S}; +is_a(fractionDigits,S) -> {fractionDigits,S}; +is_a(length,S) -> {length,S}; +is_a(minLength,S) -> {minLength,S}; +is_a(maxLength,S) -> {maxLength,S}; +is_a(enumeration,S) -> {enumeration,S}; +is_a(whiteSpace,S) -> {whiteSpace,S}; +is_a(pattern,S) -> {pattern,S}; +is_a(Name,S) when is_record(S,xsd_state) -> + {Name,acc_errs(S,{[],?MODULE,{unknown_content,Name}})}; +is_a(Name,_) -> + exit({error,{internal_error,not_implemented,Name}}). + + + + +%% namespace/2 -> [token()] +%% token() -> {not,namespace_name()} | namespace_name() +%% ((##any | ##other) | List of (anyURI | (##targetNamespace | ##local)) ) : ##any +%% The result will be: +%% NSList ::= ['##any'] | [{'not',[TNS]}] | NSURIs +%% TNS ::= URI | absent +%% NSURIs ::= (URI | absent) + +%% URI ::= atomified URI-string +wildcard_namespace(E,S) -> + AttVal = get_attribute_value(namespace,E,"##any"), + ListOfVals = namestring2namelist(AttVal), + Pred = fun('##other') -> + case S#xsd_state.targetNamespace of + undefined -> {'not',[absent]}; + TN -> {'not',TN} + end; + ('##targetNamespace') -> + case S#xsd_state.targetNamespace of + undefined -> absent; + TN -> TN + end; + ('##local') -> absent;%%'##local'; %% any well-formed xml that + %% is not qualified. + (X) -> X + end, + [X||X <- map(Pred,ListOfVals),X=/=[]]. + +processor_contents(Any) -> + case get_attribute_value(processContents,Any,strict) of + V when is_list(V) -> list_to_atom(V); + A -> A + end. + +base_type(E) -> + get_attribute_value(base,E,[]). +base_type_type(Env) -> + case member(simpleType,Env) of + true -> simpleType; + _ -> simple_or_complex_Type + end. + +attribute_ref(A) -> + get_attribute_value(ref,A,[]). + +particle_ref(El) -> + get_attribute_value(ref,El,[]). + +attributeGroup_ref(El) -> + get_attribute_value(ref,El,[]). + +get_value(El) -> + get_attribute_value(value,El,undefined). + +get_attribute_value(Key,#xmlElement{attributes=Atts},Default) -> + case keyNsearch(Key,#xmlAttribute.name,Atts,Default) of + #xmlAttribute{value=V} -> + V; + _ -> Default + end. + +%% qualify_NCName/2 returns a qualified name, QName, that has +%% information of the name attribute and namespace of the XSD object. +%% The object E has a name attribute with a NCName. The Namespace +%% part of the QName is from the targetNamespace attribute of the +%% schema or the empty list. +qualify_NCName(E=#xmlElement{},S) -> + case get_local_name(E) of + [] -> no_name; + LocalName -> + Namespace = + case S#xsd_state.targetNamespace of + undefined -> + []; %%?XSD_NAMESPACE; + TNS -> + TNS + end, + {atom_if_shortasciilist(LocalName),S#xsd_state.scope,Namespace} + end. + + +get_local_name(#xmlElement{attributes=Atts}) -> + case keyNsearch(name,#xmlAttribute.name,Atts,[]) of + #xmlAttribute{value=V} -> + V; + Default -> Default + end. + +local_name(Name) when is_atom(Name) -> + local_name(atom_to_list(Name)); +local_name(Name) when is_list(Name) -> + case splitwith(fun($:) -> false;(_)->true end,Name) of + {_,":"++LocalName} -> list_to_atom(LocalName); + _ -> + list_to_atom(Name) + end. + +%% transforms "a B c" to [a,'B',c] +namestring2namelist(Str) -> + split_by_whitespace(Str,[]). +split_by_whitespace(Str,Acc) when is_list(Str),length(Str) > 0 -> + F = fun($ ) -> + false; + (_) -> + true + end, + {Str1,Rest} = splitwith(F,Str), + split_by_whitespace(string:strip(Rest),[list_to_atom(Str1)|Acc]); +split_by_whitespace(_,Acc) -> + reverse(Acc). + +%% get_QName(Name,S) where Name is a QName in string format, or where +%% a QName is expected according to schema specification. If the name +%% is unqualified it is qualified with the targetNamespace of the schema +%% or with the empty list. +get_QName(Name,NS,S) when is_atom(Name) -> + get_QName(atom_to_list(Name),NS,S); +get_QName(Name,NS,#xsd_state{scope=Scope}) -> + qualified_name(Name,NS,NS#xmlNamespace.default,Scope). + +qualified_name(Name,NS,Default,Scope) -> + case splitwith(fun($:) -> false;(_)->true end,Name) of + {GlobalName,":"++LocalName} -> {atom_if_shortasciilist(LocalName),Scope, + namespace(GlobalName,NS,Default)}; + _ -> + {atom_if_shortasciilist(Name),Scope,Default} + end. + +atom_if_shortasciilist(N) when is_list(N) -> + case catch list_to_atom(N) of + {'EXIT',_Reason} -> + %% Reason may be system_limit if N is very long, it may be + %% badarg ifN is a list of UTF characters. + N; + AN -> AN + end; +atom_if_shortasciilist(N) -> + N. + +namespace("xml",_,_) -> 'http://www.w3.org/XML/1998/namespace'; +namespace(Prefix,NS,Default) -> + case key1search(Prefix,NS#xmlNamespace.nodes,Default) of + {Prefix,Namespace} -> + Namespace; + Namespace -> Namespace + end. + + +%% mk_EII_QName/2 +%% makes a name with qualified info out of an Element Information Item +%% A) If name is qualified get namespace matching prefix. +%% B) If not qualified search parents for a namespace: +%% 1) use default namespace if defined, else. +%% 2) if a parent is qualified use that namespace or +%% 3) no namespace is applied +mk_EII_QName(Name,#xmlElement{name=Me,namespace=NS,parents=P},S) + when is_list(Name) -> + mk_EII_QName(list_to_atom(Name), + #xmlElement{name=Me,namespace=NS,parents=P},S); +mk_EII_QName(Name,#xmlElement{name=Me,namespace=NS,parents=P},S) -> + Scope = S#xsd_state.scope, + NameStr = atom_to_list(Name), + case string:tokens(NameStr,":") of + ["xmlns",PrefixDef] -> %% special case + {'xmlns',Scope,namespace(PrefixDef,NS,[])}; + [Prefix,LocalName] -> %% A + {list_to_atom(LocalName),Scope,namespace(Prefix,NS,[])}; + [_LocalName] -> %% B + {Name,Scope,mk_EII_namespace([{Me,0}|P],NS,S)} + end. +mk_EII_namespace([],#xmlNamespace{default=DefaultNS},_S) -> + DefaultNS; +%%mk_EII_namespace([{PName,_}|GrandPs],NS=#xmlNamespace{default=[]},S) -> +mk_EII_namespace([{PName,_}|GrandPs],NS,S) -> + NameStr = atom_to_list(PName), + case string:tokens(NameStr,":") of + [Prefix,_LocalName] -> + namespace(Prefix,NS,[]); + [_LocalName] -> + mk_EII_namespace(GrandPs,NS,S) + end; +mk_EII_namespace(_,NS,_S) -> + NS#xmlNamespace.default. + +mk_EII_Att_QName(AttName,XMLEl,S) when is_list(AttName) -> + mk_EII_Att_QName(list_to_atom(AttName),XMLEl,S); +mk_EII_Att_QName(AttName,XMLEl,S) -> + NameStr = atom_to_list(AttName), + {member($:,NameStr),mk_EII_QName(AttName,XMLEl,S)}. + + + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +%% table access functions +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +create_tables(S=#xsd_state{table=undefined}) -> + Tid=ets:new(xmerl_schema_tab,[]), + initial_tab_data(Tid), + S#xsd_state{table=Tid}; +create_tables(S) -> + S. + +delete_table(#xsd_state{table=Tab}) -> + catch ets:delete(Tab). + +%% @hidden +print_table(#xsd_state{table=Tab}) -> + case catch ets:tab2list(Tab) of + Res when is_list(Res) -> + Res; + {'EXIT',Reason} -> + {error,{?MODULE,[],Reason}} + end; +print_table(_) -> + ok. + +%save_object({name,_},S) -> +% %% already saved. +% S; +%% only simpleType asn complexType are temporary saved with +%% three-tuple key. They are loaded and merged in redefine/2. +save_object({Kind,Obj},S=#xsd_state{redefine=true}) + when Kind == simpleType; Kind == complexType -> + save_in_table({Kind,redefine,object_name(Obj)},Obj,S); +save_object({Kind,Obj},S=#xsd_state{redefine=true}) + when Kind == group; Kind == attributeGroup -> + save_in_table({Kind,object_name(Obj)},Obj,S); +save_object({Kind,Obj},S) when Kind == simpleType; Kind == complexType -> + save_unique_type({Kind,object_name(Obj)},Obj,S); +save_object({Kind,Obj},S) + when Kind == attributeGroup; Kind == group -> + save_uniquely({Kind,object_name(Obj)},Obj,S); +save_object({Kind,Obj},S) -> + save_in_table({Kind,object_name(Obj)},Obj,S). + +save_unique_type(Key={_,Name},Obj,S) -> + case resolve({simple_or_complex_Type,Name},S) of + {#schema_simple_type{},_} -> + acc_errs(S,{[],?MODULE,{type_not_uniquely_defined_in_schema,Name}}); + {#schema_complex_type{},_} -> + acc_errs(S,{[],?MODULE,{type_not_uniquely_defined_in_schema,Name}}); + _ -> + save_in_table(Key,Obj,S) + end. + +save_uniquely(Key,Obj,S) -> + case load_object(Key,S) of + {[],_} -> + save_in_table(Key,Obj,S); + _ -> + acc_errs(S,{[],?MODULE,{not_uniquely_defined_in_schema,Key}}) + end. + + + +save_schema_element(CM,S=#xsd_state{elementFormDefault = EFD, + attributeFormDefault = AFD, + targetNamespace = TN, + finalDefault = FD, + blockDefault = BD}) -> + ElementList = [X||X = {element,_} <- CM], +%% OtherGlobalEls = other_global_elements(S,ElementList), + Schema = get_schema_cm(S#xsd_state.table,TN), + Schema2 = + case Schema == #schema{} of + true -> + Schema#schema{elementFormDefault = EFD, + attributeFormDefault = AFD, + targetNamespace = TN, + blockDefault = BD, + finalDefault = FD, + content = ElementList}; + _ -> + Content = Schema#schema.content, + Schema#schema{content=[X||X<-Content,member(X,ElementList)==false]++ElementList} + end, + TN2 = case TN of + undefined -> []; + _ -> TN + end, + save_in_table({schema,TN2},Schema2,S), + save_to_file(S). + +%% other_global_elements(S,ElementList) -> +%% Schema = get_schema_cm(S#xsd_state.table,S#xsd_state.targetNamespace), +%% [X||X<-Schema#schema.content, +%% member(X,ElementList) == false]. + +%% other_global_elements(#xsd_state{schema_name=SchemaName, +%% table = Tab, +%% global_element_source=GES},ElementList) -> +%% case [X||{Y,X}<-GES,Y==SchemaName] of +%% [] -> +%% []; +%% L -> %% All other schemas included in redefine +%% NameList = [X||{element,{X,_}}<-ElementList], +%% Contents = +%% flatten([X||#schema{content=X}<-[get_schema_cm(Tab,Y)||Y<-L]]), +%% SortFun = +%% fun({_,{A,_}},{_,{B,_}}) when A =< B -> +%% true; +%% (_,_) -> false end, +%% [X||X={element,{Y,_}}<-lists:sort(SortFun,Contents),member(Y,NameList)==false] +%% end. + +save_to_file(S=#xsd_state{tab2file=true},FileName) -> + save_to_file(S#xsd_state{tab2file=FileName}); +save_to_file(_,_) -> + ok. + +save_to_file(S=#xsd_state{tab2file=TF}) -> + case TF of + true -> + {ok,IO}=file:open(filename:rootname(S#xsd_state.schema_name)++".tab", + [write]), + io:format(IO,"~p~n",[catch ets:tab2list(S#xsd_state.table)]), + file:close(IO); + false -> + ok; + IOFile -> + {ok,IO}=file:open(IOFile,[write]), + io:format(IO,"~p~n",[catch ets:tab2list(S#xsd_state.table)]), + file:close(IO) + end. + +save_merged_type(Type=#schema_simple_type{},S) -> + resave_object({simpleType,Type},S); +save_merged_type(Type=#schema_complex_type{},S) -> + resave_object({complexType,Type},S). +resave_object({Kind,Obj},S) -> + save_in_table({Kind,object_name(Obj)},Obj,S). + +save_in_table(Name,ElDef,S=#xsd_state{table=Tab}) -> + catch ets:insert(Tab,{Name,ElDef}), + S. + +save_idc(key,IDConstr,S) -> + save_key(IDConstr,S); +save_idc(keyref,IDConstr,S) -> + save_keyref(IDConstr,S); +save_idc(unique,IDConstr,S) -> + save_unique(IDConstr,S). + +save_key(Key,S) -> + save_object({key,Key},S), + S. + +save_keyref(KeyRef=#id_constraint{category=keyref},S) -> + S1 = add_keyref(KeyRef,S), + save_object({keyref,KeyRef},S1), + S1; +save_keyref(_,S) -> + S. + +save_unique(Unique,S) -> + save_object({unique,Unique},S), + S. + +save_substitutionGroup([],S) -> + S; +save_substitutionGroup([{Head,Members}|SGs],S) -> + %% save {head,[members]} + save_in_table({substitutionGroup,Head},Members,S), + %% save {member,head}, an element can only be a member in one + %% substitutionGroup + lists:foreach(fun(X)->save_in_table({substitutionGroup_member,X},Head,S) end,Members), + save_substitutionGroup(SGs,S). +substitutionGroup_member(ElName,S) -> + case load_object({substitutionGroup_member,ElName},S) of + {[],_} -> + false; + {Res,_} -> + Res + end. +%% substitutionGroup_head(Head,S) -> +%% case load_object({substitutionGroup,Head},S) of +%% {[],_} -> +%% false; +%% {Res,_} -> +%% Res +%% end. + +add_keyref(#id_constraint{name=Name,refer=Refer}, + S=#xsd_state{keyrefs=KeyRefs}) -> + S#xsd_state{keyrefs=add_once({keyref,Name,Refer},KeyRefs)}. + + +load_redefine_object({Kind,Name},S) -> + load_object({Kind,redefine,Name},S). + +load_object({element,{QN,Occ={Min,_}}},S) when is_integer(Min) -> + case load_object({element,QN},S) of + {SE=#schema_element{},S1} -> {SE#schema_element{occurance=Occ},S1}; + Other -> Other + end; +load_object({group,{QN,_Occ={Min,_}}},S) when is_integer(Min) -> + load_object({group,QN},S); +load_object(Key,S=#xsd_state{table=Tab}) -> + case ets:lookup(Tab,Key) of + [{Key,Value}] -> + {Value,S}; + [] -> + case ets:lookup(Tab,global_def(Key)) of + [{_,Value}] -> {Value,global_scope(S)}; + Other -> {Other,S} + end; + Other -> + {Other,S} + end. + + +load_keyref(Name,S) -> + case load_object({keyref,Name},S) of + {KeyRef=#id_constraint{},_} -> KeyRef; + _ -> + [] + end. +load_key(Name,S) -> + case load_object({key,Name},S) of + {Key=#id_constraint{},_} -> Key; + _ -> + case load_object({unique,Name},S) of + {Key=#id_constraint{},_} -> Key; + _ -> + [] + end + end. + + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +%% END table access functions +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% + +save_ID(ID,S) -> + case member(ID,S#xsd_state.'IDs') of + true -> + acc_errs(S,{'ID_name_not_unique',ID}); + _ -> + S#xsd_state{'IDs'=[ID|S#xsd_state.'IDs']} + end. +check_and_save_ID(ID,S) -> + case xmerl_xsd_type:check_simpleType('ID',ID,S) of + {ok,ID} -> + save_ID(ID,S); + _ -> + acc_errs(S,{illegal_ID_value,ID}) + end. + +insert_substitutionGroup(#schema_element{substitutionGroup=undefined},S) -> + S; +insert_substitutionGroup(#schema_element{name=Name, + substitutionGroup=SG}, + S=#xsd_state{substitutionGroups=SGregister}) -> + case key1search(SG,SGregister,[]) of + {_,SGList} -> + S#xsd_state{substitutionGroups= + keyreplace(SG,1,SGregister,{SG,[Name|SGList]})}; + _ -> + S#xsd_state{substitutionGroups=[{SG,[Name]}|SGregister]} + end. + + +global_scope(S=#xsd_state{}) -> + S#xsd_state{scope=[]}. + +global_def({Kind,{Local,_,NS}}) + when Kind==simpleType; Kind==complexType; Kind==group; + Kind==attributeGroup; Kind==element; Kind==attribute; + Kind==substitutionGroup;Kind==substitutionGroup_member-> + {Kind,{Local,[],NS}}; +global_def(D) -> D. + + +get_schema_cm(Tab,undefined) -> + get_schema_cm(Tab,[]); +get_schema_cm(Tab,[]) -> + get_schema_cm1(Tab,[]); +get_schema_cm(Tab,Namespace) -> + NoNamespaceC=get_no_namespace_content(Tab), + Schema = get_schema_cm1(Tab,Namespace), + NSC = Schema#schema.content, + Schema#schema{content=NSC++[X||X<-NoNamespaceC,member(X,NSC)==false]}. +get_schema_cm1(Tab,Namespace) -> + case catch ets:lookup(Tab,{schema,Namespace}) of + [{_,H}] -> + H; + _ -> + #schema{} + end. +get_no_namespace_content(Tab) -> + case get_schema_cm1(Tab,[]) of + #schema{content=C} -> + C; + _ -> [] + end. + + +%% is_simple_type(Type,S) when is_atom(Type) -> +%% is_simple_type(atom_to_list(Type),S); +is_simple_type({LName,Scope,NS},S) when is_atom(LName) -> + is_simple_type({atom_to_list(LName),Scope,NS},S); +is_simple_type(QName={_,_,_},S) -> + case is_builtin_simple_type(QName) of + true -> + true; + _ -> + is_derived_simple_type(QName,S) + end. + + +is_derived_simple_type(QName,S) -> +%% case resolve({simple_or_complex_Type,QName},S) of + case resolve({simpleType,QName},S) of + {#schema_simple_type{},_} -> true; + _ -> false + end. + + + +object_name(#schema_element{name=N}) -> + N; +object_name(#schema_simple_type{name=N}) -> + N; +object_name(#schema_complex_type{name=N}) -> + N; +object_name(#schema_attribute{name=N}) -> + N; +object_name(#schema_attribute_group{name=N}) -> + N; +object_name(#schema_group{name=N}) -> + N; +object_name(#id_constraint{name=N}) -> + N. + + +is_whitespace(#xmlText{value=V}) -> + case [X|| X <- V, whitespace(X) == false] of + [] -> + true; + _ -> false + end; +is_whitespace(_) -> + false. + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +%% fetch +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +fetch(URI,S) -> + Split = filename:split(URI), + Filename = fun([])->[];(X)->lists:last(X) end (Split), + Fullname = + case Split of %% how about Windows systems? + ["file:"|Name]-> %% absolute path, see RFC2396 sect 3 + %% file:/dtd_name + filename:join(["/"|Name]); + ["/"|Rest] when Rest /= [] -> + %% absolute path name + URI; + ["http:"|_Rest] -> + {http,URI}; + [] -> %% empty systemliteral + []; + _ -> + filename:join(S#xsd_state.xsd_base, URI) + end, + Path = path_locate(S#xsd_state.fetch_path, Filename, Fullname), + ?dbg("fetch(~p) -> {file, ~p}.~n", [URI, Path]), + {ok, Path, S}. + +path_locate(_, _, {http,_}=URI) -> + URI; +path_locate(_, _, []) -> + []; +path_locate([Dir|Dirs], FN, FullName) -> + F = filename:join(Dir, FN), + case file:read_file_info(F) of + {ok, #file_info{type = regular}} -> + {file,F}; + _ -> + path_locate(Dirs, FN, FullName) + end; +path_locate([], _FN, FullName) -> + {file,FullName}. + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +%% return +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% + +return_error(Errs) -> + {error,reverse(Errs)}. + +return_schema_error(Errs) -> + {error,{schema_failure,reverse(Errs)}}. + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +%% general helper functions +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% + +if_atom_to_list(A) when is_atom(A) -> + atom_to_list(A); +if_atom_to_list(L) -> + L. +if_list_to_atom(L) when is_list(L) -> + list_to_atom(L); +if_list_to_atom(A) -> + A. + +list_members(Members,CompleteList) -> + case [X||X<-Members,member(X,CompleteList)==false] of + [] -> + true; + L -> + {error,L} + end. + +whitespace(X) when ?whitespace(X) -> + true; +whitespace(_) -> + false. + +key1search(Key,List,Default) -> + case keysearch(Key,1,List) of + {value,V} -> V; + _ -> Default + end. + +keyNsearch(Key,N,L,Default) -> + case keysearch(Key,N,L) of + {value,V} -> V; + _ -> Default + end. + +key_replace_or_insert(Key,N,List,Tuple) -> + case keyreplace(Key,N,List,Tuple) of + List -> + [Tuple|List]; + NewList -> + NewList + end. + +keysearch_delete(Key,N,List,Default) -> + case keysearch(Key,N,List) of + {value,Res} -> + {Res,keydelete(Key,N,List)}; + _ -> + {Default,List} + end. + +search_delete_all_el(ElName,ElList,S) -> + case search_delete_all_el2(ElName,ElList,[]) of + false -> + case substitutionGroup_member(ElName,S) of + false -> + false; + Head -> + case search_delete_all_el(Head,ElList,S) of + {_,Rest} -> + {Name,_,NS} = ElName, + {{element,{Name,[],NS}},Rest}; + _ -> + false + end + end; + Res -> + Res + end. +search_delete_all_el2(_ElName,[],_NoMatch) -> + false; +%% name must match defined (local scope) and referenced (global scope) +%% elements. +search_delete_all_el2({Name,Scope,NS}, + [El={element,{{Name,ScopeCM,NS},_}}|Rest], + NoMatch) + when Scope == ScopeCM; ScopeCM == [] -> + {El,reverse(NoMatch)++Rest}; +search_delete_all_el2(ElName,[H|T],NoMatch) -> + search_delete_all_el2(ElName,T,[H|NoMatch]). + +%% Search attribute should not consider the scope. All attributes +%% allowed in this scope are in SchemaAttList. +search_attribute(true,{Name,_,Namespace},SchemaAtts) -> + case [A||A={_,{N,_,NS}}<-SchemaAtts,N==Name,NS==Namespace] of + [] -> + {undefined,SchemaAtts}; + [Attr] -> + {Attr,lists:delete(Attr,SchemaAtts)} + end; +search_attribute(_,{Name,_,_},SchemaAtts) -> + case [A||A={_,{N,_,_}}<-SchemaAtts,N==Name] of + [] -> + {undefined,SchemaAtts}; + [Attr] -> + {Attr,lists:delete(Attr,SchemaAtts)} + end. + +error_msg(Format,Args) -> + io:format(Format,Args). + + +add_once(El,L) -> + case member(El,L) of + true -> + L; + _ -> + [El|L] + end. + +add_key_once(Key,N,El,L) -> + case keymember(Key,N,L) of + true -> + L; + _ -> + [El|L] + end. + +%% shema_el_pathname({Type,_},Env) -> +%% mk_path(reverse([Type|Env])). +%% xml_el_pathname(#xmlElement{name=Name,parents=Parents,pos=Pos}) -> +%% {element,mk_xml_path(Parents,Name,Pos)}; +%% xml_el_pathname(#xmlAttribute{name=Name,parents=Parents,pos=Pos}) -> +%% {attribute,mk_xml_path(Parents,Name,Pos)}; +%% xml_el_pathname(#xmlText{parents=Parents,pos=Pos}) -> +%% {text,mk_xml_path(Parents,text,Pos)}. + +%% mk_path([]) -> +%% []; +%% mk_path(L) when is_list(L) -> +%% "/"++filename:join(L). + +%% mk_xml_path(Parents,Type,Pos) -> +%% %% io:format("mk_xml_path: Parents = ~p~n",[Parents]), +%% {filename:join([[io_lib:format("/~w(~w)",[X,Y])||{X,Y}<-Parents],Type]),Pos}. + +%% @spec format_error(Errors) -> Result +%% Errors = error_tuple() | [error_tuple()] +%% Result = string() | [string()] +%% @doc Formats error descriptions to human readable strings. +format_error(L) when is_list(L) -> + [format_error(X)||X<-L]; +format_error({unexpected_rest,UR}) -> + io_lib:format("XML: The following content of an element didn't validate by the provided schema, ~n~p.",[UR]); +format_error({unvalidated_rest,UR}) -> + io_lib:format("XML: The following content of an element didn't validate by the provided schema, ~n~p.",[UR]); +format_error({no_schemas_provided}) -> + "Schema: Validator found no schema. A schema must be provided for validation."; +format_error({internal_error,Reason}) -> + io_lib:format("An error occured that was unforeseen, due to ~p.",[Reason]); +format_error({internal_error,Reason,Info}) -> + io_lib:format("An error occured that was unforeseen, due to ~p: ~p.",[Reason,Info]); +format_error({internal_error,Function,Info1,Info2}) -> + io_lib:format("An internal error occured in function ~p with args: ~p,~p.",[Function,Info1,Info2]); +format_error({illegal_content,Reason,Kind}) -> + io_lib:format("Schema: The schema violates the content model allowed for schemas.~nReason: ~p,~nkind of schema element: ~p.",[Reason,Kind]); +format_error({no_match,Kind}) -> + io_lib:format("Schema: The schema violates the content model allowed for schemas.~nKind of schema element: ~p.",[Kind]); +format_error({bad_match,S4SC,CM}) -> + io_lib:format("Schema: The schema missed mandatory elements ~p in ~p.",[S4SC,CM]); +format_error({unmatched_mandatory_object,SequenceEl1,SequenceEl2}) -> + io_lib:format("Schema: The schema should have had an ~p object after the ~p, but it was missing.",[SequenceEl2,SequenceEl1]); +format_error({parsing_external_schema_failed,File,Reason}) -> + io_lib:format("Schema: Parsing the referenced external schema ~p, failed due to ~p.",[File,Reason]); +format_error({fetch_fun_failed,Other}) -> + io_lib:format("Schema: Fetching this kind of external schema is not supported ~p.", + [Other]); +format_error({element_not_in_schema,[EIIName,_ElQName,_CM]}) -> + io_lib:format("XML: The XML element ~p are not present in the schema.", + [EIIName]); +format_error({missing_mandatory_element,CMEl}) -> + io_lib:format("XML: The XML file missed mandatory element(s) ~p defined in schema.",[CMEl]); +format_error({empty_content_not_allowed,C}) -> + io_lib:format("XML: The XML file missed mandatory element(s): ~p defined in schema.",[C]); +format_error({element_not_suitable_with_schema,ElName,_S}) -> + io_lib:format("XML: The XML element: ~p violates the schema, probably to many of same element.",[ElName]); +format_error({element_not_suitable_with_schema,ElName,CMName,_CMEl,_S}) -> + io_lib:format("XML: The XML element: ~p violates the schema. Schema expected element ~p.",[ElName,CMName]); +format_error({no_element_expected_in_group,XML}) -> + io_lib:format("XML: The XML element(s) ~p violates the schema. No element was expected.",[XML]); +format_error({element_bad_match,E,Any,_Env}) -> + io_lib:format("XML: XML element ~p didn't match into the namespace of schema type any ~p.",[E,Any]); +format_error({match_failure,_XML,_CM,_S}) -> + "XML: A combination of XML element(s) and schema definitions that is not known has occured. The implementation doesn't support this structure."; +format_error({cannot_contain_text,_XMLTxt,CMEl}) -> + io_lib:format("XML: The schema structure: ~p doesn't allow text",[CMEl]); +format_error({missing_mandatory_elements,MandatoryEls}) -> + io_lib:format("XML: A schema sequence has mandatory elements ~p, that were unmatched.",[MandatoryEls]); +format_error({choice_missmatch,T,Els}) -> + io_lib:format("XML: A schema choice structure with the alternatives: ~p doesn't allow the text: ~p.",[Els,T]); +format_error({no_element_matching_choice,XML}) -> + io_lib:format("XML: The choice at location: ~p had no alternative that matched the XML structure(s): ~p.",[error_path(XML,undefined),XML]); +format_error({all_missmatch,T,CM}) -> + io_lib:format("XML: The schema expected one of: ~p, but the XML content was text: ~p at the location: ~p.",[CM,T,error_path(T,undefined)]); +format_error({element_not_in_all,ElName,E,_CM}) -> + io_lib:format("XML: The element ~p at location ~p in the XML file was not allowed according to the schema.",[ElName,error_path(E,undefined)]); +format_error({missing_mandatory_elements_in_all,MandatoryEls}) -> + io_lib:format("XML: The schema elements ~p were missed in the XML file.",[MandatoryEls]); +format_error({failed_validating,E,Any}) -> + io_lib:format("XML: The element ~p at location ~p failed validation. It should hav been matched by an any schema element ~p",[E#xmlElement.name,error_path(E,undefined),Any]); +format_error({schemaLocation_list_failure,Paths}) -> + io_lib:format("XML: schemaLocation values consists of one or more pairs of URI references, separated by white space. The first is a namespace name the second a reference to a schema: ~p.",[Paths]); +format_error({element_content_not_nil,XMLEl}) -> + io_lib:format("XML: The element ~p at position ~p has content of text/elements despite the nillable attribute was true.",[XMLEl#xmlElement.name,error_path(XMLEl,undefined)]); +format_error({attribute_in_simpleType,El,Att}) -> + io_lib:format("XML: The element ~p at location ~p must not have attributes like: ~p since it according to the schema has simpleType type.",[El#xmlElement.name,error_path(El,undefined),Att]); +format_error({required_attribute_missed,El,Name}) -> + io_lib:format("XML: The schema required an attribute ~p in element at location ~p that was missing.",[Name,error_path(El,undefined)]); +format_error({default_and_fixed_attributes_mutual_exclusive, + Name,Default,Fix}) -> + io_lib:format("Schema: It is an error in the schema to assign values for both default and fix for an attribute. Attribute: ~p, default: ~p, fix: ~p.",[Name,Default,Fix]); +format_error({schema_error,unexpected_object,_SA,_Err}) -> + "Schema: An unforeseen error case occured, maybee due to an unimplemented feature."; +format_error({attribute_not_defined_in_schema,Name}) -> + io_lib:format("XML: The attribute ~p is not defined in the provided schema.",[Name]); +format_error({disallowed_namespace,Namespace,NS,Name}) -> + io_lib:format("XML: The attribute ~p is not valid because the namespace ~p is forbidden by ~p",[Name,NS,Namespace]); +format_error({cirkular_attributeGroup_reference,Name}) -> + io_lib:format("Schema: Cirkular references to attribute groups are forbidden. One was detected including ~p.",[Name]); +format_error({could_not_resolve_type,ST}) -> + io_lib:format("Schema: The simpleType ~p could not be found among the types defined by the provided schema.",[ST]); +format_error({could_not_check_value_for_type,Type}) -> + io_lib:format("XML: Checking value for type ~p is not implemented.",[Type]); +format_error({unknown_simpleType,BT}) -> + io_lib:format("Schema: The simpleType ~p could not be found among the types defined by the provided schema",[BT]); +format_error({abstract_element_instance,ElName}) -> + io_lib:format("XML: Elements defined as abstract in the schema must not be instantiated in XML: ~p.",[ElName]); +format_error({qualified_name_required,LocalName}) -> + io_lib:format("XML: Element name ~p in XML instance is not qualified, though the schema requires that.",[LocalName]); +format_error({unqualified_name_required,QualifiedName}) -> + io_lib:format("XML: Element name ~p in XML instance must be unqualified, according to schema.",[QualifiedName]); +format_error({illegal_key_sequence_value,Err}) -> + io_lib:format("XML: The 'key-sequence', (se XML-spec 3.11.4), must be a node with at most one member: ~p",[Err]); +format_error({qualified_node_set_not_correct_for_key,_Err}) -> + "Schema: The 'target node set' and 'qualified node set' (se XML-spec 3.11.4.2.1) must be equal."; +format_error({key_value_not_unique,KS}) -> + io_lib:format("Schema: Key values must be unique within the schema. This is not ~p,",[KS]); +format_error({keyref_missed_matching_key,Refer}) -> + io_lib:format("Schema: This keyref had no matching key ~p.",[Refer]); +format_error({keyref_unexpected_object,_Other}) -> + "Schema: An unforeseen error case occured, unknown failure cause."; +format_error({cardinality_of_fields_not_equal,KR,K}) -> + io_lib:format("Schema: keyref and the corresponding key must have same cardinality of their fields. Missmatch in this case keyref: ~p, key: ~p.",[KR,K]); +format_error({could_not_load_keyref,Name}) -> + io_lib:format("Schema: The schema didn't define a keyref with the name ~p.",[Name]); +format_error({reference_undeclared,Kind,Ref}) -> + io_lib:format("Schema: The schema didn't define an ~p with the name ~p.",[Kind,Ref]); +format_error({cyclic_substitutionGroup,SGs}) -> + io_lib:format("Schema: cyclic substitutionGroup was detected, substitutionGroup structure is ~p.",[SGs]); +format_error({substitutionGroup_error,Head,SG}) -> + io_lib:format("Schema: Either of substitutionGroup members ~p or ~p is not defined in the provided schema.",[Head,SG]); +format_error({cyclic_definition,CA}) -> + io_lib:format("Schema: A forbidden cicular definition was detected ~p.",[CA]); +format_error({type_of_element_not_derived,MemT,HeadT}) -> + io_lib:format("Schema: Type in substitutionGroup members should be simpleType or complexType. In this case ~p and ~p were found.",[MemT, HeadT]); +format_error({derivation_blocked,BlockTag,Derivation}) -> + io_lib:format("Derivation by ~p is blocked by the blocking tag ~p.",[Derivation,BlockTag]); +format_error({names_not_equal,QName1,QName2}) -> + io_lib:format("The type ~p seems to be derived from another type than the base type ~p",[QName2,QName1]); +%% format_error({miss_match_base_types,QName1,QName2}) -> +%% io_lib:format("Types and/or names of base type ~p and derived type ~p doesn't fit.",[QName1,QName2]); +format_error({illegal_content_in_extension,Ext}) -> + io_lib:format("The extension content ~p didn't match the content model of the provided schema.",[Ext]); +format_error({SeqCho,expected,Other,found}) + when SeqCho == sequence;SeqCho == choice -> + io_lib:format("Schema: The restriction content ~p didn't match the content model of the provided schema, ~p was expected.",[SeqCho,Other]); +format_error({does_not_support,F,in_restriction}) -> + io_lib:format("Schema: The structure ~p is not supported in the implementation.",[F]); +format_error({illegal_content_simple_type,CM,TypeName}) -> + io_lib:format("Schema: ~p content is not allowed in a simpleType, as in ~p.",[CM,TypeName]); +format_error({illegal_in_restriction_of_simpleType,X}) -> + io_lib:format("Schema: The ~p content is illegal in a simpleType.",[X]); +format_error({element,Name,not_present_in_restriction}) -> + io_lib:format("Schema: In a restriction all element names of the restriction must be one of the elements of the base type. ~p is not.",[Name]); +format_error({invalid_derivation,EA,BaseAtts}) -> + io_lib:format("Schema: An anyAttribute ~p in a restricted derived type must be present among the base type attributes ~p.",[EA,BaseAtts]); +format_error({wildcard_namespace,NS,not_subset_of_base_namespace,BaseNS}) -> + io_lib:format("Schema: See XML spec. section 3.10.6. This wildcard namespace ~p is not allowed by the base namespace restrictions ~p.",[NS,BaseNS]); +format_error({wildcard_namespace_union_not_expressible,NS1,NS2}) -> + io_lib:format("Schema: See XML spec. section 3.10.6. The union of namespaces ~p and ~p is not expressible.",[NS1,NS2]); +format_error({wildcard_namespace_intersection_not_expressible,NS1,NS2}) -> + io_lib:format("Schema: See XML spec. section 3.10.6. The intersection of namespaces ~p and ~p is not expressible.",[NS1,NS2]); +format_error({circular_reference_of_type,TName}) -> + io_lib:format("Schema: An illicit circular reference involving simple/complex type ~p has been detected.",[TName]); +format_error({type_not_uniquely_defined_in_schema,Name}) -> + io_lib:format("Schema: See XML spec. section 3.4.1. Type names whether simple or complex must be unique within the schema. ~p is not.",[Name]); +format_error({not_uniquely_defined_in_schema,Key}) -> + io_lib:format("Schema: All schema objects of the same kind identified by name must be unique within the schema. ~p is not.",[Key]); +format_error({illegal_ID_value,ID}) -> + io_lib:format("The ID value ~p is not allowed as an ID value.",[ID]); +format_error({incomplete_file,_FileName,_Other}) -> + "Schema: The file containing a schema state must be produced by xmerl_xsd:state2file/[1,2]."; +format_error({unexpected_content_in_any,A}) -> + io_lib:format("Schema: The any type is considered to have no content besides annotation. ~p was found.",[A]); +format_error({erronous_content_in_identity_constraint,IDC,Err}) -> + io_lib:format("Schema: An ~p identity constraint must have one selector and one or more field in content. This case ~p",[IDC,Err]); +format_error({missing_xpath_attribute,IDCContent}) -> + io_lib:format("Schema: A ~p in a identity constraint must have a xpath attribute.",[IDCContent]); +format_error({content_in_anyAttribute,Err}) -> + io_lib:format("Schema: ~p is not allowed in anyAttribute. Content cannot be anything else than annotation.",[Err]); +format_error({content_in_simpleContent,Err}) -> + io_lib:format("Schema: Content of simpleContent can only be an optional annotation and one of restriction or extension. In this case ~p.",[Err]); +format_error({complexContent_content_failure,Err}) -> + io_lib:format("Schema: Besides an optional annotation complexContent should have one of restriction or extension. In this case ~p.",[Err]); +format_error({union_member_type_not_simpleType,IllegalType}) -> + io_lib:format("Schema: ~p is not allowed in a union. Content must be any nymber of simpleType.",[IllegalType]); +format_error({missing_base_type,restriction,_Other}) -> + "Schema: A restriction must have a base type, either assigned by the 'base' attribute or as a simpleType defined in content."; +format_error({content_failure_expected_restriction_or_extension,Kind,_}) -> + io_lib:format("Schema: A ~p had no restriction or extension in content.",[Kind]); +format_error({content_failure_only_one_restriction_or_extension_allowed,Kind,_}) -> + io_lib:format("Schema: A ~p can only have one of restriction or extension in content.",[Kind]); +format_error({mandatory_component_missing,S4SCMRest,Kind}) -> + io_lib:format("Schema: After matching a ~p the schema should have had content ~p.",[Kind,S4SCMRest]); +format_error(Err) -> + io_lib:format("~p~n",[Err]). + +%% format_error(ErrMsg,E,SchemaE,Env) -> +%% ?debug("format_error: ~p~n",[ErrMsg]), +%% {ErrMsg,format_error2(E,SchemaE,Env)}. +%% format_error2(E,SchemaE,Env) -> +%% {shema_el_pathname(SchemaE,Env), +%% xml_el_pathname(E)}. + +initial_tab_data(Tab) -> + ets:insert(Tab, + binary_to_term( + <<131,108,0,0,0,9,104,2,104,2,100,0,9,97,116,116,114,105,98,117,116, + 101,104,3,100,0,5,115,112,97,99,101,106,100,0,36,104,116,116,112,58, + 47,47,119,119,119,46,119,51,46,111,114,103,47,88,77,76,47,49,57,57, + 56,47,110,97,109,101,115,112,97,99,101,104,9,100,0,16,115,99,104,101, + 109,97,95,97,116,116,114,105,98,117,116,101,104,3,100,0,5,115,112,97, + 99,101,106,100,0,36,104,116,116,112,58,47,47,119,119,119,46,119,51, + 46,111,114,103,47,88,77,76,47,49,57,57,56,47,110,97,109,101,115,112, + 97,99,101,108,0,0,0,1,104,2,100,0,10,115,105,109,112,108,101,84,121, + 112,101,104,3,100,0,15,95,120,109,101,114,108,95,110,111,95,110,97, + 109,101,95,108,0,0,0,1,100,0,5,115,112,97,99,101,106,106,106,100,0,5, + 102,97,108,115,101,106,100,0,8,111,112,116,105,111,110,97,108,100,0,9, + 117,110,100,101,102,105,110,101,100,100,0,9,117,110,100,101,102,105, + 110,101,100,100,0,9,117,110,100,101,102,105,110,101,100,104,2,104,2, + 100,0,6,115,99,104,101,109,97,107,0,7,120,109,108,46,120,115,100,104, + 7,100,0,6,115,99,104,101,109,97,100,0,11,117,110,113,117,97,108,105, + 102,105,101,100,100,0,11,117,110,113,117,97,108,105,102,105,101,100, + 100,0,36,104,116,116,112,58,47,47,119,119,119,46,119,51,46,111,114, + 103,47,88,77,76,47,49,57,57,56,47,110,97,109,101,115,112,97,99,101, + 106,106,106,104,2,104,2,100,0,9,97,116,116,114,105,98,117,116,101, + 104,3,100,0,4,98,97,115,101,106,100,0,36,104,116,116,112,58,47,47, + 119,119,119,46,119,51,46,111,114,103,47,88,77,76,47,49,57,57,56,47, + 110,97,109,101,115,112,97,99,101,104,9,100,0,16,115,99,104,101,109, + 97,95,97,116,116,114,105,98,117,116,101,104,3,100,0,4,98,97,115,101, + 106,100,0,36,104,116,116,112,58,47,47,119,119,119,46,119,51,46,111, + 114,103,47,88,77,76,47,49,57,57,56,47,110,97,109,101,115,112,97,99, + 101,108,0,0,0,1,104,2,100,0,10,115,105,109,112,108,101,84,121,112,101, + 104,3,100,0,6,97,110,121,85,82,73,106,100,0,32,104,116,116,112,58,47, + 47,119,119,119,46,119,51,46,111,114,103,47,50,48,48,49,47,88,77,76,83, + 99,104,101,109,97,106,100,0,5,102,97,108,115,101,106,100,0,8,111,112, + 116,105,111,110,97,108,100,0,9,117,110,100,101,102,105,110,101,100, + 100,0,9,117,110,100,101,102,105,110,101,100,100,0,9,117,110,100,101, + 102,105,110,101,100,104,2,104,2,100,0,14,97,116,116,114,105,98,117, + 116,101,71,114,111,117,112,104,3,100,0,12,115,112,101,99,105,97,108, + 65,116,116,114,115,106,100,0,36,104,116,116,112,58,47,47,119,119,119, + 46,119,51,46,111,114,103,47,88,77,76,47,49,57,57,56,47,110,97,109,101, + 115,112,97,99,101,104,5,100,0,22,115,99,104,101,109,97,95,97,116,116, + 114,105,98,117,116,101,95,103,114,111,117,112,104,3,100,0,12,115,112, + 101,99,105,97,108,65,116,116,114,115,106,100,0,36,104,116,116,112,58, + 47,47,119,119,119,46,119,51,46,111,114,103,47,88,77,76,47,49,57,57, + 56,47,110,97,109,101,115,112,97,99,101,100,0,9,117,110,100,101,102, + 105,110,101,100,100,0,9,117,110,100,101,102,105,110,101,100,108,0,0, + 0,3,104,2,100,0,9,97,116,116,114,105,98,117,116,101,104,3,100,0,4,98, + 97,115,101,106,106,104,2,100,0,9,97,116,116,114,105,98,117,116,101, + 104,3,100,0,4,108,97,110,103,106,106,104,2,100,0,9,97,116,116,114, + 105,98,117,116,101,104,3,100,0,5,115,112,97,99,101,106,106,106,104, + 2,104,2,100,0,10,115,105,109,112,108,101,84,121,112,101,104,3,100,0, + 15,95,120,109,101,114,108,95,110,111,95,110,97,109,101,95,108,0,0,0, + 1,100,0,5,115,112,97,99,101,106,106,104,9,100,0,18,115,99,104,101, + 109,97,95,115,105,109,112,108,101,95,116,121,112,101,104,3,100,0,15, + 95,120,109,101,114,108,95,110,111,95,110,97,109,101,95,108,0,0,0,1, + 100,0,5,115,112,97,99,101,106,106,108,0,0,0,1,100,0,5,115,112,97,99, + 101,106,104,3,100,0,6,78,67,78,97,109,101,106,100,0,32,104,116,116, + 112,58,47,47,119,119,119,46,119,51,46,111,114,103,47,50,48,48,49,47, + 88,77,76,83,99,104,101,109,97,100,0,5,102,97,108,115,101,106,108,0,0, + 0,1,104,2,100,0,11,101,110,117,109,101,114,97,116,105,111,110,108,0,0, + 0,2,107,0,7,100,101,102,97,117,108,116,107,0,8,112,114,101,115,101, + 114,118,101,106,106,100,0,6,97,116,111,109,105,99,108,0,0,0,1,104,2, + 100,0,11,114,101,115,116,114,105,99,116,105,111,110,104,2,104,3,100, + 0,6,78,67,78,97,109,101,106,100,0,32,104,116,116,112,58,47,47,119, + 119,119,46,119,51,46,111,114,103,47,50,48,48,49,47,88,77,76,83,99, + 104,101,109,97,108,0,0,0,2,104,2,100,0,11,101,110,117,109,101,114, + 97,116,105,111,110,107,0,7,100,101,102,97,117,108,116,104,2,100,0, + 11,101,110,117,109,101,114,97,116,105,111,110,107,0,8,112,114,101, + 115,101,114,118,101,106,106,104,2,104,2,100,0,10,115,105,109,112, + 108,101,84,121,112,101,104,3,100,0,15,95,120,109,101,114,108,95,110, + 111,95,110,97,109,101,95,108,0,0,0,1,100,0,4,108,97,110,103,106,106, + 104,9,100,0,18,115,99,104,101,109,97,95,115,105,109,112,108,101,95, + 116,121,112,101,104,3,100,0,15,95,120,109,101,114,108,95,110,111,95, + 110,97,109,101,95,108,0,0,0,1,100,0,4,108,97,110,103,106,106,108,0,0, + 0,1,100,0,4,108,97,110,103,106,100,0,9,117,110,100,101,102,105,110, + 101,100,100,0,5,102,97,108,115,101,106,106,100,0,6,97,116,111,109, + 105,99,108,0,0,0,1,104,2,100,0,5,117,110,105,111,110,108,0,0,0,2,104, + 2,100,0,10,115,105,109,112,108,101,84,121,112,101,104,3,100,0,8,108, + 97,110,103,117,97,103,101,106,100,0,32,104,116,116,112,58,47,47,119, + 119,119,46,119,51,46,111,114,103,47,50,48,48,49,47,88,77,76,83,99,104, + 101,109,97,104,2,100,0,10,115,105,109,112,108,101,84,121,112,101,104, + 3,100,0,15,95,120,109,101,114,108,95,110,111,95,110,97,109,101,95,108, + 0,0,0,2,100,0,15,95,120,109,101,114,108,95,110,111,95,110,97,109,101, + 95,100,0,4,108,97,110,103,106,106,106,106,104,2,104,2,100,0,9,97,116, + 116,114,105,98,117,116,101,104,3,100,0,2,105,100,106,100,0,36,104,116, + 116,112,58,47,47,119,119,119,46,119,51,46,111,114,103,47,88,77,76,47, + 49,57,57,56,47,110,97,109,101,115,112,97,99,101,104,9,100,0,16,115,99, + 104,101,109,97,95,97,116,116,114,105,98,117,116,101,104,3,100,0,2,105, + 100,106,100,0,36,104,116,116,112,58,47,47,119,119,119,46,119,51,46, + 111,114,103,47,88,77,76,47,49,57,57,56,47,110,97,109,101,115,112,97, + 99,101,108,0,0,0,1,104,2,100,0,10,115,105,109,112,108,101,84,121,112, + 101,104,3,100,0,2,73,68,106,100,0,32,104,116,116,112,58,47,47,119,119, + 119,46,119,51,46,111,114,103,47,50,48,48,49,47,88,77,76,83,99,104,101, + 109,97,106,100,0,5,102,97,108,115,101,106,100,0,8,111,112,116,105,111, + 110,97,108,100,0,9,117,110,100,101,102,105,110,101,100,100,0,9,117, + 110,100,101,102,105,110,101,100,100,0,9,117,110,100,101,102,105,110, + 101,100,104,2,104,2,100,0,9,97,116,116,114,105,98,117,116,101,104,3, + 100,0,4,108,97,110,103,106,100,0,36,104,116,116,112,58,47,47,119,119, + 119,46,119,51,46,111,114,103,47,88,77,76,47,49,57,57,56,47,110,97,109, + 101,115,112,97,99,101,104,9,100,0,16,115,99,104,101,109,97,95,97,116, + 116,114,105,98,117,116,101,104,3,100,0,4,108,97,110,103,106,100,0,36, + 104,116,116,112,58,47,47,119,119,119,46,119,51,46,111,114,103,47,88, + 77,76,47,49,57,57,56,47,110,97,109,101,115,112,97,99,101,108,0,0,0,1, + 104,2,100,0,10,115,105,109,112,108,101,84,121,112,101,104,3,100,0,15, + 95,120,109,101,114,108,95,110,111,95,110,97,109,101,95,108,0,0,0,1, + 100,0,4,108,97,110,103,106,106,106,100,0,5,102,97,108,115,101,106, + 100,0,8,111,112,116,105,111,110,97,108,100,0,9,117,110,100,101,102, + 105,110,101,100,100,0,9,117,110,100,101,102,105,110,101,100,100,0,9, + 117,110,100,101,102,105,110,101,100,104,2,104,2,100,0,10,115,105,109, + 112,108,101,84,121,112,101,104,3,100,0,15,95,120,109,101,114,108,95, + 110,111,95,110,97,109,101,95,108,0,0,0,2,100,0,15,95,120,109,101,114, + 108,95,110,111,95,110,97,109,101,95,100,0,4,108,97,110,103,106,106, + 104,9,100,0,18,115,99,104,101,109,97,95,115,105,109,112,108,101,95, + 116,121,112,101,104,3,100,0,15,95,120,109,101,114,108,95,110,111,95, + 110,97,109,101,95,108,0,0,0,2,100,0,15,95,120,109,101,114,108,95,110, + 111,95,110,97,109,101,95,100,0,4,108,97,110,103,106,106,108,0,0,0,2, + 100,0,15,95,120,109,101,114,108,95,110,111,95,110,97,109,101,95,100, + 0,4,108,97,110,103,106,104,3,100,0,6,115,116,114,105,110,103,106,100, + 0,32,104,116,116,112,58,47,47,119,119,119,46,119,51,46,111,114,103,47, + 50,48,48,49,47,88,77,76,83,99,104,101,109,97,100,0,5,102,97,108,115, + 101,106,108,0,0,0,1,104,2,100,0,11,101,110,117,109,101,114,97,116,105, + 111,110,108,0,0,0,1,106,106,106,100,0,6,97,116,111,109,105,99,108,0,0, + 0,1,104,2,100,0,11,114,101,115,116,114,105,99,116,105,111,110,104,2, + 104,3,100,0,6,115,116,114,105,110,103,106,100,0,32,104,116,116,112,58, + 47,47,119,119,119,46,119,51,46,111,114,103,47,50,48,48,49,47,88,77,76, + 83,99,104,101,109,97,108,0,0,0,1,104,2,100,0,11,101,110,117,109,101, + 114,97,116,105,111,110,106,106,106,106>>)). + +default_namespace_by_convention() -> + [{xml,'http://www.w3.org/XML/1998/namespace'}]. diff --git a/lib/xmerl/src/xmerl_xsd_type.erl b/lib/xmerl/src/xmerl_xsd_type.erl new file mode 100644 index 0000000000..19951f030f --- /dev/null +++ b/lib/xmerl/src/xmerl_xsd_type.erl @@ -0,0 +1,1558 @@ +%% +%% %CopyrightBegin% +%% +%% Copyright Ericsson AB 2006-2009. All Rights Reserved. +%% +%% The contents of this file are subject to the Erlang Public License, +%% Version 1.1, (the "License"); you may not use this file except in +%% compliance with the License. You should have received a copy of the +%% Erlang Public License along with this software. If not, it can be +%% retrieved online at http://www.erlang.org/. +%% +%% Software distributed under the License is distributed on an "AS IS" +%% basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See +%% the License for the specific language governing rights and limitations +%% under the License. +%% +%% %CopyrightEnd% +%% + +-module(xmerl_xsd_type). + +-export([check_simpleType/3,facet_fun/2,compare_floats/2, + replace_ws/2,collapse_ws/1]). + +-export([fQuotient/2,fQuotient/3,modulo/2,modulo/3,maximumDayInMonthFor/2, + add_duration2dateTime/2,duration_atoms/1,dateTime_atoms/1, + normalize_dateTime/1]). + +-export([compare_durations/2,compare_dateTime/2]). + +-include("xmerl.hrl"). +-include("xmerl_xsd.hrl"). + + +-define(catch_exit(_Call_,_Value_,_ErrorCause_), + case catch (_Call_) of + {'EXIT',_} -> + {error,{type,_ErrorCause_,_Value_}}; + {error,_} -> + {error,{_ErrorCause_,_Value_}}; + _ -> + {ok,_Value_} + end). + +-define(is_whitespace(__WS__), + __WS__==16#20; __WS__==16#9;__WS__==16#a; __WS__==16#d). + +check_simpleType(Name,Value,S) when is_list(Name) -> + ?debug("simpleType name a list: "++Name++"~n",[]), + check_simpleType(list_to_atom(Name),Value,S); +check_simpleType(string,Value,_S) -> + case [X||X <- Value, + xmerl_lib:is_char(X)] of + Value -> + {ok,Value}; + _ -> + {error,{value_not_string,Value}} + end; +check_simpleType(normalizedString,Value,_S) -> + case [X||X <- Value,xmerl_lib:is_char(X), + ns_whitespace(X)==false] of + Value -> + {ok,Value}; + _ -> + {error,{value_not_normalizedString,Value}} + end; +check_simpleType(boolean,"true",_S) -> {ok,"true"}; +check_simpleType(boolean,"false",_S) -> {ok,"false"}; +check_simpleType(boolean,"1",_S) -> {ok,"1"}; +check_simpleType(boolean,"0",_S) -> {ok,"0"}; +check_simpleType(boolean,Other,_S) -> {error,{value_not_boolean,Other}}; +check_simpleType(decimal,Value,_S) -> + ?catch_exit(check_decimal(Value),Value,invalid_decimal); +check_simpleType(integer,Value,_S) -> + ?catch_exit(check_integer(Value),Value,invalid_integer); + +% float values: m * 2^e, where m is an integer whose absolute value is +% less than 2^24, and e is an integer between -149 and 104, inclusive. +check_simpleType(float,Value,_S) -> + ?catch_exit(check_float(Value),Value,invalid_float); +% double values: m * 2^e, where m is an integer whose absolute value +% is less than 2^53, and e is an integer between -1075 and 970, +% inclusive. +check_simpleType(double,Value,_S) -> + ?catch_exit(check_double(Value),Value,invalid_double); +% extended format PnYnMnDTnHnMnS where n is an integer. The n value +% before S may include decimal fraction. +check_simpleType(duration,Value,_S) -> + ?catch_exit(check_duration(Value),Value,invalid_duration); +check_simpleType(dateTime,Value,_S) -> + ?catch_exit(check_dateTime(Value),Value,invalid_dateTime); +check_simpleType(time,Value,_S) -> + ?catch_exit(check_time(Value),Value,invalid_time); +check_simpleType(date,Value,_S) -> + ?catch_exit(check_date(Value),Value,invalid_date); +check_simpleType(gYearMonth,Value,_S) -> + ?catch_exit(check_gYearMonth(Value),Value,invalid_gYearMonth); +check_simpleType(gYear,Value,_S) -> + ?catch_exit(check_gYear(Value),Value,invalid_gYear); +check_simpleType(gMonthDay,Value,_S) -> + ?catch_exit(check_gMonthDay(Value),Value,invalid_gMonthDay); +check_simpleType(gDay,Value,_S) -> + ?catch_exit(check_gDay(Value),Value,invalid_gDay); +check_simpleType(gMonth,Value,_S) -> + ?catch_exit(check_gMonth(Value),Value,invalid_gMonth); +check_simpleType(hexBinary,Value,_S) -> + IsEven = fun(X) -> + case X rem 2 of + 0 -> true; + _ -> false + end + end, + IsHex = fun(X) when X >= $A, X =< $F -> true; + (X) when X >= $a, X =< $f -> true; + (X) when X >= $0, X =< $9 -> true; + (_) -> false + end, + case [X|| X<-Value, + IsEven(length(Value)), + IsHex(X)] of + Value -> + {ok,Value}; + _ -> {error,{value_not_hexBinary,Value}} + end; +check_simpleType(base64Binary,Value,_S) -> + check_base64Binary(Value); +check_simpleType(anyURI,Value,S) -> + case xmerl_uri:parse(Value) of + {error,_} -> + %% might be a relative uri, then it has to be a path in the context + case catch file:read_file_info(filename:join(S#xsd_state.xsd_base,Value)) of + {ok,_} -> + {ok,Value}; + _ -> + {error,{value_not_anyURI,Value}} + end; + _ -> + {ok,Value} + end; +check_simpleType('QName',Value,_S) -> + case xmerl_lib:is_name(Value) of + true -> + {ok,Value}; + _ -> + {error,{value_not_QName,Value}} + end; +check_simpleType('NOTATION',Value,_S) -> + {ok,Value}; %% Must provide for check of all QNames in schema. +check_simpleType(token,Value,_S) -> + ?catch_exit(check_token(Value),Value,invalid_token); +%% conform to the pattern [a-zA-Z]{1,8}(-[a-zA-Z0-9]{1,8})* +check_simpleType(language,Value,_S) -> + ?catch_exit(check_language(Value),Value,illegal_language); +check_simpleType('NMTOKEN',Value,_S) -> + ?catch_exit(check_NMTOKEN(Value),Value,illegal_NMTOKEN); +check_simpleType('NMTOKENS',Value,_S) -> + ?catch_exit(check_NMTOKENS(Value),Value,illegal_NMTOKENS); +check_simpleType('Name',Value,_S) -> + ?catch_exit(check_Name(Value),Value,illegal_name); +check_simpleType('NCName',Value,_S) -> + ?catch_exit(check_NCName(Value),Value,illegal_name); +check_simpleType('ID',Value,_S) -> + ?catch_exit(check_ID(Value),Value,illegal_ID); +check_simpleType('IDREF',Value,_S) -> + ?catch_exit(check_IDREF(Value),Value,illegal_IDREF); +check_simpleType('IDREFS',Value,_S) -> + ?catch_exit(check_IDREFS(Value),Value,illegal_IDREFS); +check_simpleType('ENTITY',Value,_S) -> + ?catch_exit(check_ENTITY(Value),Value,illegal_ENTITY); +check_simpleType('ENTITIES',Value,_S) -> + ?catch_exit(check_ENTITIES(Value),Value,illegal_ENTITIES); +check_simpleType(nonPositiveInteger,Value,_S) -> + ?catch_exit(check_nonPositiveInteger(Value),Value, + illegal_nonPositiveInteger); +check_simpleType(negativeInteger,Value,_S) -> + ?catch_exit(check_negativeInteger(Value),Value, + illegal_negativeInteger); +check_simpleType(long,Value,_S) -> + ?catch_exit(check_long(Value),Value,illegal_long); +check_simpleType(int,Value,_S) -> + ?catch_exit(check_int(Value),Value,illegal_int); +check_simpleType(short,Value,_S) -> + ?catch_exit(check_short(Value),Value,illegal_short); +check_simpleType(byte,Value,_S) -> + ?catch_exit(check_byte(Value),Value,illegal_byte); +check_simpleType(nonNegativeInteger,Value,_S) -> + ?catch_exit(check_nonNegativeInteger(Value),Value, + illegal_nonNegativeInteger); +check_simpleType(unsignedLong,Value,_S) -> + ?catch_exit(check_unsignedLong(Value),Value,illegal_unsignedLong); +check_simpleType(unsignedInt,Value,_S) -> + ?catch_exit(check_unsignedInt(Value),Value,illegal_unsignedInt); +check_simpleType(unsignedShort,Value,_S) -> + ?catch_exit(check_unsignedShort(Value),Value,illegal_unsignedShort); +check_simpleType(unsignedByte,Value,_S) -> + ?catch_exit(check_unsignedByte(Value),Value,illegal_unsignedByte); +check_simpleType(positiveInteger,Value,_S) -> + ?catch_exit(check_positiveInteger(Value),Value,illegal_positiveInteger); +check_simpleType(Unknown,Value,_S) -> + {error,{unknown_type,Unknown,Value}}. + +check_decimal(Value) -> + case string:tokens(Value,".") of + L when length(L) == 1; length(L) == 2 -> + _ = [list_to_integer(X)||X <- L], + {ok,Value}; + _ -> + {error,{value_not_decimal,Value}} + end. +%% I=string:chr(Value,$.), +%% {NumberDot,Decimal}=lists:split(I,Value), +%% Number=string:strip(NumberDot,right,$.), +%% case catch {list_to_integer(Number),list_to_integer(Decimal)} of +%% {'EXIT',_} -> +%% {error,{value_not_decimal,Value}}; +%% _ -> {ok,Value} +%% end. + +check_float(V="-INF") -> + {ok,V}; +check_float(V="INF") -> + {ok,V}; +check_float(V="NaN") -> + {ok,V}; +check_float(Value) -> +%% Pred = fun(X) when X==$e;X==$E -> false;(_) -> true end, +%% {Mantissa,Exponent}=lists:splitwith(Pred,Value), +%% SkipEe = fun([]) -> [];(L) -> tl(L) end, + case string:tokens(Value,"eE") of + [Mantissa,Exponent] -> + {ok,_} = check_decimal(Mantissa), + {ok,_} = check_integer(Exponent); + [Mantissa] -> + check_decimal(Mantissa) + end, + {ok,Value}. +%% case {check_decimal(Mantissa), +%% check_simpleType(integer,SkipEe(Exponent))} of +%% {{ok,_},{ok,_}} -> +%% {ok,Value}; +%% _ -> +%% {error,{value_not_float,Value}} +%% end. + +check_double(Value) -> + check_float(Value). + + +%% format PnYnMnDTnHnMnS +%% P is always present +%% T is absent iff all time items are absent +%% At least one duration item must be present +check_duration("-"++Value) -> + check_duration(Value); +check_duration("P"++Value) -> + {Date,Time}=lists:splitwith(fun($T) -> false;(_) -> true end,Value), + {ok,_} = check_duration_date(Date,["Y","M","D"]), + {ok,_} = check_duration_time(Time,["T","H","M","S"]). + +check_duration_date("",_) -> + {ok,""}; +check_duration_date(Date,[H|T]) -> + case string:tokens(Date,H) of + [Date] -> + check_duration_date(Date,T); + [DateItem] -> + {ok,_} = check_positive_integer(DateItem); + [DateItem,Rest] -> + {ok,_} = check_positive_integer(DateItem), + check_duration_date(Rest,T) + end. +%% Time any combination of TnHnMfS +%% n unsigned integers and f unsigned decimal +%%check_duration_time(Time,["T","H","M","S"]) +check_duration_time("",[_H|_T]) -> + {ok,""}; +check_duration_time(Time,[S]) -> + [Sec] = string:tokens(Time,S), + {ok,_} = check_decimal(Sec); +check_duration_time("T"++Time,TTokens) -> + [_H|_] = Time, + check_duration_time(Time,tl(TTokens)); +check_duration_time(Time,[H|T]) -> + case string:tokens(Time,H) of + [Time] -> + check_duration_time(Time,T); + [TimeItem] -> + {ok,_} = check_positive_integer(TimeItem); + [TimeItem,Rest] -> + {ok,_} = check_positive_integer(TimeItem), + check_duration_time(Rest,T) + end. + +check_positive_integer(Value) -> + case catch list_to_integer(Value) of + Int when is_integer(Int),Int>=0 -> + {ok,Int}; + _ -> + {error,{value_not_integer,Value}} + end. + + +%% check_integer and thereof derived types +check_integer(Value) -> + {ok,list_to_integer(Value)}. + +check_nonPositiveInteger(Value) -> + check_constr_int(Value,undefined,0,illegal_nonPositiveInteger). + +check_negativeInteger(Value) -> + check_constr_int(Value,undefined,-1,illegal_negativeInteger). + +check_long(Value) -> + check_constr_int(Value,-9223372036854775808, + 9223372036854775807,illegal_long). + +check_int(Value) -> + check_constr_int(Value,-2147483648,2147483647,illegal_int). + +check_short(Value) -> + check_constr_int(Value,-32768,32767,illegal_short). + +check_byte(Value) -> + check_constr_int(Value,-128,127,illegal_byte). + +check_nonNegativeInteger(Value) -> + check_constr_int(Value,0,undefined,illegal_nonNegativeInteger). + +check_unsignedLong(Value) -> + check_constr_int(Value,0,18446744073709551615,illegal_unsignedLong). + +check_unsignedInt(Value) -> + check_constr_int(Value,0,4294967295,illegal_unsignedInt). + +check_unsignedShort(Value) -> + check_constr_int(Value,0,65535,illegal_unsignedShort). + +check_unsignedByte(Value) -> + check_constr_int(Value,0,255,illegal_unsignedByte). + +check_positiveInteger(Value) -> + check_constr_int(Value,1,undefined,illegal_positiveInteger). + +check_constr_int(Value,undefined,Max,ErrMsg) -> + case check_integer(Value) of + {ok,Int} when Int =< Max -> + {ok,Int}; + _ -> + {error,{ErrMsg}} + end; +check_constr_int(Value,Min,Max,ErrMsg) -> + case check_integer(Value) of + {ok,Int} when Int >= Min, Int =< Max -> + {ok,Int}; + _ -> + {error,{ErrMsg}} + end. + +%% DateTime on form: '-'? yyyy '-' mm '-' dd 'T' hh ':' mm ':' ss +%% ('.' s+)? (zzzzzz)? +check_dateTime("-"++DateTime) -> + check_dateTime(DateTime); +check_dateTime("+"++_DateTime) -> + {error,{invalid_dateTime,plus_sign}}; +check_dateTime(DateTime) -> + [Date,Time] = string:tokens(DateTime,"T"), + [Y,M,D] = string:tokens(Date,"-"), + check_year(Y), + {ok,_} = check_positive_integer(M), + {ok,_} = check_positive_integer(D), + check_time(Time). + +check_year(Y) when length(Y)>4 -> + Y = string:strip(Y,left,$0), + {ok,list_to_integer(Y)}; +check_year(Y) -> + case list_to_integer(Y) of + Year when Year =/= 0 -> + {ok,Year}; + _ -> + {error,{invalid_year,Y}} + end. + +check_month(Str) -> + case check_positive_integer(Str) of + {ok,Int} when Int >= 1,Int =< 12 -> + {ok,Int}; + _ -> + {error,{invalid_month,Str}} + end. +check_day(Str) -> + case check_positive_integer(Str) of + {ok,Int} when Int >= 1,Int =< 31 -> + {ok,Int}; + _ -> + {error,{invalid_day,Str}} + end. + + +check_time(Time) -> + %% hh:mm:ss (.s+)? TZ + {HMS,TZ} = + case lists:split(8,Time) of + {T,"."++SecFractionsTZ} -> + OnlyDigits = fun(X) when X>=$0,X=<$9 ->true;(_)->false end, + {SecFrac,TZone} = lists:splitwith(OnlyDigits,SecFractionsTZ), + {ok,_} = check_positive_integer(SecFrac), + {T,TZone}; + {T,TZone} -> + {T,TZone} + end, + [H,M,S] = string:tokens(HMS,":"), + {ok,_} = check_hour(H), + {ok,_} = check_minute(M), + {ok,_} = check_second(S), + case TZ of + [] -> + {ok,Time}; %% timezone optional + _ -> + check_timezone(TZ) + end. + +check_hour(Str) -> + case check_positive_integer(Str) of + {ok,H} when H >= 0,H =< 24 -> + {ok,H}; + _ -> + {error,{invalid_hour,Str}} + end. +check_minute(Str) -> + case check_positive_integer(Str) of + {ok,H} when H >= 0,H =< 60 -> + {ok,H}; + _ -> + {error,{invalid_minute,Str}} + end. +check_second(Str) -> + case check_positive_integer(Str) of + {ok,H} when H >= 0,H =< 60 -> + {ok,H}; + _ -> + {error,{invalid_second,Str}} + end. + +check_timezone("Z") -> + {ok,"Z"}; +check_timezone(TZ) -> + [H,M] = string:tokens(TZ,":"), + case check_integer(H) of + {ok,H2} when H2 >= -13, H2 =< 13 -> + case check_positive_integer(M) of + {ok,M2} when M2 >= 0, M2 =< 59 -> + {ok,{H2,M2}}; + _ -> + {error,{invalid_timezone,TZ,M}} + end; + {ok,H2} when H2==14;H2==-14 -> + case check_positive_integer(M) of + {ok,0} -> + {ok,{H2,0}}; + _ -> + {error,{invalid_timezone,TZ}} + end; + _ -> + {error,{invalid_timezone,TZ}} + end. + + +%% the form: '-'? yyyy '-' mm '-' dd zzzzzz? +check_date("-"++Date) -> + check_date(Date); +check_date("+"++_Date) -> + {error,{invalid_date,plus_sign}}; +check_date(Date) -> + {Year,Month,Day} = + case string:tokens(Date,"-+Z") of + [Y,M,D,TZ] -> + {ok,_}=check_timezone(TZ), + {Y,M,D}; + [Y,M,D] -> + {Y,M,D} + end, + {ok,_}=check_year(Year), + {ok,_}=check_month(Month), + {ok,_}=check_day(Day). + +%% gYearMonth on the form: '-'? ccyy '-' mm zzzzzz? +check_gYearMonth("-"++Value) -> + check_gYearMonth(Value); +check_gYearMonth("+"++_Value) -> + {error,{invalid_gYearMonth,plus_sign}}; +check_gYearMonth(Value) -> + {Year,Month} = + case string:tokens(Value,"-+Z") of + [Y,M,TZ] -> + {ok,_} = check_timezone(TZ), + {Y,M}; + [Y,M] -> + {Y,M} + end, + {ok,_} = check_year(Year), + {ok,_} = check_month(Month). + +%% gYear on the form: '-'? ccyy zzzzzz? +check_gYear("-"++Value) -> + check_gYear(Value); +check_gYear("+"++_Value) -> + {error,{invalid_gYear,plus_sign}}; +check_gYear(Value) -> + Year = + case string:tokens(Value,"-+Z") of + [Y,TZ] -> + {ok,_} = check_timezone(TZ), + Y; + [Y] -> + Y + end, + {ok,_} = check_year(Year). + +%% gMonthDay on the form: mm dd zzzzzz? +check_gMonthDay("--"++Value) -> + {M,"-"++DTZ} = lists:split(2,Value), + {ok,_} = check_month(M), + {ok,_} = check_gDay2(DTZ). + +%% dDay on the form dd zzzzzz? +check_gDay("---"++Value) -> + check_gDay2(Value). +check_gDay2(Value) -> + {D,TZ} = lists:split(2,Value), + {ok,_} = check_day(D), + case TZ of + [] -> + {ok,Value}; + _ -> + {ok,_} = check_timezone(TZ) + end. +%% dMonth on the form mm zzzzzz? +check_gMonth("--"++Value) -> + {M,TZ} = lists:split(2,Value), + {ok,_} = check_month(M), + case TZ of + [] -> + {ok,Value}; + _ -> + {ok,_} = check_timezone(TZ) + end. + +check_base64Binary(Value) -> + case catch xmerl_b64Bin:parse(xmerl_b64Bin_scan:scan(Value)) of + {ok,_} -> + {ok,Value}; + Err = {error,_} -> + Err; + {'EXIT',{error,Reason}} -> %% scanner failed on character + {error,Reason}; + {'EXIT',Reason} -> + {error,{internal_error,Reason}} + end. + +%% tokens may not contain the carriage return (#xD), line feed (#xA) +%% nor tab (#x9) characters, that have no leading or trailing spaces +%% (#x20) and that have no internal sequences of two or more spaces. +check_token(V=[32|_]) -> + {error,{invalid_token,leading_space,V}}; +check_token(Value) -> + check_token(Value,Value). +check_token([],Value) -> + {ok,Value}; +check_token([32],V) -> + {error,{invalid_token,trailing_space,V}}; +check_token([9|_T],V) -> + {error,{invalid_token,tab,V}}; +check_token([10|_T],V) -> + {error,{invalid_token,line_feed,V}}; +check_token([13|_T],V) -> + {error,{invalid_token,carriage_return,V}}; +check_token([32,32|_T],V) -> + {error,{invalid_token,double_space,V}}; +check_token([_H|T],V) -> + check_token(T,V). + +%% conform to the pattern [a-zA-Z]{1,8}(-[a-zA-Z0-9]{1,8})* +check_language(Value) -> + check_language(Value,0). +check_language([H|T],N) when H>=$A,H=<$Z -> + check_language(T,N+1); +check_language([H|T],N) when H>=$a,H=<$z -> + check_language(T,N+1); +check_language([$-|T],N) when N>=1,N=<8 -> + check_language2(T,0); +check_language([],N) when N>=1,N=<8 -> + {ok,[]}. +check_language2([H|T],N) when H>=$A,H=<$Z -> + check_language2(T,N+1); +check_language2([H|T],N) when H>=$a,H=<$z -> + check_language2(T,N+1); +check_language2([H|T],N) when H>=$0,H=<$9 -> + check_language2(T,N+1); +check_language2([$-|T],N) when N>=1,N=<8 -> + check_language2(T,0); +check_language2([],N) when N>=1,N=<8 -> + {ok,[]}. + +check_NMTOKEN([H|T]) -> + true = xmerl_lib:is_namechar(H), + check_NMTOKEN2(T). +check_NMTOKEN2([]) -> + {ok,[]}; +check_NMTOKEN2([H|T]) -> + true = xmerl_lib:is_namechar(H), + check_NMTOKEN2(T). + +check_NMTOKENS(Value) -> + TokList = string:tokens(Value," "), + lists:foreach(fun check_NMTOKEN/1,TokList), + {ok,Value}. + +check_Name(Value) -> + true = xmerl_lib:is_name(Value), + {ok,Value}. + +check_NCName(Value) -> + true = xmerl_lib:is_ncname(Value), + {ok,Value}. + +check_ID(Value) -> + %% ID must be a NCName and uniquely identify the element which + %% bear it. Only one ID per element. + true = xmerl_lib:is_ncname(Value), + {ok,Value}. + +check_IDREF(Value) -> + true = xmerl_lib:is_name(Value), + {ok,Value}. + +check_IDREFS(Value) -> + check_list_type(Value,fun check_IDREF/1). + +check_ENTITY(Value) -> + true = xmerl_lib:is_ncname(Value), + {ok,Value}. + +check_ENTITIES(Value) -> + check_list_type(Value,fun check_ENTITY/1). + +check_list_type(Value,BaseTypeFun) -> + Tokens = string:tokens(Value," "), + lists:foreach(BaseTypeFun,Tokens), + {ok,Value}. + +ns_whitespace(WS) when WS==16#9;WS==16#A;WS==16#D -> + true; +ns_whitespace(_) -> + false. + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +%% facet functions +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% + +facet_fun(Type,{length,V}) -> + length_fun(Type,list_to_integer(V)); +facet_fun(Type,{minLength,V}) -> + minLength_fun(Type,list_to_integer(V)); +facet_fun(Type,{maxLength,V}) -> + maxLength_fun(Type,list_to_integer(V)); +facet_fun(Type,{pattern,V}) -> +%% fun(Val) -> +%% {ok,Val} +%% end; + pattern_fun(Type,V); +facet_fun(Type,{enumeration,V}) -> + enumeration_fun(Type,V); +facet_fun(Type,{whiteSpace,V}) -> + whiteSpace_fun(Type,V); +facet_fun(Type,{maxInclusive,V}) -> + maxInclusive_fun(Type,V); +facet_fun(Type,{maxExclusive,V}) -> + maxExclusive_fun(Type,V); +facet_fun(Type,{minExclusive,V}) -> + minExclusive_fun(Type,V); +facet_fun(Type,{minInclusive,V}) -> + minInclusive_fun(Type,V); +facet_fun(Type,{totalDigits,V}) -> + totalDigits_fun(Type,list_to_integer(V)); +facet_fun(Type,{fractionDigits,V}) -> + fractionDigits_fun(Type,list_to_integer(V)); +facet_fun(Type,F) -> + fun(_X_) -> + io:format("Warning: not valid facet on ~p ~p~n",[Type,F]) + end. + + +length_fun(T,V) + when T==string;T==normalizedString;T==token; + T=='Name';T=='NCName';T==language;T=='ID'; + T=='IDREF';T=='IDREFS';T=='ENTITY';T=='ENTITIES'; + T=='NMTOKEN';T=='NMTOKENS';T==anyURI -> + fun(Val) -> + case string:len(Val) == V of + true -> {ok,Val}; + false -> {error,{length,string:len(Val),should_be,V}} + end + end; +length_fun(T,_V) when T=='NOTATION';T=='QName' -> + fun(Val) -> + {ok,Val} + end; +length_fun(T,V) when T==base64Binary;T==hexBinary -> + fun(Val) -> + case length(Val)==V of + true -> {ok,Val}; + false -> {error,{length,length(Val),xhould_be,V}} + end + end; +length_fun(T,_V) -> + fun(_Val) -> + {error,{length_not_applicable_on,T}} + end. + +minLength_fun(T,V) + when T==string;T==normalizedString;T==token; + T=='Name';T=='NCName';T==language;T=='ID'; + T=='IDREF';T=='IDREFS';T=='ENTITY';T=='ENTITIES'; + T=='NMTOKEN';T=='NMTOKENS';T==anyURI -> + fun(Val) -> + case string:len(Val) >= V of + true -> {ok,Val}; + false -> {error,{minLength,string:len(Val),should_at_least_be,V}} + end + end; +minLength_fun(T,_V) when T=='NOTATION';T=='QName' -> + fun(Val) -> + {ok,Val} + end; +minLength_fun(T,V) when T==base64Binary;T==hexBinary -> + fun(Val) -> + case length(Val) >= V of + true -> {ok,Val}; + false -> {error,{minLength,length(Val),should_at_least_be,V}} + end + end; +minLength_fun(T,_V) -> + fun(_Val) -> + {error,{minLength_not_applicable_on,T}} + end. + +maxLength_fun(T,V) + when T==string;T==normalizedString;T==token; + T=='Name';T=='NCName';T==language;T=='ID'; + T=='IDREF';T=='IDREFS';T=='ENTITY';T=='ENTITIES'; + T=='NMTOKEN';T=='NMTOKENS';T==anyURI -> + fun(Val) -> + case length(Val) of + Len when Len =< V -> {ok,Val}; + _ -> {error,{maxLength,string:len(Val),should_not_be_more_than,V}} + end + end; +maxLength_fun(T,_V) when T=='NOTATION';T=='QName' -> + fun(Val) -> + {ok,Val} + end; +maxLength_fun(T,V) when T==base64Binary;T==hexBinary -> + fun(Val) -> + case length(Val) =< V of + true -> {ok,Val}; + false -> {error,{maxLength,length(Val),should_not_be_more_than,V}} + end + end; +maxLength_fun(T,_V) -> + fun(_Val) -> + {error,{maxLength_not_applicable_on,T}} + end. + +pattern_fun(_Type,RegExp) -> + case xmerl_regexp:setup(RegExp) of + {ok,RE} -> + fun(Val) -> + case xmerl_regexp:first_match(Val,RE) of + {match,_,_} -> {ok,Val}; + _ -> {error,{pattern_mismatch,Val,RegExp}} + end + end; + _ -> + fun(Val) -> + {error,{unsupported_pattern,Val,RegExp}} + end + end. + +enumeration_fun(_Type,V) -> + fun(Val) -> + case lists:member(Val,V) of + true -> {ok,Val}; + false -> {error,{enumeration,Val,should_be_one_of,V}} + end + end. + +whiteSpace_fun(_Type,"preserve") -> + fun(Val) -> + {ok,Val} + end; +whiteSpace_fun(_Type,"replace") -> + fun(Val) -> + {ok,?MODULE:replace_ws(Val,[])} + end; +whiteSpace_fun(_Type,"collapse") -> + fun(Val) -> + {ok,?MODULE:collapse_ws(Val)} + end. + +replace_ws([16#9|T],Acc) -> + replace_ws(T,[16#20|Acc]); +replace_ws([16#a|T],Acc) -> + replace_ws(T,[16#20|Acc]); +replace_ws([16#d|T],Acc) -> + replace_ws(T,[16#20|Acc]); +replace_ws([H|T],Acc) -> + replace_ws(T,[H|Acc]); +replace_ws([],Acc) -> + lists:reverse(Acc). + +collapse_ws(Val) -> + collapse_ws(lists:dropwhile(fun(WS) when ?is_whitespace(WS) ->true;(_) -> false end, + replace_ws(Val,[])),[]). +collapse_ws([16#20,16#20|T],Acc) -> + collapse_ws([16#20|T],Acc); +collapse_ws([H|T],Acc) -> + collapse_ws(T,[H|Acc]); +collapse_ws([],Acc) -> + lists:reverse(lists:dropwhile(fun($ ) ->true;(_) -> false end,Acc)). + +maxInclusive_fun(T,V) + when T==integer;T==positiveInteger;T==negativeInteger; + T==nonNegativeInteger;T==nonPositiveInteger;T==long; + T==unsignedLong;T==int;T==unsignedInt;T==short; + T==unsignedShort;T==byte;T==unsignedByte -> + fun(Val) -> + case (catch list_to_integer(Val) =< list_to_integer(V)) of + true -> + {ok,Val}; + _ -> + {error,{maxInclusive,Val,should_be_less_than_or_equal_with,V}} + end + end; +maxInclusive_fun(T,V) when T==decimal;T==float;T==double -> + fun(Val) -> + case ?MODULE:compare_floats(Val,V) of + gt -> + {error,{maxInclusive,Val,should_be_less_than_or_equal_with,V}}; + Err={error,_} -> Err; + _ -> + {ok,Val} + end + end; +maxInclusive_fun(T,V) when T==duration -> + fun(Val) -> + case ?MODULE:compare_durations(Val,V) of + GT when GT==gt;GT==indefinite -> + {error,{maxInclusive,Val,should_be_less_than_or_equal_with,V}}; + _ -> + {ok,Val} + end + end; +maxInclusive_fun(T,V) when T==dateTime -> + fun(Val) -> + case ?MODULE:compare_dateTime(Val,V) of + GT when GT==gt;GT==indefinite -> + {error,{maxInclusive,Val,should_be_less_than_or_equal_with,V}}; + _ -> + {ok,Val} + end + end; +maxInclusive_fun(T,_V) -> +%% when T==duration;T==dateTime;T==date;T==time;T==gYear;T==gYearMonth; +%% T==gMonth;T==gMonthDay;T==gDay -> + fun(_) -> {error,{maxInclusive,not_implemented_for,T}} end. + +maxExclusive_fun(T,V) + when T==integer;T==positiveInteger;T==negativeInteger; + T==nonNegativeInteger;T==nonPositiveInteger;T==long; + T==unsignedLong;T==int;T==unsignedInt;T==short; + T==unsignedShort;T==byte;T==unsignedByte -> + fun(Val) -> + case (catch list_to_integer(Val) < list_to_integer(V)) of + true -> + {ok,Val}; + _ -> + {error,{maxExclusive,Val,not_less_than,V}} + end + end; +maxExclusive_fun(T,V) when T==decimal;T==float;T==double -> + fun(Val) -> + case ?MODULE:compare_floats(Val,V) of + lt -> + {ok,Val}; + Err={error,_} -> Err; + _ -> + {error,{maxExclusive,Val,not_less_than,V}} + end + end; +maxExclusive_fun(T,V) when T==duration -> + fun(Val) -> + case ?MODULE:compare_durations(Val,V) of + lt -> + {ok,Val}; + _ -> + {error,{maxExclusive,Val,not_less_than,V}} + end + end; +maxExclusive_fun(T,V) when T==dateTime -> + fun(Val) -> + case ?MODULE:compare_dateTime(Val,V) of + lt -> + {ok,Val}; + _ -> + {error,{maxExclusive,Val,not_less_than,V}} + end + end; +maxExclusive_fun(T,_V) -> + fun(_) -> {error,{maxExclusive,not_implemented_for,T}} end. + +minExclusive_fun(T,V) + when T==integer;T==positiveInteger;T==negativeInteger; + T==nonNegativeInteger;T==nonPositiveInteger;T==long; + T==unsignedLong;T==int;T==unsignedInt;T==short; + T==unsignedShort;T==byte;T==unsignedByte -> + fun(Val) -> + case (catch list_to_integer(Val) > list_to_integer(V)) of + true -> + {ok,Val}; + _ -> + {error,{minExclusive,Val,not_greater_than,V}} + end + end; +minExclusive_fun(T,V) when T==decimal;T==float;T==double -> + fun(Val) -> + case ?MODULE:compare_floats(Val,V) of + gt -> + {ok,Val}; + Err={error,_} -> Err; + _ -> + {error,{minExclusive,Val,not_greater_than,V}} + end + end; +minExclusive_fun(T,V) when T==duration -> + fun(Val) -> + case ?MODULE:compare_durations(Val,V) of + gt -> + {ok,Val}; + _ -> + {error,{minExclusive,Val,not_greater_than,V}} + end + end; +minExclusive_fun(T,V) when T==dateTime -> + fun(Val) -> + case ?MODULE:compare_dateTime(Val,V) of + gt -> + {ok,Val}; + _ -> + {error,{minExclusive,Val,not_greater_than,V}} + end + end; +minExclusive_fun(T,_V) -> + fun(_) -> {error,{minExclusive,not_implemented_for,T}} end. + +minInclusive_fun(T,V) + when T==integer;T==positiveInteger;T==negativeInteger; + T==nonNegativeInteger;T==nonPositiveInteger;T==long; + T==unsignedLong;T==int;T==unsignedInt;T==short; + T==unsignedShort;T==byte;T==unsignedByte -> + fun(Val) -> + case (catch list_to_integer(Val) >= list_to_integer(V)) of + true -> + {ok,Val}; + _ -> + {error,{minInclusive,Val,not_greater_than_or_equal_with,V}} + end + end; +minInclusive_fun(T,V) when T==decimal;T==float;T==double -> + fun(Val) -> + case ?MODULE:compare_floats(Val,V) of + lt -> + {error,{minInclusive,Val,not_greater_than_or_equal_with,V}}; + Err={error,_} -> Err; + _ -> + {ok,Val} + end + end; +minInclusive_fun(T,V) when T==duration -> + fun(Val) -> + case ?MODULE:compare_durations(Val,V) of + lt -> + {error,{minInclusive,Val,not_greater_than_or_equal_with,V}}; + _ -> + {ok,Val} + end + end; +minInclusive_fun(T,V) when T==dateTime -> + fun(Val) -> + case ?MODULE:compare_dateTime(Val,V) of + lt -> + {error,{minInclusive,Val,not_greater_than_or_equal_with,V}}; + _ -> + {ok,Val} + end + end; +minInclusive_fun(T,_V) -> + fun(_) -> {error,{minInclusive,not_implemented_for,T}} end. + +totalDigits_fun(T,V) + when T==integer;T==positiveInteger;T==negativeInteger;T==nonNegativeInteger; + T==nonPositiveInteger;T==long;T==unsignedLong;T==int;T==unsignedInt; + T==short;T==unsignedShort;T==byte;T==unsignedByte;T==decimal -> + %% Val is expressible as i * 10^-n where i and n are integers + %% such that |i| < 10^Val and 0 =< n =< Val. + fun(Val) -> + Pred = fun($0)-> true; + (_) -> false + end, + Val2 = lists:dropwhile(Pred,Val), + Length = + case lists:member($.,Val2) of + true -> + length(lists:dropwhile(Pred,lists:reverse(Val2))) -1; + _ -> + length(Val2) + end, + if + Length =< V -> + {ok,Val}; + true -> + {error,{totalDigits,Length,to_many_digits}} + end + end; +totalDigits_fun(T,_V) -> + fun(_) -> {error,{totalDigits,not_applicable,T}} end. + +fractionDigits_fun(T,V) + when T==integer;T==positiveInteger;T==negativeInteger;T==nonNegativeInteger; + T==nonPositiveInteger;T==long;T==unsignedLong;T==int;T==unsignedInt; + T==short;T==unsignedShort;T==byte;T==unsignedByte;T==decimal -> + fun(Val) -> + Len = + case string:tokens(Val,".") of + [_I,Frc] when T==decimal -> + Pred = fun($0)-> true; + (_) -> false + end, + length(lists:dropwhile(Pred,lists:reverse(Frc))); + _ -> + 0 + end, + if + Len =< V -> + {ok,Val}; + true -> + {error,{fractionDigits,Len,to_many_digits_in,Val}} + end + end; +fractionDigits_fun(T,_V) -> + fun(_) -> {error,{fractionDigits,not_applicable,T}} end. + + +%% The relation between F1 and F2 may be eq,lt or gt. +%% lt: F1 < F2 +%% gt: F1 > F2 +compare_floats(F1,F2) when F1=="NaN";F2=="NaN" -> + {error,{not_comparable}}; +compare_floats(F1,F1) -> + eq; +compare_floats(F1,F2) when F1=="INF";F2=="-INF" -> + gt; +compare_floats(F1,F2) when F1=="-INF";F2=="INF" -> + lt; +compare_floats(Str1,Str2) -> + F1={S1,_B1,_D1,_E1} = str_to_float(Str1), + F2={S2,_B2,_D2,_E2} = str_to_float(Str2), +% io:format("F1: ~p~nF2: ~p~n",[F1,F2]), + if + S1=='-',S2=='+' -> lt; + S1=='+',S2=='-' -> gt; +% B1<0 -> compare_floats2(F2,F1); + true -> compare_floats2(F1,F2) + end. +compare_floats2({S1,B1,D1,E1},{_S2,B2,D2,E2}) when B1==0;B2==0 -> + I1 = pow(B1,D1,E1), + I2 = pow(B2,D2,E2), + if I1 > I2 -> + sign(S1,gt); + I1 < I2 -> + sign(S1,lt); + true -> + eq + end; +compare_floats2({S1,B1,D1,E1},{_S2,B2,D2,E2}) -> + %% S1 and S2 have same sign. + I1 = pow(B1,E1),% B1 * round(math:pow(10,E1)), + I2 = pow(B2,E2),%B2 * round(math:pow(10,E2)), + if + I1 > I2 -> sign(S1,gt); + I1 < I2 -> sign(S1,lt); + true -> + %% fractions are compared in lexicographic order + if + D1 == D2 -> eq; + D1 < D2 -> sign(S1,lt); + D1 > D2 -> sign(S1,gt) + end + end. + +str_to_float(String) -> + {Sign,Str} = + case String of + "-"++Str1 -> {'-',Str1}; + _ -> {'+',String} + end, + case string:tokens(Str,".") of + [B,DE] -> + case string:tokens(DE,"Ee") of + [D,E] -> + %% round(math:pow(10,list_to_integer(E))) + {Sign,list_to_integer(B), + remove_trailing_zeros(D), + list_to_integer(E)}; + [D] -> + {Sign,list_to_integer(B), + remove_trailing_zeros(D),0} + end; + [B] -> %% could also be 1E4, but no fraction + case string:tokens(Str,"Ee") of + [I,E] -> + {Sign,list_to_integer(I),"0",list_to_integer(E)}; + _ -> + {Sign,list_to_integer(B),"0",0} + end + end. + +pow(Mantissa,Exponent) -> + case (Mantissa * math:pow(10,Exponent)) of + I when I<1 -> + I; + I -> round(I) + end. + +pow(Mantissa,Fraction,Exponent) -> + (Mantissa * math:pow(10,Exponent)) + + (list_to_integer(Fraction) * math:pow(10,Exponent-length(Fraction))). + +sign('-',gt) -> + lt; +sign('-',lt) -> + gt; +sign(_,Rel) -> + Rel. + +remove_trailing_zeros(Str) -> + Pred = fun($0) ->true;(_) ->false end, + case lists:reverse(lists:dropwhile(Pred,lists:reverse(Str))) of + [] -> + "0"; + Fr -> Fr + end. + + +%% when T==duration;T==dateTime;T==date;T==time;T==gYear;T==gYearMonth; +%% T==gMonth;T==gMonthDay;T==gDay -> + +%% compare_duration(V1,V2) compares V1 to V2 +%% returns gt | lt | eq | indefinite +%% ex: V1 > V2 -> gt +%% +%% V1, V2 on format PnYnMnDTnHnMnS +%% P is always present +%% T is absent iff all time items are absent +%% compare_duration(V1,V2) -> +%% {Y1,M1,D1,H1,M1,S1} = duration_atoms(V1), +%% {Y2,M2,D2,H2,M2,S2} = duration_atoms(V2), +%% YearDiff = Y1 - Y2, +%% MonthsDiff = M1 - M2, +%% DaysDiff = D1 - D2, +compare_durations(V1,V2) -> + %% Four reference dateTimes are used, see XMLSchema part 2, + %% 3.2.6.2. + %% "The order-relation of two duration values x and y is x < y iff + %% s+x < s+y for each qualified dateTime s in the list below." + Ref1_dateTime = {1696,9,1,0,0,0,{pos,0,0}},%1696-09-01T00:00:00Z + Ref2_dateTime = {1697,2,1,0,0,0,{pos,0,0}},%1697-02-01T00:00:00Z + Ref3_dateTime = {1903,3,1,0,0,0,{pos,0,0}},%1903-03-01T00:00:00Z + Ref4_dateTime = {1903,7,1,0,0,0,{pos,0,0}},%1903-07-01T00:00:00Z + CmpRes1=compare_dateTime(normalize_dateTime(add_duration2dateTime(Ref1_dateTime,V1)), + normalize_dateTime(add_duration2dateTime(Ref1_dateTime,V2))), + CmpRes2=compare_dateTime(normalize_dateTime(add_duration2dateTime(Ref2_dateTime,V1)), + normalize_dateTime(add_duration2dateTime(Ref2_dateTime,V2))), + CmpRes3=compare_dateTime(normalize_dateTime(add_duration2dateTime(Ref3_dateTime,V1)), + normalize_dateTime(add_duration2dateTime(Ref3_dateTime,V2))), + CmpRes4=compare_dateTime(normalize_dateTime(add_duration2dateTime(Ref4_dateTime,V1)), + normalize_dateTime(add_duration2dateTime(Ref4_dateTime,V2))), + if + CmpRes1==CmpRes2, + CmpRes1==CmpRes3, + CmpRes1==CmpRes4 -> + CmpRes1; + true -> indefinite + end. + + +compare_dateTime(DT1={_,_,_,_,_,_,Z},DT2={_,_,_,_,_,_,Z}) -> + case DT1<DT2 of + true -> lt; + _ -> + case DT1>DT2 of + true -> + gt; + _ -> eq + end + end; +%% If P contains a time zone and Q does not, compare as follows: +%% 1. P < Q if P < (Q with time zone +14:00) +%% 2. P > Q if P > (Q with time zone -14:00) +%% 3. P <> Q otherwise, that is, if (Q with time zone +14:00) < P < +%% (Q with time zone -14:00) +compare_dateTime(P={_,_,_,_,_,_,{_,_,_}},_Q={Y,M,D,H,Min,S,none}) -> + case compare_dateTime(P,normalize_dateTime({Y,M,D,H,Min,S,{pos,14,0}})) of + lt -> + lt; + _ -> + case compare_dateTime(P,normalize_dateTime({Y,M,D,H,Min,S,{neg,14,0}})) of + gt -> + gt; + _ -> + indefinite + end + end; +%% If P does not contain a time zone and Q does, compare as follows: +%% 1. P < Q if (P with time zone -14:00) < Q. +%% 2. P > Q if (P with time zone +14:00) > Q. +%% 3. P <> Q otherwise, that is, if (P with time zone +14:00) < Q < +%% (P with time zone -14:00) +compare_dateTime(_P={Y,M,D,H,Min,S,none},Q={_,_,_,_,_,_,{_,_,_}}) -> + case compare_dateTime(normalize_dateTime({Y,M,D,H,Min,S,{neg,14,0}}),Q) of + lt -> + lt; + _ -> + case compare_dateTime(normalize_dateTime({Y,M,D,H,Min,S,{pos,14,0}}),Q) of + gt -> + gt; + _ -> + indefinite + end + end; +compare_dateTime(P,Q) when is_list(P) -> + compare_dateTime(normalize_dateTime(dateTime_atoms(P)),Q); +compare_dateTime(P,Q) when is_list(Q) -> + compare_dateTime(P,normalize_dateTime(dateTime_atoms(Q))); +compare_dateTime(_P,_Q) -> + indefinite. + +fQuotient(A,B) when is_float(A) -> + fQuotient(floor(A),B); +fQuotient(A,B) when is_float(B) -> + fQuotient(A,floor(B)); +fQuotient(A,B) when A >= 0, B >= 0 -> + A div B; +fQuotient(A,B) when A < 0, B < 0 -> + A div B; +fQuotient(A,B) -> + case A rem B of + 0 -> + A div B; + _ -> + (A div B) -1 + end. + +fQuotient(A, Low, High) -> + fQuotient(A - Low, High - Low). + +floor(A) -> + case round(A) of + I when I > A -> + I - 1; + I -> I + end. + +modulo(A,B) -> + A - (fQuotient(A,B) * B). + +modulo(A, Low, High) -> + modulo(A - Low, High - Low) + Low. + +maximumDayInMonthFor(YearValue, MonthValue) -> + M = modulo(MonthValue, 1, 13), + Y = YearValue + fQuotient(MonthValue, 1, 13), + monthValue(M,Y). + +monthValue(M,_Y) when M==1;M==3;M==5;M==7;M==8;M==10;M==12 -> + 31; +monthValue(M,_Y) when M==4;M==6;M==9;M==11 -> + 30; +monthValue(_M,Y) -> + case modulo(Y,400) of + 0 -> + 29; + _ -> + case {modulo(Y,100) /= 0,modulo(Y,4)} of + {true,0} -> + 29; + _ -> + 28 + end + end. + +%% S dateTime, D duration +%% result is E dateTime, end of time period with start S and duration +%% D. E = S + D. +add_duration2dateTime(S,D) when is_list(S),is_list(D) -> + Satoms = dateTime_atoms(S), + case duration_atoms(D) of + Datoms = {_,_,_,_,_,_} -> + add_duration2dateTime2(Satoms,Datoms); + Err -> + {error,Err} + end; +add_duration2dateTime(S={_,_,_,_,_,_,_},D) -> + case duration_atoms(D) of + Datoms = {_,_,_,_,_,_} -> + add_duration2dateTime2(S,Datoms); + Err -> + {error,Err} + end. + +add_duration2dateTime2({Syear,Smonth,Sday,Shour,Sminute,Ssec,Szone}, + {Dyears,Dmonths,Ddays,Dhours,Dminutes,Dsecs}) -> + + %% months + Temp1 = Smonth + Dmonths, + Emonth = modulo(Temp1,1,13), + Carry1 = fQuotient(Temp1,1,13), + + %% years + Eyear = Syear + Dyears + Carry1, + + %% seconds + Temp2 = Ssec + Dsecs, + Esecs = modulo(Temp2,60), + Carry2 = fQuotient(Temp2,60), + + %% minutes + Temp3 = Sminute + Dminutes + Carry2, + Eminute = modulo(Temp3,60), + Carry3 = fQuotient(Temp3,60), + + %% hours + Temp4 = Shour + Dhours + Carry3, + Ehour = modulo(Temp4,24), + Carry4 = fQuotient(Temp4,24), + + %% days + TempDays = + case maximumDayInMonthFor(Eyear,Emonth) of + MaxDay when Sday > MaxDay -> + MaxDay; + _ -> + case Sday < 1 of + true -> + 1; + _ -> + Sday + end + end, + {Eyear2,Emonth2,Eday} = + carry_loop(TempDays+Ddays+Carry4,Emonth,Eyear), + {Eyear2,Emonth2,Eday,Ehour,Eminute,Esecs,Szone}. + +carry_loop(Eday,Emonth,Eyear) when Eday < 1 -> + carry_loop(Eday + maximumDayInMonthFor(Eyear,Emonth - 1), + modulo(Emonth - 1,1,13), + Eyear + fQuotient(Emonth - 1,1,13)); +carry_loop(Eday,Emonth,Eyear) -> + case maximumDayInMonthFor(Eyear,Emonth) of + MaxD when Eday > MaxD -> + carry_loop(Eday - maximumDayInMonthFor(Eyear,Emonth), + modulo(Emonth + 1,1,13), + Eyear + fQuotient(Emonth+1,1,13)); + _ -> + {Eyear,Emonth,Eday} + end. + +%% Format: '-'? yyyy '-' mm '-' dd 'T' hh ':' mm ':' ss ('.' s+)? (zzzzzz)? +dateTime_atoms("-" ++ DT) -> + dateTime_atoms(DT,neg); +dateTime_atoms(DT) -> + dateTime_atoms(DT,pos). +dateTime_atoms(S,Sign) -> + [Date,TimeZone] = string:tokens(S,"T"), + [YY,MM,DD] = string:tokens(Date,"-"), + {Zone,ZoneSign,[Hour,Min,Sec]} = + case lists:reverse(TimeZone) of + "Z"++_ -> + {"Z",pos,string:tokens(TimeZone,"Z:")}; + _ -> + ZS = zone_sign(TimeZone), + case string:tokens(TimeZone,"-+") of + [Time,Z] -> + {Z,ZS,string:tokens(Time,":")}; + [Time] -> + {none,ZS,string:tokens(Time,":")} + end + end, + {set_sign(Sign,YY),list_to_integer(MM),list_to_integer(DD), + list_to_integer(Hour),list_to_integer(Min),sign_sec(pos,Sec), + zone_atoms(ZoneSign,Zone)}. + +zone_sign(TimeZone) -> + case lists:member($-,TimeZone) of + true -> + neg; + _ -> + pos + end. + +zone_atoms(_Sign,"Z") -> + {pos,0,0}; +zone_atoms(Sign,Zone) when is_list(Zone) -> + case string:tokens(Zone,":") of + [H,M] -> + {Sign,list_to_integer(H),list_to_integer(M)}; + _ -> none + end; +zone_atoms(_Sign,Zone) -> + Zone. + + +%% Format: '-'? PnYnMnDTnHnMnS +duration_atoms("-P"++Dur) -> + duration_atoms2(Dur,neg); +duration_atoms("P"++Dur) -> + duration_atoms2(Dur,pos); +duration_atoms(Dur) -> + {illegal_duration,Dur}. +duration_atoms2(Dur,Sign) -> + case lists:member($T,Dur) of + true -> %% time atoms exists + case string:tokens(Dur,"T") of + [Date,Time] -> + case duration_atoms_date(Date) of + {Y,M,D} -> + case duration_atoms_time(Time) of + {Hour,Min,Sec} -> + {set_sign(Sign,Y),set_sign(Sign,M), + set_sign(Sign,D),set_sign(Sign,Hour), + set_sign(Sign,Min),sign_sec(Sign,Sec)}; + Err -> + Err + end; + Err -> + Err + end; + [Time] -> + case duration_atoms_time(Time) of + {Hour,Min,Sec} -> + {0,0,0,set_sign(Sign,Hour),set_sign(Sign,Min), + sign_sec(Sign,Sec)}; + Err -> + Err + end; + Err -> + {illegal_duration,Err} + end; + _ -> %% only date coomponents + {Y,M,D} = duration_atoms_date(Dur), + {set_sign(Sign,Y),set_sign(Sign,M),set_sign(Sign,D),0,0,0} + end. + +duration_atoms_date(Date) -> + {Y,Date2} = get_digit(Date,$Y), + {M,Date3} = get_digit(Date2,$M), + {D,Rest} = get_digit(Date3,$D), + case Rest of + "" -> {Y,M,D}; + Err -> {illegal_duration,Err} + end. +duration_atoms_time(Time) -> + {H,Time2} = get_digit(Time,$H), + {M,Time3} = get_digit(Time2,$M), + {S,Rest} = get_sec(Time3), + case Rest of + "" -> + {H,M,S}; + Err -> + {illegal_duration,Err} + end. + +get_digit(Str,Delim) -> + get_digit(Str,Delim,[],Str). +get_digit([Delim|T],Delim,Acc,_Str) -> + {lists:reverse(Acc),T}; +get_digit([H|T],Delim,Acc,Str) when H>=$0,H=<$9 -> + get_digit(T,Delim,[H|Acc],Str); +get_digit([],_,[],_Str) -> + {"0",[]}; +get_digit([],_,_,Str) -> + {"0",Str}; +get_digit(_,_,_,Str) -> + %% this matches both the case when reaching another delimeter and + %% when the string already are emptied. + {"0",Str}. + +get_sec([]) -> + {"0",[]}; +get_sec(Str) -> + get_sec(Str,[],Str). +get_sec([H|T],Acc,Str) when H>=$0,H=<$9 -> + get_sec(T,[H|Acc],Str); +get_sec([$.|T],Acc,Str) -> + get_sec(T,[$.|Acc],Str); +get_sec([$S|T],Acc,_) -> + {lists:reverse(Acc),T}; +get_sec(_,_,Str) -> + {"0",Str}. + + +set_sign(pos,Istr) -> + list_to_integer(Istr); +set_sign(_,Istr) -> + list_to_integer("-"++Istr). +sign_sec(pos,Sec) -> + case lists:member($.,Sec) of + true -> + list_to_float(Sec); + _ -> + list_to_integer(Sec) + end; +sign_sec(_,Sec) -> + sign_sec(pos,"-"++Sec). + +invert_sign(pos) -> + neg; +invert_sign(neg) -> + pos; +invert_sign(S) -> + S. + +normalize_dateTime({Y,M,D,Hour,Min,Sec,{Sign,ZH,ZM}}) -> + %% minutes + TmpMin = Min + set_sign(invert_sign(Sign),integer_to_list(ZM)), + NMin = modulo(TmpMin,60), + Carry1 = fQuotient(TmpMin,60), + + %% hours + TmpHour = Hour + set_sign(invert_sign(Sign),integer_to_list(ZH)) + Carry1, + NHour = modulo(TmpHour,24), + Carry2 = fQuotient(TmpHour,24), + + {NY,NM,ND} = + carry_loop(D+Carry2,M,Y), + {NY,NM,ND,NHour,NMin,Sec,{pos,0,0}}; +normalize_dateTime(DT) -> + DT. |