aboutsummaryrefslogtreecommitdiffstats
path: root/lib/docbuilder/src/docb_xmerl_tree_cb.erl
diff options
context:
space:
mode:
authorErlang/OTP <[email protected]>2009-11-20 14:54:40 +0000
committerErlang/OTP <[email protected]>2009-11-20 14:54:40 +0000
commit84adefa331c4159d432d22840663c38f155cd4c1 (patch)
treebff9a9c66adda4df2106dfd0e5c053ab182a12bd /lib/docbuilder/src/docb_xmerl_tree_cb.erl
downloadotp-84adefa331c4159d432d22840663c38f155cd4c1.tar.gz
otp-84adefa331c4159d432d22840663c38f155cd4c1.tar.bz2
otp-84adefa331c4159d432d22840663c38f155cd4c1.zip
The R13B03 release.OTP_R13B03
Diffstat (limited to 'lib/docbuilder/src/docb_xmerl_tree_cb.erl')
-rw-r--r--lib/docbuilder/src/docb_xmerl_tree_cb.erl343
1 files changed, 343 insertions, 0 deletions
diff --git a/lib/docbuilder/src/docb_xmerl_tree_cb.erl b/lib/docbuilder/src/docb_xmerl_tree_cb.erl
new file mode 100644
index 0000000000..d57f55bff8
--- /dev/null
+++ b/lib/docbuilder/src/docb_xmerl_tree_cb.erl
@@ -0,0 +1,343 @@
+%% ``The contents of this file are subject to the Erlang Public License,
+%% Version 1.1, (the "License"); you may not use this file except in
+%% compliance with the License. You should have received a copy of the
+%% Erlang Public License along with this software. If not, it can be
+%% retrieved via the world wide web at http://www.erlang.org/.
+%%
+%% Software distributed under the License is distributed on an "AS IS"
+%% basis, WITHOUT WARRANTY OF ANY KIND, either expressed or implied. See
+%% the Licence for the specific language governing rights and limitations
+%% under the License.
+%%
+%% The Initial Developer of the Original Code is Ericsson AB.
+%% Portions created by Ericsson are Copyright 1999-2006, Ericsson AB.
+%% All Rights Reserved.��
+%%
+%% $Id$
+%%
+-module(docb_xmerl_tree_cb).
+
+%% This is the XMerL callback module for exporting XML to the internal
+%% tree format used by DocBuilder.
+%% {Doc, _Misc} = xmerl_scan:file("file.xml", [{validation,true}])
+%% Tree = xmerl:export([Doc], docb_xmerl_tree_cb)
+
+-export(['#xml-inheritance#'/0]).
+
+-export(['#root#'/4,
+ '#text#'/1,
+ '#element#'/5]).
+-include("xmerl.hrl").
+
+%%--Functions used by xmerl---------------------------------------------
+
+'#xml-inheritance#'() ->
+ [].
+
+'#root#'(Data, _Attrs, [], _E) ->
+ Data.
+
+'#text#'(Text) ->
+ Text2 = strip_leading_blanks(Text),
+%% before
+%% case Text2 of
+%% [$\n|T] ->
+%% case is_empty(T) of
+%% true -> [];
+%% false -> {pcdata, [], nl(Text2)}
+%% end;
+%%
+%% _ ->
+%% {pcdata, [], nl(Text2)}
+%% end.
+%% after
+ {pcdata, [], nl(Text2)}.
+
+'#element#'(Tag, Data, Attrs, Parents, _E) when Tag==pre; Tag==code ->
+ [H|T] = reinsert_nl(Data),
+ NewData = [strip_nl(H)|T],
+ NewData2 = case Tag of
+ code ->
+ fix_single_pcdata(NewData);
+ pre ->
+ NewData
+ end,
+ {Tag, attrs(get_dtd(Parents), Tag, Attrs), NewData2};
+'#element#'(Tag, Data, Attrs, Parents, _E) ->
+ NewData = case tag_content(Tag) of
+ no_pcdata -> % remove all pcdata
+ [Dat||
+ Dat <- Data,
+ begin
+ Fun = fun({pcdata,_,_}) -> false;
+ (_) -> true end,
+ Fun(Dat)
+ end];
+ single_pcdata when length(Data)>1 ->
+ %% merge several pcdata's into one single pcdata
+ fix_single_pcdata(Data);
+ _ ->
+ lists:flatten(Data)
+ end,
+ {Tag, attrs(get_dtd(Parents), Tag, Attrs), NewData}.
+
+%%--Internal functions--------------------------------------------------
+
+%% is_empty(Str) -> bool()
+%% Returns true if the string Str only contains blanks, tabs and
+%% newlines, false otherwise.
+%% is_empty("\n" ++ Text) ->
+%% is_empty(Text);
+%% is_empty("\t" ++ Text) ->
+%% is_empty(Text);
+%% is_empty(" " ++ Text) ->
+%% is_empty(Text);
+%% is_empty("") ->
+%% true;
+%% is_empty(_) ->
+%% false.
+
+%% reinsert_nl(L1) -> L2
+%% Workaround for <pre>: Normally empty lines are ignored. However,
+%% Xmerl splits lines whenever it encounters an entity. In the case of
+%% <pre>, this may lead to that we ignores what we think is an empty
+%% line but is actually a line break that should be kept, for example
+%% in this case:
+%% <pre>
+%% <input>some command</input> <-- this line break is lost!
+%% &lt;some result&gt;
+%% </pre>
+%% This function reinserts line breaks where necessary.
+reinsert_nl([[]|T]) ->
+ [{pcdata,[],"\\n"} | reinsert_nl(T)];
+reinsert_nl([H|T]) ->
+ [H | reinsert_nl(T)];
+reinsert_nl([]) ->
+ [].
+
+%% sgmls treats line breaks in a way that DocBuilder relies on and
+%% which must be imitated here. Replace all "\n" with "\\n" and add
+%% "\n" to the end of each text element.
+nl("") ->
+ "\n";
+nl("\n"++Text) ->
+ "\\n"++nl(Text);
+nl([Ch|Text]) ->
+ [Ch|nl(Text)].
+
+
+%% strip_leading_blanks(Str) -> Str
+%% Leading spaces and tabs before a newline are always redundant
+%% and are therefore stripped of here
+%% If no newline is found the original string is returned unchanged
+
+strip_leading_blanks(Str) ->
+ strip_leading_blanks(Str,Str).
+
+strip_leading_blanks([],Str) ->
+ Str;
+strip_leading_blanks([$\s|T],Str) ->
+ strip_leading_blanks(T,Str);
+strip_leading_blanks([$\t|T],Str) ->
+ strip_leading_blanks(T,Str);
+strip_leading_blanks(Rest=[$\n|_],_) ->
+ Rest;
+strip_leading_blanks(_,Str) ->
+ Str.
+
+%% strip_nl(Str) -> Str
+%% The XMerL scan will often result in the contents of <pre> or <code>
+%% starting with a newline, as the format is normally:
+%% <pre>
+%% ..contents..
+%% </pre>
+%% However, this newline must be removed, or the resulting HTML will be
+%% <pre>
+%%
+%% ..content..
+%% </pre>
+strip_nl({pcdata,[],"\\n"++Str}) -> {pcdata,[],Str};
+strip_nl(E) -> E.
+
+get_dtd([]) ->
+ none;
+get_dtd(Parents) ->
+ {DTD, _} = lists:last(Parents),
+ DTD.
+
+%% attrs(DTD, Tag, GivenAttrs) -> AllAttrs
+%% DTD = Tag = atom() DTD and tag name
+%% GivenAttrs = [#xmlAttribute{}]
+%% AllAttrs = [{Name, Type, Val}]
+%% Name = string() (uppercase) Example: "VALIGN"
+%% Type = "CDATA" | "TOKEN"
+%% Val = string() (uppercase if type is "TOKEN", as-is otherwise)
+%% The XMerL scanning of <file>.xml renders only the given attributes.
+%% However, DocBuilder needs also the optional attributes (which not
+%% necessarily have been given), so we add them here, using the default
+%% values according to the DTDs.
+%% NOTE: Uses the information from the DTDs. That is, if some change is
+%% done to the DTDs, also this file must be updated. Ideally, the DTDs
+%% should be parsed automatically in some way.
+%% It can also be noted that this check is superfluous in the case where
+%% all attributes are required (except that the attributes are sorted
+%% in the same order as in the DTD) and where an optional attribute has
+%% type "CDATA" as no sensible default value can be specified in this
+%% case.
+attrs(DTD, Tag, GivenAttrs) ->
+ merge_attrs(Tag, default_attrs(DTD, Tag), GivenAttrs).
+
+merge_attrs(Tag, [{NameA, Type, DefVal}|Default], GivenAttrs) ->
+ Val = case lists:keysearch(NameA, #xmlAttribute.name, GivenAttrs) of
+ {value, #xmlAttribute{value=Val0}} -> Val0;
+ false -> DefVal
+ end,
+ Attr = {attr_name(NameA), Type, attr_val(Type, Val)},
+ [Attr | merge_attrs(Tag, Default, GivenAttrs)];
+merge_attrs(_Tag, [], _GivenAttrs) ->
+ [].
+
+attr_name(Atom) ->
+ string:to_upper(atom_to_list(Atom)).
+
+attr_val("CDATA", Val) -> Val;
+attr_val("TOKEN", Val) -> string:to_upper(Val).
+
+%% Given the DTD and element tag, return a list [{Name, Value}] where
+%% Name (atom) is the name of each possible attribute and
+%% Value (lowercase string) its default value.
+default_attrs(_, cell) ->
+ [{align, "TOKEN", "left"},
+ {valign, "TOKEN", "middle"}];
+default_attrs(_, cite) ->
+ [{id, "CDATA", ""}]; % required
+default_attrs(_, code) ->
+ [{type, "TOKEN", "none"}];
+default_attrs(_, codeinclude) ->
+ [{file, "CDATA", ""}, % required
+ {tag, "CDATA", ""},
+ {type, "TOKEN", "none"}];
+default_attrs(book, contents) ->
+ [{level, "TOKEN", "2"}];
+default_attrs(_, erleval) ->
+ [{expr, "CDATA", ""}]; % required
+default_attrs(report, erlinclude) ->
+ [{file, "CDATA", ""}, % required
+ {tag, "CDATA", ""}]; % required
+default_attrs(_, fascicule) ->
+ [{file, "CDATA", ""}, % required
+ {href, "CDATA", ""}, % required
+ {entry, "TOKEN", "no"}];
+default_attrs(book, header) ->
+ [{titlestyle, "TOKEN", "normal"}];
+default_attrs(_, image) ->
+ [{file, "CDATA", ""}]; % required
+default_attrs(_, include) ->
+ [{file, "CDATA", ""}]; % required
+default_attrs(report, index) ->
+ [{txt, "CDATA", ""}]; % required
+default_attrs(_, list) ->
+ [{type, "TOKEN", "bulleted"}];
+default_attrs(_, marker) ->
+ [{id, "CDATA", ""}]; % required
+default_attrs(book, onepart) ->
+ [{lift, "TOKEN", "no"}];
+default_attrs(book, parts) ->
+ [{lift, "TOKEN", "no"}];
+default_attrs(_, path) ->
+ [{unix, "CDATA", ""},
+ {windows, "CDATA", ""}];
+default_attrs(_, seealso) ->
+ [{marker, "CDATA", ""}]; % required
+default_attrs(report, table) ->
+ [{width, "CDATA", "0"},
+ {colspec, "CDATA", ""}];
+default_attrs(_, table) ->
+ [{align, "TOKEN", "center"}];
+default_attrs(_, term) ->
+ [{id, "CDATA", ""}]; % required
+default_attrs(book, theheader) ->
+ [{tag, "TOKEN", "none"}];
+default_attrs(bookinsidecover, theheader) ->
+ [{tag, "TOKEN", "none"}];
+default_attrs(_, url) ->
+ [{href, "CDATA", ""}]; % required
+default_attrs(_, _) -> [].
+
+%%--Single PCDATA broken into several fix-------------------------------
+
+%% When text contains an entity, then XMERL splits it into two
+%% PCDATA elements, the second starting with the entity.
+%%
+%% Example:
+%% Magnus Fr�berg => [{pcdata,[],"Magnus Fr\n"},{pcdata,[],"�berg\n"}]
+%%
+%% This is not handled by DocBuilder which expects many tags, for
+%% example title and aname, to contain a single PCDATA element. (That
+%% is also what nsgmls returned.)
+
+fix_single_pcdata([{pcdata,[],Str1}, {pcdata,[],Str2}|T]) ->
+ fix_single_pcdata([{pcdata,[],Str1++Str2}|T]);
+fix_single_pcdata(FixedData) ->
+ FixedData.
+
+tag_content(aname) -> single_pcdata;
+tag_content(app) -> single_pcdata;
+tag_content(approved) -> single_pcdata;
+tag_content(appsummary) -> single_pcdata;
+tag_content(b) -> single_pcdata;
+tag_content(c) -> single_pcdata;
+tag_content(cauthor) -> single_pcdata;
+tag_content(cell) -> mixed_content;
+tag_content(checked) -> single_pcdata;
+tag_content(chowpublished) -> single_pcdata;
+tag_content(code) -> single_pcdata; % mixed?
+tag_content(com) -> single_pcdata;
+tag_content(comsummary) -> single_pcdata;
+tag_content(copyright) -> mixed_content;
+tag_content(ctitle) -> single_pcdata;
+tag_content(d) -> mixed_content;
+tag_content(date) -> single_pcdata;
+tag_content(docno) -> single_pcdata;
+tag_content(em) -> mixed_content;
+tag_content(email) -> single_pcdata;
+tag_content(fascicule) -> single_pcdata;
+tag_content(file) -> single_pcdata;
+tag_content(filesummary) -> single_pcdata;
+tag_content(fsummary) -> mixed_content;
+tag_content(headline) -> single_pcdata;
+tag_content(holder) -> single_pcdata;
+tag_content(i) -> single_pcdata;
+tag_content(icaption) -> single_pcdata;
+tag_content(id) -> single_pcdata;
+tag_content(input) -> mixed_content;
+tag_content(item) -> mixed_content;
+tag_content(legalnotice) -> single_pcdata;
+tag_content(lib) -> single_pcdata;
+tag_content(libsummary) -> single_pcdata;
+tag_content(module) -> single_pcdata;
+tag_content(modulesummary) -> single_pcdata;
+tag_content(name) -> single_pcdata;
+tag_content(nametext) -> single_pcdata;
+tag_content(p) -> mixed_content;
+tag_content(pagetext) -> single_pcdata;
+tag_content(path) -> single_pcdata; % mixed?
+tag_content(pre) -> mixed_content;
+tag_content(prepared) -> single_pcdata;
+tag_content(resp) -> single_pcdata;
+tag_content(responsible) -> single_pcdata;
+tag_content(ret) -> single_pcdata;
+tag_content(rev) -> single_pcdata;
+tag_content(seealso) -> single_pcdata; % mixed?
+tag_content(shortdef) -> single_pcdata;
+tag_content(shorttitle) -> single_pcdata;
+tag_content(tag) -> mixed_content;
+tag_content(tcaption) -> single_pcdata;
+tag_content(termdef) -> single_pcdata;
+tag_content(title) -> single_pcdata;
+tag_content(url) -> single_pcdata; % mixed
+tag_content(v) -> single_pcdata;
+tag_content(year) -> single_pcdata;
+tag_content(_) -> no_pcdata.
+
+