diff options
Diffstat (limited to 'lib/docbuilder/src/docb_xmerl_tree_cb.erl')
-rw-r--r-- | lib/docbuilder/src/docb_xmerl_tree_cb.erl | 343 |
1 files changed, 343 insertions, 0 deletions
diff --git a/lib/docbuilder/src/docb_xmerl_tree_cb.erl b/lib/docbuilder/src/docb_xmerl_tree_cb.erl new file mode 100644 index 0000000000..d57f55bff8 --- /dev/null +++ b/lib/docbuilder/src/docb_xmerl_tree_cb.erl @@ -0,0 +1,343 @@ +%% ``The contents of this file are subject to the Erlang Public License, +%% Version 1.1, (the "License"); you may not use this file except in +%% compliance with the License. You should have received a copy of the +%% Erlang Public License along with this software. If not, it can be +%% retrieved via the world wide web at http://www.erlang.org/. +%% +%% Software distributed under the License is distributed on an "AS IS" +%% basis, WITHOUT WARRANTY OF ANY KIND, either expressed or implied. See +%% the Licence for the specific language governing rights and limitations +%% under the License. +%% +%% The Initial Developer of the Original Code is Ericsson AB. +%% Portions created by Ericsson are Copyright 1999-2006, Ericsson AB. +%% All Rights Reserved.�� +%% +%% $Id$ +%% +-module(docb_xmerl_tree_cb). + +%% This is the XMerL callback module for exporting XML to the internal +%% tree format used by DocBuilder. +%% {Doc, _Misc} = xmerl_scan:file("file.xml", [{validation,true}]) +%% Tree = xmerl:export([Doc], docb_xmerl_tree_cb) + +-export(['#xml-inheritance#'/0]). + +-export(['#root#'/4, + '#text#'/1, + '#element#'/5]). +-include("xmerl.hrl"). + +%%--Functions used by xmerl--------------------------------------------- + +'#xml-inheritance#'() -> + []. + +'#root#'(Data, _Attrs, [], _E) -> + Data. + +'#text#'(Text) -> + Text2 = strip_leading_blanks(Text), +%% before +%% case Text2 of +%% [$\n|T] -> +%% case is_empty(T) of +%% true -> []; +%% false -> {pcdata, [], nl(Text2)} +%% end; +%% +%% _ -> +%% {pcdata, [], nl(Text2)} +%% end. +%% after + {pcdata, [], nl(Text2)}. + +'#element#'(Tag, Data, Attrs, Parents, _E) when Tag==pre; Tag==code -> + [H|T] = reinsert_nl(Data), + NewData = [strip_nl(H)|T], + NewData2 = case Tag of + code -> + fix_single_pcdata(NewData); + pre -> + NewData + end, + {Tag, attrs(get_dtd(Parents), Tag, Attrs), NewData2}; +'#element#'(Tag, Data, Attrs, Parents, _E) -> + NewData = case tag_content(Tag) of + no_pcdata -> % remove all pcdata + [Dat|| + Dat <- Data, + begin + Fun = fun({pcdata,_,_}) -> false; + (_) -> true end, + Fun(Dat) + end]; + single_pcdata when length(Data)>1 -> + %% merge several pcdata's into one single pcdata + fix_single_pcdata(Data); + _ -> + lists:flatten(Data) + end, + {Tag, attrs(get_dtd(Parents), Tag, Attrs), NewData}. + +%%--Internal functions-------------------------------------------------- + +%% is_empty(Str) -> bool() +%% Returns true if the string Str only contains blanks, tabs and +%% newlines, false otherwise. +%% is_empty("\n" ++ Text) -> +%% is_empty(Text); +%% is_empty("\t" ++ Text) -> +%% is_empty(Text); +%% is_empty(" " ++ Text) -> +%% is_empty(Text); +%% is_empty("") -> +%% true; +%% is_empty(_) -> +%% false. + +%% reinsert_nl(L1) -> L2 +%% Workaround for <pre>: Normally empty lines are ignored. However, +%% Xmerl splits lines whenever it encounters an entity. In the case of +%% <pre>, this may lead to that we ignores what we think is an empty +%% line but is actually a line break that should be kept, for example +%% in this case: +%% <pre> +%% <input>some command</input> <-- this line break is lost! +%% <some result> +%% </pre> +%% This function reinserts line breaks where necessary. +reinsert_nl([[]|T]) -> + [{pcdata,[],"\\n"} | reinsert_nl(T)]; +reinsert_nl([H|T]) -> + [H | reinsert_nl(T)]; +reinsert_nl([]) -> + []. + +%% sgmls treats line breaks in a way that DocBuilder relies on and +%% which must be imitated here. Replace all "\n" with "\\n" and add +%% "\n" to the end of each text element. +nl("") -> + "\n"; +nl("\n"++Text) -> + "\\n"++nl(Text); +nl([Ch|Text]) -> + [Ch|nl(Text)]. + + +%% strip_leading_blanks(Str) -> Str +%% Leading spaces and tabs before a newline are always redundant +%% and are therefore stripped of here +%% If no newline is found the original string is returned unchanged + +strip_leading_blanks(Str) -> + strip_leading_blanks(Str,Str). + +strip_leading_blanks([],Str) -> + Str; +strip_leading_blanks([$\s|T],Str) -> + strip_leading_blanks(T,Str); +strip_leading_blanks([$\t|T],Str) -> + strip_leading_blanks(T,Str); +strip_leading_blanks(Rest=[$\n|_],_) -> + Rest; +strip_leading_blanks(_,Str) -> + Str. + +%% strip_nl(Str) -> Str +%% The XMerL scan will often result in the contents of <pre> or <code> +%% starting with a newline, as the format is normally: +%% <pre> +%% ..contents.. +%% </pre> +%% However, this newline must be removed, or the resulting HTML will be +%% <pre> +%% +%% ..content.. +%% </pre> +strip_nl({pcdata,[],"\\n"++Str}) -> {pcdata,[],Str}; +strip_nl(E) -> E. + +get_dtd([]) -> + none; +get_dtd(Parents) -> + {DTD, _} = lists:last(Parents), + DTD. + +%% attrs(DTD, Tag, GivenAttrs) -> AllAttrs +%% DTD = Tag = atom() DTD and tag name +%% GivenAttrs = [#xmlAttribute{}] +%% AllAttrs = [{Name, Type, Val}] +%% Name = string() (uppercase) Example: "VALIGN" +%% Type = "CDATA" | "TOKEN" +%% Val = string() (uppercase if type is "TOKEN", as-is otherwise) +%% The XMerL scanning of <file>.xml renders only the given attributes. +%% However, DocBuilder needs also the optional attributes (which not +%% necessarily have been given), so we add them here, using the default +%% values according to the DTDs. +%% NOTE: Uses the information from the DTDs. That is, if some change is +%% done to the DTDs, also this file must be updated. Ideally, the DTDs +%% should be parsed automatically in some way. +%% It can also be noted that this check is superfluous in the case where +%% all attributes are required (except that the attributes are sorted +%% in the same order as in the DTD) and where an optional attribute has +%% type "CDATA" as no sensible default value can be specified in this +%% case. +attrs(DTD, Tag, GivenAttrs) -> + merge_attrs(Tag, default_attrs(DTD, Tag), GivenAttrs). + +merge_attrs(Tag, [{NameA, Type, DefVal}|Default], GivenAttrs) -> + Val = case lists:keysearch(NameA, #xmlAttribute.name, GivenAttrs) of + {value, #xmlAttribute{value=Val0}} -> Val0; + false -> DefVal + end, + Attr = {attr_name(NameA), Type, attr_val(Type, Val)}, + [Attr | merge_attrs(Tag, Default, GivenAttrs)]; +merge_attrs(_Tag, [], _GivenAttrs) -> + []. + +attr_name(Atom) -> + string:to_upper(atom_to_list(Atom)). + +attr_val("CDATA", Val) -> Val; +attr_val("TOKEN", Val) -> string:to_upper(Val). + +%% Given the DTD and element tag, return a list [{Name, Value}] where +%% Name (atom) is the name of each possible attribute and +%% Value (lowercase string) its default value. +default_attrs(_, cell) -> + [{align, "TOKEN", "left"}, + {valign, "TOKEN", "middle"}]; +default_attrs(_, cite) -> + [{id, "CDATA", ""}]; % required +default_attrs(_, code) -> + [{type, "TOKEN", "none"}]; +default_attrs(_, codeinclude) -> + [{file, "CDATA", ""}, % required + {tag, "CDATA", ""}, + {type, "TOKEN", "none"}]; +default_attrs(book, contents) -> + [{level, "TOKEN", "2"}]; +default_attrs(_, erleval) -> + [{expr, "CDATA", ""}]; % required +default_attrs(report, erlinclude) -> + [{file, "CDATA", ""}, % required + {tag, "CDATA", ""}]; % required +default_attrs(_, fascicule) -> + [{file, "CDATA", ""}, % required + {href, "CDATA", ""}, % required + {entry, "TOKEN", "no"}]; +default_attrs(book, header) -> + [{titlestyle, "TOKEN", "normal"}]; +default_attrs(_, image) -> + [{file, "CDATA", ""}]; % required +default_attrs(_, include) -> + [{file, "CDATA", ""}]; % required +default_attrs(report, index) -> + [{txt, "CDATA", ""}]; % required +default_attrs(_, list) -> + [{type, "TOKEN", "bulleted"}]; +default_attrs(_, marker) -> + [{id, "CDATA", ""}]; % required +default_attrs(book, onepart) -> + [{lift, "TOKEN", "no"}]; +default_attrs(book, parts) -> + [{lift, "TOKEN", "no"}]; +default_attrs(_, path) -> + [{unix, "CDATA", ""}, + {windows, "CDATA", ""}]; +default_attrs(_, seealso) -> + [{marker, "CDATA", ""}]; % required +default_attrs(report, table) -> + [{width, "CDATA", "0"}, + {colspec, "CDATA", ""}]; +default_attrs(_, table) -> + [{align, "TOKEN", "center"}]; +default_attrs(_, term) -> + [{id, "CDATA", ""}]; % required +default_attrs(book, theheader) -> + [{tag, "TOKEN", "none"}]; +default_attrs(bookinsidecover, theheader) -> + [{tag, "TOKEN", "none"}]; +default_attrs(_, url) -> + [{href, "CDATA", ""}]; % required +default_attrs(_, _) -> []. + +%%--Single PCDATA broken into several fix------------------------------- + +%% When text contains an entity, then XMERL splits it into two +%% PCDATA elements, the second starting with the entity. +%% +%% Example: +%% Magnus Fr�berg => [{pcdata,[],"Magnus Fr\n"},{pcdata,[],"�berg\n"}] +%% +%% This is not handled by DocBuilder which expects many tags, for +%% example title and aname, to contain a single PCDATA element. (That +%% is also what nsgmls returned.) + +fix_single_pcdata([{pcdata,[],Str1}, {pcdata,[],Str2}|T]) -> + fix_single_pcdata([{pcdata,[],Str1++Str2}|T]); +fix_single_pcdata(FixedData) -> + FixedData. + +tag_content(aname) -> single_pcdata; +tag_content(app) -> single_pcdata; +tag_content(approved) -> single_pcdata; +tag_content(appsummary) -> single_pcdata; +tag_content(b) -> single_pcdata; +tag_content(c) -> single_pcdata; +tag_content(cauthor) -> single_pcdata; +tag_content(cell) -> mixed_content; +tag_content(checked) -> single_pcdata; +tag_content(chowpublished) -> single_pcdata; +tag_content(code) -> single_pcdata; % mixed? +tag_content(com) -> single_pcdata; +tag_content(comsummary) -> single_pcdata; +tag_content(copyright) -> mixed_content; +tag_content(ctitle) -> single_pcdata; +tag_content(d) -> mixed_content; +tag_content(date) -> single_pcdata; +tag_content(docno) -> single_pcdata; +tag_content(em) -> mixed_content; +tag_content(email) -> single_pcdata; +tag_content(fascicule) -> single_pcdata; +tag_content(file) -> single_pcdata; +tag_content(filesummary) -> single_pcdata; +tag_content(fsummary) -> mixed_content; +tag_content(headline) -> single_pcdata; +tag_content(holder) -> single_pcdata; +tag_content(i) -> single_pcdata; +tag_content(icaption) -> single_pcdata; +tag_content(id) -> single_pcdata; +tag_content(input) -> mixed_content; +tag_content(item) -> mixed_content; +tag_content(legalnotice) -> single_pcdata; +tag_content(lib) -> single_pcdata; +tag_content(libsummary) -> single_pcdata; +tag_content(module) -> single_pcdata; +tag_content(modulesummary) -> single_pcdata; +tag_content(name) -> single_pcdata; +tag_content(nametext) -> single_pcdata; +tag_content(p) -> mixed_content; +tag_content(pagetext) -> single_pcdata; +tag_content(path) -> single_pcdata; % mixed? +tag_content(pre) -> mixed_content; +tag_content(prepared) -> single_pcdata; +tag_content(resp) -> single_pcdata; +tag_content(responsible) -> single_pcdata; +tag_content(ret) -> single_pcdata; +tag_content(rev) -> single_pcdata; +tag_content(seealso) -> single_pcdata; % mixed? +tag_content(shortdef) -> single_pcdata; +tag_content(shorttitle) -> single_pcdata; +tag_content(tag) -> mixed_content; +tag_content(tcaption) -> single_pcdata; +tag_content(termdef) -> single_pcdata; +tag_content(title) -> single_pcdata; +tag_content(url) -> single_pcdata; % mixed +tag_content(v) -> single_pcdata; +tag_content(year) -> single_pcdata; +tag_content(_) -> no_pcdata. + + |