aboutsummaryrefslogblamecommitdiffstats
path: root/lib/docbuilder/src/docb_xmerl_tree_cb.erl
blob: bc6206923013c091e585964bcd151038ae603380 (plain) (tree)




























































































































































































                                                                         
                                                                      





















































































































































                                                                        
%% ``The contents of this file are subject to the Erlang Public License,
%% Version 1.1, (the "License"); you may not use this file except in
%% compliance with the License. You should have received a copy of the
%% Erlang Public License along with this software. If not, it can be
%% retrieved via the world wide web at http://www.erlang.org/.
%%
%% Software distributed under the License is distributed on an "AS IS"
%% basis, WITHOUT WARRANTY OF ANY KIND, either expressed or implied. See
%% the Licence for the specific language governing rights and limitations
%% under the License.
%%
%% The Initial Developer of the Original Code is Ericsson AB.
%% Portions created by Ericsson are Copyright 1999-2006, Ericsson AB.
%% All Rights Reserved.��
%%
%%     $Id$
%%
-module(docb_xmerl_tree_cb).

%% This is the XMerL callback module for exporting XML to the internal
%% tree format used by DocBuilder.
%%   {Doc, _Misc} = xmerl_scan:file("file.xml", [{validation,true}])
%%   Tree = xmerl:export([Doc], docb_xmerl_tree_cb)

-export(['#xml-inheritance#'/0]).

-export(['#root#'/4,
	 '#text#'/1,
	 '#element#'/5]).
-include("xmerl.hrl").

%%--Functions used by xmerl---------------------------------------------

'#xml-inheritance#'() ->
    [].

'#root#'(Data, _Attrs, [], _E) ->
    Data.

'#text#'(Text) ->
    Text2 = strip_leading_blanks(Text),
%% before
%%    case Text2 of
%%	[$\n|T] ->
%%	    case is_empty(T) of 
%%		true -> [];
%%		false -> {pcdata, [], nl(Text2)}
%%	    end;
%%
%%	_ ->
%%	    {pcdata, [], nl(Text2)}
%%    end.
%% after
    {pcdata, [], nl(Text2)}.

'#element#'(Tag, Data, Attrs, Parents, _E) when Tag==pre; Tag==code ->
    [H|T] = reinsert_nl(Data),
    NewData = [strip_nl(H)|T],
    NewData2 = case Tag of
		   code ->
		       fix_single_pcdata(NewData);
		   pre ->
		       NewData
	       end,
    {Tag, attrs(get_dtd(Parents), Tag, Attrs), NewData2};
'#element#'(Tag, Data, Attrs, Parents, _E) ->
    NewData = case tag_content(Tag) of
		  no_pcdata -> % remove all pcdata
		      [Dat|| 
			  Dat <- Data, 
			  begin 
			      Fun = fun({pcdata,_,_}) -> false;
				       (_) -> true end,
			      Fun(Dat)
			  end];
		  single_pcdata when length(Data)>1 -> 
		      %% merge several pcdata's into one single pcdata
		      fix_single_pcdata(Data);
		  _ ->
		      lists:flatten(Data)
	      end,
    {Tag, attrs(get_dtd(Parents), Tag, Attrs), NewData}. 

%%--Internal functions--------------------------------------------------

%% is_empty(Str) -> bool()
%% Returns true if the string Str only contains blanks, tabs and
%% newlines, false otherwise.
%% is_empty("\n" ++ Text) ->
%%     is_empty(Text);
%% is_empty("\t" ++ Text) ->
%%     is_empty(Text);
%% is_empty(" " ++ Text) ->
%%     is_empty(Text);
%% is_empty("") ->
%%     true;
%% is_empty(_) ->
%%     false.

%% reinsert_nl(L1) -> L2
%% Workaround for <pre>: Normally empty lines are ignored. However,
%% Xmerl splits lines whenever it encounters an entity. In the case of
%% <pre>, this may lead to that we ignores what we think is an empty
%% line but is actually a line break that should be kept, for example
%% in this case:
%% <pre>
%%   <input>some command</input> <-- this line break is lost!
%%   &lt;some result&gt;
%% </pre>
%% This function reinserts line breaks where necessary.
reinsert_nl([[]|T]) ->
    [{pcdata,[],"\\n"} | reinsert_nl(T)];
reinsert_nl([H|T]) ->
    [H | reinsert_nl(T)];
reinsert_nl([]) ->
    [].

%% sgmls treats line breaks in a way that DocBuilder relies on and
%% which must be imitated here. Replace all "\n" with "\\n" and add
%% "\n" to the end of each text element.
nl("") ->
    "\n";
nl("\n"++Text) ->
    "\\n"++nl(Text);
nl([Ch|Text]) ->
    [Ch|nl(Text)].


%% strip_leading_blanks(Str) -> Str
%% Leading spaces and tabs before a newline are always redundant
%% and are therefore stripped of here
%% If no newline is found the original string is returned unchanged

strip_leading_blanks(Str) ->
    strip_leading_blanks(Str,Str).

strip_leading_blanks([],Str) ->
    Str;
strip_leading_blanks([$\s|T],Str) ->
    strip_leading_blanks(T,Str);
strip_leading_blanks([$\t|T],Str) ->
    strip_leading_blanks(T,Str);
strip_leading_blanks(Rest=[$\n|_],_) ->
    Rest;
strip_leading_blanks(_,Str) ->
    Str.

%% strip_nl(Str) -> Str
%% The XMerL scan will often result in the contents of <pre> or <code>
%% starting with a newline, as the format is normally:
%%   <pre>
%%     ..contents..
%%   </pre>
%% However, this newline must be removed, or the resulting HTML will be
%%   <pre>
%%
%%     ..content..
%%   </pre>
strip_nl({pcdata,[],"\\n"++Str}) -> {pcdata,[],Str};
strip_nl(E) -> E.

get_dtd([]) ->
    none;
get_dtd(Parents) ->
    {DTD, _} = lists:last(Parents),
    DTD.

%% attrs(DTD, Tag, GivenAttrs) -> AllAttrs
%%   DTD = Tag = atom()  DTD and tag name
%%   GivenAttrs = [#xmlAttribute{}]
%%   AllAttrs = [{Name, Type, Val}]
%%     Name = string()  (uppercase) Example: "VALIGN"
%%     Type = "CDATA" | "TOKEN"
%%     Val  = string()  (uppercase if type is "TOKEN", as-is otherwise)
%% The XMerL scanning of <file>.xml renders only the given attributes.
%% However, DocBuilder needs also the optional attributes (which not
%% necessarily have been given), so we add them here, using the default
%% values according to the DTDs.
%% NOTE: Uses the information from the DTDs. That is, if some change is
%% done to the DTDs, also this file must be updated. Ideally, the DTDs
%% should be parsed automatically in some way.
%% It can also be noted that this check is superfluous in the case where
%% all attributes are required (except that the attributes are sorted
%% in the same order as in the DTD) and where an optional attribute has
%% type "CDATA" as no sensible default value can be specified in this
%% case.
attrs(DTD, Tag, GivenAttrs) ->
    merge_attrs(Tag, default_attrs(DTD, Tag), GivenAttrs).

merge_attrs(Tag, [{NameA, Type, DefVal}|Default], GivenAttrs) ->
    Val = case lists:keyfind(NameA, #xmlAttribute.name, GivenAttrs) of
	      #xmlAttribute{value=Val0} -> Val0;
	      false -> DefVal
	  end,
    Attr = {attr_name(NameA), Type, attr_val(Type, Val)},
    [Attr | merge_attrs(Tag, Default, GivenAttrs)];
merge_attrs(_Tag, [], _GivenAttrs) ->
    [].

attr_name(Atom) ->
    string:to_upper(atom_to_list(Atom)).

attr_val("CDATA", Val) -> Val;
attr_val("TOKEN", Val) -> string:to_upper(Val).

%% Given the DTD and element tag, return a list [{Name, Value}] where
%% Name (atom) is the name of each possible attribute and
%% Value (lowercase string) its default value.
default_attrs(_, cell) ->
    [{align, "TOKEN", "left"},
     {valign, "TOKEN", "middle"}];
default_attrs(_, cite) ->
    [{id, "CDATA", ""}]; % required
default_attrs(_, code) ->
    [{type, "TOKEN", "none"}];
default_attrs(_, codeinclude) ->
    [{file, "CDATA", ""}, % required
     {tag, "CDATA", ""},
     {type, "TOKEN", "none"}];
default_attrs(book, contents) ->
    [{level, "TOKEN", "2"}];
default_attrs(_, erleval) ->
    [{expr, "CDATA", ""}]; % required
default_attrs(report, erlinclude) ->
    [{file, "CDATA", ""}, % required
     {tag, "CDATA", ""}]; % required
default_attrs(_, fascicule) ->
    [{file, "CDATA", ""}, % required
     {href, "CDATA", ""}, % required
     {entry, "TOKEN", "no"}];
default_attrs(book, header) ->
    [{titlestyle, "TOKEN", "normal"}];
default_attrs(_, image) ->
    [{file, "CDATA", ""}]; % required
default_attrs(_, include) ->
    [{file, "CDATA", ""}]; % required
default_attrs(report, index) ->
    [{txt, "CDATA", ""}]; % required
default_attrs(_, list) ->
    [{type, "TOKEN", "bulleted"}];
default_attrs(_, marker) ->
    [{id, "CDATA", ""}]; % required
default_attrs(book, onepart) ->
    [{lift, "TOKEN", "no"}];
default_attrs(book, parts) ->
    [{lift, "TOKEN", "no"}];
default_attrs(_, path) ->
    [{unix, "CDATA", ""},
     {windows, "CDATA", ""}];
default_attrs(_, seealso) ->
    [{marker, "CDATA", ""}]; % required
default_attrs(report, table) ->
    [{width, "CDATA", "0"},
     {colspec, "CDATA", ""}];
default_attrs(_, table) ->
    [{align, "TOKEN", "center"}];
default_attrs(_, term) ->
    [{id, "CDATA", ""}]; % required
default_attrs(book, theheader) ->
    [{tag, "TOKEN", "none"}];
default_attrs(bookinsidecover, theheader) ->
    [{tag, "TOKEN", "none"}];
default_attrs(_, url) ->
    [{href, "CDATA", ""}]; % required
default_attrs(_, _) -> [].

%%--Single PCDATA broken into several fix-------------------------------

%% When text contains an entity, then XMERL splits it into two
%% PCDATA elements, the second starting with the entity.
%%
%% Example:
%%  Magnus Fr�berg => [{pcdata,[],"Magnus Fr\n"},{pcdata,[],"�berg\n"}]
%%
%% This is not handled by DocBuilder which expects many tags, for
%% example title and aname, to contain a single PCDATA element. (That
%% is also what nsgmls returned.)

fix_single_pcdata([{pcdata,[],Str1}, {pcdata,[],Str2}|T]) ->
    fix_single_pcdata([{pcdata,[],Str1++Str2}|T]);
fix_single_pcdata(FixedData) ->
    FixedData.

tag_content(aname) -> single_pcdata;
tag_content(app) -> single_pcdata;
tag_content(approved) -> single_pcdata;
tag_content(appsummary) -> single_pcdata;
tag_content(b) -> single_pcdata;
tag_content(c) -> single_pcdata;
tag_content(cauthor) -> single_pcdata;
tag_content(cell) -> mixed_content;
tag_content(checked) -> single_pcdata;
tag_content(chowpublished) -> single_pcdata;
tag_content(code) -> single_pcdata; % mixed?
tag_content(com) -> single_pcdata;
tag_content(comsummary) -> single_pcdata;
tag_content(copyright) -> mixed_content;
tag_content(ctitle) -> single_pcdata;
tag_content(d) -> mixed_content;
tag_content(date) -> single_pcdata;
tag_content(docno) -> single_pcdata;
tag_content(em) -> mixed_content;
tag_content(email) -> single_pcdata;
tag_content(fascicule) -> single_pcdata;
tag_content(file) -> single_pcdata;
tag_content(filesummary) -> single_pcdata;
tag_content(fsummary) -> mixed_content;
tag_content(headline) -> single_pcdata;
tag_content(holder) -> single_pcdata;
tag_content(i) -> single_pcdata;
tag_content(icaption) -> single_pcdata;
tag_content(id) -> single_pcdata;
tag_content(input) -> mixed_content;
tag_content(item) -> mixed_content;
tag_content(legalnotice) -> single_pcdata;
tag_content(lib) -> single_pcdata;
tag_content(libsummary) -> single_pcdata;
tag_content(module) -> single_pcdata;
tag_content(modulesummary) -> single_pcdata;
tag_content(name) -> single_pcdata;
tag_content(nametext) -> single_pcdata;
tag_content(p) -> mixed_content;
tag_content(pagetext) -> single_pcdata;
tag_content(path) -> single_pcdata; % mixed?
tag_content(pre) -> mixed_content;
tag_content(prepared) -> single_pcdata;
tag_content(resp) -> single_pcdata;
tag_content(responsible) -> single_pcdata;
tag_content(ret) -> single_pcdata;
tag_content(rev) -> single_pcdata;
tag_content(seealso) -> single_pcdata; % mixed?
tag_content(shortdef) -> single_pcdata;
tag_content(shorttitle) -> single_pcdata;
tag_content(tag) -> mixed_content;
tag_content(tcaption) -> single_pcdata;
tag_content(termdef) -> single_pcdata;
tag_content(title) -> single_pcdata;
tag_content(url) -> single_pcdata; % mixed
tag_content(v) -> single_pcdata;
tag_content(year) -> single_pcdata;
tag_content(_) -> no_pcdata.