%%% -*- mode: Erlang; fill-column: 80; comment-column: 75; -*- %%% Copyright 2012 Erlware, LLC. All Rights Reserved. %%% %%% This file is provided to you under the Apache License, %%% Version 2.0 (the "License"); you may not use this file %%% except in compliance with the License. You may obtain %%% a copy of the License at %%% %%% http://www.apache.org/licenses/LICENSE-2.0 %%% %%% Unless required by applicable law or agreed to in writing, %%% software distributed under the License is distributed on an %%% "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY %%% KIND, either express or implied. See the License for the %%% specific language governing permissions and limitations %%% under the License. %%%--------------------------------------------------------------------------- %%% @copyright (C) Erlware, LLC. %%% @copyright (C) 2009, Gordon Guthrie %%% @doc -module(erlmarkdown). -export([conv/1, conv_utf8/1, conv_file/2]). -define(SPACE, 32). -define(TAB, 9). -define(LF, 10). -define(CR, 13). -define(NBSP, 160). -define(AMP, $&, $a, $m, $p, $;). -define(COPY, $&, $c, $o, $p, $y, $;). %%============================================================================ %% API %%============================================================================ %% the lexer first lexes the input %% make_lines does 2 passes: %% * it chops the lexed strings into lines which it represents as a %% list of lists %% * it then types the lines into the following: %% * normal lines %% * reference style links %% * reference style images %% * special line types %% - blank %% - SETEXT header lines %% - ATX header lines %% - blockquote %% - unordered lists %% - ordered lists %% - code blocks %% - horizontal rules %% the parser then does its magic interpolating the references as appropriate conv(String) -> Lex = lex(String), UntypedLines = make_lines(Lex), {TypedLines, Refs} = type_lines(UntypedLines), parse(TypedLines, Refs). -spec conv_utf8(list()) -> list(). conv_utf8(Utf8) -> Str = xmerl_ucs:from_utf8(Utf8), Res = conv(Str), xmerl_ucs:to_utf8(Res). conv_file(FileIn, FileOut) -> case file:open(FileIn, [read]) of {ok, Device} -> Input = get_all_lines(Device,[]), Output = conv(Input), write(FileOut, Output); _ -> error end. %%============================================================================ %% Internal Functions %%============================================================================ get_all_lines(Device, Accum) -> case io:get_line(Device,"") of eof -> file:close(Device), Accum; Line -> get_all_lines(Device,Accum ++ Line) end. write(File, Text) -> _Return=filelib:ensure_dir(File), case file:open(File, [write]) of {ok, Id} -> io:fwrite(Id, "~s~n", [Text]), file:close(Id); _ -> error end. %% %% Parse the lines interpolating the references as appropriate %% parse(TypedLines, Refs) -> string:strip(p1(TypedLines, Refs, 0, []), both, $\n). %% goes through the lines %% Variable 'R' contains the References and 'I' is the indent level %% Terminal clause p1([], _R, _I, Acc) -> lists:flatten(lists:reverse(Acc)); %% Tags have the highest precedence... p1([{tag, Tag} | T], R, I, Acc) -> case T of [] -> p1([], R, I, ["

", make_tag_str(Tag, R), "

" | Acc]); [{blank, _} | T2] -> p1(T2, R, I, [make_tag_str(Tag, R) | Acc]); _Other -> p1(T, R, I, [pad(I) ++ make_tag_str(Tag, R) | Acc]) end; p1([{blocktag, [{{{tag, open}, Type}, Tg}] = _Tag} | T], R, I, Acc) -> {Block, Rest} = grab_for_blockhtml(T, Type, []), Str = lists:flatten([Tg, "\n" | Block]), p1(Rest, R, I, [Str | Acc]); %% blank lines/linefeeds are gobbled down p1([{Type, _} | T], R, I, Acc) when Type == blank orelse Type == linefeed -> Rest = grab_empties(T), p1(Rest, R, I, [pad(I) ++ "\n" | Acc]); %% two consecutive normal lines should be concatenated... %% remembering the pad the second line with the indent... p1([{normal, P1}, {normal, P2} | T], R, I, Acc) -> p1([{normal, merge(P1, pad(I), P2)} | T], R, I, Acc); %% as should a normal and linefeed %% setext h1 is a look behind and it overrides blockquote and code... p1([{normal, P}, {setext_h1, _} | T], R, I, Acc) -> p1(T, R, I, [pad(I) ++ "

" ++ make_string(snip(P), R) ++ "

\n\n" | Acc]); p1([{blockquote, P}, {setext_h1, _} | T], R, I, Acc) -> p1(T, R, I, [pad(I) ++ "

" ++ make_string(snip(P), R) ++ "

\n\n" | Acc]); p1([{{codeblock, P}, _}, {setext_h1, _} | T], R, I, Acc) -> p1(T, R, I, [pad(I) ++ "

" ++ make_string(snip(P), R) ++ "

\n\n" | Acc]); p1([{blockquote, P}, {h2_or_hr, _} | T], R, I, Acc) -> p1(T, R, I, [pad(I) ++ "

" ++ make_string(snip(P), R) ++ "

\n\n" | Acc]); p1([{{codeblock, P}, _}, {h2_or_hr, _} | T], R, I, Acc) -> p1(T, R, I, [pad(I) ++ "

" ++ make_string(snip(P), R) ++ "

\n\n" | Acc]); %% but a setext with no lookbehind is just rendered as a normal line, %% so change its type and rethrow it p1([{setext_h1, P} | T], R, I, Acc) -> p1([{normal, P} | T], R, I, Acc); %% setext h2 might be a look behind p1([{normal, P}, {h2_or_hr, _} | T], R, I, Acc) -> P2 = string:strip(make_string(snip(P), R), both, ?SPACE), p1(T, R, I, [pad(I) ++ "

" ++ P2 ++ "

\n\n" | Acc]); %% blockquotes swallow each other %% replace the first blockquote mark with a space... p1([{blockquote, P1}, {blockquote, [_ | P2]} | T], R, I, Acc) -> p1([{blockquote, merge(P1, pad(I), [{{ws, sp}, " "} | P2])} | T], R, I, Acc); %% blockquotes swallow normal p1([{blockquote, P1}, {normal, P2} | T], R, I, Acc) -> p1([{blockquote, merge(P1, pad(I + 1), P2)} | T], R, I, Acc); %% blockquote p1([{blockquote, P} | T], R, I, Acc) -> [{{md, gt}, _} | T1] = P, T2 = string:strip(make_string(T1, R)), p1(T, R, I, ["\n
\n" ++ pad(I + 1) ++ "

" ++ T2 ++ "

\n
" | Acc]); %% one normal is just normal... p1([{normal, P} | T], R, I, Acc) -> P2 = string:strip(make_string(snip(P), R), both, ?SPACE), p1(T, R, I, [pad(I) ++ "

" ++ P2 ++ "

\n" | Acc]); %% atx headings p1([{{h1, P}, _} | T], R, I, Acc) -> NewP = string:strip(make_string(snip(P), R), right), p1(T, R, I, [pad(I) ++ "

" ++ NewP ++ "

\n\n" | Acc]); p1([{{h2, P}, _} | T], R, I, Acc) -> NewP = string:strip(make_string(snip(P), R), right), p1(T, R, I, [pad(I) ++ "

" ++ NewP ++ "

\n\n" | Acc]); p1([{{h3, P}, _} | T], R, I, Acc) -> NewP = string:strip(make_string(snip(P), R), right), p1(T, R, I, [pad(I) ++ "

" ++ NewP ++ "

\n\n" | Acc]); p1([{{h4, P}, _} | T], R, I, Acc) -> NewP = string:strip(make_string(snip(P), R), right), p1(T, R, I, [pad(I) ++ "

" ++ NewP ++ "

\n\n" | Acc]); p1([{{h5, P}, _} | T], R, I, Acc) -> NewP = string:strip(make_string(snip(P), R), right), p1(T, R, I, [pad(I) ++ "
" ++ NewP ++ "
\n\n" | Acc]); p1([{{h6, P}, _} | T], R, I, Acc) -> NewP = string:strip(make_string(snip(P), R), right), p1(T, R, I, [pad(I) ++ "
" ++ NewP ++ "
\n\n" | Acc]); %% unordered lists swallow normal and codeblock lines p1([{{ul, P1}, S1}, {{normal, P2}, S2} | T], R, I , Acc) -> p1([{{ul, merge(P1, pad(I), P2)}, S1 ++ S2} | T], R, I, Acc); p1([{{ul, P1}, S1}, {{codeblock, P2}, S2} | T], R, I , Acc) -> p1([{{ul, merge(P1, pad(I), P2)}, S1 ++ S2} | T], R, I, Acc); p1([{{ul, _P}, _} | _T] = List, R, I, Acc) -> {Rest, NewAcc} = parse_list(ul, List, R, I, [], false), p1(Rest, R, I, [pad(I) ++ "\n" | Acc]); %% ordered lists swallow normal and codeblock lines p1([{{ol, P1}, S1}, {{normal, P2}, S2} | T], R, I , Acc) -> p1([{{ol, merge(P1, pad(I), P2)}, S1 ++ S2} | T], R, I, Acc); p1([{{ol, P1}, S1}, {{codeblock, P2}, S2} | T], R, I , Acc) -> p1([{{ol, merge(P1, pad(I), P2)}, S1 ++ S2} | T], R, I, Acc); p1([{{ol, _P}, _} | _T] = List, R, I, Acc) -> {Rest, NewAcc} = parse_list(ol, List, R, I, [], false), p1(Rest, R, I, [pad(I) ++ "
    \n" ++ NewAcc ++ pad(I) ++ "
\n" | Acc]); %% codeblock consumes any following empty lines %% and other codeblocks p1([{{codeblock, P1}, S1}, {{codeblock, P2}, S2} | T], R, I, Acc) -> p1([{{codeblock, merge(P1, pad(I), P2)}, S1 ++ S2} | T], R, I, Acc); p1([{{codeblock, P}, _} | T], R, I, Acc) -> Rest = grab_empties(T), p1(Rest, R, I, ["
" ++ make_string(snip(P), R)
                     ++ "\n
\n\n" | Acc]); %% horizontal rules p1([{hr, _} | T], R, I, Acc) -> p1(T, R, I, ["
" | Acc]); %% h2_or_hr is greedy for normal lines p1([{h2_or_hr, P1}, {normal, P2} | T], R, I, Acc) -> p1([{normal, lists:flatten([P1 | P2])} | T], R, I, Acc); %% the clause with a normal before an 'h2_or_hr' has already been %% handled further up the tree, so this is a bona fide 'hr'... p1([{h2_or_hr, _} | T], R, I, Acc) -> p1(T, R, I, ["
" | Acc]); %% Now start pulling out inline refs etc, etc p1([{inlineref, _P} | T], R, I, Acc) -> p1(T, R, I, Acc). grab_for_blockhtml([], Type, Acc) -> {lists:reverse(["" | Acc]), []}; grab_for_blockhtml([{blocktag, [{{{tag, close}, Type}, Tg}]} | T], Type, Acc) -> {lists:reverse([Tg | Acc]), T}; grab_for_blockhtml([{blocktag, [{{{tag, _}, GrabType}, Tg}]} | T], Type, Acc) when GrabType =/= Type -> %% blocktags grabbed in a blocktag need a line ending pushed grab_for_blockhtml(T, Type, ["\n", Tg | Acc]); grab_for_blockhtml([{tag, {{{tag, self_closing}, _Ty}, Tg}} | T], Type, Acc) -> grab_for_blockhtml(T, Type, [Tg | Acc]); grab_for_blockhtml([H | T], Type, Acc) -> {_Type, Content} = H, Str = make_plain_string(Content), grab_for_blockhtml(T, Type, [Str | Acc]). grab_empties([{linefeed, _} | T]) -> grab_empties(T); grab_empties([{blank, _} | T]) -> grab_empties(T); grab_empties(List) -> List. merge(P1, Pad, P2) -> NewP1 = make_br(P1), lists:flatten([NewP1, {string, Pad} | P2]). make_br(List) -> make_br1(lists:reverse(List)). make_br1([{{lf, _}, _}, {{ws, comp}, _} | T]) -> lists:reverse([{tags, "
\n"} | T]); make_br1([{{lf, _}, _}, {{ws, tab}, _} | T]) -> lists:reverse([{tags, "
\n"} | T]); make_br1(List) -> lists:reverse(List). pad(N) -> pad1(N, []). pad1(0, Acc) -> Acc; pad1(N, Acc) when N > 0 -> pad1(N - 1, [" " | Acc]). %% this is a bit messy because of the way that hard lines are treated... %% If your li's have a blank line between them the item gets wrapped in a para, %% if not, they don't %% BUT if one item is

wrapped then the next is too parse_list(_Type, [], _R, _I, A, _) -> {[], lists:reverse(A)}; parse_list(Type, [{{Type, P}, _} | T], R, I, A, Wrap) -> {Rest, NewP, NewWrap} = grab(T, R, [], Wrap), Li = case NewWrap of false -> Ret = parse([{normal, P}], R), %% need to strip off the extra

's Ret2 = string:left(Ret, length(Ret) - 4), Ret3 = string:right(Ret2, length(Ret2) -3), Ret3 ++ "\n" ++ NewP ++ pad(I); true -> string:strip(parse([{normal, P}], R), right, ?LF) ++ NewP ++ pad(I) end, NewWrap2 = case T of [] -> false; % doesnt matter [H2 | _T2] -> case H2 of {linefeed, _} -> true; _ -> false end end, parse_list(Type, Rest, R, I, [pad(I) ++ "
  • " ++ string:strip(Li, right, ?LF) ++ "
  • \n" | A], NewWrap2); parse_list(_Type, List, _R, _I, A, _) -> {List, lists:reverse(A)}. %% grab grabs normals, double codeblocks, linefeeds and blanks %% BUT stop grabbing if a normal if preceeded by a linefeed or blank %% UNLESS the normal starts with white space :( %% the third return parameter is 'true' if the 'li' should be %% wrapped in '

    ' and false if it shouldn't grab([{{codeblock, _}, S} | T] = List, R, Acc, W) -> case is_blockquote(S, T) of {{true, R1}, T2} -> grab(T2, R, ["", make_escape_string(R1, R), "
    " | Acc], W); {{esc_false, R1}, _T2} -> {R1, lists:reverse(Acc), false}; {false, T2} -> case is_double_indent(S) of false -> {List, lists:reverse(Acc), false}; {true, R2} -> %% if it is a double indent - delete 4 spaces %% no it makes not sense to me neither :( grab(T2, R, [" " ++ make_escape_string(R2, R) | Acc], W) end end; grab([{linefeed, _} | T], R, Acc, false) -> grab2(T, R, Acc, T, Acc, true); grab([{linefeed, _} | T], R, Acc, true) -> grab2(T, R, ["\n" | Acc], T, Acc, true); grab([{blank, _} | T], R, Acc, false) -> grab2(T, R, Acc, T, Acc, true); grab([{blank, _} | T], R, Acc, true) -> grab2(T, R, ["\n" | Acc], T, Acc, true); grab([{normal, P} | T], R, Acc, W) -> Li = case W of false -> make_escape_string(P, R); true -> "

    "++ string:strip(make_escape_string(P, R), right, ?LF) ++ "

    " end, grab(T, R, [Li | Acc], W); grab(List, _R, Acc, W) -> {List, lists:reverse(Acc), W}. %% the problem is knowing when to grab, if the list is followed by a long %% string of blank lines and linefeeds and a normal then the linefeeds aren't %% grabbed %% if the list if followed by blank lines and linefeeds and a normal with an %% initial whitespace it is grabbed... grab2([{normal, P2} | T], R, Acc, LO, AO, W) -> case P2 of [{{ws, _}, _} | T2] -> Li = case W of false -> make_escape_string(T2, R); true -> "

    " ++ string:strip(make_escape_string(T2, R), right, ?LF) ++ "

    " end, grab(T, R, [Li | Acc], W); _ -> {LO, AO, false} end; grab2([{linefeed, _} | T], R, Acc, LO, AO, _W) -> grab2(T, R, ["\n" | Acc], LO, AO, true); grab2([{blank, _} | T], R, Acc, LO, AO, _W) -> grab2(T, R, ["\n" | Acc], LO, AO, true); %% We dont want to grab this stuff so return the old list and the old acc grab2(_List, _R, _Acc, LO, AO, _W) -> {LO, AO, true}. is_double_indent(List) -> is_double_indent1(List, 0). %% double indent is any combination of tabs and spaces that add %% up to 8 is_double_indent1([], _N) -> false; is_double_indent1(Rest, N) when N > 7 -> {true, Rest}; is_double_indent1([{{ws, sp}, _} | T], N) -> is_double_indent1(T, N + 1); is_double_indent1([{{ws, tab}, _} | T], N) -> is_double_indent1(T, N + 4); is_double_indent1(_List, _N) -> false. is_blockquote(List, T) -> case is_bq1(List, 0) of false -> {false, T}; {esc_false, R} -> {{esc_false, R}, T}; {true, R} -> {NewT, NewR} = grab2(T, R), {{true, NewR}, NewT} end. is_bq1([], _N) -> false; is_bq1([{{ws, sp}, _} | T], N) -> is_bq1(T, N + 1); is_bq1([{{ws, tab}, _} | T], N) -> is_bq1(T, N + 4); is_bq1([{{md, gt}, _}, {{ws, _}, _} | T], N) when N > 3 -> {true, T}; is_bq1([{{punc, bslash}, _}, {{md, gt}, GT}, {{ws, _}, WS} | T], N) when N > 3 -> {esc_false, [GT, WS | T]}; is_bq1(_List, _N) -> false. grab2(List, R) -> gb2(List, lists:reverse(R)). gb2([], Acc) -> {[], lists:flatten(lists:reverse(Acc))}; gb2([{blank, _} | T], Acc) -> {T, lists:flatten(lists:reverse(Acc))}; gb2([{_Type, P} | T], Acc) -> gb2(T, [P | Acc]). %% %% Make the lines from the raw tokens %% make_lines(Tokens) -> ml1(Tokens, [], []). ml1([], [], A2) -> lists:reverse(A2); ml1([], A1, A2) -> ml1([], [], [lists:reverse(A1) | A2]); ml1([{{lf, _}, _} = H | T], A1, A2) -> ml1(T, [], [ml2(H, A1) | A2]); ml1([H | T], A1, A2) -> ml1(T, [H | A1], A2). ml2(H, List) -> lists:reverse([H | List]). %%% %%% Process the lines and give each line a type. The valid types are: %%% * normal line %%% * reference style links %%% * reference style images %%% * special line types %%% - blank %%% - SETEXT header lines %%% - ATX header lines %%% - unordered lists (including code blocks) %%% - ordered lists (including code blocks) %%% - blockquotes %%% - code blocks %%% - horizontal rules %%% type_lines(Lines) -> {Refs, TypedLines} = type_lines1(Lines, [], []), {strip_lines(TypedLines), Refs}. type_lines1([], A1, A2) -> {A1, lists:reverse(A2)}; type_lines1([[{{ws, sp}, _}, {{inline, open}, _} | T1] = H | T2], A1, A2) -> %% this clause extracts URL and Image refs %% (it is the only one that uses A1 and A2... %% inlines can have up to 3 spaces before it t_inline(H, T1, T2, A1, A2); type_lines1([[{{ws, tab}, _}, {{inline, open}, _} | T1] = H | T2], A1, A2) -> t_inline(H, T1, T2, A1, A2); type_lines1([[{{ws, comp}, W}, {{inline, open}, _} | T1] = H | T2], A1, A2) -> case gt(W, 3) of {true, _R} -> t_inline(H, T1, T2, A1, A2); false -> type_lines1(T1, A1, [{normal , H} | A2]) % same exit at the final clause! end, t_inline(H, T1, T2, A1, A2); type_lines1([[{{inline, open}, _} | T1] = H | T2], A1, A2) -> t_inline(H, T1, T2, A1, A2); type_lines1([[{{md, eq}, _} | _T] = H | T], A1, A2) -> %% types setext lines type_lines1(T, A1, [type_setext_h1(H) | A2]); type_lines1([[{{md, dash}, _} | _T] = H | T], A1, A2) -> %% NOTE 1: generates a ul as the default not a normal line %% NOTE 2: depending on the context this might generate an

    header %% or an
    %% NOTE 3: space - is typed to a bullet down in