diff options
Diffstat (limited to 'examples/markdown_middleware/src')
4 files changed, 1588 insertions, 18 deletions
diff --git a/examples/markdown_middleware/src/erlmarkdown.erl b/examples/markdown_middleware/src/erlmarkdown.erl new file mode 100644 index 0000000..6cab50d --- /dev/null +++ b/examples/markdown_middleware/src/erlmarkdown.erl @@ -0,0 +1,1581 @@ +%%% -*- mode: Erlang; fill-column: 80; comment-column: 75; -*- +%%% Copyright 2012 Erlware, LLC. All Rights Reserved. +%%% +%%% This file is provided to you under the Apache License, +%%% Version 2.0 (the "License"); you may not use this file +%%% except in compliance with the License. You may obtain +%%% a copy of the License at +%%% +%%% http://www.apache.org/licenses/LICENSE-2.0 +%%% +%%% Unless required by applicable law or agreed to in writing, +%%% software distributed under the License is distributed on an +%%% "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +%%% KIND, either express or implied. See the License for the +%%% specific language governing permissions and limitations +%%% under the License. +%%%--------------------------------------------------------------------------- +%%% @copyright (C) Erlware, LLC. +%%% @copyright (C) 2009, Gordon Guthrie +%%% @doc +-module(erlmarkdown). + +-export([conv/1, + conv_utf8/1, + conv_file/2]). + +-define(SPACE, 32). +-define(TAB, 9). +-define(LF, 10). +-define(CR, 13). +-define(NBSP, 160). +-define(AMP, $&, $a, $m, $p, $;). +-define(COPY, $&, $c, $o, $p, $y, $;). + +%%============================================================================ +%% API +%%============================================================================ + +%% the lexer first lexes the input +%% make_lines does 2 passes: +%% * it chops the lexed strings into lines which it represents as a +%% list of lists +%% * it then types the lines into the following: +%% * normal lines +%% * reference style links +%% * reference style images +%% * special line types +%% - blank +%% - SETEXT header lines +%% - ATX header lines +%% - blockquote +%% - unordered lists +%% - ordered lists +%% - code blocks +%% - horizontal rules +%% the parser then does its magic interpolating the references as appropriate +conv(String) -> + Lex = lex(String), + UntypedLines = make_lines(Lex), + {TypedLines, Refs} = type_lines(UntypedLines), + parse(TypedLines, Refs). + +-spec conv_utf8(list()) -> list(). +conv_utf8(Utf8) -> + Str = xmerl_ucs:from_utf8(Utf8), + Res = conv(Str), + xmerl_ucs:to_utf8(Res). + +conv_file(FileIn, FileOut) -> + case file:open(FileIn, [read]) of + {ok, Device} -> + Input = get_all_lines(Device,[]), + Output = conv(Input), + write(FileOut, Output); + _ -> + error + end. + +%%============================================================================ +%% Internal Functions +%%============================================================================ + +get_all_lines(Device, Accum) -> + case io:get_line(Device,"") of + eof -> + file:close(Device), + Accum; + Line -> + get_all_lines(Device,Accum ++ Line) + end. + +write(File, Text) -> + _Return=filelib:ensure_dir(File), + case file:open(File, [write]) of + {ok, Id} -> + io:fwrite(Id, "~s~n", [Text]), + file:close(Id); + _ -> + error + end. + + +%% +%% Parse the lines interpolating the references as appropriate +%% + +parse(TypedLines, Refs) -> + string:strip(p1(TypedLines, Refs, 0, []), both, $\n). + +%% goes through the lines +%% Variable 'R' contains the References and 'I' is the indent level + +%% Terminal clause +p1([], _R, _I, Acc) -> + lists:flatten(lists:reverse(Acc)); + +%% Tags have the highest precedence... +p1([{tag, Tag} | T], R, I, Acc) -> + case T of + [] -> + p1([], R, I, ["</p>", make_tag_str(Tag, R), "<p>" | Acc]); + [{blank, _} | T2] -> + p1(T2, R, I, [make_tag_str(Tag, R) | Acc]); + _Other -> + p1(T, R, I, [pad(I) ++ make_tag_str(Tag, R) | Acc]) + end; + +p1([{blocktag, [{{{tag, open}, Type}, Tg}] = _Tag} | T], R, I, Acc) -> + {Block, Rest} = grab_for_blockhtml(T, Type, []), + Str = lists:flatten([Tg, "\n" | Block]), + p1(Rest, R, I, [Str | Acc]); + +%% blank lines/linefeeds are gobbled down +p1([{Type, _} | T], R, I, Acc) + when Type == blank orelse Type == linefeed -> + Rest = grab_empties(T), + p1(Rest, R, I, [pad(I) ++ "\n" | Acc]); + +%% two consecutive normal lines should be concatenated... +%% remembering the pad the second line with the indent... +p1([{normal, P1}, {normal, P2} | T], R, I, Acc) -> + p1([{normal, merge(P1, pad(I), P2)} | T], R, I, Acc); +%% as should a normal and linefeed + +%% setext h1 is a look behind and it overrides blockquote and code... +p1([{normal, P}, {setext_h1, _} | T], R, I, Acc) -> + p1(T, R, I, [pad(I) ++ "<h1>" ++ make_string(snip(P), R) + ++ "</h1>\n\n" | Acc]); +p1([{blockquote, P}, {setext_h1, _} | T], R, I, Acc) -> + p1(T, R, I, [pad(I) ++ "<h1>" ++ make_string(snip(P), R) + ++ "</h1>\n\n" | Acc]); +p1([{{codeblock, P}, _}, {setext_h1, _} | T], R, I, Acc) -> + p1(T, R, I, [pad(I) ++ "<h1>" ++ make_string(snip(P), R) + ++ "</h1>\n\n" | Acc]); +p1([{blockquote, P}, {h2_or_hr, _} | T], R, I, Acc) -> + p1(T, R, I, [pad(I) ++ "<h2>" ++ make_string(snip(P), R) + ++ "</h2>\n\n" | Acc]); +p1([{{codeblock, P}, _}, {h2_or_hr, _} | T], R, I, Acc) -> + p1(T, R, I, [pad(I) ++ "<h2>" ++ make_string(snip(P), R) + ++ "</h2>\n\n" | Acc]); + +%% but a setext with no lookbehind is just rendered as a normal line, +%% so change its type and rethrow it +p1([{setext_h1, P} | T], R, I, Acc) -> + p1([{normal, P} | T], R, I, Acc); + +%% setext h2 might be a look behind +p1([{normal, P}, {h2_or_hr, _} | T], R, I, Acc) -> + P2 = string:strip(make_string(snip(P), R), both, ?SPACE), + p1(T, R, I, [pad(I) ++ "<h2>" ++ P2 ++ "</h2>\n\n" | Acc]); + +%% blockquotes swallow each other +%% replace the first blockquote mark with a space... +p1([{blockquote, P1}, {blockquote, [_ | P2]} | T], R, I, Acc) -> + p1([{blockquote, merge(P1, pad(I), [{{ws, sp}, " "} | P2])} | T], R, I, Acc); +%% blockquotes swallow normal +p1([{blockquote, P1}, {normal, P2} | T], R, I, Acc) -> + p1([{blockquote, merge(P1, pad(I + 1), P2)} | T], R, I, Acc); +%% blockquote +p1([{blockquote, P} | T], R, I, Acc) -> + [{{md, gt}, _} | T1] = P, + T2 = string:strip(make_string(T1, R)), + p1(T, R, I, + ["\n<blockquote>\n" ++ pad(I + 1) ++ "<p>" ++ T2 ++ "</p>\n</blockquote>" | Acc]); + +%% one normal is just normal... +p1([{normal, P} | T], R, I, Acc) -> + P2 = string:strip(make_string(snip(P), R), both, ?SPACE), + p1(T, R, I, [pad(I) ++ "<p>" ++ P2 ++ "</p>\n" | Acc]); + +%% atx headings +p1([{{h1, P}, _} | T], R, I, Acc) -> + NewP = string:strip(make_string(snip(P), R), right), + p1(T, R, I, [pad(I) ++ "<h1>" ++ NewP ++ "</h1>\n\n" | Acc]); +p1([{{h2, P}, _} | T], R, I, Acc) -> + NewP = string:strip(make_string(snip(P), R), right), + p1(T, R, I, [pad(I) ++ "<h2>" ++ NewP ++ "</h2>\n\n" | Acc]); +p1([{{h3, P}, _} | T], R, I, Acc) -> + NewP = string:strip(make_string(snip(P), R), right), + p1(T, R, I, [pad(I) ++ "<h3>" ++ NewP ++ "</h3>\n\n" | Acc]); +p1([{{h4, P}, _} | T], R, I, Acc) -> + NewP = string:strip(make_string(snip(P), R), right), + p1(T, R, I, [pad(I) ++ "<h4>" ++ NewP ++ "</h4>\n\n" | Acc]); +p1([{{h5, P}, _} | T], R, I, Acc) -> + NewP = string:strip(make_string(snip(P), R), right), + p1(T, R, I, [pad(I) ++ "<h5>" ++ NewP ++ "</h5>\n\n" | Acc]); +p1([{{h6, P}, _} | T], R, I, Acc) -> + NewP = string:strip(make_string(snip(P), R), right), + p1(T, R, I, [pad(I) ++ "<h6>" ++ NewP ++ "</h6>\n\n" | Acc]); + +%% unordered lists swallow normal and codeblock lines +p1([{{ul, P1}, S1}, {{normal, P2}, S2} | T], R, I , Acc) -> + p1([{{ul, merge(P1, pad(I), P2)}, S1 ++ S2} | T], R, I, Acc); +p1([{{ul, P1}, S1}, {{codeblock, P2}, S2} | T], R, I , Acc) -> + p1([{{ul, merge(P1, pad(I), P2)}, S1 ++ S2} | T], R, I, Acc); +p1([{{ul, _P}, _} | _T] = List, R, I, Acc) -> + {Rest, NewAcc} = parse_list(ul, List, R, I, [], false), + p1(Rest, R, I, [pad(I) ++ "<ul>\n" ++ NewAcc + ++ pad(I) ++ "</ul>\n" | Acc]); + +%% ordered lists swallow normal and codeblock lines +p1([{{ol, P1}, S1}, {{normal, P2}, S2} | T], R, I , Acc) -> + p1([{{ol, merge(P1, pad(I), P2)}, S1 ++ S2} | T], R, I, Acc); +p1([{{ol, P1}, S1}, {{codeblock, P2}, S2} | T], R, I , Acc) -> + p1([{{ol, merge(P1, pad(I), P2)}, S1 ++ S2} | T], R, I, Acc); +p1([{{ol, _P}, _} | _T] = List, R, I, Acc) -> + {Rest, NewAcc} = parse_list(ol, List, R, I, [], false), + p1(Rest, R, I, [pad(I) ++ "<ol>\n" ++ NewAcc + ++ pad(I) ++ "</ol>\n" | Acc]); + +%% codeblock consumes any following empty lines +%% and other codeblocks +p1([{{codeblock, P1}, S1}, {{codeblock, P2}, S2} | T], R, I, Acc) -> + p1([{{codeblock, merge(P1, pad(I), P2)}, S1 ++ S2} | T], R, I, Acc); +p1([{{codeblock, P}, _} | T], R, I, Acc) -> + Rest = grab_empties(T), + p1(Rest, R, I, ["<pre><code>" ++ make_string(snip(P), R) + ++ "\n</code></pre>\n\n" | Acc]); + +%% horizontal rules +p1([{hr, _} | T], R, I, Acc) -> + p1(T, R, I, ["<hr />" | Acc]); +%% h2_or_hr is greedy for normal lines +p1([{h2_or_hr, P1}, {normal, P2} | T], R, I, Acc) -> + p1([{normal, lists:flatten([P1 | P2])} | T], R, I, Acc); +%% the clause with a normal before an 'h2_or_hr' has already been +%% handled further up the tree, so this is a bona fide 'hr'... +p1([{h2_or_hr, _} | T], R, I, Acc) -> + p1(T, R, I, ["<hr />" | Acc]); + +%% Now start pulling out inline refs etc, etc +p1([{inlineref, _P} | T], R, I, Acc) -> + p1(T, R, I, Acc). + +grab_for_blockhtml([], Type, Acc) -> + {lists:reverse(["</" ++ Type ++ ">" | Acc]), []}; +grab_for_blockhtml([{blocktag, [{{{tag, close}, Type}, Tg}]} + | T], Type, Acc) -> + {lists:reverse([Tg | Acc]), T}; +grab_for_blockhtml([{blocktag, [{{{tag, _}, GrabType}, Tg}]} + | T], Type, Acc) when GrabType =/= Type -> + %% blocktags grabbed in a blocktag need a line ending pushed + grab_for_blockhtml(T, Type, ["\n", Tg | Acc]); +grab_for_blockhtml([{tag, {{{tag, self_closing}, _Ty}, Tg}} + | T], Type, Acc) -> + grab_for_blockhtml(T, Type, [Tg | Acc]); +grab_for_blockhtml([H | T], Type, Acc) -> + {_Type, Content} = H, + Str = make_plain_string(Content), + grab_for_blockhtml(T, Type, [Str | Acc]). + +grab_empties([{linefeed, _} | T]) -> grab_empties(T); +grab_empties([{blank, _} | T]) -> grab_empties(T); +grab_empties(List) -> List. + +merge(P1, Pad, P2) -> + NewP1 = make_br(P1), + lists:flatten([NewP1, {string, Pad} | P2]). + +make_br(List) -> make_br1(lists:reverse(List)). + +make_br1([{{lf, _}, _}, + {{ws, comp}, _} | T]) -> lists:reverse([{tags, " <br />\n"} | T]); +make_br1([{{lf, _}, _}, + {{ws, tab}, _} | T]) -> lists:reverse([{tags, " <br />\n"} | T]); +make_br1(List) -> lists:reverse(List). + +pad(N) -> + pad1(N, []). + +pad1(0, Acc) -> + Acc; +pad1(N, Acc) + when N > 0 -> + pad1(N - 1, [" " | Acc]). + +%% this is a bit messy because of the way that hard lines are treated... +%% If your li's have a blank line between them the item gets wrapped in a para, +%% if not, they don't +%% BUT if one item is <p> wrapped then the next is too +parse_list(_Type, [], _R, _I, A, _) -> + {[], lists:reverse(A)}; +parse_list(Type, [{{Type, P}, _} | T], R, I, A, Wrap) -> + {Rest, NewP, NewWrap} = grab(T, R, [], Wrap), + Li = + case NewWrap of + false -> + Ret = parse([{normal, P}], R), + %% need to strip off the extra <p></p>'s + Ret2 = string:left(Ret, length(Ret) - 4), + Ret3 = string:right(Ret2, length(Ret2) -3), + Ret3 ++ "\n" ++ NewP ++ pad(I); + true -> + string:strip(parse([{normal, P}], R), right, ?LF) + ++ NewP ++ pad(I) + end, + NewWrap2 = + case T of + [] -> + false; % doesnt matter + [H2 | _T2] -> + case H2 of + {linefeed, _} -> + true; + _ -> + false + end + end, + parse_list(Type, Rest, R, I, [pad(I) ++ "<li>" + ++ string:strip(Li, right, ?LF) + ++ "</li>\n" | A], NewWrap2); +parse_list(_Type, List, _R, _I, A, _) -> + {List, lists:reverse(A)}. + +%% grab grabs normals, double codeblocks, linefeeds and blanks +%% BUT stop grabbing if a normal if preceeded by a linefeed or blank +%% UNLESS the normal starts with white space :( +%% the third return parameter is 'true' if the 'li' should be +%% wrapped in '<p></p>' and false if it shouldn't +grab([{{codeblock, _}, S} | T] = List, R, Acc, W) -> + case is_blockquote(S, T) of + {{true, R1}, T2} -> + grab(T2, R, ["</blockquote>", + make_escape_string(R1, R), + "<blockquote>" | Acc], W); + {{esc_false, R1}, _T2} -> + {R1, lists:reverse(Acc), false}; + {false, T2} -> + case is_double_indent(S) of + false -> + {List, lists:reverse(Acc), false}; + {true, R2} -> + %% if it is a double indent - delete 4 spaces + %% no it makes not sense to me neither :( + grab(T2, R, [" " ++ make_escape_string(R2, R) | Acc], W) + end + end; +grab([{linefeed, _} | T], R, Acc, false) -> + grab2(T, R, Acc, T, Acc, true); +grab([{linefeed, _} | T], R, Acc, true) -> + grab2(T, R, ["\n" | Acc], T, Acc, true); +grab([{blank, _} | T], R, Acc, false) -> + grab2(T, R, Acc, T, Acc, true); +grab([{blank, _} | T], R, Acc, true) -> + grab2(T, R, ["\n" | Acc], T, Acc, true); +grab([{normal, P} | T], R, Acc, W) -> + Li = case W of + false -> + make_escape_string(P, R); + true -> + "<p>"++ string:strip(make_escape_string(P, R), right, ?LF) + ++ "</p>" + end, + grab(T, R, [Li | Acc], W); +grab(List, _R, Acc, W) -> + {List, lists:reverse(Acc), W}. + +%% the problem is knowing when to grab, if the list is followed by a long +%% string of blank lines and linefeeds and a normal then the linefeeds aren't +%% grabbed +%% if the list if followed by blank lines and linefeeds and a normal with an +%% initial whitespace it is grabbed... +grab2([{normal, P2} | T], R, Acc, LO, AO, W) -> + case P2 of + [{{ws, _}, _} | T2] -> + Li = case W of + false -> + make_escape_string(T2, R); + true -> + "<p>" ++ + string:strip(make_escape_string(T2, R), right, ?LF) + ++ "</p>" + end, + grab(T, R, [Li | Acc], W); + _ -> + {LO, AO, false} + end; +grab2([{linefeed, _} | T], R, Acc, LO, AO, _W) -> + grab2(T, R, ["\n" | Acc], LO, AO, true); +grab2([{blank, _} | T], R, Acc, LO, AO, _W) -> + grab2(T, R, ["\n" | Acc], LO, AO, true); +%% We dont want to grab this stuff so return the old list and the old acc +grab2(_List, _R, _Acc, LO, AO, _W) -> + {LO, AO, true}. + +is_double_indent(List) -> + is_double_indent1(List, 0). + +%% double indent is any combination of tabs and spaces that add +%% up to 8 +is_double_indent1([], _N) -> + false; +is_double_indent1(Rest, N) + when N > 7 -> + {true, Rest}; +is_double_indent1([{{ws, sp}, _} | T], N) -> + is_double_indent1(T, N + 1); +is_double_indent1([{{ws, tab}, _} | T], N) -> + is_double_indent1(T, N + 4); +is_double_indent1(_List, _N) -> + false. + +is_blockquote(List, T) -> + case is_bq1(List, 0) of + false -> + {false, T}; + {esc_false, R} -> + {{esc_false, R}, T}; + {true, R} -> + {NewT, NewR} = grab2(T, R), + {{true, NewR}, NewT} + end. + +is_bq1([], _N) -> + false; +is_bq1([{{ws, sp}, _} | T], N) -> + is_bq1(T, N + 1); +is_bq1([{{ws, tab}, _} | T], N) -> + is_bq1(T, N + 4); +is_bq1([{{md, gt}, _}, {{ws, _}, _} | T], N) + when N > 3 -> + {true, T}; +is_bq1([{{punc, bslash}, _}, {{md, gt}, GT}, {{ws, _}, WS} | T], N) + when N > 3 -> + {esc_false, [GT, WS | T]}; +is_bq1(_List, _N) -> + false. + +grab2(List, R) -> + gb2(List, lists:reverse(R)). + +gb2([], Acc) -> + {[], lists:flatten(lists:reverse(Acc))}; +gb2([{blank, _} | T], Acc) -> + {T, lists:flatten(lists:reverse(Acc))}; +gb2([{_Type, P} | T], Acc) -> + gb2(T, [P | Acc]). + +%% +%% Make the lines from the raw tokens +%% +make_lines(Tokens) -> + ml1(Tokens, [], []). + +ml1([], [], A2) -> + lists:reverse(A2); +ml1([], A1, A2) -> + ml1([], [], [lists:reverse(A1) | A2]); +ml1([{{lf, _}, _} = H | T], A1, A2) -> + ml1(T, [], [ml2(H, A1) | A2]); +ml1([H | T], A1, A2) -> + ml1(T, [H | A1], A2). + +ml2(H, List) -> + lists:reverse([H | List]). + +%%% +%%% Process the lines and give each line a type. The valid types are: +%%% * normal line +%%% * reference style links +%%% * reference style images +%%% * special line types +%%% - blank +%%% - SETEXT header lines +%%% - ATX header lines +%%% - unordered lists (including code blocks) +%%% - ordered lists (including code blocks) +%%% - blockquotes +%%% - code blocks +%%% - horizontal rules +%%% +type_lines(Lines) -> + {Refs, TypedLines} = type_lines1(Lines, [], []), + {strip_lines(TypedLines), Refs}. + +type_lines1([], A1, A2) -> + {A1, lists:reverse(A2)}; +type_lines1([[{{ws, sp}, _}, {{inline, open}, _} | T1] = H | T2], A1, A2) -> + %% this clause extracts URL and Image refs + %% (it is the only one that uses A1 and A2... + %% inlines can have up to 3 spaces before it + t_inline(H, T1, T2, A1, A2); +type_lines1([[{{ws, tab}, _}, {{inline, open}, _} | T1] = H | T2], A1, A2) -> + t_inline(H, T1, T2, A1, A2); +type_lines1([[{{ws, comp}, W}, {{inline, open}, _} | T1] = H | T2], A1, A2) -> + case gt(W, 3) of + {true, _R} -> + t_inline(H, T1, T2, A1, A2); + false -> + type_lines1(T1, A1, [{normal , H} | A2]) % same exit at the final clause! + end, + t_inline(H, T1, T2, A1, A2); +type_lines1([[{{inline, open}, _} | T1] = H | T2], A1, A2) -> + t_inline(H, T1, T2, A1, A2); +type_lines1([[{{md, eq}, _} | _T] = H | T], A1, A2) -> + %% types setext lines + type_lines1(T, A1, [type_setext_h1(H) | A2]); +type_lines1([[{{md, dash}, _} | _T] = H | T], A1, A2) -> + %% NOTE 1: generates a ul as the default not a normal line + %% NOTE 2: depending on the context this might generate an <h2> header + %% or an <hr /> + %% NOTE 3: space - is typed to a bullet down in <ul> land... + type_lines1(T, A1, [type_setext_h2(H) | A2]); +type_lines1([[{{md, atx}, _} | _T] = H | T], A1, A2) -> + %% types atx lines + type_lines1(T, A1, [type_atx(H) | A2]); +type_lines1([[{{md, gt}, _} | []] = H | T], A1, A2) -> + %% types blockquotes + %% a blockquote on its own or followed by a linefeed is + %% displayed 'as is' by showdown + type_lines1(T, A1, [{normal, H} | A2]); +type_lines1([[{{md, gt}, _}, {{lf, _}, _} | []] = H | T], A1, A2) -> + type_lines1(T, A1, [{normal, H} | A2]); +%% one with anything after it starts a blockquote +type_lines1([[{{md, gt}, _} | _T1] = H | T], A1, A2) -> + type_lines1(T, A1, [{blockquote, H} | A2]); +type_lines1([[{{ws, _}, _}, {{md, star}, _} = ST1, {{ws, _}, _} = WS1 | T1] = H | T], A1, A2) -> + %% types unordered lists lines + %% NOTE 1: the dashed version is generated in type_setext_h2 + %% NOTE 2: the asterix version also might generate a horizontal rule + %% which is why it jumps to type_star2 <-- note the 2!! + type_lines1(T, A1, [{type_star2([ST1, WS1 | T1]), H} | A2]); +type_lines1([[{{md, star}, _}, {{ws, _}, _} | _T1] = H | T], A1, A2) -> + type_lines1(T, A1, [{type_star2(H), H} | A2]); +type_lines1([[{{ws, _}, _}, {{md, plus}, _}, {{ws, _}, _} = W | T1] = H | T], A1, A2) -> + type_lines1(T, A1, [{{ul, make_list_str([W | T1])}, H} | A2]); +type_lines1([[{{md, plus}, _}, {{ws, _}, _} = W | T1] = H | T], A1, A2) -> + type_lines1(T, A1, [{{ul, make_list_str([W | T1])}, H} | A2]); +%% UL based on dashes +type_lines1([[{{ws, _}, _}, {{md, dash}, _}, {{ws, _}, _} = W | T1] = H | T], A1, A2) -> + type_lines1(T, A1, [{{ul, make_list_str([W | T1])}, H} | A2]); +type_lines1([[{{ws, _}, _}, {num, _} = N1| T1] | T], A1, A2) -> + %% types ordered lists... + type_lines1(T, A1, [type_ol([N1 | T1]) | A2]); +type_lines1([[{num, _} | _T] = H | T], A1, A2) -> + type_lines1(T, A1, [type_ol(H) | A2]); +type_lines1([[{{md, underscore}, _} | _T1] = H | T], A1, A2) -> + %% types horizontal rules for stars and underscores + %% dashes and some stars are done elsewhere... + type_lines1(T, A1, [type_underscore(H) | A2]); +type_lines1([[{{md, star}, _} | _T1] = H | T], A1, A2) -> + type_lines1(T, A1, [type_star(H) | A2]); +type_lines1([[{{{tag, _Type}, Tag}, _ } = H | T1] = List | T], A1, A2) -> + %% Block level tags - these are look ahead they must be + %% on a single line (ie directly followed by a lf and nothing else + case is_blank(T1) of + false -> + type_lines1(T, A1, [{normal , List} | A2]); + true -> + case is_block_tag(Tag) of + true -> + type_lines1(T, A1, [{blocktag , [H]} | A2]); + false -> + type_lines1(T, A1, [{tag, [H | T1]} | A2]) + end + end; +type_lines1([[{{lf, _}, _}| []] = H | T], A1, A2) -> + %% types a blank line or a code block + type_lines1(T, A1, [{linefeed, H} | A2]); +type_lines1([[{{ws, _}, _} | _T1] = H | T], A1, A2) -> + type_lines1(T, A1, [type_ws(H) | A2]); + +%% Final clause... +type_lines1([H | T], A1, A2) -> + type_lines1(T, A1, [{normal , H} | A2]). + +t_inline(H, T1, T2, A1, A2) -> + case snip_ref(T1) of + {Type, {Id, {Url, Title}}} -> + type_lines1(T2, lists:flatten([{Id, {Url, Title}} | A1]), + [{Type, H} | A2]); + normal -> + type_lines1(T2, A1, [{normal, H} | A2]) + end. + +%% strips blanks from the beginning and end +strip_lines(List) -> + lists:reverse(strip_lines1(lists:reverse(strip_lines1(List)))). + +strip_lines1([{linefeed, _} | T]) -> + strip_lines1(T); +strip_lines1([{blank, _} | T]) -> + strip_lines1(T); +strip_lines1(List) -> + List. + +%% +%% Loads of type rules... +%% +is_blank([]) -> + true; +is_blank([{{lf, _}, _} | []]) -> + true; +is_blank([{{ws, _}, _} | T]) -> + is_blank(T); +is_blank(_List) -> + false. + +is_block_tag("address") -> + true; +is_block_tag("blockquote") -> + true; +is_block_tag("center") -> + true; +is_block_tag("dir") -> + true; +is_block_tag("div") -> + true; +is_block_tag("dl") -> + true; +is_block_tag("fieldset") -> + true; +is_block_tag("form") -> + true; +is_block_tag("h1") -> + true; +is_block_tag("h2") -> + true; +is_block_tag("h3") -> + true; +is_block_tag("h4") -> + true; +is_block_tag("h5") -> + true; +is_block_tag("h6") -> + true; +is_block_tag("hr") -> + true; +is_block_tag("isindex") -> + true; +is_block_tag("menu") -> + true; +is_block_tag("noframes") -> + true; +is_block_tag("noscript") -> + true; +is_block_tag("ol") -> + true; +is_block_tag("p") -> + true; +is_block_tag("pre") -> + true; +is_block_tag("table") -> + true; +is_block_tag("thead") -> + true; +is_block_tag("tbody") -> + true; +is_block_tag("tr") -> + true; +is_block_tag("td") -> + true; +is_block_tag("ul") -> + true; +is_block_tag(_Other) -> + false. + +type_underscore(List) -> + case type_underscore1(trim_right(List)) of + hr -> + {hr, List}; + maybe -> + {type_underscore2(List), List} + end. + +type_underscore1([]) -> + hr; +type_underscore1([{{md, underscore}, _} | T]) -> + type_underscore1(T); +type_underscore1(_List) -> + maybe. + +type_underscore2(List) -> + case trim_right(List) of % be permissive of trailing spaces + [{{md, underscore}, _}, {{ws, _}, _}, + {{md, underscore}, _}, {{ws, _}, _}, + {{md, underscore}, _}] -> + hr; + _Other -> + normal + end. + +type_star(List) -> + Trim = trim_right(List), + case type_star1(Trim) of % be permssive of trailing spaces + hr -> + {hr, trim_right(Trim)}; + maybe -> + Type = type_star2(List), + %% if it is a normal line we prepend it with a special + %% non-space filling white space character + case Type of + normal -> + {normal, [{{ws, none}, none} | List]}; + _ -> + {Type, List} + end + end. + +type_star1([]) -> + hr; +type_star1([{{md, star}, _} | T]) -> + type_star1(T); +type_star1(_List) -> + maybe. + +type_star2(List) -> + case trim_right(List) of + [{{md, star}, _}, {{ws, _}, _}, + {{md, star}, _}, {{ws, _}, _}, + {{md, star}, _}] -> + hr; + _Other -> + case List of + [{{md, star}, _}, + {{ws, _}, _}= WS | T] -> + {ul, make_list_str([WS | T])}; + _Other2 -> + normal + end + end. + +type_ol(List) -> + case type_ol1(List, []) of + normal -> + {normal, List}; + {ol, Str} -> + {{ol, Str}, List}; + {esc_normal, Str} -> + {normal, Str} + end. + +%% this line terminates on an escaped fullstop after a number +%% (but you need to drop the bslash...) +type_ol1([{num, _} = N, + {{punc, bslash}, _}, + {{punc, fullstop}, _} = P | T], Acc) -> + {esc_normal, lists:flatten([lists:reverse(Acc), N, P | T])}; +type_ol1([{num, _} = H | T], Acc) -> + %% we accumulate the digits in case we need to escape a full stop in a normal line + type_ol1(T, [H | Acc]); +type_ol1([{{punc, fullstop}, _}, {{ws, _}, _} | T], _Acc) -> + {ol, T}; +type_ol1(_List, _Acc) -> + normal. + +%% You need to understand what this function is trying to d... +%% '### blah' is fine +%% '### blah ###' is reduced to '### blah' because trailing #'s are +%% just for show but... +%% '##' is like appling '#' to '#' <-- applying 1 less styling to a single # +%% and '###' is like appling '##' to '#' etc, etc +%% but after you hit 6#'s you just get this for a single hash +%% ie '#############' is like applying '######' to a single '#' +%% but/and '######## blah' is like apply '######' to '## blah' +%% strip trailing #'s as they are decorative only... +type_atx(List) -> + {Sz, R} = get_atx_size(List), + A = [{{md, atx}, "#"}], + Type = + case is_all_hashes(R) of + true -> + if + Sz == 1 -> + normal; + ((Sz > 1) andalso (Sz < 6)) -> + Ns = integer_to_list(Sz - 1), + Hn = list_to_atom("h" ++ Ns), + {Hn, A}; + ((Sz == 6) andalso (R == [])) -> + {h5, A}; + ((Sz == 6) andalso (R == [{{lf, lf}, "\n"}])) -> + {h5, A}; + ((Sz == 6) andalso (R == [{{lf, crlf}, "\r\n"}])) -> + {h5, A}; + ((Sz == 6) andalso (R =/= [])) -> + {h6, A} + end; + false -> + Ns = integer_to_list(Sz), + Hn = list_to_atom("h" ++ Ns), + {Hn, strip_atx(R)} + end, + {Type, List}. + +is_all_hashes([]) -> + true; +is_all_hashes([{{md, atx}, _} | T]) -> + is_all_hashes(T); +is_all_hashes([{{lf, _}, _} | []]) -> + true; +is_all_hashes(_List) -> + false. + +get_atx_size(List) -> + g_atx_size1(List, 0). + + % this function also strips whitespace to the left... +g_atx_size1([{{md, atx}, _} = A | T], N) when N == 6 -> + {6, [A | T]}; +g_atx_size1([{{md, atx}, _} | T], N) -> + g_atx_size1(T, N + 1); +g_atx_size1([{{ws, _}, _} | T], N) -> + g_atx_size1(T, N); +g_atx_size1(List, N) -> + {N, List}. + +strip_atx(List) -> + lists:reverse(s_atx1(lists:reverse(List))). + +s_atx1([{{lf, _}, _}, {{md, atx}, _} | T]) -> + s_atx1(T); +s_atx1([{{md, atx}, _} | T]) -> + s_atx1(T); +s_atx1(List) -> + List. + +type_setext_h1(List) -> + type_s_h1_1(List, []). + +%% terminates on running out or new line +type_s_h1_1([{{lf, _}, _} = L | []], Acc) -> + {setext_h1, lists:reverse([L | Acc])}; +type_s_h1_1([], Acc)-> + {setext_h1, lists:reverse(Acc)}; +type_s_h1_1([[] | T], Acc) -> + type_s_h1_1(T, Acc); +type_s_h1_1([{{md, eq}, _} = H | T], Acc) -> + type_s_h1_1(T, [H | Acc]); +type_s_h1_1(L, Acc) -> + {normal, lists:flatten([Acc | L])}. + +type_setext_h2(List) -> + case type_s_h2_1(List) of + h2_or_hr -> + {h2_or_hr, List}; + not_h2 -> + {type_s_h2_2(trim_right(List)), List} + end. +%% terminates on running out or new line +type_s_h2_1([{{lf, _}, _} | []]) -> + h2_or_hr; +type_s_h2_1([]) -> + h2_or_hr; +type_s_h2_1([[] | T]) -> + type_s_h2_1(T); +type_s_h2_1([{{md, dash}, _} | T]) -> + type_s_h2_1(T); +type_s_h2_1(_L) -> + not_h2. + +type_s_h2_2([{{md, dash}, _}, {{ws,_}, _}, + {{md, dash}, _}, {{ws, _}, _}, + {{md, dash}, _}]) -> + hr; +type_s_h2_2([{{md, dash}, _}, + {{ws, _}, _} = WS | T]) -> + {ul, make_list_str([WS | T])}; +type_s_h2_2(_List) -> + normal. + +type_ws(List) -> + case type_ws1(List) of + blank -> + {blank, List}; + try_codeblock -> + case type_ws2(List) of + normal -> + {normal, List}; + {codeblock, Ret} -> + {{codeblock, Ret}, List} + end + end. + +type_ws1([]) -> + blank; +type_ws1([{{lf, _}, _} | []]) -> + blank; +type_ws1([[] | T]) -> + type_ws1(T); +type_ws1([{{ws, _}, _} | T]) -> + type_ws1(T); +type_ws1(_L) -> + try_codeblock. + +%% 4 or more spaces takes you over the limit +%% (a tab is 4...) +type_ws2([{{ws, tab}, _} | T]) -> + {codeblock, T}; +type_ws2([{{ws, comp}, W} | T]) -> + case gt(W, 4) of + {true, R} -> + {codeblock, [R| T]}; + false -> + normal + end; +type_ws2([{{ws, sp}, _} | _T]) -> + normal. + +gt(String, Len) -> + ExpString = re:replace(String, "\t", " ", [{return, list}]), + ExpStringLen = length(ExpString), + if + ExpStringLen >= Len -> + WS = string:substr(ExpString, Len + 1, + ExpStringLen), + {true, {{ws, sp}, WS}}; + ExpStringLen < Len -> + false + end. + +%% make a tag into a string +make_tag_str(L, R) -> + make_tag1(L, R, []). + +make_tag1([], _R, Acc) -> + lists:reverse(Acc); +make_tag1([{{{tag, _Type}, _Tag}, B} | T], R, Acc) -> + make_tag1(T, R, [B | Acc]); +make_tag1([H | T], R, Acc) -> + make_tag1(T, R, [make_string([H], R) | Acc]). + +esc_tag(String) -> + esc_tag1(String, []). + +esc_tag1([], Acc) -> + lists:reverse(Acc); +esc_tag1([?NBSP | T], Acc) -> + esc_tag1(T, [?SPACE | Acc]); % non-breaking space to space +esc_tag1([H | T], Acc) -> + esc_tag1(T, [H | Acc]). + +%% if it is a list we need to discard the initial white space... +make_list_str([{{ws, _}, _} | T] = List) -> + case is_double_indent(List) of + false -> + T; + {true, R} -> + lists:flatten([{tags, "<pre><code>"} ,R , + {tags, "</code></pre>\n\n"} | []]) + end. + +%% All ref processing can ignore the original values 'cos those +%% have already been captured at a higher level +snip_ref(List) -> + case get_id(List) of + {[{_, Id}], Rest} -> + {_Rest2, Ref, Title} = parse_inline(Rest), + Ref2 = trim(Ref), + Rs = htmlencode(make_plain_string(Ref2)), + Ts = make_plain_string(Title), + {inlineref, {Id, {Rs, Ts}}}; + normal -> + normal + end. + +get_id(List) -> + g_id1(List, []). + +g_id1([], _Acc) -> + normal; +g_id1([{{inline, close}, _}, {{punc, colon}, _}, {{ws, _}, _} | T], Acc) -> + {lists:reverse(Acc), T}; +g_id1([H | T], Acc) -> + g_id1(T, [H | Acc]). + +parse_inline(List) -> + parse_inline1(List, []). + +%% snip off the terminal linefeed (if there is one...) +parse_inline1([{{lf, _}, _} | []], A) -> + {[], lists:reverse(A), []}; +parse_inline1([], A) -> + {[], lists:reverse(A), []}; +parse_inline1([{{punc, bslash}, _}, {bra, _} = B | T], A) -> + %% brackets can be escaped + parse_inline1(T, [B | A]); +parse_inline1([{{punc, bslash}, _}, {ket, _} = B | T], A) -> + parse_inline1(T, [B | A]); +parse_inline1([{{punc, bslash}, _}, {{punc, doubleq}, _} = Q | T], A) -> + parse_inline1(T, [Q | A]); +parse_inline1([{{punc, bslash}, _}, {{punc, singleq}, _} = Q | T], A) -> + parse_inline1(T, [Q | A]); +parse_inline1([{{punc, doubleq}, _} | T], A) -> + %% these clauses capture the start of the title... + parse_inline2(T, lists:reverse(A), doubleq, []); +parse_inline1([{{punc, singleq}, _} | T], A) -> + parse_inline2(T, lists:reverse(A), singleq, []); +parse_inline1([{bra, _} | T], A) -> + parse_inline2(T, lists:reverse(A), brackets, []); +parse_inline1([{ket, _} | T], A) -> + {T, lists:reverse(A), []}; +parse_inline1([H | T], A) -> + parse_inline1(T, [H | A]). + +%% this gets titles in single and double quotes +%% the delimiter type is passed in as 'D' +parse_inline2([], Url, _D, A) -> + {[], Url, lists:flatten(lists:reverse(A))}; + +parse_inline2([{{punc, bslash}, _}, {bra, _} = B | T], Url, D, A) -> + %% brackets can be escaped + parse_inline2(T, Url, D, [B | A]); +parse_inline2([{{punc, bslash}, _}, {ket, _} = B | T], Url, D, A) -> + parse_inline2(T, Url, D, [B | A]); +parse_inline2([{{punc, bslash}, _}, + {{punc, doubleq}, _}= Q | T], Url, D, A) -> + %% quotes can be escaped + parse_inline2(T, Url, D, [Q | A]); +parse_inline2([{{punc, bslash}, _}, {{punc, singleq}, _} = Q | T], Url, D, A) -> + parse_inline2(T, Url, D, [Q | A]); +parse_inline2([{{punc, doubleq}, _} | T], Url, doubleq, A) -> + %% these clauses capture the end of the title and drop the delimiter... + parse_inline2(T, Url, none, A); +parse_inline2([{{punc, singleq}, _} | T], Url, singleq, A) -> + parse_inline2(T, Url, none, A); +parse_inline2([{ket, _} | T], Url, brackets, A) -> + parse_inline2(T, Url, none, A); +parse_inline2([{ket, _} | T], Url, none, A) -> + %% terminator clause + {T, Url, lists:flatten(lists:reverse(A))}; +parse_inline2([_H | T], Url, none, A) -> + %% this clause silently discards stuff after the delimiter... + parse_inline2(T, Url, none, [A]); +parse_inline2([H | T], Url, D, A) -> + parse_inline2(T, Url, D, [H | A]). + +trim(String) -> + trim_left(trim_right(String)). + +trim_right(String) -> + lists:reverse(trim_left(lists:reverse(String))). + +trim_left([{{ws, _}, _} | T]) -> + trim_left(T); +trim_left([[] | T]) -> + trim_left(T); +trim_left(List) -> + List. + +snip(List) -> + List2 = lists:reverse(List), + case List2 of + [{{lf, _}, _} | T] -> + lists:reverse(T); + _ -> + List + end. + +%% end of ref processing + +%% +%% Build the Lexed Token List +%% This is a two part lexer, first it chunks the input and then on the second +%% pass it gathers it into lines and types the lines +%% +%% NOTE that there are two different styles of processing lines: +%% * markdown transformed +%% * block +%% inside block processing the whole text is dumped and just url encoded +%% and the original text is always maintained during the lexing/parsing +%% so that it can be recreated if the context requires it... +%% + +lex(String) -> + merge_ws(lex1(String, [], [])). + +merge_ws(List) -> + merge_ws1(List, []). + +merge_ws1([], Acc) -> + lists:reverse(Acc); +merge_ws1([{{ws, _}, W1}, {{ws, _}, W2} | T], Acc) -> + merge_ws1([{{ws, comp}, W1 ++ W2} | T], Acc); +merge_ws1([H | T], Acc) -> + merge_ws1(T, [H | Acc]). + +%% this is the terminal head which ends the parsing... +lex1([], [], A2) -> + lists:flatten(lists:reverse(A2)); +lex1([], A1, A2) -> + lex1([], [], [lex2(A1) | A2]); +%% these two heads capture opening and closing tags +lex1([$<, $/|T], A1, A2) -> + {Tag, NewT} = closingdiv(T, []), + lex1(NewT, [], [Tag, lex2(A1) | A2]); +lex1([$< | T], A1, A2) -> + {Tag, NewT} = openingdiv(T), + lex1(NewT, [], [Tag , lex2(A1) | A2]); +%% these clauses are the normal lexer clauses +lex1([$= | T], A1, A2) -> + lex1(T, [], [{{md, eq}, "="}, lex2(A1) | A2]); +lex1([$- | T], A1, A2) -> + lex1(T, [], [{{md, dash}, "-"}, lex2(A1) | A2]); +lex1([$# | T], A1, A2) -> + lex1(T, [], [{{md, atx}, "#"}, lex2(A1) | A2]); +lex1([$> | T], A1, A2) -> + lex1(T, [], [{{md, gt}, ">"}, lex2(A1) | A2]); +lex1([$+ | T], A1, A2) -> + lex1(T, [], [{{md, plus}, "+"}, lex2(A1) | A2]); +lex1([$* | T], A1, A2) -> + lex1(T, [], [{{md, star}, "*"}, lex2(A1) | A2]); +lex1([$_ | T], A1, A2) -> + lex1(T, [], [{{md, underscore}, "_"}, lex2(A1) | A2]); +lex1([$1 | T], A1, A2) -> + lex1(T, [], [{num, "1"}, lex2(A1) | A2]); +lex1([$2 | T], A1, A2) -> + lex1(T, [], [{num, "2"}, lex2(A1) | A2]); +lex1([$3 | T], A1, A2) -> + lex1(T, [], [{num, "3"}, lex2(A1) | A2]); +lex1([$4 | T], A1, A2) -> + lex1(T, [], [{num, "4"}, lex2(A1) | A2]); +lex1([$5 | T], A1, A2) -> + lex1(T, [], [{num, "5"}, lex2(A1) | A2]); +lex1([$6 | T], A1, A2) -> + lex1(T, [], [{num, "6"}, lex2(A1) | A2]); +lex1([$7 | T], A1, A2) -> + lex1(T, [], [{num, "7"}, lex2(A1) | A2]); +lex1([$8 | T], A1, A2) -> + lex1(T, [], [{num, "8"}, lex2(A1) | A2]); +lex1([$9 | T], A1, A2) -> + lex1(T, [], [{num, "9"}, lex2(A1) | A2]); +lex1([$0 | T], A1, A2) -> + lex1(T, [], [{num, "0"}, lex2(A1) | A2]); +lex1([$. | T], A1, A2) -> + lex1(T, [], [{{punc, fullstop}, "."}, lex2(A1) | A2]); +lex1([$: | T], A1, A2) -> + lex1(T, [], [{{punc, colon}, ":"}, lex2(A1) | A2]); +lex1([$' | T], A1, A2) -> + lex1(T, [], [{{punc, singleq}, "'"}, lex2(A1) | A2]); %' +lex1([$" | T], A1, A2) -> + lex1(T, [], [{{punc, doubleq}, "\""}, lex2(A1) | A2]); %" +lex1([$` | T], A1, A2) -> + lex1(T, [], [{{punc, backtick}, "`"}, lex2(A1) | A2]); %" +lex1([$! | T], A1, A2) -> + lex1(T, [], [{{punc, bang}, "!"}, lex2(A1) | A2]); %" +lex1([$\\ | T], A1, A2)-> + lex1(T, [], [{{punc, bslash}, "\\"}, lex2(A1) | A2]); %" +lex1([$/ | T], A1, A2) -> + lex1(T, [], [{{punc, fslash}, "/"}, lex2(A1) | A2]); %" +lex1([$( | T], A1, A2) -> + lex1(T, [], [{bra, "("}, lex2(A1) | A2]); +lex1([$) | T], A1, A2) -> + lex1(T, [], [{ket, ")"}, lex2(A1) | A2]); +lex1([$[ | T], A1, A2) -> + lex1(T, [], [{{inline, open}, "["}, lex2(A1) | A2]); +lex1([$] | T], A1, A2) -> + lex1(T, [], [{{inline, close}, "]"}, lex2(A1) | A2]); +lex1([?SPACE | T], A1, A2) -> + %% note there is a special 'whitespace' {{ws, none}, ""} which is used to generate non-space + %% filling whitespace for cases like '*bob* is great' which needs a non-space filling + %% whitespace prepended to trigger emphasis so it renders as "<em>bob</em> is great... + %% that 'character' doesn't exist so isn't in the lexer but appears in the parser + lex1(T, [], [{{ws, sp}, " "}, lex2(A1) | A2]); +lex1([?TAB | T], A1, A2) -> + lex1(T, [], [{{ws, tab}, "\t"}, lex2(A1) | A2]); +lex1([?NBSP | T], A1, A2) -> + lex1(T, [], [{{ws, sp}, " "}, lex2(A1) | A2]); +lex1([?CR, ?LF | T], A1, A2) -> + lex1(T, [], [{{lf, crlf}, [?CR , ?LF]}, lex2(A1) | A2]); +lex1([?LF | T], A1, A2) -> + lex1(T, [], [{{lf, lf}, [?LF]}, lex2(A1) | A2]); +lex1([H|T], A1, A2) -> + %% this final clause accumulates line fragments + lex1(T, [H |A1] , A2). + +lex2([]) -> + []; +lex2(List) -> + {string, lists:flatten(lists:reverse(List))}. + +%% need to put in regexes for urls and e-mail addies +openingdiv(String) -> + case get_url(String) of + {{url, URL}, R1} -> + {{url, URL}, R1}; + not_url -> + case get_email_addie(String) of + {{email, EM}, R2} -> + {{email, EM}, R2}; + not_email -> + openingdiv1(String, []) + end + end. + + % dumps out a list if it is not an opening div +openingdiv1([], Acc) -> + {lists:flatten([{{punc, bra}, "<"} | lex(lists:reverse(Acc))]), []}; +openingdiv1([$/,$>| T], Acc) -> + Acc2 = lists:flatten(lists:reverse(Acc)), + Acc3 = string:to_lower(Acc2), + [Tag | _T] = string:tokens(Acc3, " "), + {{{{tag, self_closing}, Tag}, "<" + ++ Acc2 ++ "/>"}, T}; +openingdiv1([$>| T], []) -> + %% special for non-tags + {[{{punc, bra}, "<"}, + {{punc, ket}, ">"}], T}; +openingdiv1([$>| T], Acc) -> + Acc2 = lists:flatten(lists:reverse(Acc)), + Acc3 = string:to_lower(Acc2), + [Tag | _T] = string:tokens(Acc3, " "), + {{{{tag, open}, Tag}, "<" + ++ Acc2 ++ ">"}, T}; +openingdiv1([H|T], Acc) -> + openingdiv1(T, [H | Acc]). + + % dumps out a list if it is not an closing div +closingdiv([], Acc) -> + {lists:flatten([{{punc, bra}, "<"}, {{punc, fslash}, "/"} + | lex(lists:reverse(Acc))]), []}; +closingdiv([$>| T], Acc) -> + Acc2 = lists:flatten(lists:reverse(Acc)), + Acc3 = string:to_lower(Acc2), + [Tag | _T] = string:tokens(Acc3, " "), + {{{{tag, close}, Tag}, "</" + ++ Acc2 ++ ">"}, T}; +closingdiv([H|T], Acc) -> + closingdiv(T, [H | Acc]). + +get_url(String) -> + HTTP_regex = "^(H|h)(T|t)(T|t)(P|p)(S|s)*://", + case re:run(String, HTTP_regex) of + nomatch -> + not_url; + {match, _} -> + get_url1(String, []) + end. + +get_url1([], Acc) -> + URL = lists:flatten(lists:reverse(Acc)), + {{url, URL}, []}; +get_url1([$\\, $> | T], Acc) -> + %% allow escaped kets + get_url1(T, [$>, $\\ | Acc]); +get_url1([$> | T], Acc) -> + URL = lists:flatten(lists:reverse(Acc)), + {{url, URL}, T}; +get_url1([H | T], Acc) -> + get_url1(T, [H | Acc]). + +get_email_addie(String) -> + Snip_regex = ">", + case re:run(String, Snip_regex) of + nomatch -> + not_email; + {match, [{N, _} | _T]} -> + {Possible, [$> | T]} = lists:split(N, String), + EMail_regex = "[a-zA-Z0-9!#$%&'*+/=?^_`{|}~-]+" + ++ "(?:\.[a-zA-Z0-9!#$%&'*+/=?^_`{|}~-]+)*" + ++ "@(?:[a-zA-Z0-9](?:[a-zA-Z0-9-]*[a-zA-Z0-9])?\.)+" + ++ "(?:[a-zA-Z]{2}|com|org|net|gov|mil" + ++ "|biz|info|mobi|name|aero|jobs|museum)", + case re:run(Possible, EMail_regex) of + nomatch -> + not_email; + {match, _} -> + {{email, Possible}, T} + end + end. + +make_plain_string(List) -> + make_plain_string(List, []). + +make_plain_string([], Acc) -> + lists:flatten(lists:reverse(Acc)); +make_plain_string([{{ws, none}, none} | T], Acc) -> + make_plain_string(T, [" " | Acc]); +make_plain_string([{_, Str} | T], Acc) -> + make_plain_string(T, [Str | Acc]). + +make_escape_string(List, Refs) -> + make_escape_string(List, Refs, []). + +make_escape_string([], _R, A) -> + lists:flatten(lists:reverse(A)); +make_escape_string([{tags, Tag} | T], R, A) -> + make_escape_string(T, R, [{tags, Tag} | A]); +make_escape_string([H | T], R, A) -> + make_escape_string(T, R, [make_string([H], R) | A]). + + +make_string(List, Refs) -> + make_string(List, Refs, []). + +make_string([], _R, A) -> + Flat = lists:flatten(lists:reverse(A)), + htmlchars(Flat); +make_string([{{punc, bang}, B}, {{inline, open}, O} | T], R, A) -> + case get_inline(T, R, [], img) of + {Rest, {Url, Title, Acc}} -> + Tag = [make_img_tag(Url, Acc, Title)], + make_string(Rest, R, [Tag | A]); + {Rest, Tag} -> + make_string(Rest, R, [Tag, O, B | A]) + end; +%% escape inline open's... +make_string([{{punc, bslash}, _}, {{inline, open}, O} | T], R, A) -> + make_string(T, R, [O | A]); +make_string([{{inline, open}, O} | T], R, A) -> + case get_inline(T, R, [], url) of + {Rest, {Url, Title, Acc}} -> + Tit = case Title of + [] -> + []; + _ -> + " title=\"" ++ Title ++ "\"" + end, + Tag = [{tags, "<a href=\"" ++ Url ++ "\"" + ++ Tit ++ ">"}, Acc, + {tags, "</a>"} | []], + make_string(Rest, R, [Tag | A]); + {Rest, Tag} -> + make_string(Rest, R, [Tag, O | A]) + end; +make_string([{email, Addie} | T], R, A) -> + make_string(T, R, [{tags, "\" />"}, Addie, {tags, "<a href=\"mailto:"}| A]); +make_string([{url, Url} | T], R, A) -> + make_string(T, R, [ {tags, "</a>"}, Url, {tags, "\">"}, Url, + {tags, "<a href=\""} | A]); +make_string([{tags, _} = Tag | T], R, A) -> + make_string(T, R, [Tag | A]); +make_string([{{{tag, Type}, Tag}, _} | T], R, A) -> + Tag2 = esc_tag(Tag), + TagStr = case Type of + open -> + {tags, "<" ++ Tag2 ++ ">"}; + close -> + {tags, "</" ++ Tag2 ++ ">"}; + self_closing -> + {tags, "<" ++ Tag2 ++ " />"} + end, + make_string(T, R, [TagStr | A]); +make_string([{_, Orig} | T], R, A) -> + make_string(T, R, [Orig | A]). + +get_inline([], _R, A, _) -> + %% if the inline doesn't terminate its not an inline... + {[], make_plain_string(lists:reverse(A))}; +get_inline([{{punc, bang}, _B}, {{inline, open}, _O} | T], R, A, url) -> + %% a url can contain an image inline + {Rest, {Url, Title, Acc}} = get_inline(T, R, A, img), + Tag = make_img_tag(Url, Acc, Title), + %% We double tag the tag so that it can get through the lists:flatteners.. + get_inline(Rest, R, [{tags, Tag} | A], url); +get_inline([{{inline, close}, _}, {bra, _} | T], _R, A, _) -> + {Rest, Url, Title} = parse_inline(T), + Tag = {string:strip(make_plain_string(Url)), + make_plain_string(Title), + make_plain_string(lists:reverse(A))}, + {Rest, Tag}; +get_inline([{{inline, close}, _}, {{ws, sp}, _}, {bra, _} | T], _R, A, img) -> + %% for img's but not url's you need to allow a single space between them + %% to be compatible with showdown :( + {Rest, Url, Title} = parse_inline(T), + Tag = {string:strip(make_plain_string(Url)), + make_plain_string(Title), + make_plain_string(lists:reverse(A))}, + {Rest, Tag}; +get_inline([{{inline, close}, _}, {{inline, open}, _} | T], R, A, _) -> + %% this clause detects references to images/links... + Text = make_plain_string(lists:reverse(A)), + case get_id_diff(T) of + normal -> + {[], make_plain_string(lists:reverse(A))}; + {[{_, Id}], Rest} -> + {Url, Title} = case lists:keyfind(Id, 1, R) of + false -> + {"", ""}; + {Id, {U, Tit}} -> + {U, Tit} + end, + Tag = {Url, Title, Text}, + {Rest, Tag}; + _Other -> + {[], make_plain_string(lists:reverse(A))} % random failing id's + end; +get_inline([{{inline, close}, _} = C , {{ws, _}, _}, + {{inline, open}, _} = O | T], R, A, Type) -> + %% so does this one - just delete the space and rethrow it + get_inline([C, O | T], R, A, Type); +get_inline([{{inline, close}, _} | T], R, A, _) -> + %% this is the markdown extension clause that takes an id in square brackets without + %% any additional stuff as a valid id marker + Id = make_plain_string(lists:reverse(A)), + case lists:keyfind(Id, 1, R) of + false -> + {T, lists:flatten([Id , $]])}; + {Id, {Url, Title}} -> + Tag = {Url, Title, Id}, + {T, Tag} + end; +get_inline([H | T], R, A, Type) -> + get_inline(T, R, [H | A], Type). + +get_id_diff(List) -> + g_id_diff1(List, []). + +g_id_diff1([], _Acc) -> + normal; +g_id_diff1([{{inline, close}, _}| T], Acc) -> + {lists:reverse(Acc), T}; +g_id_diff1([H | T], Acc) -> + g_id_diff1(T, [H | Acc]). + +%% convert ascii into html characters +htmlencode(List) -> + htmlencode(List, []). + +htmlencode([], Acc) -> + lists:flatten(lists:reverse(Acc)); + +htmlencode([$& | Rest], Acc) -> + htmlencode(Rest, ["&" | Acc]); +htmlencode([$< | Rest], Acc) -> + htmlencode(Rest, ["<" | Acc]); +htmlencode([$> | Rest], Acc) -> + htmlencode(Rest, [">" | Acc]); +htmlencode([160 | Rest], Acc) -> + htmlencode(Rest, [" " | Acc]); +htmlencode([Else | Rest], Acc) -> + htmlencode(Rest, [Else | Acc]). + +htmlchars(List) -> + htmlchars1(List, []). + +htmlchars1([], Acc) -> + lists:flatten(lists:reverse(Acc)); +htmlchars1([{tags, Tag} | T], Acc) -> + %% tags are just wheeched out unescaped + htmlchars1(T, [Tag | Acc]); +htmlchars1([?CR, ?LF | T], Acc) -> + %% line ends are pushed to a space.. + htmlchars1(T, ["\n" | Acc]); +htmlchars1([?LF | T], Acc) -> + %% line ends are pushed to a space.. + htmlchars1(T, ["\n" | Acc]); +htmlchars1([?CR | T], Acc) -> + htmlchars1(T, ["\r" | Acc]); +htmlchars1([$\\, $*, $*, $* | T], A) -> + %% there is a non-space filling white space represented by the atom 'none' + %% which is created in the parser (NOT IN THE LEXER!) and which triggers + %% emphasis or strong tags being turned on... + htmlchars1(T, [$*, $*, $* | A]); +htmlchars1([$*, $*, $* | T], A) -> + {T2, NewA} = superstrong(T, $*), + htmlchars1(T2, [NewA | A]); +htmlchars1([$\\, $*, $* | T], A) -> + %% repeat for strong + htmlchars1(T, [$*, $* | A]); +htmlchars1([$*, $* | T], A) -> + {T2, NewA} = strong(T, $*), + htmlchars1(T2, [NewA | A]); +htmlchars1([$\\, $* | T], A) -> + %% likewise for strong + htmlchars1(T, [$* | A]); +htmlchars1([$* | T], A) -> + {T2, NewA} = emphasis(T, $*), + htmlchars1(T2, [NewA | A]); +htmlchars1([$\\, $_, $_, $_ | T], A) -> + %% and again for underscores + htmlchars1(T, [$_, $_, $_ | A]); +htmlchars1([$_, $_, $_ | T], A) -> + %% the none atom is the non-space filling whitespace + {T2, NewA} = superstrong(T, $_), + htmlchars1(T2, [NewA | A]); +htmlchars1([$\\, $_, $_ | T], A) -> + %% and strong + %% and again for underscores + htmlchars1(T, [$_, $_ | A]); +htmlchars1([$_, $_ | T], A) -> + {T2, NewA} = strong(T, $_), + htmlchars1(T2, [NewA | A]); +htmlchars1([$\\, $_ | T], A) -> + %% likewise for strong + htmlchars1(T, [$_ | A]); +htmlchars1([$_ | T], A) -> + {T2, NewA} = emphasis(T, $_), + htmlchars1(T2, [NewA | A]); +htmlchars1([$\\, $` | T], A) -> + %% handle backtick escaping + htmlchars1(T, [$` | A]); +htmlchars1([$`, $` | T], A) -> + {T2, NewA} = dblcode(T), + htmlchars1(T2, [NewA | A]); +htmlchars1([$` | T], A) -> + {T2, NewA} = code(T), + htmlchars1(T2, [NewA | A]); +htmlchars1([?COPY | T], A) -> + htmlchars1(T, ["©" | A]); +htmlchars1([?AMP | T], A) -> + htmlchars1(T, ["&" | A]); +htmlchars1([$& | T], A) -> + htmlchars1(T, ["&" | A]); +htmlchars1([$< | T], A) -> + htmlchars1(T, ["<" | A]); +htmlchars1([?NBSP | T], A) -> + htmlchars1(T, [" " | A]); +htmlchars1([?TAB | T], A) -> + htmlchars1(T, [" " | A]); +htmlchars1([none | T], A) -> + htmlchars1(T, A); +htmlchars1([H | T], A) -> + htmlchars1(T, [H | A]). + +emphasis(List, Delim) -> + interpolate(List, Delim, "em", "" ,[]). +strong(List, Delim) -> + interpolate2(List, Delim, "strong", "", []). +superstrong(List, Delim) -> + interpolate3(List, Delim, "strong", "em", "", []). +dblcode(List) -> + {T, Tag} = interpolate2(List, $`, "code", "" ,[]), + {T, "<pre>" ++ Tag ++ "</pre>"}. +code(List) -> + interpolateX(List, $`, "code", "", []). + +%% pain in the arse - sometimes the closing tag should be preceded by +%% a "\n" and sometimes not in showdown.js +%% interpolate is for single delimiters... +interpolateX([], Delim, _Tag, _X, Acc) -> + {[], [Delim] ++ htmlchars(lists:reverse(Acc))}; +interpolateX([Delim | T], Delim, Tag, X, Acc) -> + {T, "<" ++ Tag ++ ">" ++ htmlchars(lists:reverse(Acc)) ++ X ++ + "</" ++ Tag ++ ">"}; +interpolateX([H | T], Delim, Tag, X, Acc) -> + interpolateX(T, Delim, Tag, X, [H | Acc]). + +interpolate([], Delim, _Tag, _X, Acc) -> + {[], [Delim] ++ htmlchars(lists:reverse(Acc))}; +interpolate([Delim | T], Delim, Tag, X, Acc) -> + {T, "<" ++ Tag ++ ">" ++ htmlchars(lists:reverse(Acc)) ++ X ++ + "</" ++ Tag ++ ">"}; +interpolate([H | T], Delim, Tag, X, Acc) -> + interpolate(T, Delim, Tag, X, [H | Acc]). + +%% interpolate two is for double delimiters... +interpolate2([], Delim, _Tag, _X, Acc) -> + {[], [Delim] ++ [Delim] ++ htmlchars(lists:reverse(Acc))}; +interpolate2([Delim, Delim | T], Delim, Tag, X, Acc) -> + {T, "<" ++ Tag ++ ">" ++ htmlchars(lists:reverse(Acc)) ++ X ++ + "</" ++ Tag ++ ">"}; +interpolate2([H | T], Delim, Tag, X, Acc) -> + interpolate2(T, Delim, Tag, X, [H | Acc]). + +%% interpolate three is for double delimiters... +interpolate3([], D, _Tag1, Tag2, _X, Acc) -> + {[], "<" ++ Tag2 ++ ">" ++ [D] ++ "</" ++ Tag2 ++ ">" + ++ htmlchars(lists:reverse(Acc))}; +interpolate3([D, D, D | T], D, Tag1, Tag2, _X, Acc) -> + {T, "<" ++ Tag1 ++ ">" ++ "<" ++ Tag2 ++ ">" + ++ htmlchars(lists:reverse(Acc)) ++ "</" ++ Tag2 ++ ">" + ++ "</" ++ Tag1 ++ ">"}; +interpolate3([H | T], D, Tag1, Tag2, X, Acc) -> + interpolate3(T, D, Tag1, Tag2, X, [H | Acc]). + +make_img_tag(Url, Acc, Title) -> + {tags, "<img src=\"" ++ Url ++ "\"" + ++ " alt=\"" ++ Acc ++ "\"" + ++ " title=\"" ++ Title ++ "\"" + ++ " />"}. diff --git a/examples/markdown_middleware/src/markdown_converter.erl b/examples/markdown_middleware/src/markdown_converter.erl index 24bb132..f380332 100644 --- a/examples/markdown_middleware/src/markdown_converter.erl +++ b/examples/markdown_middleware/src/markdown_converter.erl @@ -22,8 +22,8 @@ maybe_generate_markdown(Path) -> end. resource_path(Path) -> - {ok, Cwd} = file:get_cwd(), - filename:join([Cwd, "priv", Path]). + PrivDir = code:priv_dir(markdown_middleware), + filename:join([PrivDir, Path]). source_path(Path) -> << (filename:rootname(Path))/binary, ".md" >>. diff --git a/examples/markdown_middleware/src/markdown_middleware.erl b/examples/markdown_middleware/src/markdown_middleware.erl deleted file mode 100644 index 6e94350..0000000 --- a/examples/markdown_middleware/src/markdown_middleware.erl +++ /dev/null @@ -1,15 +0,0 @@ -%% Feel free to use, reuse and abuse the code in this file. - --module(markdown_middleware). - -%% API. --export([start/0]). - -%% API. - -start() -> - ok = application:start(crypto), - ok = application:start(cowlib), - ok = application:start(ranch), - ok = application:start(cowboy), - ok = application:start(markdown_middleware). diff --git a/examples/markdown_middleware/src/markdown_middleware_app.erl b/examples/markdown_middleware/src/markdown_middleware_app.erl index 75ad886..98a909d 100644 --- a/examples/markdown_middleware/src/markdown_middleware_app.erl +++ b/examples/markdown_middleware/src/markdown_middleware_app.erl @@ -15,7 +15,11 @@ start(_Type, _Args) -> {'_', [ {"/[...]", cowboy_static, [ {directory, {priv_dir, markdown_middleware, []}}, - {mimetypes, {fun mimetypes:path_to_mimes/2, default}} + {mimetypes, [ + {<<".html">>, [<<"text/html">>]}, + {<<".mp4">>, [<<"video/mp4">>]}, + {<<".ogv">>, [<<"video/ogg">>]} + ]} ]} ]} ]), |