diff options
Diffstat (limited to 'lib/parsetools/src/leex.erl')
-rw-r--r-- | lib/parsetools/src/leex.erl | 1608 |
1 files changed, 1608 insertions, 0 deletions
diff --git a/lib/parsetools/src/leex.erl b/lib/parsetools/src/leex.erl new file mode 100644 index 0000000000..fd494eaf06 --- /dev/null +++ b/lib/parsetools/src/leex.erl @@ -0,0 +1,1608 @@ +%% Copyright (c) 2008,2009 Robert Virding. All rights reserved. +%% +%% Redistribution and use in source and binary forms, with or without +%% modification, are permitted provided that the following conditions +%% are met: +%% +%% 1. Redistributions of source code must retain the above copyright +%% notice, this list of conditions and the following disclaimer. +%% 2. Redistributions in binary form must reproduce the above copyright +%% notice, this list of conditions and the following disclaimer in the +%% documentation and/or other materials provided with the distribution. +%% +%% THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +%% "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +%% LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS +%% FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE +%% COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +%% INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, +%% BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +%% LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +%% CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +%% LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN +%% ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +%% POSSIBILITY OF SUCH DAMAGE. + +%%% A Lexical Analyser Generator for Erlang. +%%% +%%% Most of the algorithms used here are taken pretty much as +%%% described in the "Dragon Book" by Aho, Sethi and Ullman. Some +%%% completing details were taken from "Compiler Design in C" by +%%% Hollub. + +-module(leex). + +-export([compile/3,file/1,file/2,format_error/1]). + +-import(lists, [member/2,reverse/1,sort/1,delete/2, + keysearch/3,keysort/2,keydelete/3,keyfind/3, + map/2,foldl/3,foreach/2,flatmap/2]). +-import(string, [substr/2,substr/3,span/2]). +-import(ordsets, [is_element/2,add_element/2,union/2]). +-import(orddict, [store/3]). + +-include("erl_compile.hrl"). +%%-include_lib("stdlib/include/erl_compile.hrl"). + +-define(LEEXINC, "leexinc.hrl"). % Include file +-define(LEEXLIB, parsetools). % Leex is in lib parsetools +%%-define(LEEXLIB, leex). % Leex is in lib leex + +-define(DEFS_HEAD, "Definitions."). +-define(RULE_HEAD, "Rules."). +-define(CODE_HEAD, "Erlang code."). + +-record(leex, {xfile=[], % Xrl file + efile=[], % Erl file + ifile=[], % Include file + gfile=[], % Graph file + module, % Module name + opts=[], % Options + posix=false, % POSIX regular expressions + errors=[], + warnings=[] + }). + +-record(nfa_state, {no,edges=[],accept=noaccept}). +-record(dfa_state, {no,nfa=[],trans=[],accept=noaccept}). + +%%% +%%% Exported functions +%%% + +%%% Interface to erl_compile. + +compile(Input0, Output0, + #options{warning = WarnLevel, verbose=Verbose, includes=Includes}) -> + Input = assure_extension(shorten_filename(Input0), ".xrl"), + Output = assure_extension(shorten_filename(Output0), ".erl"), + Includefile = lists:sublist(Includes, 1), + Opts = [{scannerfile,Output},{includefile,Includefile},{verbose,Verbose}, + {report_errors,true},{report_warnings,WarnLevel > 0}], + case file(Input, Opts) of + {ok, _} -> + ok; + error -> + error + end. + +%% file(File) -> ok | error. +%% file(File, Options) -> ok | error. + +file(File) -> file(File, []). + +file(File, Opts0) -> + case is_filename(File) of + no -> erlang:error(badarg, [File,Opts0]); + _ -> ok + end, + Opts = case options(Opts0) of + badarg -> + erlang:error(badarg, [File,Opts0]); + Options -> + Options + end, + St0 = #leex{}, + St1 = filenames(File, Opts, St0), % Get all the filenames + St = try + {ok,REAs,Actions,Code,St2} = parse_file(St1), + {DFA,DF} = make_dfa(REAs, St2), + St3 = out_file(St2, DFA, DF, Actions, Code), + case lists:member(dfa_graph, St3#leex.opts) of + true -> out_dfa_graph(St3, DFA, DF); + false -> St3 + end + catch #leex{}=St4 -> + St4 + end, + leex_ret(St). + +format_error({file_error, Reason}) -> + io_lib:fwrite("~s",[file:format_error(Reason)]); +format_error(missing_defs) -> "missing Definitions"; +format_error(missing_rules) -> "missing Rules"; +format_error(missing_code) -> "missing Erlang code"; +format_error(empty_rules) -> "no rules"; +format_error(bad_rule) -> "bad rule"; +format_error({regexp,E})-> + Es = case E of + {interval_range,_} -> "interval range"; + {unterminated,Cs} -> + "unterminated " ++ Cs; + {illegal_char,Cs} -> + "illegal character " ++ Cs; + {posix_cc,What} -> + ["illegal POSIX character class ",io_lib:write_string(What)]; + {char_class,What} -> + ["illegal character class ",io_lib:write_string(What)] + end, + ["bad regexp `",Es,"'"]; +format_error(ignored_characters) -> + "ignored characters". + +%%% +%%% Local functions +%%% + +assure_extension(File, Ext) -> + lists:concat([strip_extension(File, Ext), Ext]). + +%% Assumes File is a filename. +strip_extension(File, Ext) -> + case filename:extension(File) of + Ext -> filename:rootname(File); + _Other -> File + end. + +options(Options0) when is_list(Options0) -> + try + Options = flatmap(fun(return) -> short_option(return, true); + (report) -> short_option(report, true); + ({return,T}) -> short_option(return, T); + ({report,T}) -> short_option(report, T); + (T) -> [T] + end, Options0), + options(Options, [scannerfile,includefile,report_errors, + report_warnings,return_errors,return_warnings, + verbose,dfa_graph], []) + catch error: _ -> badarg + end; +options(Option) -> + options([Option]). + +short_option(return, T) -> + [{return_errors,T}, {return_warnings,T}]; +short_option(report, T) -> + [{report_errors,T}, {report_warnings,T}]. + +options(Options0, [Key|Keys], L) when is_list(Options0) -> + Options = case member(Key, Options0) of + true -> + [atom_option(Key)|delete(Key, Options0)]; + false -> + Options0 + end, + V = case keysearch(Key, 1, Options) of + {value, {Key, Filename0}} when Key =:= includefile; + Key =:= scannerfile -> + case is_filename(Filename0) of + no -> + badarg; + Filename -> + {ok,[{Key,Filename}]} + end; + {value,{Key,Bool}} when Bool; not Bool -> + {ok,[{Key, Bool}]}; + {value,{Key, _}} -> + badarg; + false -> + {ok,[{Key,default_option(Key)}]} + end, + case V of + badarg -> + badarg; + {ok,KeyValueL} -> + NewOptions = keydelete(Key, 1, Options), + options(NewOptions, Keys, KeyValueL ++ L) + end; +options([], [], L) -> + foldl(fun({_,false}, A) -> A; + ({Tag,true}, A) -> [Tag|A]; + (F,A) -> [F|A] + end, [], L); +options(_Options, _, _L) -> + badarg. + +default_option(dfa_graph) -> false; +default_option(includefile) -> []; +default_option(report_errors) -> true; +default_option(report_warnings) -> true; +default_option(return_errors) -> false; +default_option(return_warnings) -> false; +default_option(scannerfile) -> []; +default_option(verbose) -> false. + +atom_option(dfa_graph) -> {dfa_graph,true}; +atom_option(report_errors) -> {report_errors,true}; +atom_option(report_warnings) -> {report_warnings,true}; +atom_option(return_errors) -> {return_errors,true}; +atom_option(return_warnings) -> {return_warnings,true}; +atom_option(verbose) -> {verbose,true}; +atom_option(Key) -> Key. + +is_filename(T) -> + try filename:flatten(T) of + Filename -> Filename + catch error: _ -> no + end. + +shorten_filename(Name0) -> + {ok,Cwd} = file:get_cwd(), + case lists:prefix(Cwd, Name0) of + false -> Name0; + true -> + case lists:nthtail(length(Cwd), Name0) of + "/"++N -> N; + N -> N + end + end. + +leex_ret(St) -> + report_errors(St), + report_warnings(St), + Es = pack_errors(St#leex.errors), + Ws = pack_warnings(St#leex.warnings), + if + Es =:= [] -> + case member(return_warnings, St#leex.opts) of + true -> {ok, St#leex.efile, Ws}; + false -> {ok, St#leex.efile} + end; + true -> + case member(return_errors, St#leex.opts) of + true -> {error, Es, Ws}; + false -> error + end + end. + +pack_errors([{File,_} | _] = Es) -> + [{File, flatmap(fun({_,E}) -> [E] end, sort(Es))}]; +pack_errors([]) -> + []. + +pack_warnings([{File,_} | _] = Ws) -> + [{File, flatmap(fun({_,W}) -> [W] end, sort(Ws))}]; +pack_warnings([]) -> + []. + +report_errors(St) -> + when_opt(fun () -> + foreach(fun({File,{none,Mod,E}}) -> + io:fwrite("~s: ~s\n", + [File,Mod:format_error(E)]); + ({File,{Line,Mod,E}}) -> + io:fwrite("~s:~w: ~s\n", + [File,Line,Mod:format_error(E)]) + end, sort(St#leex.errors)) + end, report_errors, St#leex.opts). + +report_warnings(St) -> + when_opt(fun () -> + foreach(fun({File,{none,Mod,W}}) -> + io:fwrite("~s: Warning: ~s\n", + [File,Mod:format_error(W)]); + ({File,{Line,Mod,W}}) -> + io:fwrite("~s:~w: Warning: ~s\n", + [File,Line,Mod:format_error(W)]) + end, sort(St#leex.warnings)) + end, report_warnings, St#leex.opts). + +add_error(E, St) -> + add_error(St#leex.xfile, E, St). + +add_error(File, Error, St) -> + throw(St#leex{errors = [{File,Error}|St#leex.errors]}). + +add_warning(Line, W, St) -> + St#leex{warnings = [{St#leex.xfile,{Line,leex,W}}|St#leex.warnings]}. + +%% filenames(File, Options, State) -> State. +%% The default output dir is the current directory unless an +%% explicit one has been given in the options. + +filenames(File, Opts, St0) -> + Dir = filename:dirname(File), + Base = filename:basename(File, ".xrl"), + Xfile = filename:join(Dir, Base ++ ".xrl"), + Efile = Base ++ ".erl", + Gfile = Base ++ ".dot", + Module = list_to_atom(Base), + St1 = St0#leex{xfile=Xfile, + opts=Opts, + module=Module}, + {value,{includefile,Ifile0}} = keysearch(includefile, 1, Opts), + Ifile = inc_file_name(Ifile0), + %% Test for explicit scanner file. + {value,{scannerfile,Ofile}} = keysearch(scannerfile, 1, Opts), + if + Ofile =:= [] -> + St1#leex{efile=filename:join(Dir, Efile), + ifile=Ifile, + gfile=filename:join(Dir, Gfile)}; + true -> + D = filename:dirname(Ofile), + St1#leex{efile=Ofile, + ifile=Ifile, + gfile=filename:join(D, Gfile)} + end. + +when_opt(Do, Opt, Opts) -> + case member(Opt, Opts) of + true -> Do(); + false -> ok + end. + +verbose_print(St, Format, Args) -> + when_opt(fun () -> io:fwrite(Format, Args) end, verbose, St#leex.opts). + +%% parse_file(State) -> {ok,[REA],[Action],Code,NewState} | throw(NewState) +%% when +%% REA = {RegExp,ActionNo}; +%% Action = {ActionNo,ActionString}; +%% Code = {StartLine, StartPos, NumOfLines}. Where the Erlang code is. +%% +%% Read and parse the file Xfile. +%% After each section of the file has been parsed we directly call the +%% next section. This is done when we detect a line we don't recognise +%% in the current section. The file format is very simple and Erlang +%% token based, we allow empty lines and Erlang style comments. + +parse_file(St0) -> + case file:open(St0#leex.xfile, [read]) of + {ok,Xfile} -> + try + verbose_print(St0, "Parsing file ~s, ", [St0#leex.xfile]), + %% We KNOW that errors throw so we can ignore them here. + {ok,Line1,St1} = parse_head(Xfile, St0), + {ok,Line2,Macs,St2} = parse_defs(Xfile, Line1, St1), + {ok,Line3,REAs,Actions,St3} = + parse_rules(Xfile, Line2, Macs, St2), + {ok,Code,St4} = parse_code(Xfile, Line3, St3), + verbose_print(St1, "contained ~w rules.~n", [length(REAs)]), + {ok,REAs,Actions,Code,St4} + after file:close(Xfile) + end; + {error,Error} -> + add_error({none,leex,{file_error,Error}}, St0) + end. + +%% parse_head(File, State) -> {ok,NextLine,State}. +%% Parse the head of the file. Skip all comments and blank lines. + +parse_head(Ifile, St) -> {ok,nextline(Ifile, 0),St}. + +%% parse_defs(File, Line, State) -> {ok,NextLine,Macros,State}. +%% Parse the macro definition section of a file. This must exist. +%% The section is ended by a non-blank line which is not a macro def. + +parse_defs(Ifile, {ok,?DEFS_HEAD ++ Rest,L}, St) -> + St1 = warn_ignored_chars(L, Rest, St), + parse_defs(Ifile, nextline(Ifile, L), [], St1); +parse_defs(_, {ok,_,L}, St) -> + add_error({L,leex,missing_defs}, St); +parse_defs(_, {eof,L}, St) -> + add_error({L,leex,missing_defs}, St). + +parse_defs(Ifile, {ok,Chars,L}=Line, Ms, St) -> + %% This little beauty matches out a macro definition, RE's are so clear. + MS = "^[ \t]*([A-Z_][A-Za-z0-9_]*)[ \t]*=[ \t]*([^ \t\r\n]*)[ \t\r\n]*\$", + case re:run(Chars, MS, [{capture,all_but_first,list}]) of + {match,[Name,Def]} -> + %%io:fwrite("~p = ~p\n", [Name,Def]), + parse_defs(Ifile, nextline(Ifile, L), [{Name,Def}|Ms], St); + _ -> {ok,Line,Ms,St} % Anything else + end; +parse_defs(_, Line, Ms, St) -> + {ok,Line,Ms,St}. + +%% parse_rules(File, Line, Macros, State) -> {ok,NextLine,REAs,Actions,State}. +%% Parse the RE rules section of the file. This must exist. + +parse_rules(Ifile, {ok,?RULE_HEAD ++ Rest,L}, Ms, St) -> + St1 = warn_ignored_chars(L, Rest, St), + parse_rules(Ifile, nextline(Ifile, L), Ms, [], [], 0, St1); +parse_rules(_, {ok,_,L}, _, St) -> + add_error({L,leex,missing_rules}, St); +parse_rules(_, {eof,L}, _, St) -> + add_error({L,leex,missing_rules}, St). + +%% parse_rules(File, Result, Macros, RegExpActions, Actions, Acount, State) -> +%% {ok,NextCLine,RegExpActions,Actions,NewState} | throw(NewState) + +parse_rules(Ifile, NextLine, Ms, REAs, As, N, St) -> + case NextLine of + {ok,?CODE_HEAD ++ _Rest,_} -> + parse_rules_end(Ifile, NextLine, REAs, As, St); + {ok,Chars,L0} -> + %%io:fwrite("~w: ~p~n", [L0,Chars]), + case collect_rule(Ifile, Chars, L0) of + {ok,Re,Atoks,L1} -> + {ok,REA,A,St1} = parse_rule(Re, L0, Atoks, Ms, N, St), + parse_rules(Ifile, nextline(Ifile, L1), Ms, + [REA|REAs], [A|As], N+1, St1); + {error,E} -> add_error(E, St) + end; + {eof,_} -> + parse_rules_end(Ifile, NextLine, REAs, As, St) + end. + +parse_rules_end(_, {ok,_,L}, [], [], St) -> + add_error({L,leex,empty_rules}, St); +parse_rules_end(_, {eof,L}, [], [], St) -> + add_error({L,leex,empty_rules}, St); +parse_rules_end(_, NextLine, REAs, As, St) -> + %% Must be *VERY* careful to put rules in correct order! + {ok,NextLine,reverse(REAs),reverse(As),St}. + +%% collect_rule(File, Line, Lineno) -> +%% {ok,RegExp,ActionTokens,NewLineno} | {error,E}. +%% Collect a complete rule by reading lines until the the regexp and +%% action has been read. Keep track of line number. + +collect_rule(Ifile, Chars, L0) -> + %% Erlang strings are 1 based, but re 0 :-( + {match,[{St0,Len}|_]} = re:run(Chars, "[^ \t\r\n]+"), + St = St0 + 1, + %%io:fwrite("RE = ~p~n", [substr(Chars, St, Len)]), + case collect_action(Ifile, substr(Chars, St+Len), L0, []) of + {ok,[{':',_}|Toks],L1} -> {ok,substr(Chars, St, Len),Toks,L1}; + {ok,_,_} -> {error,{L0,leex,bad_rule}}; + {eof,L1} -> {error,{L1,leex,bad_rule}}; + {error,E,_} -> {error,E} + end. + +collect_action(Ifile, Chars, L0, Cont0) -> + case erl_scan:tokens(Cont0, Chars, L0) of + {done,{ok,Toks,_},_} -> {ok,Toks,L0}; + {done,{eof,_},_} -> {eof,L0}; + {done,{error,E,_},_} -> {error,E,L0}; + {more,Cont1} -> + collect_action(Ifile, io:get_line(Ifile, leex), L0+1, Cont1) + end. + +%% parse_rule(RegExpString, RegExpLine, ActionTokens, Macros, Counter, State) -> +%% {ok,{RE,Action},ActionData,State}. +%% Parse one regexp after performing macro substition. + +parse_rule(S, Line, [{dot,_}], Ms, N, St) -> + case parse_rule_regexp(S, Ms, St) of + {ok,R} -> + {ok,{R,N},{N,empty_action},St}; + {error,E} -> + add_error({Line,leex,E}, St) + end; +parse_rule(S, Line, Atoks, Ms, N, St) -> + case parse_rule_regexp(S, Ms, St) of + {ok,R} -> + %%io:fwrite("RE = ~p~n", [R]), + %% Check for token variables. + TokenChars = var_used('TokenChars', Atoks), + TokenLen = var_used('TokenLen', Atoks), + TokenLine = var_used('TokenLine', Atoks), + {ok,{R,N},{N,Atoks,TokenChars,TokenLen,TokenLine},St}; + {error,E} -> + add_error({Line,leex,E}, St) + end. + +var_used(Name, Toks) -> + case keyfind(Name, 3, Toks) of + {var,_,Name} -> true; %It's the var we want + _ -> false + end. + +%% parse_rule_regexp(RegExpString, Macros, State) -> +%% {ok,RegExp} | {error,Error}. +%% Substitute in macros and parse RegExpString. Cannot use re:replace +%% here as it uses info in replace string (&). + +parse_rule_regexp(RE0, [{M,Exp}|Ms], St) -> + Split= re:split(RE0, "\\{" ++ M ++ "\\}", [{return,list}]), + RE1 = string:join(Split, Exp), + parse_rule_regexp(RE1, Ms, St); +parse_rule_regexp(RE, [], St) -> + %%io:fwrite("RE = ~p~n", [RE]), + case re_parse(RE, St) of + {ok,R} -> {ok,R}; + {error,E} -> {error,{regexp,E}} + end. + +%% parse_code(File, Line, State) -> {ok,Code,NewState}. +%% Finds the line and the position where the code section of the file +%% begins. This must exist. + +parse_code(Ifile, {ok,?CODE_HEAD ++ Rest,CodeL}, St) -> + St1 = warn_ignored_chars(CodeL, Rest, St), + {ok, CodePos} = file:position(Ifile, cur), + %% Just count the lines; copy the code from file to file later. + NCodeLines = count_lines(Ifile, 0), + {ok,{CodeL,CodePos,NCodeLines},St1}; +parse_code(_, {ok,_,L}, St) -> + add_error({L,leex,missing_code}, St); +parse_code(_, {eof,L}, St) -> + add_error({L,leex,missing_code}, St). + +count_lines(File, N) -> + case io:get_line(File, leex) of + eof -> N; + _Line -> count_lines(File, N+1) + end. + +%% nextline(InputFile, PrevLineNo) -> {ok,Chars,LineNo} | {eof,LineNo}. +%% Get the next line skipping comment lines and blank lines. + +nextline(Ifile, L) -> + case io:get_line(Ifile, leex) of + eof -> {eof,L}; + Chars -> + case substr(Chars, span(Chars, " \t\n")+1) of + [$%|_Rest] -> nextline(Ifile, L+1); + [] -> nextline(Ifile, L+1); + _Other -> {ok,Chars,L+1} + end + end. + +warn_ignored_chars(Line, S, St) -> + case non_white(S) of + [] -> St; + _ -> add_warning(Line, ignored_characters, St) + end. + +non_white(S) -> + [C || C <- S, C > $\s, C < $\200 orelse C > $\240]. + +%% This is the regular expression grammar used. It is equivalent to the +%% one used in AWK, except that we allow ^ $ to be used anywhere and fail +%% in the matching. +%% +%% reg -> alt : '$1'. +%% alt -> seq "|" seq ... : {alt,['$1','$2'...]}. +%% seq -> repeat repeat ... : {seq,['$1','$2'...]}. +%% repeat -> repeat "*" : {kclosure,'$1'}. +%% repeat -> repeat "+" : {pclosure,'$1'}. +%% repeat -> repeat "?" : {optional,'$1'}. +%% repeat -> repeat "{" [Min],[Max] "}" : {interval,'$1',Min,Max} +%% repeat -> single : '$1'. +%% single -> "(" reg ")" : {sub,'$2',Number}. +%% single -> "^" : bos/bol. +%% single -> "$" : eos/eol. +%% single -> "." : any. +%% single -> "[" class "]" : {char_class,char_class('$2')} +%% single -> "[" "^" class "]" : {comp_class,char_class('$3')}. +%% single -> "\"" chars "\"" : {lit,'$2'}. +%% single -> "\\" char : {lit,['$2']}. +%% single -> char : {lit,['$1']}. +%% single -> empty : epsilon. +%% The grammar of the current regular expressions. The actual parser +%% is a recursive descent implementation of the grammar. + +%% re_parse(Chars, State) -> {ok,RegExp} | {error,Error}. + +re_parse(Cs0, St) -> + case catch re_reg(Cs0, 0, St) of + {RE,_,[]} -> {ok,RE}; + {_,_,[C|_]} -> {error,{illegal_char,[C]}}; + {parse_error,E} -> {error,E} + end. + +parse_error(E) -> throw({parse_error,E}). + +re_reg(Cs, Sn, St) -> re_alt(Cs, Sn, St). + +re_alt(Cs0, Sn0, St) -> + {L,Sn1,Cs1} = re_seq(Cs0, Sn0, St), + case re_alt1(Cs1, Sn1, St) of + {[],Sn2,Cs2} -> {L,Sn2,Cs2}; + {Rs,Sn2,Cs2} -> {{alt,[L|Rs]},Sn2,Cs2} + end. + +re_alt1([$||Cs0], Sn0, St) -> + {L,Sn1,Cs1} = re_seq(Cs0, Sn0, St), + {Rs,Sn2,Cs2} = re_alt1(Cs1, Sn1, St), + {[L|Rs],Sn2,Cs2}; +re_alt1(Cs, Sn, _) -> {[],Sn,Cs}. + +%% Parse a sequence of regexps. Don't allow the empty sequence. +%% re_seq(Cs0, Sn0, St) -> +%% {L,Sn1,Cs1} = repeat(Cs0, Sn0, St), +%% case re_seq1(Cs1, Sn1, St) of +%% {[],Sn2,Cs2} -> {L,Sn2,Cs2}; +%% {Rs,Sn2,Cs2} -> {{seq,[L|Rs]},Sn2,Cs2} +%% end. + +%% re_seq(Chars, SubNumber, State) -> {RegExp,SubNumber,Chars}. +%% Parse a sequence of regexps. Allow the empty sequence, returns epsilon. + +re_seq(Cs0, Sn0, St) -> + case re_seq1(Cs0, Sn0, St) of + {[],Sn1,Cs1} -> {epsilon,Sn1,Cs1}; + {[R],Sn1,Cs1} -> {R,Sn1,Cs1}; + {Rs,Sn1,Cs1} -> {{seq,Rs},Sn1,Cs1} + end. + +re_seq1([C|_]=Cs0, Sn0, St) when C /= $|, C /= $) -> + {L,Sn1,Cs1} = re_repeat(Cs0, Sn0, St), + {Rs,Sn2,Cs2} = re_seq1(Cs1, Sn1, St), + {[L|Rs],Sn2,Cs2}; +re_seq1(Cs, Sn, _) -> {[],Sn,Cs}. + +%% re_repeat(Chars, SubNumber, State) -> {RegExp,SubNumber,Chars}. + +re_repeat(Cs0, Sn0, St) -> + {S,Sn1,Cs1} = re_single(Cs0, Sn0, St), + re_repeat1(Cs1, Sn1, S, St). + +re_repeat1([$*|Cs], Sn, S, St) -> re_repeat1(Cs, Sn, {kclosure,S}, St); +re_repeat1([$+|Cs], Sn, S, St) -> re_repeat1(Cs, Sn, {pclosure,S}, St); +re_repeat1([$?|Cs], Sn, S, St) -> re_repeat1(Cs, Sn, {optional,S}, St); +%% { only starts interval when ere is true, otherwise normal character. +re_repeat1([${|Cs0], Sn, S, #leex{posix=true}=St) -> % $} + case re_interval_range(Cs0) of + {Min,Max,[$}|Cs1]} when is_integer(Min), is_integer(Max), Min =< Max -> + re_repeat1(Cs1, Sn, {interval,S,Min,Max}, St); + {Min,Max,[$}|Cs1]} when is_integer(Min), is_atom(Max) -> + re_repeat1(Cs1, Sn, {interval,S,Min,Max}, St); + {_,_,Cs1} -> parse_error({interval_range,string_between([${|Cs0], Cs1)}) + end; +re_repeat1(Cs, Sn, S, _) -> {S,Sn,Cs}. + +%% re_single(Chars, SubNumber, State) -> {RegExp,SubNumber,Chars}. +%% Parse a re_single regexp. + +re_single([$(|Cs0], Sn0, St) -> % $) + Sn1 = Sn0 + 1, % Keep track of sub count anyway + case re_reg(Cs0, Sn1, St) of + {S,Sn2,[$)|Cs1]} -> {S,Sn2,Cs1}; + %%{S,Sn2,[$)|Cs1]} -> {{sub,S,Sn1},Sn2,Cs1}; + _ -> parse_error({unterminated,"("}) + end; +%% These are not legal inside a regexp. +%% re_single([$^|Cs], Sn, St) -> {bos,Sn,Cs}; +%% re_single([$$|Cs], Sn, St) -> {eos,Sn,Cs}; +%% re_single([$.|Cs], Sn, St) -> {any,Sn,Cs}; +re_single([$.|Cs], Sn, _) -> {{comp_class,"\n"},Sn,Cs}; % Do this here? +re_single("[^" ++ Cs0, Sn, St) -> + case re_char_class(Cs0, St) of + {Cc,[$]|Cs1]} -> {{comp_class,Cc},Sn,Cs1}; + _ -> parse_error({unterminated,"["}) + end; +re_single([$[|Cs0], Sn, St) -> + case re_char_class(Cs0, St) of + {Cc,[$]|Cs1]} -> {{char_class,Cc},Sn,Cs1}; + _ -> parse_error({unterminated,"["}) + end; +re_single([$\\|Cs0], Sn, _) -> + {C,Cs1} = re_char($\\, Cs0), + {{lit,[C]},Sn,Cs1}; +re_single([C|Cs0], Sn, St) -> + case special_char(C, St) of + true -> parse_error({illegal_char,[C]}); + false -> + {C,Cs1} = re_char(C, Cs0), + {{lit,[C]},Sn,Cs1} + end. + +-define(IS_HEX(C), C >= $0 andalso C =< $9 orelse + C >= $A andalso C =< $F orelse + C >= $a andalso C =< $f). + +%% re_char(Char, Chars) -> {CharValue,Chars}. +%% Reads one character value from the input list, it knows about escapes. + +re_char($\\, [O1,O2,O3|S]) when + O1 >= $0, O1 =< $7, O2 >= $0, O2 =< $7, O3 >= $0, O3 =< $7 -> + {(O1*8 + O2)*8 + O3 - 73*$0,S}; +re_char($\\, [$x,H1,H2|S]) when ?IS_HEX(H1), ?IS_HEX(H2) -> + {erlang:list_to_integer([H1,H2], 16),S}; +re_char($\\,[$x,${|S0]) -> + re_hex(S0, []); +re_char($\\,[$x|_]) -> + parse_error({illegal_char,"\\x"}); +re_char($\\, [C|S]) -> {escape_char(C),S}; +re_char($\\, []) -> parse_error({unterminated,"\\"}); +re_char(C, S) -> {C,S}. % Just this character + +re_hex([C|Cs], L) when ?IS_HEX(C) -> re_hex(Cs, [C|L]); +re_hex([$}|S], L0) -> + L = lists:reverse(L0), + case erlang:list_to_integer(L, 16) of + C when C =< 16#10FFFF -> {C,S}; + _ -> parse_error({illegal_char,[$\\,$x,${|L]++"}"}) + end; +re_hex(_, _) -> parse_error({unterminated,"\\x{"}). + +%% special_char(Char, State) -> bool(). +%% These are the special characters for an ERE. +%% N.B. ]}) are only special in the context after [{(. + +special_char($^, _) -> true; +special_char($., _) -> true; +special_char($[, _) -> true; +special_char($$, _) -> true; +special_char($(, _) -> true; +special_char($), _) -> true; +special_char($|, _) -> true; +special_char($*, _) -> true; +special_char($+, _) -> true; +special_char($?, _) -> true; +special_char(${, #leex{posix=true}) -> true; % Only when POSIX set +special_char($\\, _) -> true; +special_char(_, _) -> false. + +%% re_char_class(Chars, State) -> {CharClass,Chars}. +%% Parse a character class. + +re_char_class([$]|Cs], St) -> % Must special case this. + re_char_class(Cs, [$]], St); +re_char_class(Cs, St) -> re_char_class(Cs, [], St). + +re_char_class("[:" ++ Cs0, Cc, #leex{posix=true}=St) -> + %% POSIX char class only. + case posix_cc(Cs0) of + {Pcl,":]" ++ Cs1} -> re_char_class(Cs1, [{posix,Pcl}|Cc], St); + {_,Cs1} -> parse_error({posix_cc,string_between(Cs0, Cs1)}) + end; +re_char_class([C1|Cs0], Cc, St) when C1 /= $] -> + case re_char(C1, Cs0) of + {Cf,[$-,C2|Cs1]} when C2 /= $] -> + case re_char(C2, Cs1) of + {Cl,Cs2} when Cf < Cl -> + re_char_class(Cs2, [{range,Cf,Cl}|Cc], St); + {_,Cs2} -> + parse_error({char_class,string_between([C1|Cs0], Cs2)}) + end; + {C,Cs1} -> re_char_class(Cs1, [C|Cc], St) + end; +re_char_class(Cs, Cc, _) -> {reverse(Cc),Cs}. % Preserve order + +%% posix_cc(String) -> {PosixClass,RestString}. +%% Handle POSIX character classes. + +posix_cc("alnum" ++ Cs) -> {alnum,Cs}; +posix_cc("alpha" ++ Cs) -> {alpha,Cs}; +posix_cc("blank" ++ Cs) -> {blank,Cs}; +posix_cc("cntrl" ++ Cs) -> {cntrl,Cs}; +posix_cc("digit" ++ Cs) -> {digit,Cs}; +posix_cc("graph" ++ Cs) -> {graph,Cs}; +posix_cc("lower" ++ Cs) -> {lower,Cs}; +posix_cc("print" ++ Cs) -> {print,Cs}; +posix_cc("punct" ++ Cs) -> {punct,Cs}; +posix_cc("space" ++ Cs) -> {space,Cs}; +posix_cc("upper" ++ Cs) -> {upper,Cs}; +posix_cc("xdigit" ++ Cs) -> {xdigit,Cs}; +posix_cc(Cs) -> parse_error({posix_cc,substr(Cs, 1, 5)}). + +escape_char($n) -> $\n; % \n = LF +escape_char($r) -> $\r; % \r = CR +escape_char($t) -> $\t; % \t = TAB +escape_char($v) -> $\v; % \v = VT +escape_char($b) -> $\b; % \b = BS +escape_char($f) -> $\f; % \f = FF +escape_char($e) -> $\e; % \e = ESC +escape_char($s) -> $\s; % \s = SPACE +escape_char($d) -> $\d; % \d = DEL +escape_char(C) -> C. % Pass it straight through + +%% re_interval_range(Chars) -> {Min,Max,RestChars}. +%% NoInt -> none,none +%% Int -> Int,none +%% Int, -> Int,any +%% Int1,Int2 -> Int1,Int2 + +re_interval_range(Cs0) -> + case re_number(Cs0) of + {none,Cs1} -> {none,none,Cs1}; + {N,[$,|Cs1]} -> + case re_number(Cs1) of + {none,Cs2} -> {N,any,Cs2}; + {M,Cs2} -> {N,M,Cs2} + end; + {N,Cs1} -> {N,none,Cs1} + end. + +re_number([C|Cs]) when C >= $0, C =< $9 -> + re_number(Cs, C - $0); +re_number(Cs) -> {none,Cs}. + +re_number([C|Cs], Acc) when C >= $0, C =< $9 -> + re_number(Cs, 10*Acc + (C - $0)); +re_number(Cs, Acc) -> {Acc,Cs}. + +string_between(Cs1, Cs2) -> + substr(Cs1, 1, length(Cs1)-length(Cs2)). + +%% We use standard methods, Thompson's construction and subset +%% construction, to create first an NFA and then a DFA from the +%% regexps. A non-standard feature is that we work with sets of +%% character ranges (crs) instead sets of characters. This is most +%% noticeable when constructing DFAs. The major benefit is that we can +%% handle characters from any set, not just limited ASCII or 8859, +%% even 16/32 bit unicode. +%% +%% The whole range of characters is 0-maxchar, where maxchar is a BIG +%% number. We don't make any assumptions about the size of maxchar, it +%% is just bigger than any character. +%% +%% Using character ranges makes describing many regexps very simple, +%% for example the regexp "." just becomes the range +%% [{0-9},{11-maxchar}]. + +%% make_nfa(RegExpActions) -> {ok,{NFA,StartState}} | {error,E}. +%% Build a complete nfa from a list of {RegExp,Action}. The NFA field +%% accept has values {yes,Action}|no. The NFA is a list of states. + +make_dfa(REAs, St) -> + {NFA,NF} = build_combined_nfa(REAs), + verbose_print(St, "NFA contains ~w states, ", [tuple_size(NFA)]), + {DFA0,DF0} = build_dfa(NFA, NF), + verbose_print(St, "DFA contains ~w states, ", [length(DFA0)]), + {DFA,DF} = minimise_dfa(DFA0, DF0), + verbose_print(St, "minimised to ~w states.~n", [length(DFA)]), + %%io:fwrite("~p\n", [{NF,NFA}]), + %%io:fwrite("~p\n", [{DF0,DFA0}]), + %%io:fwrite("~p\n", [{DF,DFA}]), + {DFA,DF}. + +%% build_combined_nfa(RegExpActionList) -> {NFA,FirstState}. +%% Build the combined NFA using Thompson's construction straight out +%% of the book. Build the separate NFAs in the same order as the +%% rules so that the accepting have ascending states have ascending +%% state numbers. Start numbering the states from 1 as we put the +%% states in a tuple with the state number as the index. +%% +%% The edges from a state are a list of {CharRange,State} | {epsilon,State}. + +build_combined_nfa(REAs) -> + {NFA0,Firsts,Free} = build_nfa_list(REAs, [], [], 1), + F = #nfa_state{no=Free,edges=epsilon_trans(Firsts)}, + {list_to_tuple(keysort(#nfa_state.no, [F|NFA0])),Free}. + +build_nfa_list([{RE,Action}|REAs], NFA0, Firsts, Free0) -> + {NFA1,Free1,First} = build_nfa(RE, Free0, Action), + build_nfa_list(REAs, NFA1 ++ NFA0, [First|Firsts], Free1); +build_nfa_list([], NFA, Firsts, Free) -> + {NFA,reverse(Firsts),Free}. + +epsilon_trans(Firsts) -> [ {epsilon,F} || F <- Firsts ]. + +%% build_nfa(RegExp, NextState, Action) -> {NFA,NextState,FirstState}. +%% When building the NFA states for a regexp we don't build the end +%% state, just allocate a State for it and return this state's +%% number. This allows us to avoid building unnecessary states for +%% concatenation which would then have to be removed by overwriting +%% an existing state. + +build_nfa(RE, N0, Action) -> + {NFA,N1,E} = build_nfa(RE, N0+1, N0, []), + {[#nfa_state{no=E,accept={accept,Action}}|NFA],N1,N0}. + +%% build_nfa(RegExp, NextState, FirstState, NFA) -> {NFA,NextState,EndState}. +%% Build an NFA from the RegExp. NFA is a list of #nfa_state{} in no +%% predefined order. NextState is the number of the next free state +%% to use, FirstState is the the state which must be the start for +%% this regexp as a previous regexp refers to it, EndState is the +%% state to which this NFA will exit to. The number of the returned +%% EndState is already allocated! + +build_nfa({alt,REs}, N, F, NFA) -> + build_nfa_alt(REs, N, F, NFA); +build_nfa({seq,REs}, N, F, NFA) -> + build_nfa_seq(REs, N, F, NFA); +build_nfa({kclosure,RE}, N0, F, NFA0) -> + {NFA1,N1,E1} = build_nfa(RE, N0+1, N0, NFA0), + E = N1, % End state + {[#nfa_state{no=F,edges=[{epsilon,N0},{epsilon,E}]}, + #nfa_state{no=E1,edges=[{epsilon,N0},{epsilon,E}]}|NFA1], + N1+1,E}; +build_nfa({pclosure,RE}, N0, F, NFA0) -> + {NFA1,N1,E1} = build_nfa(RE, N0+1, N0, NFA0), + E = N1, % End state + {[#nfa_state{no=F,edges=[{epsilon,N0}]}, + #nfa_state{no=E1,edges=[{epsilon,N0},{epsilon,E}]}|NFA1], + N1+1,E}; +build_nfa({optional,RE}, N0, F, NFA0) -> + {NFA1,N1,E1} = build_nfa(RE, N0+1, N0, NFA0), + E = N1, % End state + {[#nfa_state{no=F,edges=[{epsilon,N0},{epsilon,E}]}, + #nfa_state{no=E1,edges=[{epsilon,E}]}|NFA1], + N1+1,E}; +build_nfa({char_class,Cc}, N, F, NFA) -> + {[#nfa_state{no=F,edges=[{pack_cc(Cc),N}]}|NFA],N+1,N}; +build_nfa({comp_class,Cc}, N, F, NFA) -> + {[#nfa_state{no=F,edges=[{comp_class(Cc),N}]}|NFA],N+1,N}; +build_nfa({lit,Cs}, N, F, NFA) -> % Implicit concatenation + build_nfa_lit(Cs, N, F, NFA); +build_nfa(epsilon, N, F, NFA) -> % Just an epsilon transition + {[#nfa_state{no=F,edges=[{epsilon,N}]}|NFA],N+1,N}. + +%% build_nfa_lit(Chars, NextState, FirstState, NFA) -> {NFA,NextState,EndState}. +%% Build an NFA for the sequence of literal characters. + +build_nfa_lit(Cs, N0, F0, NFA0) -> + foldl(fun (C, {NFA,N,F}) -> + {[#nfa_state{no=F,edges=[{[{C,C}],N}]}|NFA],N+1,N} + end, {NFA0,N0,F0}, Cs). + +%% build_nfa_lit([C|Cs], N, F, NFA0) when is_integer(C) -> +%% NFA1 = [#nfa_state{no=F,edges=[{[{C,C}],N}]}|NFA0], +%% build_nfa_lit(Cs, N+1, N, NFA1); +%% build_nfa_lit([], N, F, NFA) -> {NFA,N,F}. + +%% build_nfa_seq(REs, NextState, FirstState, NFA) -> {NFA,NextState,EndState}. +%% Build an NFA for the regexps in a sequence. + +build_nfa_seq(REs, N0, F0, NFA0) -> + foldl(fun (RE, {NFA,N,F}) -> build_nfa(RE, N, F, NFA) end, + {NFA0,N0,F0}, REs). + +%% build_nfa_seq([RE|REs], N0, F, NFA0) -> +%% {NFA1,N1,E1} = build_nfa(RE, N0, F, NFA0), +%% build_nfa_seq(REs, N1, E1, NFA1); +%% build_nfa_seq([], N, F, NFA) -> {NFA,N,F}. + +%% build_nfa_alt(REs, NextState, FirstState, NFA) -> {NFA,NextState,EndState}. +%% Build an NFA for the regexps in an alternative. N.B. we don't +%% handle empty alts here but the parser should never generate them +%% anyway. + +build_nfa_alt([RE], N, F, NFA) -> build_nfa(RE, N, F, NFA); +build_nfa_alt([RE|REs], N0, F, NFA0) -> + {NFA1,N1,E1} = build_nfa(RE, N0+1, N0, NFA0), + {NFA2,N2,E2} = build_nfa_alt(REs, N1+1, N1, NFA1), + E = N2, % End state + {[#nfa_state{no=F,edges=[{epsilon,N0},{epsilon,N1}]}, + #nfa_state{no=E1,edges=[{epsilon,E}]}, + #nfa_state{no=E2,edges=[{epsilon,E}]}|NFA2], + N2+1,E}. + +%% build_nfa_alt(REs, NextState, FirstState, NFA) -> {NFA,NextState,EndState}. +%% Build an NFA for the regexps in an alternative. Make one big +%% epsilon split state, not necessary but fun. + +%% build_nfa_alt(REs, N0, F0, NFA0) -> +%% E = N0, % Must reserve End state first +%% {Fs,{NFA1,N1}} = mapfoldl(fun (RE, {NFA,N}) -> +%% build_nfa_alt1(RE, N, E, NFA) +%% end, {NFA0,N0+1}, REs), +%% {[#nfa_state{no=F0,edges=epsilon_trans(Fs)}, +%% #nfa_state{no=E,edges=[{epsilon,N1}]}|NFA1],N1+1,N1}. + +%% build_nfa_alt1(RE, N0, End, NFA0) -> +%% {NFA1,N1,E} = build_nfa(RE, N0+1, N0, NFA0), +%% {N0,{[#nfa_state{no=E,edges=[{epsilon,End}]}|NFA1],N1}}. + +%% pack_cc(CharClass) -> CharClass +%% Pack and optimise a character class specification (bracket +%% expression). First sort it and then compact it. + +pack_cc(Cc) -> + Crs = foldl(fun ({range,Cf,Cl}, Set) -> add_element({Cf,Cl}, Set); + (C, Set) -> add_element({C,C}, Set) + end, ordsets:new(), Cc), + pack_crs(Crs). % An ordset IS a list! + +pack_crs([{C1,C2}=Cr,{C3,C4}|Crs]) when C1 =< C3, C2 >= C4 -> + %% C1 C2 + %% C3 C4 + pack_crs([Cr|Crs]); +pack_crs([{C1,C2},{C3,C4}|Crs]) when C2 >= C3, C2 < C4 -> + %% C1 C2 + %% C3 C4 + pack_crs([{C1,C4}|Crs]); +pack_crs([{C1,C2},{C3,C4}|Crs]) when C2 + 1 == C3 -> + %% C1 C2 + %% C3 C4 + pack_crs([{C1,C4}|Crs]); +pack_crs([Cr|Crs]) -> [Cr|pack_crs(Crs)]; +pack_crs([]) -> []. + +comp_class(Cc) -> + Crs = pack_cc(Cc), + Comp = comp_crs(Crs, 0), + %% io:fwrite("comp: ~p\n ~p\n", [Crs,Comp]), + Comp. + +comp_crs([{0,C2}|Crs], 0) -> % Get first range right + comp_crs(Crs, C2+1); +comp_crs([{C1,C2}|Crs], Last) -> + [{Last,C1-1}|comp_crs(Crs, C2+1)]; +comp_crs([], Last) -> [{Last,maxchar}]. + +%% build_dfa(NFA, NfaFirstState) -> {DFA,DfaFirstState}. +%% Build a DFA from an NFA using "subset construction". The major +%% difference from the book is that we keep the marked and unmarked +%% DFA states in seperate lists. New DFA states are added to the +%% unmarked list and states are marked by moving them to the marked +%% list. We assume that the NFA accepting state numbers are in +%% ascending order for the rules and use ordsets to keep this order. + +build_dfa(NFA, Nf) -> + D = #dfa_state{no=0,nfa=eclosure([Nf], NFA)}, + {build_dfa([D], 1, [], NFA),0}. + +%% build_dfa([UnMarked], NextState, [Marked], NFA) -> DFA. +%% Traverse the unmarked states. Temporarily add the current unmarked +%% state to the marked list before calculating translation, this is +%% to avoid adding too many duplicate states. Add it properly to the +%% marked list afterwards with correct translations. + +build_dfa([U|Us0], N0, Ms, NFA) -> + {Ts,Us1,N1} = build_dfa(U#dfa_state.nfa, Us0, N0, [], [U|Ms], NFA), + M = U#dfa_state{trans=Ts,accept=accept(U#dfa_state.nfa, NFA)}, + build_dfa(Us1, N1, [M|Ms], NFA); +build_dfa([], _, Ms, _) -> Ms. + +%% build_dfa([NfaState], [Unmarked], NextState, [Transition], [Marked], NFA) -> +%% {Transitions,UnmarkedStates,NextState}. +%% Foreach NFA state set calculate the legal translations. N.B. must +%% search *BOTH* the unmarked and marked lists to check if DFA state +%% already exists. As the range of characters is potentially VERY +%% large we cannot explicitly test all characters. Instead we first +%% calculate the set of all disjoint character ranges which are +%% possible candidates to the set of NFA states. The transitions are +%% an orddict so we get the transition lists in ascending order. + +build_dfa(Set, Us, N, Ts, Ms, NFA) -> + %% List of all transition sets. + Crs0 = [Cr || S <- Set, + {Crs,_St} <- (element(S, NFA))#nfa_state.edges, + Crs /= epsilon, % Not an epsilon transition + Cr <- Crs ], + Crs1 = lists:usort(Crs0), % Must remove duplicates! + %% Build list of disjoint test ranges. + Test = disjoint_crs(Crs1), + %% io:fwrite("bd: ~p\n ~p\n ~p\n ~p\n", [Set,Crs0,Crs1,Test]), + build_dfa(Test, Set, Us, N, Ts, Ms, NFA). + +%% disjoint_crs([CharRange]) -> [CharRange]. +%% Take a sorted list of char ranges and make a sorted list of +%% disjoint char ranges. No new char range extends past an existing +%% char range. + +disjoint_crs([{_C1,C2}=Cr1,{C3,_C4}=Cr2|Crs]) when C2 < C3 -> + %% C1 C2 + %% C3 C4 + [Cr1|disjoint_crs([Cr2|Crs])]; +disjoint_crs([{C1,C2},{C3,C4}|Crs]) when C1 == C3 -> + %% C1 C2 + %% C3 C4 + [{C1,C2}|disjoint_crs(add_element({C2+1,C4}, Crs))]; +disjoint_crs([{C1,C2},{C3,C4}|Crs]) when C1 < C3, C2 >= C3, C2 < C4 -> + %% C1 C2 + %% C3 C4 + [{C1,C3-1}|disjoint_crs(union([{C3,C2},{C2+1,C4}], Crs))]; +disjoint_crs([{C1,C2},{C3,C4}|Crs]) when C1 < C3, C2 == C4 -> + %% C1 C2 + %% C3 C4 + [{C1,C3-1}|disjoint_crs(add_element({C3,C4}, Crs))]; +disjoint_crs([{C1,C2},{C3,C4}|Crs]) when C1 < C3, C2 > C4 -> + %% C1 C2 + %% C3 C4 + [{C1,C3-1}|disjoint_crs(union([{C3,C4},{C4+1,C2}], Crs))]; +disjoint_crs([Cr|Crs]) -> [Cr|disjoint_crs(Crs)]; +disjoint_crs([]) -> []. + +build_dfa([Cr|Crs], Set, Us, N, Ts, Ms, NFA) -> + case eclosure(move(Set, Cr, NFA), NFA) of + S when S /= [] -> + case dfa_state_exist(S, Us, Ms) of + {yes,T} -> + build_dfa(Crs, Set, Us, N, store(Cr, T, Ts), Ms, NFA); + no -> + U = #dfa_state{no=N,nfa=S}, + build_dfa(Crs, Set, [U|Us], N+1, store(Cr, N, Ts), Ms, NFA) + end; + [] -> + build_dfa(Crs, Set, Us, N, Ts, Ms, NFA) + end; +build_dfa([], _, Us, N, Ts, _, _) -> + {Ts,Us,N}. + +%% dfa_state_exist(Set, Unmarked, Marked) -> {yes,State} | no. + +dfa_state_exist(S, Us, Ms) -> + case keysearch(S, #dfa_state.nfa, Us) of + {value,#dfa_state{no=T}} -> {yes,T}; + false -> + case keysearch(S, #dfa_state.nfa, Ms) of + {value,#dfa_state{no=T}} -> {yes,T}; + false -> no + end + end. + +%% eclosure([State], NFA) -> [State]. +%% move([State], Char, NFA) -> [State]. +%% These are straight out of the book. As eclosure uses ordsets then +%% the generated state sets are in ascending order. + +eclosure(Sts, NFA) -> eclosure(Sts, NFA, []). + +eclosure([St|Sts], NFA, Ec) -> + #nfa_state{edges=Es} = element(St, NFA), + eclosure([ N || {epsilon,N} <- Es, + not is_element(N, Ec) ] ++ Sts, + NFA, add_element(St, Ec)); +eclosure([], _, Ec) -> Ec. + +move(Sts, Cr, NFA) -> + %% io:fwrite("move1: ~p\n", [{Sts,Cr}]), + [ St || N <- Sts, + {Crs,St} <- (element(N, NFA))#nfa_state.edges, + Crs /= epsilon, % Not an epsilon transition + in_crs(Cr, Crs) ]. + +in_crs({C1,C2}, [{C3,C4}|_Crs]) when C1 >= C3, C2 =< C4 -> true; +in_crs(Cr, [Cr|_Crs]) -> true; % Catch bos and eos. +in_crs(Cr, [_|Crs]) -> in_crs(Cr, Crs); +in_crs(_Cr, []) -> false. + +%% accept([State], NFA) -> {accept,A} | noaccept. +%% Scan down the state list until we find an accepting state. + +accept([St|Sts], NFA) -> + case element(St, NFA) of + #nfa_state{accept={accept,A}} -> {accept,A}; + #nfa_state{accept=noaccept} -> accept(Sts, NFA) + end; +accept([], _) -> noaccept. + +%% minimise_dfa(DFA, DfaFirst) -> {DFA,DfaFirst}. +%% Minimise the DFA by removing equivalent states. We consider a +%% state if both the transitions and the their accept state is the +%% same. First repeatedly run throught the DFA state list removing +%% equivalent states and updating remaining transitions with +%% remaining equivalent state numbers. When no more reductions are +%% possible then pack the remaining state numbers to get consecutive +%% states. + +minimise_dfa(DFA0, Df0) -> + case min_dfa(DFA0) of + {DFA1,[]} -> % No reduction! + {DFA2,Rs} = pack_dfa(DFA1), + {min_update(DFA2, Rs),min_use(Df0, Rs)}; + {DFA1,Rs} -> + minimise_dfa(min_update(DFA1, Rs), min_use(Df0, Rs)) + end. + +min_dfa(DFA) -> min_dfa(DFA, [], []). + +min_dfa([D|DFA0], Rs0, MDFA) -> + {DFA1,Rs1} = min_delete(DFA0, D#dfa_state.trans, D#dfa_state.accept, + D#dfa_state.no, Rs0, []), + min_dfa(DFA1, Rs1, [D|MDFA]); +min_dfa([], Rs, MDFA) -> {MDFA,Rs}. + +%% min_delete(States, Trans, Action, NewN, Rs, MiniDFA) -> {MiniDFA,Rs}. +%% Delete all states with same transactions and action. Return +%% rewrites and minimised DFA with no duplicate states. + +min_delete([#dfa_state{no=N,trans=T,accept=A}|DFA], T, A, NewN, Rs, MDFA) -> + min_delete(DFA, T, A, NewN, [{N,NewN}|Rs], MDFA); +min_delete([D|DFA], T, A, NewN, Rs, MDFA) -> + min_delete(DFA, T, A, NewN, Rs, [D|MDFA]); +min_delete([], _, _, _, Rs, MDFA) -> {MDFA,Rs}. + +min_update(DFA, Rs) -> + [ D#dfa_state{trans=min_update_trans(D#dfa_state.trans, Rs)} || D <- DFA ]. + +min_update_trans(Tr, Rs) -> + [ {C,min_use(S, Rs)} || {C,S} <- Tr ]. + +min_use(Old, [{Old,New}|_]) -> New; +min_use(Old, [_|Reds]) -> min_use(Old, Reds); +min_use(Old, []) -> Old. + +pack_dfa(DFA) -> pack_dfa(DFA, 0, [], []). + +pack_dfa([D|DFA], NewN, Rs, PDFA) -> + pack_dfa(DFA, NewN+1, + [{D#dfa_state.no,NewN}|Rs], [D#dfa_state{no=NewN}|PDFA]); +pack_dfa([], _, Rs, PDFA) -> {PDFA,Rs}. + +%% The main output is the yystate function which is built from the +%% DFA. It has the spec: +%% +%% yystate() -> InitialState. +%% yystate(State, InChars, Line, CurrTokLen, AcceptAction, AcceptLen) -> +%% {Action, AcceptLength, RestChars, Line} | Accepting end state +%% {Action, AcceptLength, RestChars, Line, State} | Accepting state +%% {reject, AcceptLength, CurrTokLen, RestChars, Line, State} | +%% {Action, AcceptLength, CurrTokLen, RestChars, Line, State}. + +%% The return CurrTokLen is always the current number of characters +%% scanned in the current token. The returns have the follwoing +%% meanings: +%% {Action, AcceptLength, RestChars, Line} - +%% The scanner has reached an accepting end-state, for example after +%% a regexp "abc". Action is the action number and AcceptLength is +%% the length of the matching token. +%% +%% {Action, AcceptLength, RestChars, Line, State} - +%% The scanner has reached an accepting transition state, for example +%% after c in regexp "abc(xyz)?", continuation depends on +%% RestChars. If RestChars == [] (no more current characters) then we +%% need to get more characters to see if it is an end-state, +%% otherwise (eof or chars) then we have not found continuing +%% characters and it is an end state. +%% +%% {reject, AcceptLength, CurrTokLen, RestChars, Line, State} - +%% {Action, AcceptLength, CurrTokLen, RestChars, Line, State} - +%% The scanner has reached a non-accepting transistion state. If +%% RestChars == [] we need to get more characters to continue. +%% Otherwise if 'reject' then no accepting state has been reached it +%% is an error. If we have an Action and AcceptLength then these are +%% the last accept state, use them and continue from there. + +%% out_file(LeexState, DFA, DfaStart, [Action], Code) -> ok | error. +%% Generate an output .erl file from the include file, the DFA and +%% the code for the actions. + +out_file(St0, DFA, DF, Actions, Code) -> + verbose_print(St0, "Writing file ~s, ", [St0#leex.efile]), + case open_inc_file(St0) of + {ok,Ifile} -> + try + case file:open(St0#leex.efile, [write]) of + {ok,Ofile} -> + try + output_file_directive(Ofile, St0#leex.ifile, 0), + out_file(Ifile, Ofile, St0, DFA, DF, Actions, + Code, 1), + verbose_print(St0, "ok~n", []), + St0 + after file:close(Ofile) + end; + {error,Error} -> + verbose_print(St0, "error~n", []), + add_error({none,leex,{file_error,Error}}, St0) + end + after file:close(Ifile) + end; + {{error,Error},Ifile} -> + add_error(Ifile, {none,leex,{file_error,Error}}, St0) + end. + +open_inc_file(State) -> + Ifile = State#leex.ifile, + case file:open(Ifile, [read]) of + {ok,F} -> {ok,F}; + Error -> {Error,Ifile} + end. + +inc_file_name([]) -> + Incdir = filename:join(code:lib_dir(parsetools), "include"), + filename:join(Incdir, ?LEEXINC); +inc_file_name(Filename) -> + Filename. + +%% out_file(IncFile, OutFile, State, DFA, DfaStart, Actions, Code, Line) -> ok +%% Copy the include file line by line substituting special lines with +%% generated code. We cheat by only looking at the first 5 +%% characters. + +out_file(Ifile, Ofile, St, DFA, DF, Actions, Code, L) -> + case io:get_line(Ifile, leex) of + eof -> output_file_directive(Ofile, St#leex.ifile, L); + Line -> + case substr(Line, 1, 5) of + "##mod" -> out_module(Ofile, St); + "##cod" -> out_erlang_code(Ofile, St, Code, L); + "##dfa" -> out_dfa(Ofile, St, DFA, Code, DF, L); + "##act" -> out_actions(Ofile, St#leex.xfile, Actions); + _ -> io:put_chars(Ofile, Line) + end, + out_file(Ifile, Ofile, St, DFA, DF, Actions, Code, L+1) + end. + +out_module(File, St) -> + io:fwrite(File, "-module(~w).\n", [St#leex.module]). + +out_erlang_code(File, St, Code, L) -> + {CodeL,CodePos,_NCodeLines} = Code, + output_file_directive(File, St#leex.xfile, CodeL), + {ok,Xfile} = file:open(St#leex.xfile, [read]), + try + {ok,_} = file:position(Xfile, CodePos), + {ok,_} = file:copy(Xfile, File) + after + file:close(Xfile) + end, + io:nl(File), + output_file_directive(File, St#leex.ifile, L). + +out_dfa(File, St, DFA, Code, DF, L) -> + {_CodeL,_CodePos,NCodeLines} = Code, + %% Three file attributes before this one... + output_file_directive(File, St#leex.efile, L+(NCodeLines-1)+3), + io:fwrite(File, "yystate() -> ~w.~n~n", [DF]), + foreach(fun (S) -> out_trans(File, S) end, DFA), + io:fwrite(File, "yystate(S, Ics, Line, Tlen, Action, Alen) ->~n", []), + io:fwrite(File, " {Action,Alen,Tlen,Ics,Line,S}.~n", []). + +out_trans(File, #dfa_state{no=N,trans=[],accept={accept,A}}) -> + %% Accepting end state, guaranteed done. + io:fwrite(File, "yystate(~w, Ics, Line, Tlen, _, _) ->~n", [N]), + io:fwrite(File, " {~w,Tlen,Ics,Line};~n", [A]); +out_trans(File, #dfa_state{no=N,trans=Tr,accept={accept,A}}) -> + %% Accepting state, but there maybe more. + foreach(fun (T) -> out_accept_tran(File, N, A, T) end, pack_trans(Tr)), + io:fwrite(File, "yystate(~w, Ics, Line, Tlen, _, _) ->~n", [N]), + io:fwrite(File, " {~w,Tlen,Ics,Line,~w};~n", [A,N]); +out_trans(File, #dfa_state{no=N,trans=Tr,accept=noaccept}) -> + %% Non-accepting transition state. + foreach(fun (T) -> out_noaccept_tran(File, N, T) end, pack_trans(Tr)), + io:fwrite(File, "yystate(~w, Ics, Line, Tlen, Action, Alen) ->~n", [N]), + io:fwrite(File, " {Action,Alen,Tlen,Ics,Line,~w};~n", [N]). + +out_accept_tran(File, N, A, {{Cf,maxchar},S}) -> + out_accept_head_max(File, N, Cf), + out_accept_body(File, S, "Line", A); +out_accept_tran(File, N, A, {{Cf,Cl},S}) -> + out_accept_head_range(File, N, Cf, Cl), + out_accept_body(File, S, "Line", A); +out_accept_tran(File, N, A, {$\n,S}) -> + out_accept_head_1(File, N, $\n), + out_accept_body(File, S, "Line+1", A); +out_accept_tran(File, N, A, {C,S}) -> + out_accept_head_1(File, N, C), + out_accept_body(File, S, "Line", A). + +out_accept_head_1(File, State, Char) -> + out_head_1(File, State, Char, "_", "_"). + +out_accept_head_max(File, State, Min) -> + out_head_max(File, State, Min, "_", "_"). + +out_accept_head_range(File, State, Min, Max) -> + out_head_range(File, State, Min, Max, "_", "_"). + +out_accept_body(File, Next, Line, Action) -> + out_body(File, Next, Line, io_lib:write(Action), "Tlen"). + +out_noaccept_tran(File, N, {{Cf,maxchar},S}) -> + out_noaccept_head_max(File, N, Cf), + out_noaccept_body(File, S, "Line"); +out_noaccept_tran(File, N, {{Cf,Cl},S}) -> + out_noaccept_head_range(File, N, Cf, Cl), + out_noaccept_body(File, S, "Line"); +out_noaccept_tran(File, N, {$\n,S}) -> + out_noaccept_head_1(File, N, $\n), + out_noaccept_body(File, S, "Line+1"); +out_noaccept_tran(File, N, {C,S}) -> + out_noaccept_head_1(File, N, C), + out_noaccept_body(File, S, "Line"). + +out_noaccept_head_1(File, State, Char) -> + out_head_1(File, State, Char, "Action", "Alen"). + +out_noaccept_head_max(File, State, Min) -> + out_head_max(File, State, Min, "Action", "Alen"). + +out_noaccept_head_range(File, State, Min, Max) -> + out_head_range(File, State, Min, Max, "Action", "Alen"). + +out_noaccept_body(File, Next, Line) -> + out_body(File, Next, Line, "Action", "Alen"). + +out_head_1(File, State, Char, Action, Alen) -> + io:fwrite(File, "yystate(~w, [~w|Ics], Line, Tlen, ~s, ~s) ->\n", + [State,Char,Action,Alen]). + +out_head_max(File, State, Min, Action, Alen) -> + io:fwrite(File, "yystate(~w, [C|Ics], Line, Tlen, ~s, ~s) when C >= ~w ->\n", + [State,Action,Alen,Min]). + +out_head_range(File, State, Min, Max, Action, Alen) -> + io:fwrite(File, "yystate(~w, [C|Ics], Line, Tlen, ~s, ~s) when C >= ~w, C =< ~w ->\n", + [State,Action,Alen,Min,Max]). + +out_body(File, Next, Line, Action, Alen) -> + io:fwrite(File, " yystate(~w, Ics, ~s, Tlen+1, ~s, ~s);\n", + [Next,Line,Action,Alen]). + +%% pack_trans([{Crange,State}]) -> [{Crange,State}] when +%% Crange = {Char,Char} | Char. +%% Pack the translation table into something more suitable for +%% generating code. We KNOW how the pattern matching compiler works +%% so solitary characters are stored before ranges. We do this by +%% prepending singletons to the front of the packed transitions and +%% appending ranges to the back. This preserves the smallest to +%% largest order of ranges. Newline characters, $\n, are always +%% extracted and handled as singeltons. + +pack_trans(Trs) -> pack_trans(Trs, []). + +%% pack_trans(Trs) -> +%% Trs1 = pack_trans(Trs, []), +%% io:fwrite("tr:~p\n=> ~p\n", [Trs,Trs1]), +%% Trs1. + +pack_trans([{{C,C},S}|Trs], Pt) -> % Singletons to the head + pack_trans(Trs, [{C,S}|Pt]); +%% Special detection and handling of $\n. +pack_trans([{{Cf,$\n},S}|Trs], Pt) -> + pack_trans([{{Cf,$\n-1},S}|Trs], [{$\n,S}|Pt]); +pack_trans([{{$\n,Cl},S}|Trs], Pt) -> + pack_trans([{{$\n+1,Cl},S}|Trs], [{$\n,S}|Pt]); +pack_trans([{{Cf,Cl},S}|Trs], Pt) when Cf < $\n, Cl > $\n -> + pack_trans([{{Cf,$\n-1},S},{{$\n+1,Cl},S}|Trs], [{$\n,S}|Pt]); +%% Small ranges become singletons. +pack_trans([{{Cf,Cl},S}|Trs], Pt) when Cl == Cf + 1 -> + pack_trans(Trs, [{Cf,S},{Cl,S}|Pt]); +pack_trans([Tr|Trs], Pt) -> % The default uninteresting case + pack_trans(Trs, Pt ++ [Tr]); +pack_trans([], Pt) -> Pt. + +%% out_actions(File, XrlFile, ActionList) -> ok. +%% Write out the action table. + +out_actions(File, XrlFile, As) -> + As1 = prep_out_actions(As), + foreach(fun (A) -> out_action(File, A) end, As1), + io:fwrite(File, "yyaction(_, _, _, _) -> error.~n", []), + foreach(fun (A) -> out_action_code(File, XrlFile, A) end, As1). + +prep_out_actions(As) -> + map(fun ({A,empty_action}) -> + {A,empty_action}; + ({A,Code,TokenChars,TokenLen,TokenLine}) -> + Vs = [{TokenChars,"TokenChars"}, + {TokenLen,"TokenLen"}, + {TokenLine,"TokenLine"}, + {TokenChars,"YYtcs"}, + {TokenLen or TokenChars,"TokenLen"}], + Vars = [if F -> S; true -> "_" end || {F,S} <- Vs], + Name = list_to_atom(lists:concat([yyaction_,A])), + [Chars,Len,Line,_,_] = Vars, + Args = [V || V <- [Chars,Len,Line], V =/= "_"], + ArgsChars = string:join(Args, ", "), + {A,Code,Vars,Name,Args,ArgsChars} + end, As). + +out_action(File, {A,empty_action}) -> + io:fwrite(File, "yyaction(~w, _, _, _) -> skip_token;~n", [A]); +out_action(File, {A,_Code,Vars,Name,_Args,ArgsChars}) -> + [_,_,Line,Tcs,Len] = Vars, + io:fwrite(File, "yyaction(~w, ~s, ~s, ~s) ->~n", [A,Len,Tcs,Line]), + if + Tcs =/= "_" -> + io:fwrite(File, " TokenChars = yypre(YYtcs, TokenLen),~n", []); + true -> ok + end, + io:fwrite(File, " ~s(~s);~n", [Name, ArgsChars]). + +out_action_code(_File, _XrlFile, {_A,empty_action}) -> + ok; +out_action_code(File, XrlFile, {_A,Code,_Vars,Name,Args,ArgsChars}) -> + %% Should set the file to the .erl file, but instead assumes that + %% ?LEEXINC is syntactically correct. + io:fwrite(File, "\n-compile({inline,~w/~w}).\n", [Name, length(Args)]), + {line, L} = erl_scan:token_info(hd(Code), line), + output_file_directive(File, XrlFile, L-2), + io:fwrite(File, "~s(~s) ->~n", [Name, ArgsChars]), + io:fwrite(File, " ~s\n", [pp_tokens(Code, L)]). + +%% pp_tokens(Tokens, Line) -> [char()]. +%% Prints the tokens keeping the line breaks of the original code. + +pp_tokens(Tokens, Line0) -> pp_tokens(Tokens, Line0, none). + +pp_tokens([], _Line0, _) -> []; +pp_tokens([T | Ts], Line0, Prev) -> + {line, Line} = erl_scan:token_info(T, line), + [pp_sep(Line, Line0, Prev, T), pp_symbol(T) | pp_tokens(Ts, Line, T)]. + +pp_symbol({var,_,Var}) -> atom_to_list(Var); +pp_symbol({_,_,Symbol}) -> io_lib:fwrite("~p", [Symbol]); +pp_symbol({dot, _}) -> "."; +pp_symbol({Symbol, _}) -> atom_to_list(Symbol). + +pp_sep(Line, Line0, Prev, T) when Line > Line0 -> + ["\n " | pp_sep(Line - 1, Line0, Prev, T)]; +pp_sep(_, _, {'.',_}, _) -> ""; % No space after '.' (not a dot) +pp_sep(_, _, {'#',_}, _) -> ""; % No space after '#' +pp_sep(_, _, {'(',_}, _) -> ""; % No space after '(' +pp_sep(_, _, {'[',_}, _) -> ""; % No space after '[' +pp_sep(_, _, _, {'.',_}) -> ""; % No space before '.' +pp_sep(_, _, _, {'#',_}) -> ""; % No space before '#' +pp_sep(_, _, _, {',',_}) -> ""; % No space before ',' +pp_sep(_, _, _, {')',_}) -> ""; % No space before ')' +pp_sep(_, _, _, _) -> " ". + +%% out_dfa_graph(LeexState, DFA, DfaStart) -> ok | error. +%% Writes the DFA to a .dot file in DOT-format which can be viewed +%% with Graphviz. + +out_dfa_graph(St, DFA, DF) -> + verbose_print(St, "Writing DFA to file ~s, ", [St#leex.gfile]), + case file:open(St#leex.gfile, [write]) of + {ok,Gfile} -> + try + io:fwrite(Gfile, "digraph DFA {~n", []), + out_dfa_states(Gfile, DFA, DF), + out_dfa_edges(Gfile, DFA), + io:fwrite(Gfile, "}~n", []), + verbose_print(St, "ok~n", []), + St + after file:close(Gfile) + end; + {error,Error} -> + verbose_print(St, "error~n", []), + add_error({none,leex,{file_error,Error}}, St) + end. + +out_dfa_states(File, DFA, DF) -> + foreach(fun (S) -> out_dfa_state(File, DF, S) end, DFA), + io:fwrite(File, "~n", []). + +out_dfa_state(File, DF, #dfa_state{no=DF, accept={accept,_}}) -> + io:fwrite(File, " ~b [shape=doublecircle color=green];~n", [DF]); +out_dfa_state(File, DF, #dfa_state{no=DF, accept=noaccept}) -> + io:fwrite(File, " ~b [shape=circle color=green];~n", [DF]); +out_dfa_state(File, _, #dfa_state{no=S, accept={accept,_}}) -> + io:fwrite(File, " ~b [shape=doublecircle];~n", [S]); +out_dfa_state(File, _, #dfa_state{no=S, accept=noaccept}) -> + io:fwrite(File, " ~b [shape=circle];~n", [S]). + +out_dfa_edges(File, DFA) -> + foreach(fun (#dfa_state{no=S,trans=Trans}) -> + Pt = pack_trans(Trans), + Tdict = foldl(fun ({Cr,T}, D) -> + orddict:append(T, Cr, D) + end, orddict:new(), Pt), + foreach(fun (T) -> + Crs = orddict:fetch(T, Tdict), + Edgelab = dfa_edgelabel(Crs), + io:fwrite(File, " ~b -> ~b [label=\"~s\"];~n", + [S,T,Edgelab]) + end, sort(orddict:fetch_keys(Tdict))) + end, DFA). + +dfa_edgelabel([C]) when is_integer(C) -> quote(C); +dfa_edgelabel(Cranges) -> + %% io:fwrite("el: ~p\n", [Cranges]), + "[" ++ map(fun ({A,B}) -> [quote(A), "-", quote(B)]; + (C) -> [quote(C)] + end, Cranges) ++ "]". + +output_file_directive(File, Filename, Line) -> + io:fwrite(File, <<"-file(~s, ~w).\n">>, + [format_filename(Filename), Line]). + +format_filename(Filename) -> + io_lib:write_string(filename:flatten(Filename)). + +quote($^) -> "\\^"; +quote($.) -> "\\."; +quote($$) -> "\\$"; +quote($-) -> "\\-"; +quote($[) -> "\\["; +quote($]) -> "\\]"; +quote($\s) -> "\\\\s"; +quote($\") -> "\\\""; +quote($\b) -> "\\\\b"; +quote($\f) -> "\\\\f"; +quote($\n) -> "\\\\n"; +quote($\r) -> "\\\\r"; +quote($\t) -> "\\\\t"; +quote($\e) -> "\\\\e"; +quote($\v) -> "\\\\v"; +quote($\d) -> "\\\\d"; +quote($\\) -> "\\\\"; +quote(C) when is_integer(C) -> + %% Must remove the $ and get the \'s right. + case io_lib:write_unicode_char(C) of + [$$,$\\|Cs] -> "\\\\" ++ Cs; + [$$|Cs] -> Cs + end; +quote(maxchar) -> + "MAXCHAR". |