diff options
Diffstat (limited to 'lib/xmerl/src/xmerl_regexp.erl')
-rw-r--r-- | lib/xmerl/src/xmerl_regexp.erl | 1437 |
1 files changed, 1437 insertions, 0 deletions
diff --git a/lib/xmerl/src/xmerl_regexp.erl b/lib/xmerl/src/xmerl_regexp.erl new file mode 100644 index 0000000000..0c53e6f34a --- /dev/null +++ b/lib/xmerl/src/xmerl_regexp.erl @@ -0,0 +1,1437 @@ +%% +%% %CopyrightBegin% +%% +%% Copyright Ericsson AB 2006-2009. All Rights Reserved. +%% +%% The contents of this file are subject to the Erlang Public License, +%% Version 1.1, (the "License"); you may not use this file except in +%% compliance with the License. You should have received a copy of the +%% Erlang Public License along with this software. If not, it can be +%% retrieved online at http://www.erlang.org/. +%% +%% Software distributed under the License is distributed on an "AS IS" +%% basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See +%% the License for the specific language governing rights and limitations +%% under the License. +%% +%% %CopyrightEnd% +%% + +%% +-module(xmerl_regexp). + +%% This module provides a basic set of regular expression functions +%% for strings. The functions provided are taken from AWK. +%% +%% Note that we interpret the syntax tree of a regular expression +%% directly instead of converting it to an NFA and then interpreting +%% that. This method seems to go significantly faster. + +-export([sh_to_awk/1,parse/1,format_error/1,match/2,first_match/2,matches/2]). +-export([sub/3,gsub/3,split/2,sub_match/2,sub_first_match/2]). + +-export([make_nfa/1,make_dfa/1,make_dfa/2,compile/1]). + +-import(string, [substr/2,substr/3]). +-import(lists, [reverse/1,reverse/2,last/1,duplicate/2,seq/2]). +-import(lists, [member/2,keysearch/3,keysort/2,map/2,foldl/3]). +-import(ordsets, [is_element/2,add_element/2,union/2,subtract/2]). + +%%-compile([export_all]). + +-export([setup/1,compile_proc/2]). + +setup(RE0) -> + RE = setup(RE0, [$^]), + Pid = spawn(?MODULE,compile_proc,[self(),RE]), + receive + {ok,Result} -> + Result + after 2000 -> + exit(Pid,force), + parse(RE) + end. + %% compile(RE). +%%RE. +compile_proc(From,RE) -> + Res = compile(RE), + From ! {ok,Res}. + + +setup([$\\,$d|S],Acc) -> setup(S,"]9-0[" ++Acc); +setup([$\\,$D|S],Acc) -> setup(S,"]9-0^[" ++Acc); +setup([$\\,$s|S],Acc) -> setup(S,"]s\\t\\n\\r\\[" ++Acc); +setup([$\\,$S|S],Acc) -> setup(S,"]\\s\\t\\n\\r^[" ++Acc); +setup([$\\,$i|S],Acc) -> setup(S,"]z-aZ-A_:[" ++Acc); %% Only Latin-1 now +setup([$\\,$I|S],Acc) -> setup(S,"]z-aZ-A_:^[" ++Acc); +setup([$\\,$c|S],Acc) -> setup(S,"]9-0z-aZ-A_:."++[183]++"-[" ++Acc); +setup([$\\,$C|S],Acc) -> setup(S,"]9-0z-aZ-A_:."++[183]++"-^[" ++Acc); +%% fixme setup([$\\,$w|S]) -> {{char_class,"\s\t\n\r"},S}; +%% fixme setup([$\\,$W|S]) -> {{comp_class,"\s\t\n\r"},S}; +%% Letter, Any +%% fixme setup(["\\p{L}" ++ S) -> {{char_class,"\s\t\n\r"},S}; +%% fixme setup(["\\P{L}" ++ S) -> {{comp_class,"\s\t\n\r"},S}; +%% Letter, Uppercase +%% fixme setup(["\\p{Lu}" ++ S) -> {{char_class,"\s\t\n\r"},S}; +%% fixme setup(["\\P{Lu}" ++ S) -> {{comp_class,"\s\t\n\r"},S}; +%% Letter, Lowercase +%% fixme setup(["\\p{Ll}" ++ S) -> {{char_class,"\s\t\n\r"},S}; +%% fixme setup(["\\P{Ll}" ++ S) -> {{comp_class,"\s\t\n\r"},S}; +%% Letter, Titlecase +%% fixme setup(["\\p{Lt}" ++ S) -> {{char_class,"\s\t\n\r"},S}; +%% fixme setup(["\\P{Lt}" ++ S) -> {{comp_class,"\s\t\n\r"},S}; +%% Letter, Modifier +%% fixme setup(["\\p{Lm}" ++ S) -> {{char_class,"\s\t\n\r"},S}; +%% fixme setup(["\\P{Lm}" ++ S) -> {{comp_class,"\s\t\n\r"},S}; +%% Letter, Other +%% fixme setup(["\\p{Lo}" ++ S) -> {{char_class,"\s\t\n\r"},S}; +%% fixme setup(["\\P{Lo}" ++ S) -> {{comp_class,"\s\t\n\r"},S}; +%% Mark, Any +%% fixme setup(["\\p{M}" ++ S) -> {{char_class,"\s\t\n\r"},S}; +%% fixme setup(["\\P{M}" ++ S) -> {{comp_class,"\s\t\n\r"},S}; +%% Mark, Nonspacing +%% fixme setup(["\\p{Mn}" ++ S) -> {{char_class,"\s\t\n\r"},S}; +%% fixme setup(["\\P{Mn}" ++ S) -> {{comp_class,"\s\t\n\r"},S}; +%% Mark, Spacing Combining +%% fixme setup(["\\p{Mc}" ++ S) -> {{char_class,"\s\t\n\r"},S}; +%% fixme setup(["\\P{Mc}" ++ S) -> {{comp_class,"\s\t\n\r"},S}; +%% Mark, Enclosing +%% fixme setup(["\\p{Me}" ++ S) -> {{char_class,"\s\t\n\r"},S}; +%% fixme setup(["\\P{Me}" ++ S) -> {{comp_class,"\s\t\n\r"},S}; +%% Number, Any +%% fixme setup(["\\p{N}" ++ S) -> {{char_class,"\s\t\n\r"},S}; +%% fixme setup(["\\P{N}" ++ S) -> {{comp_class,"\s\t\n\r"},S}; +%% Number, Decimal Digit +%% fixme setup(["\\p{Nd}" ++ S) -> {{char_class,"\s\t\n\r"},S}; +%% fixme setup(["\\P{Nd}" ++ S) -> {{comp_class,"\s\t\n\r"},S}; +%% Number, Letter +%% fixme setup(["\\p{Nl}" ++ S) -> {{char_class,"\s\t\n\r"},S}; +%% fixme setup(["\\P{Nl}" ++ S) -> {{comp_class,"\s\t\n\r"},S}; +%% Number, Other +%% fixme setup(["\\p{No}" ++ S) -> {{char_class,"\s\t\n\r"},S}; +%% fixme setup(["\\P{No}" ++ S) -> {{comp_class,"\s\t\n\r"},S}; +%% Punctuation, Any +%% fixme setup(["\\p{P}" ++ S) -> {{char_class,"\s\t\n\r"},S}; +%% fixme setup(["\\P{P}" ++ S) -> {{comp_class,"\s\t\n\r"},S}; +%% Punctuation, Connector +%% fixme setup(["\\p{Pc}" ++ S) -> {{char_class,"\s\t\n\r"},S}; +%% fixme setup(["\\P{Pc}" ++ S) -> {{comp_class,"\s\t\n\r"},S}; +%% Punctuation, Dash +%% fixme setup(["\\p{Pd}" ++ S) -> {{char_class,"\s\t\n\r"},S}; +%% fixme setup(["\\P{Pd}" ++ S) -> {{comp_class,"\s\t\n\r"},S}; +%% Punctuation, Open +%% fixme setup(["\\p{Ps}" ++ S) -> {{char_class,"\s\t\n\r"},S}; +%% fixme setup(["\\P{Ps}" ++ S) -> {{comp_class,"\s\t\n\r"},S}; +%% Punctuation, Close +%% fixme setup(["\\p{Pe}" ++ S) -> {{char_class,"\s\t\n\r"},S}; +%% fixme setup(["\\P{Pe}" ++ S) -> {{comp_class,"\s\t\n\r"},S}; +%% Punctuation, Initial quote (may behave like Ps or Pe, depending on usage) +%% fixme setup(["\\p{Pi}" ++ S) -> {{char_class,"\s\t\n\r"},S}; +%% fixme setup(["\\P{Pi}" ++ S) -> {{comp_class,"\s\t\n\r"},S}; +%% Punctuation, Final quote (may behave like Ps or Pe, depending on usage) +%% fixme setup(["\\p{Pf}" ++ S) -> {{char_class,"\s\t\n\r"},S}; +%% fixme setup(["\\P{Pf}" ++ S) -> {{comp_class,"\s\t\n\r"},S}; +%% Punctuation, Other +%% fixme setup(["\\p{Po}" ++ S) -> {{char_class,"\s\t\n\r"},S}; +%% fixme setup(["\\P{Po}" ++ S) -> {{comp_class,"\s\t\n\r"},S}; +%% Symbol, Any +%% fixme setup(["\\p{S}" ++ S) -> {{char_class,"\s\t\n\r"},S}; +%% fixme setup(["\\P{S}" ++ S) -> {{comp_class,"\s\t\n\r"},S}; +%% Symbol, Math +%% fixme setup(["\\p{Sm}" ++ S) -> {{char_class,"\s\t\n\r"},S}; +%% fixme setup(["\\P{Sm}" ++ S) -> {{comp_class,"\s\t\n\r"},S}; +%% Symbol, Currency +%% fixme setup(["\\p{Sc}" ++ S) -> {{char_class,"\s\t\n\r"},S}; +%% fixme setup(["\\P{Sc}" ++ S) -> {{comp_class,"\s\t\n\r"},S}; +%% Symbol, Modifier +%% fixme setup(["\\p{Sk}" ++ S) -> {{char_class,"\s\t\n\r"},S}; +%% fixme setup(["\\P{Sk}" ++ S) -> {{comp_class,"\s\t\n\r"},S}; +%% Symbol, Other +%% fixme setup(["\\p{So}" ++ S) -> {{char_class,"\s\t\n\r"},S}; +%% fixme setup(["\\P{So}" ++ S) -> {{comp_class,"\s\t\n\r"},S}; +%% Separator, Any +%% fixme setup(["\\p{Z}" ++ S) -> {{char_class,"\s\t\n\r"},S}; +%% fixme setup(["\\P{Z}" ++ S) -> {{comp_class,"\s\t\n\r"},S}; +%% Separator, Space +%% fixme setup(["\\p{Zs}" ++ S) -> {{char_class,"\s\t\n\r"},S}; +%% fixme setup(["\\P{Zs}" ++ S) -> {{comp_class,"\s\t\n\r"},S}; +%% Separator, Line +%% fixme setup(["\\p{Zl}" ++ S) -> {{char_class,"\s\t\n\r"},S}; +%% fixme setup(["\\P{Zl}" ++ S) -> {{comp_class,"\s\t\n\r"},S}; +%% Separator, Paragraph +%% fixme setup(["\\p{Zp}" ++ S) -> {{char_class,"\s\t\n\r"},S}; +%% fixme setup(["\\P{Zp}" ++ S) -> {{comp_class,"\s\t\n\r"},S}; +%% Other, Any +%% fixme setup(["\\p{C}" ++ S) -> {{char_class,"\s\t\n\r"},S}; +%% fixme setup(["\\P{C}" ++ S) -> {{comp_class,"\s\t\n\r"},S}; +%% Other, Control +%% fixme setup(["\\p{Cc}" ++ S) -> {{char_class,"\s\t\n\r"},S}; +%% fixme setup(["\\P{Cc}" ++ S) -> {{comp_class,"\s\t\n\r"},S}; +%% Other, Format +%% fixme setup(["\\p{Cf}" ++ S) -> {{char_class,"\s\t\n\r"},S}; +%% fixme setup(["\\P{Cf}" ++ S) -> {{comp_class,"\s\t\n\r"},S}; +%% Other, Surrogate not supported by schema recommendation +%% fixme setup(["\\p{Cs}" ++ S) -> {{char_class,"\s\t\n\r"},S}; +%% fixme setup(["\\P{Cs}" ++ S) -> {{comp_class,"\s\t\n\r"},S}; +%% Other, Private Use +%% fixme setup(["\\p{Co}" ++ S) -> {{char_class,"\s\t\n\r"},S}; +%% fixme setup(["\\P{Co}" ++ S) -> {{comp_class,"\s\t\n\r"},S}; +%% Other, Not assigned (no characters in the file have this property) +%% fixme setup(["\\p{Cn}" ++ S) -> {{char_class,"\s\t\n\r"},S}; +%% fixme setup(["\\P{Cn}" ++ S) -> {{comp_class,"\s\t\n\r"},S}; +setup([A|S], Acc) -> setup(S, [A|Acc]); +setup([],Acc) -> reverse([$$|Acc]). + +%% sh_to_awk(ShellRegExp) +%% Convert a sh style regexp into a full AWK one. The main difficulty is +%% getting character sets right as the conventions are different. + +sh_to_awk(Sh) -> "^(" ++ sh_to_awk_1(Sh). %Fix the beginning + +sh_to_awk_1([$*|Sh]) -> %This matches any string + ".*" ++ sh_to_awk_1(Sh); +sh_to_awk_1([$?|Sh]) -> %This matches any character + [$.|sh_to_awk_1(Sh)]; +sh_to_awk_1([$[,$^,$]|Sh]) -> %This takes careful handling + "\\^" ++ sh_to_awk_1(Sh); +%% Must move '^' to end. +sh_to_awk_1("[^" ++ Sh) -> [$[|sh_to_awk_2(Sh, true)]; +sh_to_awk_1("[!" ++ Sh) -> "[^" ++ sh_to_awk_2(Sh, false); +sh_to_awk_1([$[|Sh]) -> [$[|sh_to_awk_2(Sh, false)]; +sh_to_awk_1([C|Sh]) -> + %% Unspecialise everything else which is not an escape character. + case sh_special_char(C) of + true -> [$\\,C|sh_to_awk_1(Sh)]; + false -> [C|sh_to_awk_1(Sh)] + end; +sh_to_awk_1([]) -> ")$". %Fix the end + +sh_to_awk_2([$]|Sh], UpArrow) -> [$]|sh_to_awk_3(Sh, UpArrow)]; +sh_to_awk_2(Sh, UpArrow) -> sh_to_awk_3(Sh, UpArrow). + +sh_to_awk_3([$]|Sh], true) -> "^]" ++ sh_to_awk_1(Sh); +sh_to_awk_3([$]|Sh], false) -> [$]|sh_to_awk_1(Sh)]; +sh_to_awk_3([C|Sh], UpArrow) -> [C|sh_to_awk_3(Sh, UpArrow)]; +sh_to_awk_3([], true) -> [$^|sh_to_awk_1([])]; +sh_to_awk_3([], false) -> sh_to_awk_1([]). + +%% -type sh_special_char(char()) -> bool(). +%% Test if a character is a special character. + +sh_special_char($|) -> true; +sh_special_char($*) -> true; +sh_special_char($+) -> true; +sh_special_char($?) -> true; +sh_special_char($() -> true; +sh_special_char($)) -> true; +sh_special_char($\\) -> true; +sh_special_char($^) -> true; +sh_special_char($$) -> true; +sh_special_char($.) -> true; +sh_special_char($[) -> true; +sh_special_char($]) -> true; +sh_special_char($") -> true; +sh_special_char(_C) -> false. + +%% parse(RegExp) -> {ok,RE} | {error,E}. +%% Parse the regexp described in the string RegExp. + +parse(S) -> + case catch reg(S, 0) of + {R,Sc,[]} -> {ok,{regexp,{R,Sc}}}; + {_R,_Sc,[C|_]} -> {error,{illegal,[C]}}; + {error,E} -> {error,E} + end. + +%% format_error(Error) -> String. + +format_error({interval_range,What}) -> + ["illegal interval range",io_lib:write_string(What)]; +format_error({illegal,What}) -> ["illegal character `",What,"'"]; +format_error({unterminated,What}) -> ["unterminated `",What,"'"]; +format_error({posix_cc,What}) -> + ["illegal POSIX character class ",io_lib:write_string(What)]; +format_error({char_class,What}) -> + ["illegal character class ",io_lib:write_string(What)]. + +%% match(String, RegExp) -> {match,Start,Length} | nomatch | {error,E}. +%% Find the longest match of RegExp in String. + +match(S, RegExp) when is_list(RegExp) -> + case parse(RegExp) of + {ok,RE} -> match(S, RE); + {error,E} -> {error,E} + end; +match(S, {regexp,RE}) -> + case match_re(RE, S, 1, 0, -1) of + {Start,Len} when Len >= 0 -> + {match,Start,Len}; + {_Start,_Len} -> nomatch + end; +match(S, {comp_regexp,RE}) -> + case match_comp(RE, S, 1, 0, -1) of + {Start,Len} when Len >= 0 -> + {match,Start,Len}; + {_Start,_Len} -> nomatch + end. + +match_re(RE, [_|Cs]=S0, P0, Mst, Mlen) -> + case re_apply(S0, P0, RE) of + {match,P1,_S1,_Subs} -> + Len = P1-P0, + if Len > Mlen -> match_re(RE, Cs, P0+1, P0, Len); + true -> match_re(RE, Cs, P0+1, Mst, Mlen) + end; + nomatch -> match_re(RE, Cs, P0+1, Mst, Mlen); + never_match -> {Mst,Mlen} %No need to go on + end; +match_re(_RE, _S, _P, Mst, Mlen) -> {Mst,Mlen}. + +match_comp(RE, [_|Cs]=S0, P0, Mst, Mlen) -> + case comp_apply(S0, P0, RE) of + {match,P1,_S1} -> + Len = P1-P0, + if Len > Mlen -> match_comp(RE, Cs, P0+1, P0, Len); + true -> match_comp(RE, Cs, P0+1, Mst, Mlen) + end; + nomatch -> match_comp(RE, Cs, P0+1, Mst, Mlen) + end; +match_comp(_RE, _S, _P, Mst, Mlen) -> {Mst,Mlen}. + +%% match_re(RE, S0, Pos0, Mst, Mlen) -> +%% case first_match_re(RE, S0, Pos0) of +%% {St,Len,_} -> %Found a match +%% Pos1 = St + 1, %Where to start next match +%% S1 = lists:nthtail(Pos1-Pos0, S0), +%% if Len > Mlen -> match_re(RE, S1, Pos1, St, Len); +%% true -> match_re(RE, S1, Pos1, Mst, Mlen) +%% end; +%% nomatch -> {Mst,Mlen} +%% end. + +%% match_comp(RE, S0, Pos0, Mst, Mlen) -> +%% case first_match_comp(RE, S0, Pos0) of +%% {St,Len} -> %Found a match +%% Pos1 = St + 1, %Where to start next match +%% S1 = lists:nthtail(Pos1-Pos0, S0), +%% if Len > Mlen -> match_comp(RE, S1, Pos1, St, Len); +%% true -> match_comp(RE, S1, Pos1, Mst, Mlen) +%% end; +%% nomatch -> {Mst,Mlen} +%% end. + +%% first_match(String, RegExp) -> {match,Start,Length} | nomatch | {error,E}. +%% Find the first match of RegExp in String. + +first_match(S, RegExp) when is_list(RegExp) -> + case parse(RegExp) of + {ok,RE} -> first_match(S, RE); + {error,E} -> {error,E} + end; +first_match(S, {regexp,RE}) -> + case first_match_re(RE, S, 1) of + {Start,Len,_} -> {match,Start,Len}; + nomatch -> nomatch + end; +first_match(S, {comp_regexp,RE}) -> + case first_match_comp(RE, S, 1) of + {Start,Len} -> {match,Start,Len}; + nomatch -> nomatch + end. + +first_match_re(RE, S, St) when S /= [] -> + case re_apply(S, St, RE) of + {match,P,_Rest,Subs} -> {St,P-St,Subs}; + nomatch -> first_match_re(RE, tl(S), St+1); + never_match -> nomatch + end; +first_match_re(_RE, [], _St) -> nomatch. + +first_match_comp(RE, S, St) when S /= [] -> + case comp_apply(S, St, RE) of + {match,P,_Rest} -> {St,P-St}; + nomatch -> first_match_comp(RE, tl(S), St+1) + end; +first_match_comp(_RE, [], _St) -> nomatch. + +%% matches(String, RegExp) -> {match,[{Start,Length}]} | {error,E}. +%% Return the all the non-overlapping matches of RegExp in String. + +matches(S, RegExp) when is_list(RegExp) -> + case parse(RegExp) of + {ok,RE} -> matches(S, RE); + {error,E} -> {error,E} + end; +matches(S, {regexp,RE}) -> {match,matches_re(S, RE, 1)}; +matches(S, {comp_regexp,RE}) -> {match,matches_comp(S, RE, 1)}. + +matches_re([_|Cs]=S0, RE, P0) -> + case re_apply(S0, P0, RE) of + {match,P0,S1,_Subs} -> %0 length match + [{P0,0}|matches_re(tl(S1), RE, P0+1)]; + {match,P1,S1,_Subs} -> + [{P0,P1-P0}|matches_re(S1, RE, P1)]; + nomatch -> matches_re(Cs, RE, P0+1); + never_match -> [] + end; +matches_re([], _RE, _P) -> []. + +matches_comp([_|Cs]=S0, RE, P0) -> + case comp_apply(S0, P0, RE) of + {match,P0,S1} -> %0 length match + [{P0,0}|matches_comp(tl(S1), RE, P0+1)]; + {match,P1,S1} -> + [{P0,P1-P0}|matches_comp(S1, RE, P1)]; + nomatch -> matches_comp(Cs, RE, P0+1) + end; +matches_comp([], _RE, _P) -> []. + +%% sub(String, RegExp, Replace) -> {ok,RepString,RepCount} | {error,E}. +%% Substitute the first match of the regular expression RegExp with +%% the string Replace in String. Accept pre-parsed regular +%% expressions. + +sub(String, RegExp, Rep) when is_list(RegExp) -> + case parse(RegExp) of + {ok,RE} -> sub(String, RE, Rep); + {error,E} -> {error,E} + end; +sub(String, {regexp,RE}, Rep) -> + case sub_re(String, 1, RE, [], Rep) of + {yes,NewStr} -> {ok,NewStr,1}; + no -> {ok,String,0} + end; +sub(String, {comp_regexp,RE}, Rep) -> + case sub_comp(String, 1, RE, [], Rep) of + {yes,NewStr} -> {ok,NewStr,1}; + no -> {ok,String,0} + end. + +%% sub_re(String, Position, Regexp, Before, Replacement) -> +%% {NewString,Count}. +%% sub_comp(String, Position, Regexp, Before, Replacement) -> +%% {NewString,Count}. +%% Step forward over String until a match is found saving stepped over +%% chars in Before. Return reversed Before prepended to replacement +%% and rest of string. + +sub_re([C|Cs]=S0, P0, RE, Bef, Rep) -> + case re_apply(S0, P0, RE) of + {match,P0,_S1,_} -> %Ignore 0 length match + sub_re(Cs, P0+1, RE, [C|Bef], Rep); + {match,P1,Rest,_Gps} -> + {yes,reverse(Bef, sub_repl(Rep, substr(S0, 1, P1-P0), Rest))}; + nomatch -> sub_re(Cs, P0+1, RE, [C|Bef], Rep); + never_match -> no %No need to go on + end; +sub_re([], _P, _RE, _Bef, _Rep) -> no. + +sub_comp([C|Cs]=S0, P0, RE, Bef, Rep) -> + case comp_apply(S0, P0, RE) of + {match,P0,_S1} -> %Ignore 0 length match + sub_comp(Cs, P0+1, RE, [C|Bef], Rep); + {match,P1,Rest} -> + {yes,reverse(Bef, sub_repl(Rep, substr(S0, 1, P1-P0), Rest))}; + nomatch -> sub_comp(Cs, P0+1, RE, [C|Bef], Rep) + end; +sub_comp([], _P, _RE, _Bef, _Rep) -> no. + +sub_repl([$&|Rep], M, Rest) -> M ++ sub_repl(Rep, M, Rest); +sub_repl("\\&" ++ Rep, M, Rest) -> [$&|sub_repl(Rep, M, Rest)]; +sub_repl([C|Rep], M, Rest) -> [C|sub_repl(Rep, M, Rest)]; +sub_repl([], _M, Rest) -> Rest. + +%% gsub(String, RegExp, Replace) -> {ok,RepString,RepCount} | {error,E}. +%% Substitute every match of the regular expression RegExp with the +%% string New in String. Accept pre-parsed regular expressions. + +gsub(String, RegExp, Rep) when is_list(RegExp) -> + case parse(RegExp) of + {ok,RE} -> gsub(String, RE, Rep); + {error,E} -> {error,E} + end; +gsub(String, {regexp,RE}, Rep) -> + case gsub_re(String, 1, RE, [], Rep) of + {NewStr,N} -> {ok,NewStr,N}; + no -> {ok,String,0} %No substitutions + end; +gsub(String, {comp_regexp,RE}, Rep) -> + case gsub_comp(String, 1, RE, [], Rep) of + {NewStr,N} -> {ok,NewStr,N}; + no -> {ok,String,0} %No substitutions + end. + +%% gsub_re(String, Position, Regexp, Before, Replacement) -> +%% {NewString,Count}. +%% gsub_comp(String, Position, Regexp, Before, Replacement) -> +%% {NewString,Count}. +%% Step forward over String until a match is found saving stepped over +%% chars in Before. Call recursively to do rest of string after +%% match. Return reversed Before prepended to return from recursive +%% call. + +gsub_re([C|Cs]=S0, P0, RE, Bef, Rep) -> + case re_apply(S0, P0, RE) of + {match,P0,_S1,_} -> %Ignore 0 length match + gsub_re(Cs, P0+1, RE, [C|Bef], Rep); + {match,P1,S1,_Gps} -> + case gsub_re(S1, P1, RE, [], Rep) of + {NewStr,N0} -> %Substituitions + {reverse(Bef, sub_repl(Rep, substr(S0, 1, P1-P0), NewStr)), + N0+1}; + no -> %No substituitions. + {reverse(Bef, sub_repl(Rep, substr(S0, 1, P1-P0), S1)),1} + end; + %%No match so step forward saving C on Bef. + nomatch -> gsub_re(Cs, P0+1, RE, [C|Bef], Rep); + never_match -> no %No need to go on + end; +gsub_re([], _P, _RE, _Bef, _Rep) -> no. + +gsub_comp([C|Cs]=S0, P0, RE, Bef, Rep) -> + case comp_apply(S0, P0, RE) of + {match,P0,_S1} -> %Ignore 0 length match + gsub_comp(Cs, P0+1, RE, [C|Bef], Rep); + {match,P1,S1} -> + case gsub_comp(S1, P1, RE, [], Rep) of + {NewStr,N0} -> %Substituitions + {reverse(Bef, sub_repl(Rep, substr(S0, 1, P1-P0), NewStr)), + N0+1}; + no -> %No substituitions. + {reverse(Bef, sub_repl(Rep, substr(S0, 1, P1-P0), S1)),1} + end; + %%No match so step forward saving C on Bef. + nomatch -> gsub_comp(Cs, P0+1, RE, [C|Bef], Rep) + end; +gsub_comp([], _P, _RE, _Bef, _Rep) -> no. + +%% split(String, RegExp) -> {ok,[SubString]} | {error,E}. +%% Split a string into substrings where the RegExp describes the +%% field seperator. The RegExp " " is specially treated. + +split(String, " ") -> %This is really special + {ok,{regexp,RE}} = parse("[ \t]+"), + case split_apply_re(String, RE, true) of + [[]|Ss] -> {ok,Ss}; + Ss -> {ok,Ss} + end; +split(String, RegExp) when is_list(RegExp) -> + case parse(RegExp) of + {ok,{regexp,RE}} -> {ok,split_apply_re(String, RE, false)}; + {error,E} -> {error,E} + end; +split(String, {regexp,RE}) -> {ok,split_apply_re(String, RE, false)}; +split(String, {comp_regexp,RE}) -> {ok,split_apply_comp(String, RE, false)}. + +split_apply_re(S, RE, Trim) -> split_apply_re(S, 1, RE, Trim, []). + +split_apply_re([], _P, _RE, true, []) -> []; +split_apply_re([], _P, _RE, _T, Sub) -> [reverse(Sub)]; +split_apply_re([C|Cs]=S, P0, RE, T, Sub) -> + case re_apply(S, P0, RE) of + {match,P0,_S1,_} -> %Ignore 0 length match + split_apply_re(Cs, P0+1, RE, T, [C|Sub]); + {match,P1,S1,_} -> + [reverse(Sub)|split_apply_re(S1, P1, RE, T, [])]; + nomatch -> + split_apply_re(Cs, P0+1, RE, T, [C|Sub]); + never_match -> [reverse(Sub, S)] %No need to go on + end. + +split_apply_comp(S, RE, Trim) -> split_apply_comp(S, 1, RE, Trim, []). + +%%split_apply_comp([], _P, _RE, true, []) -> []; +split_apply_comp([], _P, _RE, _T, Sub) -> [reverse(Sub)]; +split_apply_comp([C|Cs]=S, P0, RE, T, Sub) -> + case comp_apply(S, P0, RE) of + {match,P0,_S1} -> %Ignore 0 length match + split_apply_comp(Cs, P0+1, RE, T, [C|Sub]); + {match,P1,S1} -> + [reverse(Sub)|split_apply_comp(S1, P1, RE, T, [])]; + nomatch -> + split_apply_comp(Cs, P0+1, RE, T, [C|Sub]) + end. + +%% sub_match(String, RegExp) -> +%% {match,Start,Length,SubExprs} | nomatch | {error,E}. +%% Find the longest match of RegExp in String. + +sub_match(S, RegExp) when is_list(RegExp) -> + case parse(RegExp) of + {ok,RE} -> sub_match(S, RE); + {error,E} -> {error,E} + end; +sub_match(S, {regexp,RE}) -> + case sub_match_re(RE, S, 1, 0, -1, none) of + {Start,Len,Subs} when Len >= 0 -> + {match,Start,Len,Subs}; + {_Start,_Len,_Subs} -> nomatch + end. + +sub_match_re(RE, S0, Pos0, Mst, Mlen, Msubs) -> + case first_match_re(RE, S0, Pos0) of + {St,Len,Subs} -> %Found a match + Pos1 = St + 1, %Where to start next match + S1 = lists:nthtail(Pos1-Pos0, S0), + if Len > Mlen -> sub_match_re(RE, S1, Pos1, St, Len, Subs); + true -> sub_match_re(RE, S1, Pos1, Mst, Mlen, Msubs) + end; + nomatch -> {Mst,Mlen,Msubs} + end. + +%% sub_first_match(String, RegExp) -> +%% {match,Start,Length,SubExprs} | nomatch | {error,E}. +%% Find the longest match of RegExp in String, return Start and Length +%% as well as tuple of sub-expression matches. + +sub_first_match(S, RegExp) when is_list(RegExp) -> + {ok,RE} = parse(RegExp), + sub_first_match(S, RE); +sub_first_match(S, {regexp,RE}) -> + case first_match_re(RE, S, 1) of + {St,Len,Subs} -> {match,St,Len,Subs}; + nomatch -> nomatch + end. + + +%% This is the regular expression grammar used. It is equivalent to the +%% one used in AWK, except that we allow ^ $ to be used anywhere and fail +%% in the matching. +%% +%% reg -> reg1 : '$1'. +%% reg1 -> reg1 "|" reg2 : {'or','$1','$2'}. +%% reg1 -> reg2 : '$1'. +%% reg2 -> reg2 reg3 : {concat,'$1','$2'}. +%% reg2 -> reg3 : '$1'. +%% reg3 -> reg3 "*" : {kclosure,'$1'}. +%% reg3 -> reg3 "+" : {pclosure,'$1'}. +%% reg3 -> reg3 "?" : {optional,'$1'}. +%% reg3 -> reg3 "{" [Min],[Max] "}" : {closure_range, Num, '$1'} see below +%% reg3 -> reg4 : '$1'. +%% reg4 -> "(" reg ")" : '$2'. +%% reg4 -> "\\" char : '$2'. +%% reg4 -> "^" : bos. +%% reg4 -> "$" : eos. +%% reg4 -> "." : char. +%% reg4 -> "[" class "]" : {char_class,char_class('$2')} +%% reg4 -> "[" "^" class "]" : {comp_class,char_class('$3')} +%% reg4 -> "\"" chars "\"" : char_string('$2') +%% reg4 -> char : '$1'. +%% reg4 -> empty : epsilon. +%% The grammar of the current regular expressions. The actual parser +%% is a recursive descent implementation of the grammar. + +reg(S, Sc) -> reg1(S, Sc). + +%% reg1 -> reg2 reg1' +%% reg1' -> "|" reg2 +%% reg1' -> empty + +reg1(S0, Sc0) -> + {L,Sc1,S1} = reg2(S0, Sc0), + reg1p(S1, L, Sc1). + +reg1p([$||S0], L, Sc0) -> + {R,Sc1,S1} = reg2(S0, Sc0), + reg1p(S1, {'or',L,R}, Sc1); +reg1p(S, L, Sc) -> {L,Sc,S}. + +%% reg2 -> reg3 reg2' +%% reg2' -> reg3 +%% reg2' -> empty + +reg2(S0, Sc0) -> + {L,Sc1,S1} = reg3(S0, Sc0), + reg2p(S1, L, Sc1). + +reg2p([C|S0], L, Sc0) when C /= $|, C /= $) -> + {R,Sc1,S1} = reg3([C|S0], Sc0), + %% reg2p(S1, {concat,L,R}, Sc1); + case is_integer(R) of + true -> + case L of + {literal,Lit} -> + reg2p(S1, {literal,Lit ++[R]}, Sc1); + {concat,S2,Char} when is_integer(Char) -> + reg2p(S1, {concat,S2,{literal,[Char,R]}}, Sc1); + {concat,S2,{literal,Lit}} -> + reg2p(S1, {concat,S2,{literal,Lit ++ [R]}}, Sc1); + Char when is_integer(Char) -> + reg2p(S1, {literal,[Char,R]}, Sc1); + _ -> + reg2p(S1, {concat,L,R}, Sc1) + end; + false -> + reg2p(S1, {concat,L,R}, Sc1) + end; +reg2p(S, L, Sc) -> {L,Sc,S}. + +%% reg3 -> reg4 reg3' +%% reg3' -> "*" reg3' +%% reg3' -> "+" reg3' +%% reg3' -> "?" reg3' +%% reg3' -> "{" [Min],[Max] "}" reg3' +%% reg3' -> empty + +reg3(S0, Sc0) -> + {L,Sc1,S1} = reg4(S0, Sc0), + reg3p(S1, L, Sc1). + +reg3p([$*|S], L, Sc) -> reg3p(S, {kclosure,L}, Sc); +reg3p([$+|S], L, Sc) -> reg3p(S, {pclosure,L}, Sc); +reg3p([$?|S], L, Sc) -> reg3p(S, {optional,L}, Sc); +reg3p([${|Cs0], L, Sc) -> % $} + case interval_range(Cs0) of + {none,none,_Cs1} -> parse_error({interval_range,[${|Cs0]}); + {N,M,[$}|Cs1]} -> reg3p(Cs1, {iclosure,L,N,M}, Sc); + {_N,_M,_Cs1} -> parse_error({unterminated,"{"}) + end; +reg3p(S, L, Sc) -> {L,Sc,S}. + +reg4([$(|S0], Sc0) -> + Sc1 = Sc0+1, + case reg(S0, Sc1) of + {R,Sc2,[$)|S1]} -> {{subexpr,Sc1,R},Sc2,S1}; + {_R,_Sc,_S} -> parse_error({unterminated,"("}) + end; +reg4([$^|S], Sc) -> {bos,Sc,S}; +reg4([$$|S], Sc) -> {eos,Sc,S}; +reg4([$.|S], Sc) -> {{comp_class,"\n"},Sc,S}; +reg4("[^" ++ S0, Sc) -> + case char_class(S0) of + {Cc,[$]|S1]} -> {{comp_class,Cc},Sc,S1}; + {_Cc,_S} -> parse_error({unterminated,"["}) + end; +reg4([$[|S0], Sc) -> + case char_class(S0) of + {Cc,[$]|S1]} -> {{char_class,Cc},Sc,S1}; + {_Cc,_S1} -> parse_error({unterminated,"["}) + end; +%reg4([$"|S0], Sc) -> +% case char_string(S0) of +% {St,[$"|S1]} -> {St,Sc,S1}; +% {St,S1} -> parse_error({unterminated,"\""}) +% end; +reg4([C0|S0], Sc) when + is_integer(C0), C0 /= $*, C0 /= $+, C0 /= $?, C0 /= $], C0 /= $), C0 /= $} -> + %% Handle \ quoted characters as well, at least those we see. + {C1,S1} = char(C0, S0), + {C1,Sc,S1}; +reg4(S=[$)|_], Sc) -> {epsilon,Sc,S}; +reg4([C|_S], _Sc) -> parse_error({illegal,[C]}); +reg4([], Sc) -> {epsilon,Sc,[]}. + +char($\\, [O1,O2,O3|S]) when + O1 >= $0, O1 =< $7, O2 >= $0, O2 =< $7, O3 >= $0, O3 =< $7 -> + {(O1*8 + O2)*8 + O3 - 73*$0,S}; +char($\\, [C|S]) -> {escape_char(C),S}; +char($\\, []) -> parse_error({unterminated,"\\"}); +char(C, S) -> {C,S}. + +escape_char($n) -> $\n; %\n = LF +escape_char($r) -> $\r; %\r = CR +escape_char($t) -> $\t; %\t = TAB +escape_char($v) -> $\v; %\v = VT +escape_char($b) -> $\b; %\b = BS +escape_char($f) -> $\f; %\f = FF +escape_char($e) -> $\e; %\e = ESC +escape_char($s) -> $\s; %\s = SPACE +escape_char($d) -> $\d; %\d = DEL +escape_char(C) -> C. + +char_class([$]|S0]) -> + {Cc,S1} = char_class(S0, [$]]), + {pack_cc(Cc),S1}; +char_class(S0) -> + {Cc,S1} = char_class(S0, []), + {pack_cc(Cc),S1}. + +pack_cc(Cc0) -> + %% First sort the list. + Cc1 = lists:usort(fun ({Cf1,_}, {Cf2,_}) -> Cf1 < Cf2; + ({Cf1,_}, C) -> Cf1 < C; + (C, {Cf,_}) -> C < Cf; + (C1, C2) -> C1 =< C2 + end, Cc0), + pack_cc1(Cc1). + +pack_cc1([{Cf1,Cl1},{Cf2,Cl2}|Cc]) when Cl1 >= Cf2, Cl1 =< Cl2 -> + pack_cc1([{Cf1,Cl2}|Cc]); +pack_cc1([{Cf1,Cl1},{Cf2,Cl2}|Cc]) when Cl1 >= Cf2, Cl1 >= Cl2 -> + pack_cc1([{Cf1,Cl1}|Cc]); +pack_cc1([{Cf1,Cl1},{Cf2,Cl2}|Cc]) when Cl1+1 == Cf2 -> + pack_cc1([{Cf1,Cl2}|Cc]); +pack_cc1([{Cf,Cl},C|Cc]) when Cl >= C -> pack_cc1([{Cf,Cl}|Cc]); +pack_cc1([{Cf,Cl},C|Cc]) when Cl+1 == C -> pack_cc1([{Cf,C}|Cc]); +pack_cc1([C,{Cf,Cl}|Cc]) when C == Cf-1 -> pack_cc1([{C,Cl}|Cc]); +pack_cc1([C1,C2|Cc]) when C1+1 == C2 -> pack_cc1([{C1,C2}|Cc]); +pack_cc1([C|Cc]) -> [C|pack_cc1(Cc)]; +pack_cc1([]) -> []. + +char_class("[:" ++ S0, Cc0) -> %Start of POSIX char class + case posix_cc(S0, Cc0) of + {Cc1,":]" ++ S1} -> char_class(S1, Cc1); + {_,_S1} -> parse_error({posix_cc,"[:" ++ S0}) + end; +char_class([C1|S0], Cc) when C1 /= $] -> + case char(C1, S0) of + {Cf,[$-,C2|S1]} when C2 /= $] -> + case char(C2, S1) of + {Cl,S2} when Cf < Cl -> char_class(S2, [{Cf,Cl}|Cc]); + {_Cl,_S2} -> parse_error({char_class,[C1|S0]}) + end; + {C,S1} -> char_class(S1, [C|Cc]) + end; +char_class(S, Cc) -> {Cc,S}. + +%% posix_cc(String, CharClass) -> {NewCharClass,RestString}. +%% Handle POSIX character classes, use Latin-1 character set. + +posix_cc("alnum" ++ S, Cc) -> + {[{$0,$9},{$A,$Z},{192,214},{216,223},{$a,$z},{224,246},{248,255}|Cc],S}; +posix_cc("alpha" ++ S, Cc) -> + {[{$A,$Z},{192,214},{216,223},{$a,$z},{224,246},{248,255}|Cc],S}; +posix_cc("blank" ++ S, Cc) -> {[$\s,$\t,160|Cc],S}; +posix_cc("cntrl" ++ S, Cc) -> {[{0,31},{127,159}|Cc],S}; +posix_cc("digit" ++ S, Cc) -> {[{$0,$9}|Cc],S}; +posix_cc("graph" ++ S, Cc) -> {[{33,126},{161,255}|Cc],S}; +posix_cc("lower" ++ S, Cc) -> {[{$a,$z},{224,246},{248,255}|Cc],S}; +posix_cc("print" ++ S, Cc) -> {[{32,126},{160,255}|Cc],S}; +posix_cc("punct" ++ S, Cc) -> {[{$!,$/},{$:,$?},{${,$~},{161,191}|Cc],S}; +posix_cc("space" ++ S, Cc) -> {[$\s,$\t,$\f,$\r,$\v,160|Cc],S}; +posix_cc("upper" ++ S, Cc) -> {[{$A,$Z},{192,214},{216,223}|Cc],S}; +posix_cc("xdigit" ++ S, Cc) -> {[{$a,$f},{$A,$F},{$0,$9}|Cc],S}; +posix_cc(S, _Cc) -> parse_error({posix_cc,"[:" ++ S}). + +interval_range(Cs0) -> + case number(Cs0) of + {none,Cs1} -> {none,none,Cs1}; + {N,[$,|Cs1]} -> + case number(Cs1) of + {none,Cs2} -> {N,any,Cs2}; + {M,Cs2} -> {N,M,Cs2} + end; + {N,Cs1} -> {N,none,Cs1} + end. + +number([C|Cs]) when C >= $0, C =< $9 -> + number(Cs, C - $0); +number(Cs) -> {none,Cs}. + +number([C|Cs], Acc) when C >= $0, C =< $9 -> + number(Cs, 10*Acc + (C - $0)); +number(Cs, Acc) -> {Acc,Cs}. + +parse_error(E) -> throw({error,E}). + +%char_string([C|S]) when C /= $" -> char_string(S, C); +%char_string(S) -> {epsilon,S}. + +%char_string([C|S0], L) when C /= $" -> +% char_string(S0, {concat,L,C}); +%char_string(S, L) -> {L,S}. + +%% re_apply(String, StartPos, RegExp) -> +%% {match,RestPos,Rest,SubExprs} | nomatch. +%% +%% Apply the (parse of the) regular expression RegExp to String. If +%% there is a match return the position of the remaining string and +%% the string if else return 'nomatch'. +%% +%% StartPos should be the real start position as it is used to decide +%% if we are at the beginning of the string. + +re_apply(S, St, {RE,Sc}) -> + Subs = erlang:make_tuple(Sc, none), %Make a sub-regexp table. + Res = re_apply(RE, [], S, St, Subs), + %% io:format("~p x ~p -> ~p\n", [RE,S,Res]), + Res. + +re_apply(epsilon, More, S, P, Subs) -> %This always matches + re_apply_more(More, S, P, Subs); +re_apply({'or',RE1,RE2}, More, S, P, Subs) -> + re_apply_or(re_apply(RE1, More, S, P, Subs), + re_apply(RE2, More, S, P, Subs)); +re_apply({concat,RE1,RE2}, More, S0, P, Subs) -> + re_apply(RE1, [RE2|More], S0, P, Subs); +re_apply({literal,[C|Lcs]}, More, [C|S], P, Subs) -> + re_apply_lit(Lcs, More, S, P+1, Subs); %Have matched first char +re_apply({kclosure,RE}, More, S0, P0, Subs0) -> + %% Greedy so try RE first, no difference here actually. + Loop = case re_apply(RE, [], S0, P0, Subs0) of + {match,P0,_S1,_Subs1} -> %0 length match, don't loop! + nomatch; + {match,P1,S1,Subs1} -> + re_apply_more([{kclosure,RE}|More], S1, P1, Subs1); + nomatch -> nomatch; + never_match -> never_match + end, + re_apply_or(Loop, re_apply_more(More, S0, P0, Subs0)); +re_apply({pclosure,RE}, More, S, P, Subs) -> + re_apply(RE, [{kclosure,RE}|More], S, P, Subs); +re_apply({optional,RE}, More, S, P, Subs) -> + %% Greedy so try RE first, no difference here actually. + re_apply_or(re_apply(RE, More, S, P, Subs), + re_apply_more(More, S, P, Subs)); +re_apply({iclosure,RE,N,M}, More, S, P, Subs) when N > 0 -> + re_apply(RE, [{iclosure,RE,N-1,M}|More], S, P, Subs); +re_apply({iclosure,RE,0,M}, More, S, P, Subs) -> + Exp = expand_opt(RE, M), + re_apply(Exp, More, S, P, Subs); +re_apply({subexpr,N,RE}, More, S, P, Subs) -> + re_apply(RE, [{endsub,N,P}|More], S, P, Subs); +re_apply({endsub,N,St}, More, S, P, Subs0) -> + Subs1 = setelement(N, Subs0, {St,P-St}), %Record sub-expr + re_apply_more(More, S, P, Subs1); +re_apply(bos, More, S, 1, Subs) -> re_apply_more(More, S, 1, Subs); +re_apply(bos, _More, _S, _, _) -> never_match; +re_apply(eos, More, [$\n], P, Subs) -> re_apply_more(More, [], P, Subs); +re_apply(eos, More, [], P, Subs) -> re_apply_more(More, [], P, Subs); +re_apply({char_class,Cc}, More, [C|S], P, Subs) -> + case in_char_class(C, Cc) of + true -> re_apply_more(More, S, P+1, Subs); + false -> nomatch + end; +re_apply({comp_class,Cc}, More, [C|S], P, Subs) -> + case in_char_class(C, Cc) of + true -> nomatch; + false -> re_apply_more(More, S, P+1, Subs) + end; +re_apply(C, More, [C|S], P, Subs) when is_integer(C) -> + re_apply_more(More, S, P+1, Subs); +re_apply(_RE, _More, _S, _P, _Subs) -> + %% io:format("~p : ~p\n", [_RE,_S]), + nomatch. + +%% re_apply_more([RegExp], String, Length, SubsExprs) -> +%% {match,RestPos,Rest,SubExprs} | nomatch. + +re_apply_more([RE|More], S, P, Subs) -> re_apply(RE, More, S, P, Subs); +re_apply_more([], S, P, Subs) -> {match,P,S,Subs}. + +%% re_apply_lit(Literal, More, String, Position, SubExprs) -> +%% {match,RestPos,Rest,SubExprs} | nomatch. +re_apply_lit([C|Lit], More, [C|Cs], P, Subs) -> + re_apply_lit(Lit, More, Cs, P+1, Subs); +re_apply_lit([], More, Cs, P, Subs) -> + re_apply_more(More, Cs, P, Subs); +re_apply_lit(_Lit, _More, _Cs, _P, _Subs) -> + nomatch. + +%% expand_iclosure(RE, N, M) -> RE. + +expand_iclosure(RE, 0, M) -> expand_opt(RE, M); +expand_iclosure(RE, N, M) -> + {concat,RE,expand_iclosure(RE, N-1, M)}. + +%% expand_opt(RegExp, Count) -> RE. +%% Handle all the cases. + +expand_opt(_RE, none) -> epsilon; +expand_opt(RE, any) -> {kclosure,RE}; +expand_opt(_RE, 0) -> epsilon; +expand_opt(RE, 1) -> {optional,RE}; +expand_opt(RE, N) -> + {optional,{concat,RE,expand_opt(RE, N-1)}}. + +%% find_prefix(PrefixStr, SourceStr) +%% if PrefixStr is a prefix of Str then return {ok,RemainingStr} +%% otherwise return false + +%% find_prefix([C|Prest], [C|Rest]) -> +%% find_prefix(Prest, Rest); +%% find_prefix([], Rest) -> {yes,Rest}; +%% find_prefix(_, _) -> no. + +%% in_char_class(Char, Class) -> bool(). + +in_char_class(C, [{C1,C2}|_Cc]) when C >= C1, C =< C2 -> true; +in_char_class(C, [C|_Cc]) -> true; +in_char_class(C, [_|Cc]) -> in_char_class(C, Cc); +in_char_class(_C, []) -> false. + +%% re_apply_or(Match1, Match2, SubExprs) -> +%% {match,RestPos,Rest,SubExprs} | nomatch. +%% If we want the best match then choose the longest match, else just +%% choose one by trying sequentially. + +re_apply_or(M1={match,P1,_,_},{match,P2,_,_}) when P1 >= P2 -> M1; +re_apply_or({match,_,_,_}, M2={match,_,_,_}) -> M2; +re_apply_or(never_match, R2) -> R2; +re_apply_or(R1, never_match) -> R1; +re_apply_or(nomatch, R2) -> R2; +re_apply_or(R1, nomatch) -> R1. + +%% Record definitions for the NFA, DFA and compiler. + +-record(nfa_state, {no,edges=[],accept=no}). +-record(dfa_state, {no,nfa=[],trans=[],accept=no}). + +-record(c_state, {no,trans=[],tmin=0,smin=none,tmax=0,smax=none, + accept=false,spec=[]}). + +%% We use standard methods, Thompson's construction and subset +%% construction, to create first an NFA and then a DFA from the +%% regexps. A non-standard feature is that we work with sets of +%% character ranges (crs) instead sets of characters. This is most +%% noticeable when constructing DFAs. The major benefit is that we can +%% handle characters from any set, not just limited ASCII or 8859, +%% even 16/32 bit unicode. +%% +%% The whole range of characters is 0-maxchar, where maxchar is a BIG +%% number. We don't make any assumptions about the size of maxchar, it +%% is just bigger than any character. +%% +%% Using character ranges makes describing many regexps very simple, +%% for example the regexp "." just becomes the range +%% [{0-9},{11-maxchar}]. + +%% make_nfa(RegExpActions) -> {ok,{NFA,StartState}} | {error,E}. +%% Build a complete nfa from a list of {RegExp,Action}. The NFA field +%% accept has values {yes,Action}|no. The NFA is a list of states. + +make_nfa(REAs0) -> + case parse_reas(REAs0) of + {ok,REAs1} -> + {NFA,Start} = build_combined_nfa(REAs1), + {ok,{NFA,Start}}; + {error,E} -> {error,E} + end. + +%% make_dfa(RegExpActions) -> {ok,{DFA,StartState}} | {error,E}. +%% make_dfa(RegExpActions, LowestState) -> {ok,{DFA,StartState}} | {error,E}. +%% Build a complete dfa from a list of {RegExp,Action}. The DFA field +%% accept has values {yes,Action}|no. If multiple Regexps can result +%% in same match string then RegExpActions list define priority. + +make_dfa(REAs) -> make_dfa(REAs, 0). + +make_dfa(REAs0, Low) -> + case parse_reas(REAs0) of + {ok,REAs1} -> + {NFA,Start0} = build_combined_nfa(REAs1), + {DFA0,Start1} = build_dfa(NFA, Start0), + {DFA,Start} = minimise_dfa(DFA0, Start1, Low), + {ok,{DFA,Start}}; + {error,E} -> {error,E} + end. + +parse_reas(REAs) -> parse_reas(REAs, []). + +parse_reas([{{regexp,{R,_Sc}},A}|REAs], S) -> %Already parsed + parse_reas(REAs, [{R,A}|S]); +parse_reas([{RegExp,A}|REAs], S) -> + case parse(RegExp) of + {ok,{regexp,{R,_Sc}}} -> parse_reas(REAs, [{R,A}|S]); + {error,E} -> {error,E} + end; +parse_reas([], Stack) -> {ok,reverse(Stack)}. + +%% build_combined_nfa(RegExpActionList) -> {NFA,StartState}. +%% Build the combined NFA using Thompson's construction straight out +%% of the book. Build the separate NFAs in the same order as the +%% rules so that the accepting have ascending states have ascending +%% state numbers. Start numbering the states from 1 as we put the +%% states in a tuple with the state number as the index. + +build_combined_nfa(REAs) -> + {NFA,Starts,Next} = build_nfa_list(REAs, [], [], 1), + F = #nfa_state{no=Next,edges=epsilon_trans(Starts),accept=no}, + {[F|NFA],Next}. + +build_nfa_list([{RE,Action}|REAs], NFA0, Starts, Next0) -> + {NFA1,Next1,Start} = build_nfa(RE, Next0, Action), + build_nfa_list(REAs, NFA1 ++ NFA0, [Start|Starts], Next1); +build_nfa_list([], NFA, Starts, Next) -> + {NFA,reverse(Starts),Next}. + +epsilon_trans(Sts) -> [ {epsilon,S} || S <- Sts ]. + +%% build_nfa(RegExp, NextState, Action) -> {NFA,NextFreeState,StartState}. +%% When building the NFA states for a ??? we don't build the end +%% state, just allocate a State for it and return this state +%% number. This allows us to avoid building unnecessary states for +%% concatenation which would then have to be removed by overwriting +%% an existing state. + +build_nfa(RE, Next, Action) -> + {NFA,N,E} = build_nfa(RE, Next+1, Next, []), + {[#nfa_state{no=E,accept={yes,Action}}|NFA],N,Next}. + +%% build_nfa(RegExp, NextState, StartState, NFA) -> {NFA,NextState,EndState}. +%% The NFA is a list of nfa_state is no predefined order. The state +%% number of the returned EndState is already allocated! + +build_nfa({'or',RE1,RE2}, N0, S, NFA0) -> + {NFA1,N1,E1} = build_nfa(RE1, N0+1, N0, NFA0), + {NFA2,N2,E2} = build_nfa(RE2, N1+1, N1, NFA1), + E = N2, + {[#nfa_state{no=S,edges=[{epsilon,N0},{epsilon,N1}]}, + #nfa_state{no=E1,edges=[{epsilon,E}]}, + #nfa_state{no=E2,edges=[{epsilon,E}]}|NFA2], + N2+1,E}; +build_nfa({literal,[]}, N, S, NFA) -> + {NFA,N,S}; +build_nfa({literal,[C|Cs]}, N0, S, NFA0) -> + {NFA1,N1,E1} = build_nfa(C, N0, S, NFA0), + build_nfa({literal,Cs}, N1, E1, NFA1); +build_nfa({concat,RE1,RE2}, N0, S, NFA0) -> + {NFA1,N1,E1} = build_nfa(RE1, N0, S, NFA0), + {NFA2,N2,E2} = build_nfa(RE2, N1, E1, NFA1), + {NFA2,N2,E2}; +build_nfa({kclosure,RE}, N0, S, NFA0) -> + {NFA1,N1,E1} = build_nfa(RE, N0+1, N0, NFA0), + E = N1, + {[#nfa_state{no=S,edges=[{epsilon,N0},{epsilon,E}]}, + #nfa_state{no=E1,edges=[{epsilon,N0},{epsilon,E}]}|NFA1], + N1+1,E}; +build_nfa({pclosure,RE}, N0, S, NFA0) -> + {NFA1,N1,E1} = build_nfa(RE, N0+1, N0, NFA0), + E = N1, + {[#nfa_state{no=S,edges=[{epsilon,N0}]}, + #nfa_state{no=E1,edges=[{epsilon,N0},{epsilon,E}]}|NFA1], + N1+1,E}; +build_nfa({optional,RE}, N0, S, NFA0) -> + {NFA1,N1,E1} = build_nfa(RE, N0+1, N0, NFA0), + E = N1, + {[#nfa_state{no=S,edges=[{epsilon,N0},{epsilon,E}]}, + #nfa_state{no=E1,edges=[{epsilon,E}]}|NFA1], + N1+1,E}; +build_nfa({iclosure,RE,I1,I2}, N, S, NFA) -> + Exp = expand_iclosure(RE, I1, I2), + build_nfa(Exp, N, S, NFA); +build_nfa({char_class,Cc}, N, S, NFA) -> + {[#nfa_state{no=S,edges=[{nfa_char_class(Cc),N}]}|NFA],N+1,N}; +build_nfa({comp_class,Cc}, N, S, NFA) -> + {[#nfa_state{no=S,edges=[{nfa_comp_class(Cc),N}]}|NFA],N+1,N}; +build_nfa(epsilon, N, S, NFA) -> + {NFA,N,S}; +build_nfa({group,RE}, N, S, NFA) -> %%% FIXME %%%%%%% + build_nfa(RE, N, S, NFA); +build_nfa({subexpr,_N,RE}, N, S, NFA) -> %%% FIXME %%%%%%% + build_nfa(RE, N, S, NFA); +build_nfa(bos, N, S, NFA) -> + {[#nfa_state{no=S,edges=[{[bos],N}]}|NFA],N+1,N}; +build_nfa(eos, N, S, NFA) -> + {[#nfa_state{no=S,edges=[{[eos],N}]}|NFA],N+1,N}; +%%{[#nfa_state{no=S,edges=[{[eos],N}]}|NFA],N+1,N}; +build_nfa(C, N, S, NFA) when is_integer(C) -> + {[#nfa_state{no=S,edges=[{[{C,C}],N}]}|NFA],N+1,N}. + +nfa_char_class(Cc) -> + Crs = lists:foldl(fun({C1,C2}, Set) -> add_element({C1,C2}, Set); + (C, Set) -> add_element({C,C}, Set) end, [], Cc), + %% io:fwrite("cc: ~p\n", [Crs]), + pack_crs(Crs). + +pack_crs([{C1,C2}=Cr,{C3,C4}|Crs]) when C1 =< C3, C2 >= C4 -> + %% C1 C2 + %% C3 C4 + pack_crs([Cr|Crs]); +pack_crs([{C1,C2},{C3,C4}|Crs]) when C2 >= C3, C2 < C4 -> + %% C1 C2 + %% C3 C4 + pack_crs([{C1,C4}|Crs]); +pack_crs([{C1,C2},{C3,C4}|Crs]) when C2 + 1 == C3 -> + %% C1 C2 + %% C3 C4 + pack_crs([{C1,C4}|Crs]); +pack_crs([Cr|Crs]) -> [Cr|pack_crs(Crs)]; +pack_crs([]) -> []. + +nfa_comp_class(Cc) -> + Crs = nfa_char_class(Cc), + %% io:fwrite("comp: ~p\n", [Crs]), + comp_crs(Crs, 0). + +comp_crs([{C1,C2}|Crs], Last) -> + [{Last,C1-1}|comp_crs(Crs, C2+1)]; +comp_crs([], Last) -> [{Last,maxchar}]. + +%% build_dfa(NFA, NfaStartState) -> {DFA,DfaStartState}. +%% Build a DFA from an NFA using "subset construction". The major +%% difference from the book is that we keep the marked and unmarked +%% DFA states in seperate lists. New DFA states are added to the +%% unmarked list and states are marked by moving them to the marked +%% list. We assume that the NFA accepting state numbers are in +%% ascending order for the rules and use ordsets to keep this order. + +build_dfa(NFA0, Start) -> + %% We want NFA as sorted tuple for fast access, assume lowest state 1. + NFA1 = list_to_tuple(keysort(#nfa_state.no, NFA0)), + D = #dfa_state{no=0,nfa=eclosure([Start], NFA1),accept=no}, + {build_dfa([D], 1, [], NFA1),0}. + +%% build_dfa([UnMarked], NextState, [Marked], NFA) -> DFA. +%% Traverse the unmarked states. Temporarily add the current unmarked +%% state to the marked list before calculating translation, this is +%% to avoid adding too many duplicate states. Add it properly to the +%% marked list afterwards with correct translations. + +build_dfa([U|Us0], N0, Ms, NFA) -> + {Ts,Us1,N1} = build_dfa(U#dfa_state.nfa, Us0, N0, [], [U|Ms], NFA), + M = U#dfa_state{trans=Ts,accept=accept(U#dfa_state.nfa, NFA)}, + build_dfa(Us1, N1, [M|Ms], NFA); +build_dfa([], _N, Ms, _NFA) -> Ms. + +%% build_dfa([NfaState], [Unmarked], NextState, [Transition], [Marked], NFA) -> +%% {Transitions,UnmarkedStates,NextState}. +%% Foreach NFA state set calculate the legal translations. N.B. must +%% search *BOTH* the unmarked and marked lists to check if DFA state +%% already exists. As the range of characters is potentially VERY +%% large we cannot explicitly test all characters. Instead we first +%% calculate the set of all disjoint character ranges which are +%% possible candidates to the set of NFA states. + +build_dfa(Set, Us, N, Ts, Ms, NFA) -> + %% List of all transition sets. + Crs0 = [Cr || S <- Set, + {Crs,_St} <- (element(S, NFA))#nfa_state.edges, + is_list(Crs), + Cr <- Crs ], + Crs1 = lists:usort(Crs0), %Must remove duplicates! + %% Build list of disjoint test ranges. + Test = disjoint_crs(Crs1), + %% io:fwrite("bd: ~p\n ~p\n ~p\n ~p\n", [Set,Crs0,Crs1,Test]), + build_dfa(Test, Set, Us, N, Ts, Ms, NFA). + +%% disjoint_crs([CharRange]) -> [CharRange]. +%% Take a sorted list of char ranges and make a sorted list of +%% disjoint char ranges. No new char range extends past an existing +%% char range. + +disjoint_crs([{_C1,C2}=Cr1,{C3,_C4}=Cr2|Crs]) when C2 < C3 -> + %% C1 C2 + %% C3 C4 + [Cr1|disjoint_crs([Cr2|Crs])]; +disjoint_crs([{C1,C2},{C3,C4}|Crs]) when C1 == C3 -> + %% C1 C2 + %% C3 C4 + [{C1,C2}|disjoint_crs(add_element({C2+1,C4}, Crs))]; +disjoint_crs([{C1,C2},{C3,C4}|Crs]) when C1 < C3, C2 >= C3, C2 < C4 -> + %% C1 C2 + %% C3 C4 + [{C1,C3-1}|disjoint_crs(union([{C3,C2},{C2+1,C4}], Crs))]; +disjoint_crs([{C1,C2},{C3,C4}|Crs]) when C1 < C3, C2 == C4 -> + %% C1 C2 + %% C3 C4 + [{C1,C3-1}|disjoint_crs(add_element({C3,C4}, Crs))]; +disjoint_crs([{C1,C2},{C3,C4}|Crs]) when C1 < C3, C2 > C4 -> + %% C1 C2 + %% C3 C4 + [{C1,C3-1}|disjoint_crs(union([{C3,C4},{C4+1,C2}], Crs))]; +disjoint_crs([Cr|Crs]) -> [Cr|disjoint_crs(Crs)]; +disjoint_crs([]) -> []. + +build_dfa([Cr|Crs], Set, Us, N, Ts, Ms, NFA) -> + case eclosure(move(Set, Cr, NFA), NFA) of + S when S /= [] -> + case keysearch(S, #dfa_state.nfa, Us) of + {value,#dfa_state{no=T}} -> + build_dfa(Crs, Set, Us, N, [{Cr,T}|Ts], Ms, NFA); + false -> + case keysearch(S, #dfa_state.nfa, Ms) of + {value,#dfa_state{no=T}} -> + build_dfa(Crs, Set, Us, N, [{Cr,T}|Ts], Ms, NFA); + false -> + U = #dfa_state{no=N,nfa=S}, + build_dfa(Crs, Set, [U|Us], N+1, [{Cr,N}|Ts], Ms, NFA) + end + end; + [] -> + build_dfa(Crs, Set, Us, N, Ts, Ms, NFA) + end; +build_dfa([], _Set, Us, N, Ts, _Ms, _NFA) -> + {Ts,Us,N}. + +%% eclosure([State], NFA) -> [State]. +%% move([State], Char, NFA) -> [State]. +%% These are straight out of the book. As eclosure uses ordsets then +%% the generated state sets are in ascending order. + +eclosure(Sts, NFA) -> eclosure(Sts, NFA, []). + +eclosure([St|Sts], NFA, Ec) -> + #nfa_state{edges=Es} = element(St, NFA), + eclosure([ N || {epsilon,N} <- Es, + not is_element(N, Ec) ] ++ Sts, + NFA, add_element(St, Ec)); +eclosure([], _NFA, Ec) -> Ec. + +move(Sts, Cr, NFA) -> + [ St || N <- Sts, + {Crs,St} <- (element(N, NFA))#nfa_state.edges, + is_list(Crs), +%% begin +%% io:fwrite("move1: ~p\n", [{Sts,Cr,Crs,in_crs(Cr,Crs)}]), +%% true +%% end, + in_crs(Cr, Crs) ]. + +in_crs({C1,C2}, [{C3,C4}|_Crs]) when C1 >= C3, C2 =< C4 -> true; +in_crs(Cr, [Cr|_Crs]) -> true; %Catch bos and eos. +in_crs(Cr, [_|Crs]) -> in_crs(Cr, Crs); +in_crs(_Cr, []) -> false. + +%% accept([State], NFA) -> true | false. +%% Scan down the state list until we find an accepting state. + +accept([St|Sts], NFA) -> + case element(St, NFA) of + #nfa_state{accept={yes,A}} -> {yes,A}; + #nfa_state{accept=no} -> accept(Sts, NFA) + end; +accept([], _NFA) -> no. + +%% minimise_dfa(DFA, StartState, FirstState) -> {DFA,StartState}. +%% Minimise the DFA by removing equivalent states. We consider a +%% state if both the transitions and the their accept state is the +%% same. First repeatedly run throught the DFA state list removing +%% equivalent states and updating remaining transitions with +%% remaining equivalent state numbers. When no more reductions are +%% possible then pack the remaining state numbers to get consecutive +%% states. + +minimise_dfa(DFA0, Start, N) -> + case min_dfa(DFA0) of + {DFA1,[]} -> %No reduction! + {DFA2,Rs} = pack_dfa(DFA1, N), + {min_update(DFA2, Rs),min_new_state(Start, Rs)}; + {DFA1,Rs} -> + minimise_dfa(min_update(DFA1, Rs), min_new_state(Start, Rs), N) + end. + +min_dfa(DFA) -> min_dfa(DFA, [], []). + +min_dfa([D|DFA0], Rs0, MDFA) -> + {DFA1,Rs1} = min_delete(DFA0, D#dfa_state.trans, D#dfa_state.accept, + D#dfa_state.no, Rs0, []), + min_dfa(DFA1, Rs1, [D|MDFA]); +min_dfa([], Rs, MDFA) -> {MDFA,Rs}. + +min_delete([#dfa_state{no=N,trans=T,accept=A}|DFA], T, A, NewN, Rs, MDFA) -> + min_delete(DFA, T, A, NewN, [{N,NewN}|Rs], MDFA); +min_delete([D|DFA], T, A, NewN, Rs, MDFA) -> + min_delete(DFA, T, A, NewN, Rs, [D|MDFA]); +min_delete([], _T, _A, _NewN, Rs, MDFA) -> {MDFA,Rs}. + +min_update(DFA, Rs) -> + [ D#dfa_state{trans=min_update_trans(D#dfa_state.trans, Rs)} || D <- DFA ]. + +min_update_trans(Tr, Rs) -> + [ {C,min_new_state(S, Rs)} || {C,S} <- Tr ]. + +min_new_state(Old, [{Old,New}|_Reds]) -> New; +min_new_state(Old, [_R|Reds]) -> min_new_state(Old, Reds); +min_new_state(Old, []) -> Old. + +pack_dfa(DFA, N) -> pack_dfa(DFA, N, [], []). + +pack_dfa([D|DFA], NewN, Rs, PDFA) -> + pack_dfa(DFA, NewN+1, [{D#dfa_state.no,NewN}|Rs], + [D#dfa_state{no=NewN}|PDFA]); +pack_dfa([], _NewN, Rs, PDFA) -> {PDFA,Rs}. + +%% comp_apply(String, StartPos, DFAReg) -> {match,RestPos,Rest} | nomatch. +%% Apply the DFA of a regular expression to a string. If +%% there is a match return the position of the remaining string and +%% the string if else return 'nomatch'. +%% +%% StartPos should be the real start position as it is used to decide +%% if we are at the beginning of the string. + +comp_apply(Cs, P, {DFA,Start,_Fail}) -> + comp_apply(element(Start, DFA), Cs, P, DFA, nomatch). + +comp_apply(#c_state{spec=[]}=St, Cs, P, DFA, Accept) -> + comp_apply_tr(St, Cs, P, DFA, Accept); +comp_apply(#c_state{spec=Sp}=St, Cs, P, DFA, Accept) -> + comp_apply_sp(St, Cs, P, DFA, Accept, Sp). + +comp_apply_tr(#c_state{trans=none,accept=A}, Cs, P, _DFA, Accept) -> + %% End state. + accept_value(A, Cs, P, Accept); +comp_apply_tr(#c_state{trans=Tr,tmin=Tmin,smin=Smin,tmax=Tmax,smax=Smax,accept=A}, + [C|Cs]=Cs0, P, DFA, Accept) -> + %% Get the next state number to go to. + NextSt = if C =< Tmin -> Smin; %Below transition table + C >= Tmax -> Smax; %Above transition table + true -> %Otherwise use table + element(C - Tmin, Tr) + end, + comp_apply(element(NextSt, DFA), Cs, P+1, DFA, + accept_value(A, Cs0, P, Accept)); +comp_apply_tr(#c_state{trans=_Tr,accept=A}, [], P, _DFA, Accept) -> + accept_value(A, [], P, Accept). + +comp_apply_sp(_St, Cs, 1, DFA, Accept, [{bos,S}|_]) -> + comp_apply(element(S, DFA), Cs, 1, DFA, Accept); +comp_apply_sp(_St, [$\n], P, DFA, Accept, [{eos,S}|_]) -> + comp_apply(element(S, DFA), [], P, DFA, Accept); +comp_apply_sp(_St, [], P, DFA, Accept, [{eos,S}|_]) -> + comp_apply(element(S, DFA), [], P, DFA, Accept); +comp_apply_sp(St, Cs, P, DFA, Accept, [_|Sp]) -> + comp_apply_sp(St, Cs, P, DFA, Accept, Sp); +comp_apply_sp(St, Cs, P, DFA, Accept, []) -> + comp_apply_tr(St, Cs, P, DFA, Accept). + +accept_value(true, Cs, P, _Accept) -> {match,P,Cs}; +accept_value(false, _Cs, _P, Accept) -> Accept. + +%% compile(RegExp) -> {ok,RE} | {error,E}. +%% Parse the regexp described in the string RegExp. + +compile(RegExp) -> + case make_dfa([{RegExp,yes}], 2) of + {ok,{DFA0,Start}} -> + Fail = 1, + DFA1 = [#dfa_state{no=Fail,accept=no,trans=[]}|DFA0], + DFA = tuplelise_dfa(DFA1, 1), + {ok,{comp_regexp,{DFA,Start,Fail}}}; + {error,E} -> {error,E} + end. + +%% tuplelise_dfa(DFAstates, NoAcceptState) -> {{CompState},FirstState}. + +tuplelise_dfa(DFA0, NoAccept) -> + DFA1 = map(fun (#dfa_state{no=N,trans=Ts,accept=A}) -> + {Tr,Tmin,Smin,Tmax,Smax,Sp} = build_trans(Ts, NoAccept), + #c_state{no=N,trans=Tr,tmin=Tmin,smin=Smin, + tmax=Tmax,smax=Smax, + accept=fix_accept(A),spec=Sp} + end, DFA0), + list_to_tuple(keysort(#dfa_state.no, DFA1)). + +build_trans(Ts0, NoAccept) -> + %% Split transitions into character ranges and specials. + {Ts1,Sp1} = foldl(fun ({{_,_},_}=T, {Ts,Sp}) -> {[T|Ts],Sp}; + ({_,_}=T, {Ts,Sp}) -> {Ts,[T|Sp]} + end, {[],[]}, Ts0), + if Ts1 == [] -> + {none,none,none,none,none,Sp1}; + true -> + %% Have transitions, convert to tuple. + Ts2 = keysort(1, Ts1), + {Tmin,Smin,Ts3} = min_trans(Ts2, NoAccept), + %% io:fwrite("exptr: ~p\n", [{Ts3,Tmin}]), + {Trans,Tmax,Smax} = expand_trans(Ts3, Tmin, NoAccept), + {list_to_tuple(Trans),Tmin,Smin,Tmax,Smax,Sp1} + end. + +min_trans([{{0,C2},S}|Crs], _Def) -> {C2,S,Crs}; +min_trans([{{C1,_C2},_S}|_]=Crs, Def) -> {C1-1,Def,Crs}. + +expand_trans([{{C1,maxchar},S}], Last, Def) -> + Trs = duplicate(C1-(Last+1), Def), + {Trs,C1,S}; +expand_trans([{{C1,C2},S}], Last, Def) -> + Trs = duplicate(C1-(Last+1), Def) ++ duplicate(C2-C1+1, S), + {Trs,C2+1,Def}; +expand_trans([{{C1,C2},S}|Crs], Last, Def) -> + {Trs0,Tmax,Smax} = expand_trans(Crs, C2, Def), + Trs1 = duplicate(C1-(Last+1), Def) ++ duplicate(C2-C1+1, S) ++ Trs0, + {Trs1,Tmax,Smax}. + +fix_accept({yes,_}) -> true; +fix_accept(no) -> false. + |