%%
%% %CopyrightBegin%
%%
%% Copyright Ericsson AB 1996-2011. All Rights Reserved.
%%
%% The contents of this file are subject to the Erlang Public License,
%% Version 1.1, (the "License"); you may not use this file except in
%% compliance with the License. You should have received a copy of the
%% Erlang Public License along with this software. If not, it can be
%% retrieved online at http://www.erlang.org/.
%%
%% Software distributed under the License is distributed on an "AS IS"
%% basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See
%% the License for the specific language governing rights and limitations
%% under the License.
%%
%% %CopyrightEnd%
%%
-module(regexp).
%% This entire module is deprecated and will be removed in a future
%% release. Use the 're' module instead.
%%
%% This module provides a basic set of regular expression functions
%% for strings. The functions provided are taken from AWK.
%%
%% Note that we interpret the syntax tree of a regular expression
%% directly instead of converting it to an NFA and then interpreting
%% that. This method seems to go significantly faster.
-export([sh_to_awk/1,parse/1,format_error/1,match/2,first_match/2,matches/2]).
-export([sub/3,gsub/3,split/2]).
-deprecated([sh_to_awk/1,parse/1,format_error/1,match/2,first_match/2,matches/2]).
-deprecated([sub/3,gsub/3,split/2]).
-import(string, [substr/2,substr/3]).
-import(lists, [reverse/1]).
-type errordesc() :: term().
-opaque regexp() :: term().
%% -type matchres() = {match,Start,Length} | nomatch | {error,E}.
%% -type subres() = {ok,RepString,RepCount} | {error,E}.
%% -type splitres() = {ok,[SubString]} | {error,E}.
%%-compile([export_all]).
%% This is the regular expression grammar used. It is equivalent to the
%% one used in AWK, except that we allow ^ $ to be used anywhere and fail
%% in the matching.
%%
%% reg -> reg1 : '$1'.
%% reg1 -> reg1 "|" reg2 : {'or','$1','$2'}.
%% reg1 -> reg2 : '$1'.
%% reg2 -> reg2 reg3 : {concat,'$1','$2'}.
%% reg2 -> reg3 : '$1'.
%% reg3 -> reg3 "*" : {kclosure,'$1'}.
%% reg3 -> reg3 "+" : {pclosure,'$1'}.
%% reg3 -> reg3 "?" : {optional,'$1'}.
%% reg3 -> reg4 : '$1'.
%% reg4 -> "(" reg ")" : '$2'.
%% reg4 -> "\\" char : '$2'.
%% reg4 -> "^" : bos.
%% reg4 -> "$" : eos.
%% reg4 -> "." : char.
%% reg4 -> "[" class "]" : {char_class,char_class('$2')}
%% reg4 -> "[" "^" class "]" : {comp_class,char_class('$3')}
%% reg4 -> "\"" chars "\"" : char_string('$2')
%% reg4 -> char : '$1'.
%% reg4 -> empty : epsilon.
%% The grammar of the current regular expressions. The actual parser
%% is a recursive descent implementation of the grammar.
reg(S) -> reg1(S).
%% reg1 -> reg2 reg1'
%% reg1' -> "|" reg2
%% reg1' -> empty
reg1(S0) ->
{L,S1} = reg2(S0),
reg1p(S1, L).
reg1p([$||S0], L) ->
{R,S1} = reg2(S0),
reg1p(S1, {'or',L,R});
reg1p(S, L) -> {L,S}.
%% reg2 -> reg3 reg2'
%% reg2' -> reg3
%% reg2' -> empty
reg2(S0) ->
{L,S1} = reg3(S0),
reg2p(S1, L).
reg2p([C|S0], L) when C =/= $|, C =/= $) ->
{R,S1} = reg3([C|S0]),
reg2p(S1, {concat,L,R});
reg2p(S, L) -> {L,S}.
%% reg3 -> reg4 reg3'
%% reg3' -> "*" reg3'
%% reg3' -> "+" reg3'
%% reg3' -> "?" reg3'
%% reg3' -> empty
reg3(S0) ->
{L,S1} = reg4(S0),
reg3p(S1, L).
reg3p([$*|S], L) -> reg3p(S, {kclosure,L});
reg3p([$+|S], L) -> reg3p(S, {pclosure,L});
reg3p([$?|S], L) -> reg3p(S, {optional,L});
reg3p(S, L) -> {L,S}.
-define(HEX(C), C >= $0 andalso C =< $9 orelse
C >= $A andalso C =< $F orelse
C >= $a andalso C =< $f).
reg4([$(|S0]) ->
case reg(S0) of
{R,[$)|S1]} -> {R,S1};
{_R,_S} -> throw({error,{unterminated,"("}})
end;
reg4([$\\,O1,O2,O3|S]) when
O1 >= $0, O1 =< $7, O2 >= $0, O2 =< $7, O3 >= $0, O3 =< $7 ->
{(O1*8 + O2)*8 + O3 - 73*$0,S};
reg4([$\\,$x,H1,H2|S]) when ?HEX(H1), ?HEX(H2) ->
{erlang:list_to_integer([H1,H2], 16),S};
reg4([$\\,$x,${|S]) ->
hex(S, []);
reg4([$\\,$x|_]) ->
throw({error,{illegal,[$x]}});
reg4([$\\,C|S]) -> {escape_char(C),S};
reg4([$\\]) -> throw({error,{unterminated,"\\"}});
reg4([$^|S]) -> {bos,S};
reg4([$$|S]) -> {eos,S};
reg4([$.|S]) -> {{comp_class,"\n"},S};
reg4("[^" ++ S0) ->
case char_class(S0) of
{Cc,[$]|S1]} -> {{comp_class,Cc},S1};
{_Cc,_S} -> throw({error,{unterminated,"["}})
end;
reg4([$[|S0]) ->
case char_class(S0) of
{Cc,[$]|S1]} -> {{char_class,Cc},S1};
{_Cc,_S1} -> throw({error,{unterminated,"["}})
end;
%reg4([$"|S0]) ->
% case char_string(S0) of
% {St,[$"|S1]} -> {St,S1};
% {St,S1} -> throw({error,{unterminated,"\""}})
% end;
reg4([C|S]) when C =/= $*, C =/= $+, C =/= $?, C =/= $] -> {C,S};
reg4([C|_S]) -> throw({error,{illegal,[C]}});
reg4([]) -> {epsilon,[]}.
hex([C|Cs], L) when ?HEX(C) ->
hex(Cs, [C|L]);
hex([$}|S], L) ->
case catch erlang:list_to_integer(lists:reverse(L), 16) of
V when V =< 16#FF ->
{V,S};
_ ->
throw({error,{illegal,[$}]}})
end;
hex(_S, _) ->
throw({error,{unterminated,"\\x{"}}).
escape_char($n) -> $\n; %\n = LF
escape_char($r) -> $\r; %\r = CR
escape_char($t) -> $\t; %\t = TAB
escape_char($v) -> $\v; %\v = VT
escape_char($b) -> $\b; %\b = BS
escape_char($f) -> $\f; %\f = FF
escape_char($e) -> $\e; %\e = ESC
escape_char($s) -> $\s; %\s = SPACE
escape_char($d) -> $\d; %\d = DEL
escape_char(C) -> C.
char_class([$]|S]) -> char_class(S, [$]]);
char_class(S) -> char_class(S, []).
char($\\, [O1,O2,O3|S]) when
O1 >= $0, O1 =< $7, O2 >= $0, O2 =< $7, O3 >= $0, O3 =< $7 ->
{(O1*8 + O2)*8 + O3 - 73*$0,S};
char($\\, [$x,H1,H2|S]) when ?HEX(H1), ?HEX(H2) ->
{erlang:list_to_integer([H1,H2], 16),S};
char($\\,[$x,${|S]) ->
hex(S, []);
char($\\,[$x|_]) ->
throw({error,{illegal,[$x]}});
char($\\, [C|S]) -> {escape_char(C),S};
char(C, S) -> {C,S}.
char_class([C1|S0], Cc) when C1 =/= $] ->
case char(C1, S0) of
{Cf,[$-,C2|S1]} when C2 =/= $] ->
case char(C2, S1) of
{Cl,S2} when Cf < Cl -> char_class(S2, [{Cf,Cl}|Cc]);
{Cl,_S2} -> throw({error,{char_class,[Cf,$-,Cl]}})
end;
{C,S1} -> char_class(S1, [C|Cc])
end;
char_class(S, Cc) -> {Cc,S}.
%char_string([C|S]) when C =/= $" -> char_string(S, C);
%char_string(S) -> {epsilon,S}.
%char_string([C|S0], L) when C =/= $" ->
% char_string(S0, {concat,L,C});
%char_string(S, L) -> {L,S}.
%% -deftype re_app_res() = {match,RestPos,Rest} | nomatch.
%% re_apply(String, StartPos, RegExp) -> re_app_res().
%%
%% Apply the (parse of the) regular expression RegExp to String. If
%% there is a match return the position of the remaining string and
%% the string if else return 'nomatch'. BestMatch specifies if we want
%% the longest match, or just a match.
%%
%% StartPos should be the real start position as it is used to decide
%% if we ae at the beginning of the string.
%%
%% Pass two functions to re_apply_or so it can decide, on the basis
%% of BestMatch, whether to just any take any match or try both to
%% find the longest. This is slower but saves duplicatng code.
re_apply(S, St, RE) -> re_apply(RE, [], S, St).
re_apply(epsilon, More, S, P) -> %This always matches
re_apply_more(More, S, P);
re_apply({'or',RE1,RE2}, More, S, P) ->
re_apply_or(re_apply(RE1, More, S, P),
re_apply(RE2, More, S, P));
re_apply({concat,RE1,RE2}, More, S0, P) ->
re_apply(RE1, [RE2|More], S0, P);
re_apply({kclosure,CE}, More, S, P) ->
%% Be careful with the recursion, explicitly do one call before
%% looping.
re_apply_or(re_apply_more(More, S, P),
re_apply(CE, [{kclosure,CE}|More], S, P));
re_apply({pclosure,CE}, More, S, P) ->
re_apply(CE, [{kclosure,CE}|More], S, P);
re_apply({optional,CE}, More, S, P) ->
re_apply_or(re_apply_more(More, S, P),
re_apply(CE, More, S, P));
re_apply(bos, More, S, 1) -> re_apply_more(More, S, 1);
re_apply(eos, More, [$\n|S], P) -> re_apply_more(More, S, P);
re_apply(eos, More, [], P) -> re_apply_more(More, [], P);
re_apply({char_class,Cc}, More, [C|S], P) ->
case in_char_class(C, Cc) of
true -> re_apply_more(More, S, P+1);
false -> nomatch
end;
re_apply({comp_class,Cc}, More, [C|S], P) ->
case in_char_class(C, Cc) of
true -> nomatch;
false -> re_apply_more(More, S, P+1)
end;
re_apply(C, More, [C|S], P) when is_integer(C) ->
re_apply_more(More, S, P+1);
re_apply(_RE, _More, _S, _P) -> nomatch.
%% re_apply_more([RegExp], String, Length) -> re_app_res().
re_apply_more([RE|More], S, P) -> re_apply(RE, More, S, P);
re_apply_more([], S, P) -> {match,P,S}.
%% in_char_class(Char, Class) -> bool().
in_char_class(C, [{C1,C2}|_Cc]) when C >= C1, C =< C2 -> true;
in_char_class(C, [C|_Cc]) -> true;
in_char_class(C, [_|Cc]) -> in_char_class(C, Cc);
in_char_class(_C, []) -> false.
%% re_apply_or(Match1, Match2) -> re_app_res().
%% If we want the best match then choose the longest match, else just
%% choose one by trying sequentially.
re_apply_or({match,P1,S1}, {match,P2,_S2}) when P1 >= P2 -> {match,P1,S1};
re_apply_or({match,_P1,_S1}, {match,P2,S2}) -> {match,P2,S2};
re_apply_or(nomatch, R2) -> R2;
re_apply_or(R1, nomatch) -> R1.
%% sh_to_awk(ShellRegExp)
%% Convert a sh style regexp into a full AWK one. The main difficulty is
%% getting character sets right as the conventions are different.
-spec sh_to_awk(ShRegExp) -> AwkRegExp when
ShRegExp :: string(),
AwkRegExp :: string().
sh_to_awk(Sh) -> "^(" ++ sh_to_awk_1(Sh). %Fix the beginning
sh_to_awk_1([$*|Sh]) -> %This matches any string
".*" ++ sh_to_awk_1(Sh);
sh_to_awk_1([$?|Sh]) -> %This matches any character
[$.|sh_to_awk_1(Sh)];
sh_to_awk_1([$[,$^,$]|Sh]) -> %This takes careful handling
"\\^" ++ sh_to_awk_1(Sh);
sh_to_awk_1("[^" ++ Sh) -> [$[|sh_to_awk_2(Sh, true)];
sh_to_awk_1("[!" ++ Sh) -> "[^" ++ sh_to_awk_2(Sh, false);
sh_to_awk_1([$[|Sh]) -> [$[|sh_to_awk_2(Sh, false)];
sh_to_awk_1([C|Sh]) ->
%% Unspecialise everything else which is not an escape character.
case special_char(C) of
true -> [$\\,C|sh_to_awk_1(Sh)];
false -> [C|sh_to_awk_1(Sh)]
end;
sh_to_awk_1([]) -> ")$". %Fix the end
sh_to_awk_2([$]|Sh], UpArrow) -> [$]|sh_to_awk_3(Sh, UpArrow)];
sh_to_awk_2(Sh, UpArrow) -> sh_to_awk_3(Sh, UpArrow).
sh_to_awk_3([$]|Sh], true) -> "^]" ++ sh_to_awk_1(Sh);
sh_to_awk_3([$]|Sh], false) -> [$]|sh_to_awk_1(Sh)];
sh_to_awk_3([C|Sh], UpArrow) -> [C|sh_to_awk_3(Sh, UpArrow)];
sh_to_awk_3([], true) -> [$^|sh_to_awk_1([])];
sh_to_awk_3([], false) -> sh_to_awk_1([]).
%% -type special_char(char()) -> bool().
%% Test if a character is a special character.
special_char($|) -> true;
special_char($*) -> true;
special_char($+) -> true;
special_char($?) -> true;
special_char($() -> true;
special_char($)) -> true;
special_char($\\) -> true;
special_char($^) -> true;
special_char($$) -> true;
special_char($.) -> true;
special_char($[) -> true;
special_char($]) -> true;
special_char($") -> true;
special_char(_C) -> false.
%% parse(RegExp) -> {ok,RE} | {error,E}.
%% Parse the regexp described in the string RegExp.
-spec parse(RegExp) -> ParseRes when
RegExp :: string(),
ParseRes :: {ok, RE} | {error, Error},
RE :: regexp(),
Error :: errordesc().
parse(S) ->
case catch reg(S) of
{R,[]} -> {ok,R};
{_R,[C|_]} -> {error,{illegal,[C]}};
{error,E} -> {error,E}
end.
%% format_error(Error) -> String.
-spec format_error(ErrorDescriptor) -> Chars when
ErrorDescriptor :: errordesc(),
Chars :: io_lib:chars().
format_error({illegal,What}) -> ["illegal character `",What,"'"];
format_error({unterminated,What}) -> ["unterminated `",What,"'"];
format_error({char_class,What}) ->
["illegal character class ",io_lib:write_string(What)].
%% -type match(String, RegExp) -> matchres().
%% Find the longest match of RegExp in String.
-spec match(String, RegExp) -> MatchRes when
String :: string(),
RegExp :: string() | regexp(),
MatchRes :: {match, Start, Length} | nomatch | {error, Error},
Start :: pos_integer(),
Length :: pos_integer(),
Error :: errordesc().
match(S, RegExp) when is_list(RegExp) ->
case parse(RegExp) of
{ok,RE} -> match(S, RE);
{error,E} -> {error,E}
end;
match(S, RE) ->
case match(RE, S, 1, 0, -1) of
{Start,Len} when Len >= 0 ->
{match,Start,Len};
{_Start,_Len} -> nomatch
end.
match(RE, S, St, Pos, L) ->
case first_match(RE, S, St) of
{St1,L1} ->
Nst = St1 + 1,
if L1 > L -> match(RE, lists:nthtail(Nst-St, S), Nst, St1, L1);
true -> match(RE, lists:nthtail(Nst-St, S), Nst, Pos, L)
end;
nomatch -> {Pos,L}
end.
%% -type first_match(String, RegExp) -> matchres().
%% Find the first match of RegExp in String.
-spec first_match(String, RegExp) -> MatchRes when
String :: string(),
RegExp :: string() | regexp(),
MatchRes :: {match, Start, Length} | nomatch | {error, Error},
Start :: pos_integer(),
Length :: pos_integer(),
Error :: errordesc().
first_match(S, RegExp) when is_list(RegExp) ->
case parse(RegExp) of
{ok,RE} -> first_match(S, RE);
{error,E} -> {error,E}
end;
first_match(S, RE) ->
case first_match(RE, S, 1) of
{Start,Len} when Len >= 0 ->
{match,Start,Len};
nomatch -> nomatch
end.
first_match(RE, S, St) when S =/= [] ->
case re_apply(S, St, RE) of
{match,P,_Rest} -> {St,P-St};
nomatch -> first_match(RE, tl(S), St+1)
end;
first_match(_RE, [], _St) -> nomatch.
%% -type matches(String, RegExp) -> {match,[{Start,Length}]} | {error,E}.
%% Return the all the non-overlapping matches of RegExp in String.
-spec matches(String, RegExp) -> MatchRes when
String :: string(),
RegExp :: string() | regexp(),
MatchRes :: {match, Matches} | {error, Error},
Matches :: [{Start, Length}],
Start :: pos_integer(),
Length :: pos_integer(),
Error :: errordesc().
matches(S, RegExp) when is_list(RegExp) ->
case parse(RegExp) of
{ok,RE} -> matches(S, RE);
{error,E} -> {error,E}
end;
matches(S, RE) ->
{match,matches(S, RE, 1)}.
matches(S, RE, St) ->
case first_match(RE, S, St) of
{St1,0} -> [{St1,0}|matches(substr(S, St1+2-St), RE, St1+1)];
{St1,L1} -> [{St1,L1}|matches(substr(S, St1+L1+1-St), RE, St1+L1)];
nomatch -> []
end.
%% -type sub(String, RegExp, Replace) -> subsres().
%% Substitute the first match of the regular expression RegExp with
%% the string Replace in String. Accept pre-parsed regular
%% expressions.
-spec sub(String, RegExp, New) -> SubRes when
String :: string(),
RegExp :: string() | regexp(),
New :: string(),
NewString :: string(),
SubRes :: {ok, NewString, RepCount} | {error, Error},
RepCount :: 0 | 1,
Error :: errordesc().
sub(String, RegExp, Rep) when is_list(RegExp) ->
case parse(RegExp) of
{ok,RE} -> sub(String, RE, Rep);
{error,E} -> {error,E}
end;
sub(String, RE, Rep) ->
Ss = sub_match(String, RE, 1),
{ok,sub_repl(Ss, Rep, String, 1),length(Ss)}.
sub_match(S, RE, St) ->
case first_match(RE, S, St) of
{St1,L1} -> [{St1,L1}];
nomatch -> []
end.
sub_repl([{St,L}|Ss], Rep, S, Pos) ->
Rs = sub_repl(Ss, Rep, S, St+L),
substr(S, Pos, St-Pos) ++ sub_repl(Rep, substr(S, St, L), Rs);
sub_repl([], _Rep, S, Pos) -> substr(S, Pos).
sub_repl([$&|Rep], M, Rest) -> M ++ sub_repl(Rep, M, Rest);
sub_repl("\\&" ++ Rep, M, Rest) -> [$&|sub_repl(Rep, M, Rest)];
sub_repl([C|Rep], M, Rest) -> [C|sub_repl(Rep, M, Rest)];
sub_repl([], _M, Rest) -> Rest.
%% -type gsub(String, RegExp, Replace) -> subres().
%% Substitute every match of the regular expression RegExp with the
%% string New in String. Accept pre-parsed regular expressions.
-spec gsub(String, RegExp, New) -> SubRes when
String :: string(),
RegExp :: string() | regexp(),
New :: string(),
NewString :: string(),
SubRes :: {ok, NewString, RepCount} | {error, Error},
RepCount :: non_neg_integer(),
Error :: errordesc().
gsub(String, RegExp, Rep) when is_list(RegExp) ->
case parse(RegExp) of
{ok,RE} -> gsub(String, RE, Rep);
{error,E} -> {error,E}
end;
gsub(String, RE, Rep) ->
Ss = matches(String, RE, 1),
{ok,sub_repl(Ss, Rep, String, 1),length(Ss)}.
%% -type split(String, RegExp) -> splitres().
%% Split a string into substrings where the RegExp describes the
%% field seperator. The RegExp " " is specially treated.
-spec split(String, RegExp) -> SplitRes when
String :: string(),
RegExp :: string() | regexp(),
SplitRes :: {ok, FieldList} | {error, Error},
FieldList :: [string()],
Error :: errordesc().
split(String, " ") -> %This is really special
{ok,RE} = parse("[ \t]+"),
case split_apply(String, RE, true) of
[[]|Ss] -> {ok,Ss};
Ss -> {ok,Ss}
end;
split(String, RegExp) when is_list(RegExp) ->
case parse(RegExp) of
{ok,RE} -> {ok,split_apply(String, RE, false)};
{error,E} -> {error,E}
end;
split(String, RE) -> {ok,split_apply(String, RE, false)}.
split_apply(S, RE, Trim) -> split_apply(S, 1, RE, Trim, []).
split_apply([], _P, _RE, true, []) -> [];
split_apply([], _P, _RE, _T, Sub) -> [reverse(Sub)];
split_apply(S, P, RE, T, Sub) ->
case re_apply(S, P, RE) of
{match,P,_Rest} ->
split_apply(tl(S), P+1, RE, T, [hd(S)|Sub]);
{match,P1,Rest} ->
[reverse(Sub)|split_apply(Rest, P1, RE, T, [])];
nomatch ->
split_apply(tl(S), P+1, RE, T, [hd(S)|Sub])
end.