The R13B03 release.OTP_R13B03

author: Erlang/OTP <[email protected]> 2009-11-20 14:54:40 +0000
committer: Erlang/OTP <[email protected]> 2009-11-20 14:54:40 +0000
commit: 84adefa331c4159d432d22840663c38f155cd4c1 (patch)
tree: bff9a9c66adda4df2106dfd0e5c053ab182a12bd /lib/inets/src/inets_app/inets_regexp.erl
download: otp-84adefa331c4159d432d22840663c38f155cd4c1.tar.gz
otp-84adefa331c4159d432d22840663c38f155cd4c1.tar.bz2
otp-84adefa331c4159d432d22840663c38f155cd4c1.zip
1 files changed, 413 insertions, 0 deletions
diff --git a/lib/inets/src/inets_app/inets_regexp.erl b/lib/inets/src/inets_app/inets_regexp.erl
new file mode 100644
index 0000000000..a065533236
--- /dev/null
+++ b/lib/inets/src/inets_app/inets_regexp.erl
@@ -0,0 +1,413 @@
+%%
+%% %CopyrightBegin%
+%% 
+%% Copyright Ericsson AB 2009. All Rights Reserved.
+%% 
+%% The contents of this file are subject to the Erlang Public License,
+%% Version 1.1, (the "License"); you may not use this file except in
+%% compliance with the License. You should have received a copy of the
+%% Erlang Public License along with this software. If not, it can be
+%% retrieved online at http://www.erlang.org/.
+%% 
+%% Software distributed under the License is distributed on an "AS IS"
+%% basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See
+%% the License for the specific language governing rights and limitations
+%% under the License.
+%% 
+%% %CopyrightEnd%
+%%
+
+-module(inets_regexp).
+
+-export([parse/1, match/2, first_match/2, split/2, sub/3, gsub/3]).
+
+
+%%%=========================================================================
+%%%  API
+%%%=========================================================================
+
+%% parse(RegExp) -> {ok, RE} | {error, E}.
+%% Parse the regexp described in the string RegExp.
+
+parse(S) ->
+    case (catch reg(S)) of
+	{R, []} -> 
+	    {ok, R};
+	{_R, [C|_]} -> 
+	    {error, {illegal, [C]}};
+	{error, E} -> 
+	    {error, E}
+    end.
+
+
+%% Find the longest match of RegExp in String.
+
+match(S, RegExp) when is_list(RegExp) ->
+    case parse(RegExp) of
+	{ok,RE} -> match(S, RE);
+	{error,E} -> {error,E}
+    end;
+match(S, RE) ->
+    case match(RE, S, 1, 0, -1) of
+	{Start,Len} when Len >= 0 ->
+	    {match, Start, Len};
+	{_Start,_Len} -> 
+	    nomatch
+    end.
+
+%% Find the first match of RegExp in String.
+
+first_match(S, RegExp) when is_list(RegExp) ->
+    case parse(RegExp) of
+	{ok, RE} -> 
+	    first_match(S, RE);
+	{error, E} -> 
+	    {error, E}
+    end;
+first_match(S, RE) ->
+    case first_match(RE, S, 1) of
+	{Start,Len} when Len >= 0 ->
+	    {match, Start,Len};
+	nomatch -> 
+	    nomatch
+    end.
+
+first_match(RE, S, St) when S =/= [] ->
+    case re_apply(S, St, RE) of
+	{match, P, _Rest} -> 
+	    {St, P-St};
+	nomatch -> 
+	    first_match(RE, tl(S), St+1)
+    end;
+first_match(_RE, [], _St) -> 
+    nomatch.
+
+
+match(RE, S, St, Pos, L) ->
+    case first_match(RE, S, St) of
+	{St1, L1} ->
+	    Nst = St1 + 1,
+	    if L1 > L -> 
+		    match(RE, lists:nthtail(Nst-St, S), Nst, St1, L1);
+	       true -> 
+		    match(RE, lists:nthtail(Nst-St, S), Nst, Pos, L)
+	    end;
+	nomatch -> 
+	    {Pos, L}
+    end.
+
+
+%% Split a string into substrings where the RegExp describes the
+%% field seperator. The RegExp " " is specially treated.
+
+split(String, " ") ->				%This is really special
+    {ok, RE} = parse("[ \t]+"),
+    case split_apply(String, RE, true) of
+	[[]|Ss] -> 
+	    {ok,Ss};
+	Ss -> 
+	    {ok,Ss}
+    end;
+split(String, RegExp) when is_list(RegExp) ->
+    case parse(RegExp) of
+	{ok, RE} -> 
+	    {ok, split_apply(String, RE, false)};
+	{error, E} -> 
+	    {error,E}
+    end;
+split(String, RE) -> 
+    {ok, split_apply(String, RE, false)}.
+
+
+%% Substitute the first match of the regular expression RegExp 
+%% with the string Replace in String. Accept pre-parsed regular
+%% expressions.
+
+sub(String, RegExp, Rep) when is_list(RegExp) ->
+    case parse(RegExp) of
+	{ok, RE} -> 
+	    sub(String, RE, Rep);
+	{error, E} -> 
+	    {error, E}
+    end;
+sub(String, RE, Rep) ->
+    Ss = sub_match(String, RE, 1),
+    {ok, sub_repl(Ss, Rep, String, 1), length(Ss)}.
+
+
+%% Substitute every match of the regular expression RegExp with 
+%% the string New in String. Accept pre-parsed regular expressions.
+
+gsub(String, RegExp, Rep) when is_list(RegExp) ->
+    case parse(RegExp) of
+	{ok, RE} -> 
+	    gsub(String, RE, Rep);
+	{error, E} -> 
+	    {error, E}
+    end;
+gsub(String, RE, Rep) ->
+    Ss = matches(String, RE, 1),
+    {ok, sub_repl(Ss, Rep, String, 1), length(Ss)}.
+
+
+%%%========================================================================
+%%% Internal functions
+%%%========================================================================
+
+%% This is the regular expression grammar used. It is equivalent to the
+%% one used in AWK, except that we allow ^ $ to be used anywhere and fail
+%% in the matching.
+%%
+%% reg -> reg1 : '$1'.
+%% reg1 -> reg1 "|" reg2 : {'or','$1','$2'}.
+%% reg1 -> reg2 : '$1'.
+%% reg2 -> reg2 reg3 : {concat,'$1','$2'}.
+%% reg2 -> reg3 : '$1'.
+%% reg3 -> reg3 "*" : {kclosure,'$1'}.
+%% reg3 -> reg3 "+" : {pclosure,'$1'}.
+%% reg3 -> reg3 "?" : {optional,'$1'}.
+%% reg3 -> reg4 : '$1'.
+%% reg4 -> "(" reg ")" : '$2'.
+%% reg4 -> "\\" char : '$2'.
+%% reg4 -> "^" : bos.
+%% reg4 -> "$" : eos.
+%% reg4 -> "." : char.
+%% reg4 -> "[" class "]" : {char_class,char_class('$2')}
+%% reg4 -> "[" "^" class "]" : {comp_class,char_class('$3')}
+%% reg4 -> "\"" chars "\"" : char_string('$2')
+%% reg4 -> char : '$1'.
+%% reg4 -> empty : epsilon.
+%%  The grammar of the current regular expressions. The actual parser
+%%  is a recursive descent implementation of the grammar.
+
+reg(S) -> reg1(S).
+
+%% reg1 -> reg2 reg1'
+%% reg1' -> "|" reg2
+%% reg1' -> empty
+
+reg1(S0) ->
+    {L,S1} = reg2(S0),
+    reg1p(S1, L).
+
+reg1p([$||S0], L) ->
+    {R,S1} = reg2(S0),
+    reg1p(S1, {'or',L,R});
+reg1p(S, L) -> {L,S}.
+
+%% reg2 -> reg3 reg2'
+%% reg2' -> reg3
+%% reg2' -> empty
+
+reg2(S0) ->
+    {L,S1} = reg3(S0),
+    reg2p(S1, L).
+
+reg2p([C|S0], L) when (C =/= $|) andalso (C =/= $)) ->
+    {R,S1} = reg3([C|S0]),
+    reg2p(S1, {concat,L,R});
+reg2p(S, L) -> {L,S}.
+
+%% reg3 -> reg4 reg3'
+%% reg3' -> "*" reg3'
+%% reg3' -> "+" reg3'
+%% reg3' -> "?" reg3'
+%% reg3' -> empty
+
+reg3(S0) ->
+    {L,S1} = reg4(S0),
+    reg3p(S1, L).
+
+reg3p([$*|S], L) -> reg3p(S, {kclosure,L});
+reg3p([$+|S], L) -> reg3p(S, {pclosure,L});
+reg3p([$?|S], L) -> reg3p(S, {optional,L});
+reg3p(S, L) -> {L,S}.
+
+reg4([$(|S0]) ->
+    case reg(S0) of
+	{R,[$)|S1]} -> {R,S1};
+	{_R,_S} -> throw({error,{unterminated,"("}})
+    end;
+reg4([$\\,O1,O2,O3|S]) 
+  when ((O1 >= $0) andalso 
+	(O1 =< $7) andalso 
+	(O2 >= $0) andalso 
+	(O2 =< $7) andalso 
+	(O3 >= $0) andalso 
+	(O3 =< $7)) ->
+    {(O1*8 + O2)*8 + O3 - 73*$0,S};
+reg4([$\\,C|S]) -> 
+    {escape_char(C),S};
+reg4([$\\]) -> 
+    throw({error, {unterminated,"\\"}});
+reg4([$^|S]) -> 
+    {bos,S};
+reg4([$$|S]) -> 
+    {eos,S};
+reg4([$.|S]) -> 
+    {{comp_class,"\n"},S};
+reg4("[^" ++ S0) ->
+    case char_class(S0) of
+	{Cc,[$]|S1]} -> {{comp_class,Cc},S1};
+	{_Cc,_S} -> throw({error,{unterminated,"["}})
+    end;
+reg4([$[|S0]) ->
+    case char_class(S0) of
+	{Cc,[$]|S1]} -> {{char_class,Cc},S1};
+	{_Cc,_S1} -> throw({error,{unterminated,"["}})
+    end;
+reg4([C|S]) 
+  when (C =/= $*) andalso (C =/= $+) andalso (C =/= $?) andalso (C =/= $]) -> 
+    {C, S};
+reg4([C|_S]) -> 
+    throw({error,{illegal,[C]}});
+reg4([]) -> 
+    {epsilon,[]}.
+
+escape_char($n) -> $\n;				%\n = LF
+escape_char($r) -> $\r;				%\r = CR
+escape_char($t) -> $\t;				%\t = TAB
+escape_char($v) -> $\v;				%\v = VT
+escape_char($b) -> $\b;				%\b = BS
+escape_char($f) -> $\f;				%\f = FF
+escape_char($e) -> $\e;				%\e = ESC
+escape_char($s) -> $\s;				%\s = SPACE
+escape_char($d) -> $\d;				%\d = DEL
+escape_char(C) -> C.
+
+char_class([$]|S]) -> char_class(S, [$]]);
+char_class(S) -> char_class(S, []).
+
+char($\\, [O1,O2,O3|S]) when
+  O1 >= $0, O1 =< $7, O2 >= $0, O2 =< $7, O3 >= $0, O3 =< $7 ->
+    {(O1*8 + O2)*8 + O3 - 73*$0,S};
+char($\\, [C|S]) -> {escape_char(C),S};
+char(C, S) -> {C,S}.
+
+char_class([C1|S0], Cc) when C1 =/= $] ->
+    case char(C1, S0) of
+	{Cf,[$-,C2|S1]} when C2 =/= $] ->
+	    case char(C2, S1) of
+		{Cl,S2} when Cf < Cl -> char_class(S2, [{Cf,Cl}|Cc]); 
+		{Cl,_S2} -> throw({error,{char_class,[Cf,$-,Cl]}})
+	    end;
+	{C,S1} -> char_class(S1, [C|Cc])
+    end;
+char_class(S, Cc) -> {Cc,S}.
+
+
+%% re_apply(String, StartPos, RegExp) -> re_app_res().
+%%
+%%  Apply the (parse of the) regular expression RegExp to String.  If
+%%  there is a match return the position of the remaining string and
+%%  the string if else return 'nomatch'. BestMatch specifies if we want
+%%  the longest match, or just a match.
+%%
+%%  StartPos should be the real start position as it is used to decide
+%%  if we ae at the beginning of the string.
+%%
+%%  Pass two functions to re_apply_or so it can decide, on the basis
+%%  of BestMatch, whether to just any take any match or try both to
+%%  find the longest. This is slower but saves duplicatng code.
+
+re_apply(S, St, RE) -> re_apply(RE, [], S, St).
+
+re_apply(epsilon, More, S, P) ->		%This always matches
+    re_apply_more(More, S, P);
+re_apply({'or',RE1,RE2}, More, S, P) ->
+    re_apply_or(re_apply(RE1, More, S, P),
+		re_apply(RE2, More, S, P));
+re_apply({concat,RE1,RE2}, More, S0, P) ->
+    re_apply(RE1, [RE2|More], S0, P);
+re_apply({kclosure,CE}, More, S, P) ->
+    %% Be careful with the recursion, explicitly do one call before
+    %% looping.
+    re_apply_or(re_apply_more(More, S, P),
+		re_apply(CE, [{kclosure,CE}|More], S, P));
+re_apply({pclosure,CE}, More, S, P) ->
+    re_apply(CE, [{kclosure,CE}|More], S, P);
+re_apply({optional,CE}, More, S, P) ->
+    re_apply_or(re_apply_more(More, S, P),
+		re_apply(CE, More, S, P));
+re_apply(bos, More, S, 1) -> re_apply_more(More, S, 1);
+re_apply(eos, More, [$\n|S], P) -> re_apply_more(More, S, P);
+re_apply(eos, More, [], P) -> re_apply_more(More, [], P);
+re_apply({char_class,Cc}, More, [C|S], P) ->
+    case in_char_class(C, Cc) of
+	true -> re_apply_more(More, S, P+1);
+	false -> nomatch
+    end;
+re_apply({comp_class,Cc}, More, [C|S], P) ->
+    case in_char_class(C, Cc) of
+	true -> nomatch;
+	false -> re_apply_more(More, S, P+1)
+    end;
+re_apply(C, More, [C|S], P) when is_integer(C) ->
+    re_apply_more(More, S, P+1);
+re_apply(_RE, _More, _S, _P) -> nomatch.
+
+%% re_apply_more([RegExp], String, Length) -> re_app_res().
+
+re_apply_more([RE|More], S, P) -> re_apply(RE, More, S, P);
+re_apply_more([], S, P) -> {match,P,S}.
+
+%% in_char_class(Char, Class) -> bool().
+
+in_char_class(C, [{C1,C2}|_Cc]) when C >= C1, C =< C2 -> true;
+in_char_class(C, [C|_Cc]) -> true;
+in_char_class(C, [_|Cc]) -> in_char_class(C, Cc);
+in_char_class(_C, []) -> false.
+
+%% re_apply_or(Match1, Match2) -> re_app_res().
+%%  If we want the best match then choose the longest match, else just
+%%  choose one by trying sequentially.
+
+re_apply_or({match,P1,S1},   {match,P2,_S2}) when P1 >= P2 -> {match,P1,S1};
+re_apply_or({match,_P1,_S1}, {match,P2,S2}) -> {match,P2,S2};
+re_apply_or(nomatch, R2) -> R2;
+re_apply_or(R1, nomatch) -> R1.
+
+
+matches(S, RE, St) ->
+    case first_match(RE, S, St) of
+	{St1,0} -> 
+	    [{St1,0}|matches(string:substr(S, St1+2-St), RE, St1+1)];
+	{St1,L1} -> 
+	    [{St1,L1}|matches(string:substr(S, St1+L1+1-St), RE, St1+L1)];
+	nomatch -> 
+	    []
+    end.
+
+sub_match(S, RE, St) ->
+    case first_match(RE, S, St) of
+	{St1,L1} -> [{St1,L1}];
+	nomatch -> []
+    end.
+
+sub_repl([{St,L}|Ss], Rep, S, Pos) ->
+    Rs = sub_repl(Ss, Rep, S, St+L),
+    string:substr(S, Pos, St-Pos) ++ 
+	sub_repl(Rep, string:substr(S, St, L), Rs);
+sub_repl([], _Rep, S, Pos) -> 
+    string:substr(S, Pos).
+
+sub_repl([$&|Rep], M, Rest) -> M ++ sub_repl(Rep, M, Rest);
+sub_repl("\\&" ++ Rep, M, Rest) -> [$&|sub_repl(Rep, M, Rest)];
+sub_repl([C|Rep], M, Rest) -> [C|sub_repl(Rep, M, Rest)];
+sub_repl([], _M, Rest) -> Rest.
+
+split_apply(S, RE, Trim) -> split_apply(S, 1, RE, Trim, []).
+
+split_apply([], _P, _RE, true, []) -> 
+    [];
+split_apply([], _P, _RE, _T, Sub) -> 
+    [lists:reverse(Sub)];
+split_apply(S, P, RE, T, Sub) ->
+    case re_apply(S, P, RE) of
+	{match,P,_Rest} ->
+	    split_apply(tl(S), P+1, RE, T, [hd(S)|Sub]);
+	{match,P1,Rest} ->
+	    [lists:reverse(Sub)|split_apply(Rest, P1, RE, T, [])];
+	nomatch ->
+	    split_apply(tl(S), P+1, RE, T, [hd(S)|Sub])
+    end.
author	Erlang/OTP <[email protected]>	2009-11-20 14:54:40 +0000
committer	Erlang/OTP <[email protected]>	2009-11-20 14:54:40 +0000
commit	84adefa331c4159d432d22840663c38f155cd4c1 (patch)
tree	bff9a9c66adda4df2106dfd0e5c053ab182a12bd /lib/inets/src/inets_app/inets_regexp.erl
download	otp-84adefa331c4159d432d22840663c38f155cd4c1.tar.gz otp-84adefa331c4159d432d22840663c38f155cd4c1.tar.bz2 otp-84adefa331c4159d432d22840663c38f155cd4c1.zip