diff options
author | Patrik Nyblom <[email protected]> | 2013-08-16 11:37:54 +0200 |
---|---|---|
committer | Patrik Nyblom <[email protected]> | 2013-08-16 11:37:54 +0200 |
commit | 5d9a587a8fcc164e02f043959338edec2ff69381 (patch) | |
tree | 554f05a944777622b30031724010f6f31707565b /lib/stdlib/src | |
parent | 23610dbfc1c409f83349e9e293dd3cfc1f74d497 (diff) | |
parent | 52cb62b7930d9c7b9e04a210ff6b02946f27ae79 (diff) | |
download | otp-5d9a587a8fcc164e02f043959338edec2ff69381.tar.gz otp-5d9a587a8fcc164e02f043959338edec2ff69381.tar.bz2 otp-5d9a587a8fcc164e02f043959338edec2ff69381.zip |
Merge branch 'pan/update_pcre_8.33'
* pan/update_pcre_8.33:
Workaround TR gnu/181328, GCC 4.2.1 20070831 on FreeBSD 9.1
Clarify relation between erts_iolist_{size|to_buf}
Fix backslash in titles of manpages
Correct UTF-8 in stdlib's notes.xml
Add more tests for corner error cases in erl_bif_re.c
Add documentation of report_errors and match_limit(_recursion)
Add match_limit and match_limit_recursion options
Add return_errors option to re:run/3
Add README for updating PCRE
Add documentation of extensions to re module
Add new options to Erlang re interface and mend dupnames
Update PCRE doc part of re.xml to PCRE 8.33 state
Integrate new PCRE test suites
Integrate patch for PCRE bug id 1370
Handle CRLF correctly in global regexp
Add erts_prefix to pcre_library and update erl_bif_re
Update to PCRE 8.33, w/o the erts_ prefix added
OTP-11204
OTP-11205
OTP-10285
Diffstat (limited to 'lib/stdlib/src')
-rw-r--r-- | lib/stdlib/src/re.erl | 176 |
1 files changed, 126 insertions, 50 deletions
diff --git a/lib/stdlib/src/re.erl b/lib/stdlib/src/re.erl index c5109ec455..afc63496d0 100644 --- a/lib/stdlib/src/re.erl +++ b/lib/stdlib/src/re.erl @@ -19,20 +19,21 @@ -module(re). -export([grun/3,urun/3,ucompile/2,replace/3,replace/4,split/2,split/3]). -%-opaque mp() :: {re_pattern, _, _, _}. --type mp() :: {re_pattern, _, _, _}. +%-opaque mp() :: {re_pattern, _, _, _, _}. +-type mp() :: {re_pattern, _, _, _, _}. -type nl_spec() :: cr | crlf | lf | anycrlf | any. -type compile_option() :: unicode | anchored | caseless | dollar_endonly | dotall | extended | firstline | multiline | no_auto_capture | dupnames | ungreedy - | {newline, nl_spec()}| bsr_anycrlf - | bsr_unicode. + | {newline, nl_spec()} + | bsr_anycrlf | bsr_unicode + | no_start_optimize | ucp | never_utf. %%% BIFs --export([compile/1, compile/2, run/2, run/3]). +-export([compile/1, compile/2, run/2, run/3, inspect/2]). -spec compile(Regexp) -> {ok, MP} | {error, ErrSpec} when Regexp :: iodata(), @@ -63,17 +64,21 @@ run(_, _) -> -spec run(Subject, RE, Options) -> {match, Captured} | match | - nomatch when + nomatch | + {error, ErrType} when Subject :: iodata() | unicode:charlist(), RE :: mp() | iodata() | unicode:charlist(), Options :: [Option], - Option :: anchored | global | notbol | noteol | notempty + Option :: anchored | global | notbol | noteol | notempty + | notempty_atstart | report_errors | {offset, non_neg_integer()} | + {match_limit, non_neg_integer()} | + {match_limit_recursion, non_neg_integer()} | {newline, NLSpec :: nl_spec()} | bsr_anycrlf | bsr_unicode | {capture, ValueSpec} | {capture, ValueSpec, Type} | CompileOpt, Type :: index | list | binary, - ValueSpec :: all | all_but_first | first | none | ValueList, + ValueSpec :: all | all_but_first | all_names | first | none | ValueList, ValueList :: [ValueID], ValueID :: integer() | string() | atom(), CompileOpt :: compile_option(), @@ -83,11 +88,21 @@ run(_, _) -> | binary(), ListConversionData :: string() | {error, string(), binary()} - | {incomplete, string(), binary()}. + | {incomplete, string(), binary()}, + ErrType :: match_limit | match_limit_recursion | {compile, CompileErr}, + CompileErr :: {ErrString :: string(), Position :: non_neg_integer()}. run(_, _, _) -> erlang:nif_error(undef). +-spec inspect(MP,Item) -> {namelist, [ binary() ]} when + MP :: mp(), + Item :: namelist. + +inspect(_,_) -> + erlang:nif_error(undef). + + %%% End of BIFs -spec split(Subject, RE) -> SplitList when @@ -102,8 +117,10 @@ split(Subject,RE) -> Subject :: iodata() | unicode:charlist(), RE :: mp() | iodata() | unicode:charlist(), Options :: [ Option ], - Option :: anchored | notbol | noteol | notempty + Option :: anchored | notbol | noteol | notempty | notempty_atstart | {offset, non_neg_integer()} | {newline, nl_spec()} + | {match_limit, non_neg_integer()} + | {match_limit_recursion, non_neg_integer()} | bsr_anycrlf | bsr_unicode | {return, ReturnType} | {parts, NumParts} | group | trim | CompileOpt, NumParts :: non_neg_integer() | infinity, @@ -266,7 +283,7 @@ extend_subpatterns([],N) -> extend_subpatterns([H|T],N) -> [H | extend_subpatterns(T,N-1)]. -compile_split({re_pattern,N,_,_} = Comp, Options) -> +compile_split({re_pattern,N,_,_,_} = Comp, Options) -> {Comp,N,Options}; compile_split(Pat,Options0) when not is_tuple(Pat) -> Options = lists:filter(fun(O) -> @@ -275,7 +292,7 @@ compile_split(Pat,Options0) when not is_tuple(Pat) -> case re:compile(Pat,Options) of {error,Err} -> {error,Err}; - {ok, {re_pattern,N,_,_} = Comp} -> + {ok, {re_pattern,N,_,_,_} = Comp} -> NewOpt = lists:filter(fun(OO) -> (not copt(OO)) end, Options0), {Comp,N,NewOpt} end; @@ -295,8 +312,11 @@ replace(Subject,RE,Replacement) -> RE :: mp() | iodata() | unicode:charlist(), Replacement :: iodata() | unicode:charlist(), Options :: [Option], - Option :: anchored | global | notbol | noteol | notempty + Option :: anchored | global | notbol | noteol | notempty + | notempty_atstart | {offset, non_neg_integer()} | {newline, NLSpec} | bsr_anycrlf + | {match_limit, non_neg_integer()} + | {match_limit_recursion, non_neg_integer()} | bsr_unicode | {return, ReturnType} | CompileOpt, ReturnType :: iodata | list | binary, CompileOpt :: compile_option(), @@ -352,6 +372,8 @@ process_repl_params([],Convert,Unicode) -> process_repl_params([unicode|T],C,_U) -> {NT,NC,NU} = process_repl_params(T,C,true), {[unicode|NT],NC,NU}; +process_repl_params([report_errors|_],_,_) -> + throw(badopt); process_repl_params([{capture,_,_}|_],_,_) -> throw(badopt); process_repl_params([{capture,_}|_],_,_) -> @@ -387,6 +409,8 @@ process_split_params([group|T],C,U,L,S,_G) -> process_split_params(T,C,U,L,S,true); process_split_params([global|_],_,_,_,_,_) -> throw(badopt); +process_split_params([report_errors|_],_,_,_,_,_) -> + throw(badopt); process_split_params([{capture,_,_}|_],_,_,_,_,_) -> throw(badopt); process_split_params([{capture,_}|_],_,_,_,_,_) -> @@ -487,17 +511,31 @@ do_replace(Subject,Repl,SubExprs0) -> end || Part <- Repl ]. -check_for_unicode({re_pattern,_,1,_},_) -> +check_for_unicode({re_pattern,_,1,_,_},_) -> true; -check_for_unicode({re_pattern,_,0,_},_) -> +check_for_unicode({re_pattern,_,0,_,_},_) -> false; check_for_unicode(_,L) -> lists:member(unicode,L). + +check_for_crlf({re_pattern,_,_,1,_},_) -> + true; +check_for_crlf({re_pattern,_,_,0,_},_) -> + false; +check_for_crlf(_,L) -> + case lists:keysearch(newline,1,L) of + {value,{newline,any}} -> true; + {value,{newline,crlf}} -> true; + {value,{newline,anycrlf}} -> true; + _ -> false + end. % SelectReturn = false | all | stirpfirst | none % ConvertReturn = index | list | binary % {capture, all} -> all (untouchded) -% {capture, first} -> kept in argumentt list and Select all +% {capture, all_names} -> if names are present: treated as a name {capture, [...]} +% else: same as {capture, []} +% {capture, first} -> kept in argument list and Select all % {capture, all_but_first} -> removed from argument list and selects stripfirst % {capture, none} -> removed from argument list and selects none % {capture, []} -> removed from argument list and selects none @@ -506,23 +544,30 @@ check_for_unicode(_,L) -> % Call as process_parameters([],0,false,index,NeedClean) -process_parameters([],InitialOffset, SelectReturn, ConvertReturn,_) -> +process_parameters([],InitialOffset, SelectReturn, ConvertReturn,_,_) -> {[], InitialOffset, SelectReturn, ConvertReturn}; -process_parameters([{offset, N} | T],_Init0,Select0,Return0,CC) -> - process_parameters(T,N,Select0,Return0,CC); -process_parameters([global | T],Init0,Select0,Return0,CC) -> - process_parameters(T,Init0,Select0,Return0,CC); -process_parameters([{capture,Values,Type}|T],Init0,Select0,_Return0,CC) -> - process_parameters([{capture,Values}|T],Init0,Select0,Type,CC); -process_parameters([{capture,Values}|T],Init0,Select0,Return0,CC) -> +process_parameters([{offset, N} | T],_Init0,Select0,Return0,CC,RE) -> + process_parameters(T,N,Select0,Return0,CC,RE); +process_parameters([global | T],Init0,Select0,Return0,CC,RE) -> + process_parameters(T,Init0,Select0,Return0,CC,RE); +process_parameters([{capture,Values,Type}|T],Init0,Select0,_Return0,CC,RE) -> + process_parameters([{capture,Values}|T],Init0,Select0,Type,CC,RE); +process_parameters([{capture,Values}|T],Init0,Select0,Return0,CC,RE) -> % First process the rest to see if capture was already present {NewTail, Init1, Select1, Return1} = - process_parameters(T,Init0,Select0,Return0,CC), + process_parameters(T,Init0,Select0,Return0,CC,RE), case Select1 of false -> case Values of all -> {[{capture,all} | NewTail], Init1, all, Return0}; + all_names -> + case re:inspect(RE,namelist) of + {namelist, []} -> + {[{capture,first} | NewTail], Init1, none, Return0}; + {namelist, List} -> + {[{capture,[0|List]} | NewTail], Init1, stripfirst, Return0} + end; first -> {[{capture,first} | NewTail], Init1, all, Return0}; all_but_first -> @@ -541,20 +586,20 @@ process_parameters([{capture,Values}|T],Init0,Select0,Return0,CC) -> % Found overriding further down list, ignore this one {NewTail, Init1, Select1, Return1} end; -process_parameters([H|T],Init0,Select0,Return0,true) -> +process_parameters([H|T],Init0,Select0,Return0,true,RE) -> case copt(H) of true -> - process_parameters(T,Init0,Select0,Return0,true); + process_parameters(T,Init0,Select0,Return0,true,RE); false -> {NewT,Init,Select,Return} = - process_parameters(T,Init0,Select0,Return0,true), + process_parameters(T,Init0,Select0,Return0,true,RE), {[H|NewT],Init,Select,Return} end; -process_parameters([H|T],Init0,Select0,Return0,false) -> +process_parameters([H|T],Init0,Select0,Return0,false,RE) -> {NewT,Init,Select,Return} = - process_parameters(T,Init0,Select0,Return0,false), + process_parameters(T,Init0,Select0,Return0,false,RE), {[H|NewT],Init,Select,Return}; -process_parameters(_,_,_,_,_) -> +process_parameters(_,_,_,_,_,_) -> throw(badlist). postprocess({match,[]},_,_,_,_) -> @@ -662,7 +707,7 @@ urun2(Subject0,RE0,Options0) -> RE = case RE0 of BinRE when is_binary(BinRE) -> BinRE; - {re_pattern,_,_,_} = ReCompiled -> + {re_pattern,_,_,_,_} = ReCompiled -> ReCompiled; ListRE -> unicode:characters_to_binary(ListRE,unicode) @@ -703,38 +748,46 @@ grun(Subject,RE,{Options,NeedClean,OrigRE}) -> grun2(Subject,RE,{Options,NeedClean}) -> Unicode = check_for_unicode(RE,Options), + CRLF = check_for_crlf(RE,Options), FlatSubject = to_binary(Subject, Unicode), - do_grun(FlatSubject,Subject,Unicode,RE,{Options,NeedClean}). + do_grun(FlatSubject,Subject,Unicode,CRLF,RE,{Options,NeedClean}). -do_grun(FlatSubject,Subject,Unicode,RE,{Options0,NeedClean}) -> +do_grun(FlatSubject,Subject,Unicode,CRLF,RE,{Options0,NeedClean}) -> {StrippedOptions, InitialOffset, SelectReturn, ConvertReturn} = case (catch - process_parameters(Options0, 0, false, index, NeedClean)) of + process_parameters(Options0, 0, false, index, NeedClean,RE)) of badlist -> erlang:error(badarg,[Subject,RE,Options0]); CorrectReturn -> CorrectReturn end, - postprocess(loopexec(FlatSubject,RE,InitialOffset, - byte_size(FlatSubject), - Unicode,StrippedOptions), - SelectReturn,ConvertReturn,FlatSubject,Unicode). + try + postprocess(loopexec(FlatSubject,RE,InitialOffset, + byte_size(FlatSubject), + Unicode,CRLF,StrippedOptions), + SelectReturn,ConvertReturn,FlatSubject,Unicode) + catch + throw:ErrTuple -> + ErrTuple + end. -loopexec(_,_,X,Y,_,_) when X > Y -> +loopexec(_,_,X,Y,_,_,_) when X > Y -> {match,[]}; -loopexec(Subject,RE,X,Y,Unicode,Options) -> +loopexec(Subject,RE,X,Y,Unicode,CRLF,Options) -> case re:run(Subject,RE,[{offset,X}]++Options) of + {error, Err} -> + throw({error,Err}); nomatch -> {match,[]}; {match,[{A,B}|More]} -> {match,Rest} = case B>0 of true -> - loopexec(Subject,RE,A+B,Y,Unicode,Options); + loopexec(Subject,RE,A+B,Y,Unicode,CRLF,Options); false -> {match,M} = - case re:run(Subject,RE,[{offset,X},notempty, + case re:run(Subject,RE,[{offset,X},notempty_atstart, anchored]++Options) of nomatch -> {match,[]}; @@ -745,10 +798,10 @@ loopexec(Subject,RE,X,Y,Unicode,Options) -> [{_,NStep}|_] when NStep > 0 -> A+NStep; _ -> - forward(Subject,A,1,Unicode) + forward(Subject,A,1,Unicode,CRLF) end, {match,MM} = loopexec(Subject,RE,NewA,Y, - Unicode,Options), + Unicode,CRLF,Options), case M of [] -> {match,MM}; @@ -759,11 +812,22 @@ loopexec(Subject,RE,X,Y,Unicode,Options) -> {match,[[{A,B}|More] | Rest]} end. -forward(_Chal,A,0,_) -> +forward(_Chal,A,0,_,_) -> A; -forward(_Chal,A,N,false) -> - A+N; -forward(Chal,A,N,true) -> +forward(Chal,A,N,U,true) -> + <<_:A/binary,Tl/binary>> = Chal, + case Tl of + <<$\r,$\n,_/binary>> -> + forward(Chal,A+2,N-1,U,true); + _ -> + forward2(Chal,A,N,U,true) + end; +forward(Chal,A,N,U,false) -> + forward2(Chal,A,N,U,false). + +forward2(Chal,A,N,false,CRLF) -> + forward(Chal,A+1,N-1,false,CRLF); +forward2(Chal,A,N,true,CRLF) -> <<_:A/binary,Tl/binary>> = Chal, Forw = case Tl of <<1:1,1:1,0:1,_:5,_/binary>> -> @@ -775,10 +839,16 @@ forward(Chal,A,N,true) -> _ -> 1 end, - forward(Chal,A+Forw,N-1,true). + forward(Chal,A+Forw,N-1,true,CRLF). copt(caseless) -> true; +copt(no_start_optimize) -> + true; +copt(never_utf) -> + true; +copt(ucp) -> + true; copt(dollar_endonly) -> true; copt(dotall) -> @@ -809,6 +879,8 @@ copt(_) -> runopt(notempty) -> true; +runopt(notempty_atstart) -> + true; runopt(notbol) -> true; runopt(noteol) -> @@ -821,6 +893,10 @@ runopt({capture,_}) -> true; runopt(global) -> true; +runopt({match_limit,_}) -> + true; +runopt({match_limit_recursion,_}) -> + true; runopt(_) -> false. |