From 390ee65ba78d571fdc0a8fccb6c456d7346eb9bc Mon Sep 17 00:00:00 2001 From: Patrik Nyblom Date: Fri, 19 Jul 2013 17:51:11 +0200 Subject: Handle CRLF correctly in global regexp --- erts/emulator/beam/erl_bif_re.c | 30 ++++++++++++------- lib/stdlib/src/re.erl | 64 ++++++++++++++++++++++++++++------------- 2 files changed, 64 insertions(+), 30 deletions(-) diff --git a/erts/emulator/beam/erl_bif_re.c b/erts/emulator/beam/erl_bif_re.c index 4289c79ba2..12fc834685 100644 --- a/erts/emulator/beam/erl_bif_re.c +++ b/erts/emulator/beam/erl_bif_re.c @@ -379,6 +379,8 @@ build_compile_result(Process *p, Eterm error_tag, pcre *result, int errcode, con Eterm ret; size_t pattern_size; int capture_count; + int use_crlf; + unsigned long options; if (!result) { /* Return {error_tag, {Code, String, Offset}} */ int elen = sys_strlen(errstr); @@ -393,14 +395,20 @@ build_compile_result(Process *p, Eterm error_tag, pcre *result, int errcode, con } else { erts_pcre_fullinfo(result, NULL, PCRE_INFO_SIZE, &pattern_size); erts_pcre_fullinfo(result, NULL, PCRE_INFO_CAPTURECOUNT, &capture_count); + erts_pcre_fullinfo(result, NULL, PCRE_INFO_OPTIONS, &options); + options &= PCRE_NEWLINE_CR|PCRE_NEWLINE_LF | PCRE_NEWLINE_CRLF | + PCRE_NEWLINE_ANY | PCRE_NEWLINE_ANYCRLF; + use_crlf = (options == PCRE_NEWLINE_ANY || + options == PCRE_NEWLINE_CRLF || + options == PCRE_NEWLINE_ANYCRLF); /* XXX: Optimize - keep in offheap binary to allow this to be kept across traps w/o need of copying */ ret = new_binary(p, (byte *) result, pattern_size); erts_pcre_free(result); - hp = HAlloc(p, (with_ok) ? (3+5) : 5); - ret = TUPLE4(hp,am_re_pattern, make_small(capture_count), make_small(unicode),ret); + hp = HAlloc(p, (with_ok) ? (3+6) : 6); + ret = TUPLE5(hp,am_re_pattern, make_small(capture_count), make_small(unicode),make_small(use_crlf),ret); if (with_ok) { - hp += 5; + hp += 6; ret = TUPLE2(hp,am_ok,ret); } } @@ -875,7 +883,7 @@ re_run(Process *p, Eterm arg1, Eterm arg2, Eterm arg3) is_list_cap = ((pflags & PARSE_FLAG_CAPTURE_OPT) && (capture[CAPSPEC_TYPE] == am_list)); - if (is_not_tuple(arg2) || (arityval(*tuple_val(arg2)) != 4)) { + if (is_not_tuple(arg2) || (arityval(*tuple_val(arg2)) != 5)) { if (is_binary(arg2) || is_list(arg2) || is_nil(arg2)) { /* Compile from textual RE */ ErlDrvSizeT slen; @@ -947,7 +955,8 @@ re_run(Process *p, Eterm arg1, Eterm arg2, Eterm arg3) tp = tuple_val(arg2); if (tp[1] != am_re_pattern || is_not_small(tp[2]) || - is_not_small(tp[3]) || is_not_binary(tp[4])) { + is_not_small(tp[3]) || is_not_small(tp[4]) || + is_not_binary(tp[5])) { BIF_ERROR(p,BADARG); } @@ -967,9 +976,9 @@ re_run(Process *p, Eterm arg1, Eterm arg2, Eterm arg3) } ovsize = 3*(unsigned_val(tp[2])+1); - code_size = binary_size(tp[4]); + code_size = binary_size(tp[5]); if ((code_tmp = (const pcre *) - erts_get_aligned_binary_bytes(tp[4], &temp_alloc)) == NULL) { + erts_get_aligned_binary_bytes(tp[5], &temp_alloc)) == NULL) { erts_free_aligned_binary_bytes(temp_alloc); BIF_ERROR(p, BADARG); } @@ -1055,9 +1064,10 @@ handle_iolist: #ifdef DEBUG loop_count = 0xFFFFFFFF; #endif - - rc = erts_pcre_exec(restart.code, &(restart.extra), restart.subject, slength, startoffset, - options, restart.ovector, ovsize); + + rc = erts_pcre_exec(restart.code, &(restart.extra), restart.subject, + slength, startoffset, + options, restart.ovector, ovsize); ASSERT(loop_count != 0xFFFFFFFF); BUMP_REDS(p, loop_count / LOOP_FACTOR); if (rc == PCRE_ERROR_LOOP_LIMIT) { diff --git a/lib/stdlib/src/re.erl b/lib/stdlib/src/re.erl index c5109ec455..d8d529e6a4 100644 --- a/lib/stdlib/src/re.erl +++ b/lib/stdlib/src/re.erl @@ -19,8 +19,8 @@ -module(re). -export([grun/3,urun/3,ucompile/2,replace/3,replace/4,split/2,split/3]). -%-opaque mp() :: {re_pattern, _, _, _}. --type mp() :: {re_pattern, _, _, _}. +%-opaque mp() :: {re_pattern, _, _, _, _}. +-type mp() :: {re_pattern, _, _, _, _}. -type nl_spec() :: cr | crlf | lf | anycrlf | any. @@ -266,7 +266,7 @@ extend_subpatterns([],N) -> extend_subpatterns([H|T],N) -> [H | extend_subpatterns(T,N-1)]. -compile_split({re_pattern,N,_,_} = Comp, Options) -> +compile_split({re_pattern,N,_,_,_} = Comp, Options) -> {Comp,N,Options}; compile_split(Pat,Options0) when not is_tuple(Pat) -> Options = lists:filter(fun(O) -> @@ -275,7 +275,7 @@ compile_split(Pat,Options0) when not is_tuple(Pat) -> case re:compile(Pat,Options) of {error,Err} -> {error,Err}; - {ok, {re_pattern,N,_,_} = Comp} -> + {ok, {re_pattern,N,_,_,_} = Comp} -> NewOpt = lists:filter(fun(OO) -> (not copt(OO)) end, Options0), {Comp,N,NewOpt} end; @@ -487,12 +487,24 @@ do_replace(Subject,Repl,SubExprs0) -> end || Part <- Repl ]. -check_for_unicode({re_pattern,_,1,_},_) -> +check_for_unicode({re_pattern,_,1,_,_},_) -> true; -check_for_unicode({re_pattern,_,0,_},_) -> +check_for_unicode({re_pattern,_,0,_,_},_) -> false; check_for_unicode(_,L) -> lists:member(unicode,L). + +check_for_crlf({re_pattern,_,_,1,_},_) -> + true; +check_for_crlf({re_pattern,_,_,0,_},_) -> + false; +check_for_crlf(_,L) -> + case lists:keysearch(newline,1,L) of + {value,{newline,any}} -> true; + {value,{newline,crlf}} -> true; + {value,{newline,anycrlf}} -> true; + _ -> false + end. % SelectReturn = false | all | stirpfirst | none % ConvertReturn = index | list | binary @@ -662,7 +674,7 @@ urun2(Subject0,RE0,Options0) -> RE = case RE0 of BinRE when is_binary(BinRE) -> BinRE; - {re_pattern,_,_,_} = ReCompiled -> + {re_pattern,_,_,_,_} = ReCompiled -> ReCompiled; ListRE -> unicode:characters_to_binary(ListRE,unicode) @@ -703,10 +715,11 @@ grun(Subject,RE,{Options,NeedClean,OrigRE}) -> grun2(Subject,RE,{Options,NeedClean}) -> Unicode = check_for_unicode(RE,Options), + CRLF = check_for_crlf(RE,Options), FlatSubject = to_binary(Subject, Unicode), - do_grun(FlatSubject,Subject,Unicode,RE,{Options,NeedClean}). + do_grun(FlatSubject,Subject,Unicode,CRLF,RE,{Options,NeedClean}). -do_grun(FlatSubject,Subject,Unicode,RE,{Options0,NeedClean}) -> +do_grun(FlatSubject,Subject,Unicode,CRLF,RE,{Options0,NeedClean}) -> {StrippedOptions, InitialOffset, SelectReturn, ConvertReturn} = case (catch @@ -718,12 +731,12 @@ do_grun(FlatSubject,Subject,Unicode,RE,{Options0,NeedClean}) -> end, postprocess(loopexec(FlatSubject,RE,InitialOffset, byte_size(FlatSubject), - Unicode,StrippedOptions), + Unicode,CRLF,StrippedOptions), SelectReturn,ConvertReturn,FlatSubject,Unicode). -loopexec(_,_,X,Y,_,_) when X > Y -> +loopexec(_,_,X,Y,_,_,_) when X > Y -> {match,[]}; -loopexec(Subject,RE,X,Y,Unicode,Options) -> +loopexec(Subject,RE,X,Y,Unicode,CRLF,Options) -> case re:run(Subject,RE,[{offset,X}]++Options) of nomatch -> {match,[]}; @@ -731,7 +744,7 @@ loopexec(Subject,RE,X,Y,Unicode,Options) -> {match,Rest} = case B>0 of true -> - loopexec(Subject,RE,A+B,Y,Unicode,Options); + loopexec(Subject,RE,A+B,Y,Unicode,CRLF,Options); false -> {match,M} = case re:run(Subject,RE,[{offset,X},notempty, @@ -745,10 +758,10 @@ loopexec(Subject,RE,X,Y,Unicode,Options) -> [{_,NStep}|_] when NStep > 0 -> A+NStep; _ -> - forward(Subject,A,1,Unicode) + forward(Subject,A,1,Unicode,CRLF) end, {match,MM} = loopexec(Subject,RE,NewA,Y, - Unicode,Options), + Unicode,CRLF,Options), case M of [] -> {match,MM}; @@ -759,11 +772,22 @@ loopexec(Subject,RE,X,Y,Unicode,Options) -> {match,[[{A,B}|More] | Rest]} end. -forward(_Chal,A,0,_) -> +forward(_Chal,A,0,_,_) -> A; -forward(_Chal,A,N,false) -> - A+N; -forward(Chal,A,N,true) -> +forward(Chal,A,N,U,true) -> + <<_:A/binary,Tl/binary>> = Chal, + case Tl of + <<$\r,$\n,_/binary>> -> + forward(Chal,A+2,N-1,U,true); + _ -> + forward2(Chal,A,N,U,true) + end; +forward(Chal,A,N,U,false) -> + forward2(Chal,A,N,U,false). + +forward2(Chal,A,N,false,CRLF) -> + forward(Chal,A+1,N-1,false,CRLF); +forward2(Chal,A,N,true,CRLF) -> <<_:A/binary,Tl/binary>> = Chal, Forw = case Tl of <<1:1,1:1,0:1,_:5,_/binary>> -> @@ -775,7 +799,7 @@ forward(Chal,A,N,true) -> _ -> 1 end, - forward(Chal,A+Forw,N-1,true). + forward(Chal,A+Forw,N-1,true,CRLF). copt(caseless) -> true; -- cgit v1.2.3