diff options
-rw-r--r-- | erts/emulator/beam/bif.tab | 1 | ||||
-rw-r--r-- | erts/emulator/beam/erl_bif_re.c | 30 | ||||
-rw-r--r-- | erts/emulator/pcre/pcre.h | 3 | ||||
-rw-r--r-- | erts/emulator/pcre/pcre_exec.c | 115 | ||||
-rw-r--r-- | erts/emulator/pcre/pcre_internal.h | 11 | ||||
-rw-r--r-- | erts/emulator/pcre/pcre_valid_utf8.c | 73 | ||||
-rw-r--r-- | lib/stdlib/src/re.erl | 52 | ||||
-rw-r--r-- | lib/stdlib/src/stdlib.app.src | 2 | ||||
-rw-r--r-- | lib/stdlib/test/re_SUITE.erl | 57 |
9 files changed, 296 insertions, 48 deletions
diff --git a/erts/emulator/beam/bif.tab b/erts/emulator/beam/bif.tab index db9c258cb7..602db106b1 100644 --- a/erts/emulator/beam/bif.tab +++ b/erts/emulator/beam/bif.tab @@ -413,6 +413,7 @@ bif re:compile/1 bif re:compile/2 bif re:run/2 bif re:run/3 +bif re:internal_run/4 # # Bifs in lists module. diff --git a/erts/emulator/beam/erl_bif_re.c b/erts/emulator/beam/erl_bif_re.c index e0b9202fe7..b3bf1c7ee3 100644 --- a/erts/emulator/beam/erl_bif_re.c +++ b/erts/emulator/beam/erl_bif_re.c @@ -46,7 +46,7 @@ static Export *urun_trap_exportp = NULL; static Export *ucompile_trap_exportp = NULL; static BIF_RETTYPE re_exec_trap(BIF_ALIST_3); -static BIF_RETTYPE re_run(Process *p, Eterm arg1, Eterm arg2, Eterm arg3); +static BIF_RETTYPE re_run(Process *p, Eterm arg1, Eterm arg2, Eterm arg3, int first); static void *erts_erts_pcre_malloc(size_t size) { return erts_alloc(ERTS_ALC_T_RE_HEAP,size); @@ -1094,7 +1094,7 @@ build_capture(Eterm capture_spec[CAPSPEC_SIZE], const pcre *code) * The actual re:run/2,3 BIFs */ static BIF_RETTYPE -re_run(Process *p, Eterm arg1, Eterm arg2, Eterm arg3) +re_run(Process *p, Eterm arg1, Eterm arg2, Eterm arg3, int first) { const pcre *code_tmp; RestartContext restart; @@ -1120,6 +1120,14 @@ re_run(Process *p, Eterm arg1, Eterm arg2, Eterm arg3) < 0) { BIF_ERROR(p,BADARG); } + if (!first) { + /* + * 'first' is false when re:grun() previously has called re:internal_run() + * with the same subject; i.e., no need to do yet another validation of + * the subject regarding utf8 encoding... + */ + options |= PCRE_NO_UTF8_CHECK; + } is_list_cap = ((pflags & PARSE_FLAG_CAPTURE_OPT) && (capture[CAPSPEC_TYPE] == am_list)); @@ -1360,15 +1368,28 @@ handle_iolist: } BIF_RETTYPE +re_internal_run_4(BIF_ALIST_4) +{ + int first; + if (BIF_ARG_4 == am_false) + first = 0; + else if (BIF_ARG_4 == am_true) + first = !0; + else + BIF_ERROR(BIF_P,BADARG); + return re_run(BIF_P,BIF_ARG_1, BIF_ARG_2, BIF_ARG_3, first); +} + +BIF_RETTYPE re_run_3(BIF_ALIST_3) { - return re_run(BIF_P,BIF_ARG_1, BIF_ARG_2, BIF_ARG_3); + return re_run(BIF_P,BIF_ARG_1, BIF_ARG_2, BIF_ARG_3, !0); } BIF_RETTYPE re_run_2(BIF_ALIST_2) { - return re_run(BIF_P,BIF_ARG_1, BIF_ARG_2, NIL); + return re_run(BIF_P,BIF_ARG_1, BIF_ARG_2, NIL, !0); } /* @@ -1407,6 +1428,7 @@ static BIF_RETTYPE re_exec_trap(BIF_ALIST_3) loop_count = 0xFFFFFFFF; #endif rc = erts_pcre_exec(NULL, &(restartp->extra), NULL, 0, 0, 0, NULL, 0); + ASSERT(loop_count != 0xFFFFFFFF); BUMP_REDS(BIF_P, loop_count / LOOP_FACTOR); if (rc == PCRE_ERROR_LOOP_LIMIT) { diff --git a/erts/emulator/pcre/pcre.h b/erts/emulator/pcre/pcre.h index 3563791223..505e2ccce0 100644 --- a/erts/emulator/pcre/pcre.h +++ b/erts/emulator/pcre/pcre.h @@ -240,6 +240,9 @@ with J. */ #define PCRE_UTF8_ERR20 20 #define PCRE_UTF8_ERR21 21 #define PCRE_UTF8_ERR22 22 /* Unused (was non-character) */ +#if defined(ERLANG_INTEGRATION) +#define PCRE_UTF8_YIELD 23 +#endif /* Specific error codes for UTF-16 validity checks */ diff --git a/erts/emulator/pcre/pcre_exec.c b/erts/emulator/pcre/pcre_exec.c index 1946e97a72..55a7b377bf 100644 --- a/erts/emulator/pcre/pcre_exec.c +++ b/erts/emulator/pcre/pcre_exec.c @@ -6642,10 +6642,16 @@ typedef struct { REAL_PCRE *Xre; heapframe Xframe_zero; /* Always NO_RECURSE */ + /* for yield in valid_utf() */ + + struct PRIV(valid_utf_ystate) valid_utf_ystate; + /* Original function parameters that need be saved */ int Xstart_offset; int Xoffsetcount; int *Xoffsets; + int Xlength; + PCRE_SPTR Xsubject; } PcreExecContext; #endif @@ -6675,6 +6681,7 @@ pcre32_exec(const pcre32 *argument_re, const pcre32_extra *extra_data, #endif { #ifndef ERLANG_INTEGRATION +#define ERTS_UPDATE_CONSUMED(X, MD) int rc, ocount, arg_offset_max; int newline; BOOL using_temporary_offsets = FALSE; @@ -6736,6 +6743,8 @@ heapframe frame_zero; start_offset = exec_context->Xstart_offset; \ offsetcount = exec_context->Xoffsetcount; \ offsets = exec_context->Xoffsets; \ + length = exec_context->Xlength; \ + subject = exec_context->Xsubject; \ } while (0) #define SWAPOUT() do { \ @@ -6750,8 +6759,30 @@ heapframe frame_zero; exec_context->Xstart_offset = start_offset; \ exec_context->Xoffsetcount = offsetcount; \ exec_context->Xoffsets = offsets; \ + exec_context->Xlength = length; \ + exec_context->Xsubject = subject; \ } while (0) +#define ERTS_UPDATE_CONSUMED(X, MD) \ +do { \ + if (((X)->flags & PCRE_EXTRA_LOOP_LIMIT) != 0) { \ + unsigned long consumed__; \ + if (!(X)->restart_data) { \ + consumed__ = 0; \ + } \ + else { \ + PcreExecContext *ctx__ = (PcreExecContext *) \ + (*(X)->restart_data); \ + consumed__ = ctx__->valid_utf_ystate.cnt; \ + ctx__->valid_utf_ystate.cnt = 0; \ + } \ + if ((MD)) { \ + match_data *md__ = (MD); \ + consumed__ += (X)->loop_limit - md__->loop_limit; \ + } \ + *((X)->loop_counter_return) = consumed__; \ + } \ +} while (0) PcreExecContext *exec_context; PcreExecContext internal_context; @@ -6776,15 +6807,21 @@ pcre_uchar req_char; /* we are restarting, every initialization is skipped and we jump directly into the loop */ exec_context = (PcreExecContext *) *(extra_data->restart_data); SWAPIN(); - + if (exec_context->valid_utf_ystate.yielded) + goto restart_valid_utf; goto RESTART_INTERRUPTED; } else { if (extra_data != NULL && (extra_data->flags & PCRE_EXTRA_LOOP_LIMIT)) { exec_context = (PcreExecContext *) (erts_pcre_malloc)(sizeof(PcreExecContext)); - *(extra_data->restart_data) = (void *) exec_context; + *(extra_data->restart_data) = (void *) exec_context; + exec_context->valid_utf_ystate.yielded = 0; /* need freeing by special routine from client */ } else { +#if defined(ERLANG_INTEGRATION) + fprintf(stderr, "Unexpected execution path\n"); + abort(); +#endif exec_context = &internal_context; } @@ -6865,9 +6902,38 @@ code for an invalid string if a results vector is available. */ if (utf && (options & PCRE_NO_UTF8_CHECK) == 0) { int erroroffset; - int errorcode = PRIV(valid_utf)((PCRE_PUCHAR)subject, length, &erroroffset); + int errorcode; + +#if !defined(ERLANG_INTEGRATION) + errorcode = PRIV(valid_utf)((PCRE_PUCHAR)subject, length); +#else + struct PRIV(valid_utf_ystate) *ystate; + + if (!extra_data || !extra_data->restart_data) { + ystate = NULL; + } + else if (!(extra_data->flags & PCRE_EXTRA_LOOP_LIMIT)) { + exec_context->valid_utf_ystate.cnt = 10; + ystate = NULL; + } + else { + exec_context->valid_utf_ystate.yielded = 0; + restart_valid_utf: + ystate = &exec_context->valid_utf_ystate; + ystate->cnt = (int) extra_data->loop_limit; + } + errorcode = PRIV(yielding_valid_utf)((PCRE_PUCHAR)subject, length, + &erroroffset, ystate); +#endif if (errorcode != 0) { +#if defined(ERLANG_INTEGRATION) + if (ystate && ystate->yielded) { + ERTS_UPDATE_CONSUMED(extra_data, NULL); + SWAPOUT(); + return PCRE_ERROR_LOOP_LIMIT; + } +#endif if (offsetcount >= 2) { offsets[0] = erroroffset; @@ -6890,6 +6956,11 @@ if (utf && (options & PCRE_NO_UTF8_CHECK) == 0) return PCRE_ERROR_BADUTF8_OFFSET; #endif } +#if defined(ERLANG_INTEGRATION) +else { + exec_context->valid_utf_ystate.cnt = 0; +} +#endif #endif /* If the pattern was successfully studied with JIT support, run the JIT @@ -6950,7 +7021,11 @@ if (extra_data != NULL) #ifdef ERLANG_INTEGRATION if ((flags & PCRE_EXTRA_LOOP_LIMIT) != 0) { - md->loop_limit = extra_data->loop_limit; + md->loop_limit = extra_data->loop_limit; + if (extra_data->restart_data) + md->loop_limit -= extra_data->loop_limit - exec_context->valid_utf_ystate.cnt; + if (md->loop_limit < 10) + md->loop_limit = 10; /* At least do something if we've come this far... */ } #endif } @@ -7266,14 +7341,8 @@ for(;;) #endif if ((start_bits[c/8] & (1 << (c&7))) != 0) { -#ifdef ERLANG_INTEGRATION - if ((extra_data->flags & PCRE_EXTRA_LOOP_LIMIT) != 0) - { - *extra_data->loop_counter_return = - (extra_data->loop_limit - md->loop_limit); - } -#endif - break; + ERTS_UPDATE_CONSUMED(extra_data, md); + break; } start_match++; } @@ -7298,13 +7367,7 @@ for(;;) (pcre_uint32)(end_subject - start_match) < study->minlength) { rc = MATCH_NOMATCH; -#ifdef ERLANG_INTEGRATION - if ((extra_data->flags & PCRE_EXTRA_LOOP_LIMIT) != 0) - { - *extra_data->loop_counter_return = - (extra_data->loop_limit - md->loop_limit); - } -#endif + ERTS_UPDATE_CONSUMED(extra_data, md); break; } @@ -7353,13 +7416,7 @@ for(;;) if (p >= end_subject) { rc = MATCH_NOMATCH; -#ifdef ERLANG_INTEGRATION - if ((extra_data->flags & PCRE_EXTRA_LOOP_LIMIT) != 0) - { - *extra_data->loop_counter_return = - (extra_data->loop_limit - md->loop_limit); - } -#endif + ERTS_UPDATE_CONSUMED(extra_data, md); break; } @@ -7390,11 +7447,7 @@ for(;;) EDEBUGF(("Calling match...")); rc = match(start_match, md->start_code, start_match, 2, md, NULL, 0); #ifdef ERLANG_INTEGRATION - if ((extra_data->flags & PCRE_EXTRA_LOOP_LIMIT) != 0) - { - *extra_data->loop_counter_return = - (extra_data->loop_limit - md->loop_limit); - } + ERTS_UPDATE_CONSUMED(extra_data, md); SWAPOUT(); while(rc == PCRE_ERROR_LOOP_LIMIT) { EDEBUGF(("Loop limit break detected")); diff --git a/erts/emulator/pcre/pcre_internal.h b/erts/emulator/pcre/pcre_internal.h index c84dcb5a38..71f473e86f 100644 --- a/erts/emulator/pcre/pcre_internal.h +++ b/erts/emulator/pcre/pcre_internal.h @@ -2756,6 +2756,17 @@ extern int PRIV(strcmp_uc_c8_utf)(const pcre_uchar *, #endif /* COMPILE_PCRE[8|16|32] */ +#if defined(ERLANG_INTEGRATION) +struct PRIV(valid_utf_ystate) { + unsigned int cnt; + int length; + int yielded; + PCRE_PUCHAR p; +}; +extern int PRIV(yielding_valid_utf)(PCRE_PUCHAR, int, int *, + struct PRIV(valid_utf_ystate) *); +#endif + extern const pcre_uchar *PRIV(find_bracket)(const pcre_uchar *, BOOL, int); extern BOOL PRIV(is_newline)(PCRE_PUCHAR, int, PCRE_PUCHAR, int *, BOOL); diff --git a/erts/emulator/pcre/pcre_valid_utf8.c b/erts/emulator/pcre/pcre_valid_utf8.c index 516d8f4725..1dc1f9ba0c 100644 --- a/erts/emulator/pcre/pcre_valid_utf8.c +++ b/erts/emulator/pcre/pcre_valid_utf8.c @@ -107,19 +107,80 @@ Returns: = 0 if the string is a valid UTF-8 string int PRIV(valid_utf)(PCRE_PUCHAR string, int length, int *erroroffset) { + +#if defined(ERLANG_INTEGRATION) + return PRIV(yielding_valid_utf)(string, length, erroroffset, NULL); +} + +int +PRIV(yielding_valid_utf)(PCRE_PUCHAR string, int length, int *erroroffset, struct PRIV(valid_utf_ystate) *ystate) +{ +#endif + #ifdef SUPPORT_UTF register PCRE_PUCHAR p; +#if defined(ERLANG_INTEGRATION) +register long cnt; + +if (!ystate) { + cnt = -1; +} +else { + cnt = ystate->cnt; + if (ystate->yielded) { + p = ystate->p; + length = ystate->length; + if (length < 0) + goto restart_length; + else + goto restart_validate; + } +} +#endif + if (length < 0) { - for (p = string; *p != 0; p++); - length = (int)(p - string); + for (p = string; *p != 0; p++) { +#if defined(ERLANG_INTEGRATION) + if (cnt > 0 && --cnt == 0) { + /* + * Return with cnt set to amount consumed; + * i.e. same amount as at start... + */ + ystate->yielded = !0; + ystate->length = length; + ystate->p = p; + return PCRE_UTF8_YIELD; + } + restart_length: + (void) !0; +#endif + } + length = (int)(p - string); } for (p = string; length-- > 0; p++) { register pcre_uchar ab, c, d; +#if defined(ERLANG_INTEGRATION) + + if (cnt > 0 && --cnt == 0) { + /* + * Return with cnt set to amount consumed; + * i.e. same amount as at start... + */ + ystate->yielded = !0; + ystate->length = length; + ystate->p = p; + return PCRE_UTF8_YIELD; + } + + restart_validate: + +#endif + c = *p; if (c < 128) continue; /* ASCII character */ @@ -290,6 +351,14 @@ for (p = string; length-- > 0; p++) } } +#if defined(ERLANG_INTEGRATION) +if (ystate) { + /* Return with cnt set to amount consumed... */ + ystate->cnt -= cnt; + ystate->yielded = 0; +} +#endif + #else /* Not SUPPORT_UTF */ (void)(string); /* Keep picky compilers happy */ (void)(length); diff --git a/lib/stdlib/src/re.erl b/lib/stdlib/src/re.erl index 726b409d4d..197564b895 100644 --- a/lib/stdlib/src/re.erl +++ b/lib/stdlib/src/re.erl @@ -33,6 +33,8 @@ %%% BIFs +-export([internal_run/4]). + -export([version/0, compile/1, compile/2, run/2, run/3, inspect/2]). -spec version() -> binary(). @@ -100,6 +102,40 @@ run(_, _) -> run(_, _, _) -> erlang:nif_error(undef). +-spec internal_run(Subject, RE, Options, FirstCall) -> {match, Captured} | + match | + nomatch | + {error, ErrType} when + Subject :: iodata() | unicode:charlist(), + RE :: mp() | iodata() | unicode:charlist(), + Options :: [Option], + Option :: anchored | global | notbol | noteol | notempty + | notempty_atstart | report_errors + | {offset, non_neg_integer()} | + {match_limit, non_neg_integer()} | + {match_limit_recursion, non_neg_integer()} | + {newline, NLSpec :: nl_spec()} | + bsr_anycrlf | bsr_unicode | {capture, ValueSpec} | + {capture, ValueSpec, Type} | CompileOpt, + Type :: index | list | binary, + ValueSpec :: all | all_but_first | all_names | first | none | ValueList, + ValueList :: [ValueID], + ValueID :: integer() | string() | atom(), + CompileOpt :: compile_option(), + Captured :: [CaptureData] | [[CaptureData]], + CaptureData :: {integer(), integer()} + | ListConversionData + | binary(), + ListConversionData :: string() + | {error, string(), binary()} + | {incomplete, string(), binary()}, + ErrType :: match_limit | match_limit_recursion | {compile, CompileErr}, + CompileErr :: {ErrString :: string(), Position :: non_neg_integer()}, + FirstCall :: boolean(). + +internal_run(_, _, _, _) -> + erlang:nif_error(undef). + -spec inspect(MP,Item) -> {namelist, [ binary() ]} when MP :: mp(), Item :: namelist. @@ -765,17 +801,17 @@ do_grun(FlatSubject,Subject,Unicode,CRLF,RE,{Options0,NeedClean}) -> try postprocess(loopexec(FlatSubject,RE,InitialOffset, byte_size(FlatSubject), - Unicode,CRLF,StrippedOptions), + Unicode,CRLF,StrippedOptions,true), SelectReturn,ConvertReturn,FlatSubject,Unicode) catch throw:ErrTuple -> ErrTuple end. -loopexec(_,_,X,Y,_,_,_) when X > Y -> +loopexec(_,_,X,Y,_,_,_,_) when X > Y -> {match,[]}; -loopexec(Subject,RE,X,Y,Unicode,CRLF,Options) -> - case re:run(Subject,RE,[{offset,X}]++Options) of +loopexec(Subject,RE,X,Y,Unicode,CRLF,Options, First) -> + case re:internal_run(Subject,RE,[{offset,X}]++Options,First) of {error, Err} -> throw({error,Err}); nomatch -> @@ -784,11 +820,11 @@ loopexec(Subject,RE,X,Y,Unicode,CRLF,Options) -> {match,Rest} = case B>0 of true -> - loopexec(Subject,RE,A+B,Y,Unicode,CRLF,Options); + loopexec(Subject,RE,A+B,Y,Unicode,CRLF,Options,false); false -> {match,M} = - case re:run(Subject,RE,[{offset,X},notempty_atstart, - anchored]++Options) of + case re:internal_run(Subject,RE,[{offset,X},notempty_atstart, + anchored]++Options,false) of nomatch -> {match,[]}; {match,Other} -> @@ -801,7 +837,7 @@ loopexec(Subject,RE,X,Y,Unicode,CRLF,Options) -> forward(Subject,A,1,Unicode,CRLF) end, {match,MM} = loopexec(Subject,RE,NewA,Y, - Unicode,CRLF,Options), + Unicode,CRLF,Options,false), case M of [] -> {match,MM}; diff --git a/lib/stdlib/src/stdlib.app.src b/lib/stdlib/src/stdlib.app.src index ecb514e9f3..d7d57941c2 100644 --- a/lib/stdlib/src/stdlib.app.src +++ b/lib/stdlib/src/stdlib.app.src @@ -108,7 +108,7 @@ dets]}, {applications, [kernel]}, {env, []}, - {runtime_dependencies, ["sasl-3.0","kernel-6.0","erts-10.4","crypto-3.3", + {runtime_dependencies, ["sasl-3.0","kernel-6.0","erts-@OTP-15831:OTP-15836@","crypto-3.3", "compiler-5.0"]} ]}. diff --git a/lib/stdlib/test/re_SUITE.erl b/lib/stdlib/test/re_SUITE.erl index c9ef9da990..06d8fe9255 100644 --- a/lib/stdlib/test/re_SUITE.erl +++ b/lib/stdlib/test/re_SUITE.erl @@ -28,7 +28,8 @@ pcre_compile_workspace_overflow/1,re_infinite_loop/1, re_backwards_accented/1,opt_dupnames/1,opt_all_names/1,inspect/1, opt_no_start_optimize/1,opt_never_utf/1,opt_ucp/1, - match_limit/1,sub_binaries/1,copt/1]). + match_limit/1,sub_binaries/1,copt/1,global_unicode_validation/1, + yield_on_subject_validation/1]). -include_lib("common_test/include/ct.hrl"). -include_lib("kernel/include/file.hrl"). @@ -45,7 +46,8 @@ all() -> pcre_compile_workspace_overflow, re_infinite_loop, re_backwards_accented, opt_dupnames, opt_all_names, inspect, opt_no_start_optimize,opt_never_utf,opt_ucp, - match_limit, sub_binaries, re_version]. + match_limit, sub_binaries, re_version, global_unicode_validation, + yield_on_subject_validation]. groups() -> []. @@ -200,7 +202,58 @@ re_version(_Config) -> {match,[Version]} = re:run(Version,"^[0-9]\\.[0-9]{2} 20[0-9]{2}-[0-9]{2}-[0-9]{2}",[{capture,all,binary}]), ok. +global_unicode_validation(Config) when is_list(Config) -> + %% Test that unicode validation of the subject is not done + %% for every match found... + Bin = binary:copy(<<"abc\n">>,100000), + {TimeAscii, _} = take_time(fun () -> + re:run(Bin, <<"b">>, [global]) + end), + {TimeUnicode, _} = take_time(fun () -> + re:run(Bin, <<"b">>, [unicode,global]) + end), + if TimeAscii == 0; TimeUnicode == 0 -> + {comment, "Not good enough resolution to compare results"}; + true -> + %% The time the operations takes should be in the + %% same order of magnitude. If validation of the + %% whole subject occurs for every match, the unicode + %% variant will take way longer time... + true = TimeUnicode div TimeAscii < 10 + end. + +take_time(Fun) -> + Start = erlang:monotonic_time(nanosecond), + Res = Fun(), + End = erlang:monotonic_time(nanosecond), + {End-Start, Res}. + +yield_on_subject_validation(Config) when is_list(Config) -> + Go = make_ref(), + Bin = binary:copy(<<"abc\n">>,100000), + {P, M} = spawn_opt(fun () -> + receive Go -> ok end, + {match,[{1,1}]} = re:run(Bin, <<"b">>, [unicode]) + end, + [link, monitor]), + 1 = erlang:trace(P, true, [running]), + P ! Go, + N = count_re_run_trap_out(P, M), + true = N >= 5, + ok. +count_re_run_trap_out(P, M) when is_reference(M) -> + receive {'DOWN',M,process,P,normal} -> ok end, + TD = erlang:trace_delivered(P), + receive {trace_delivered, P, TD} -> ok end, + count_re_run_trap_out(P, 0); +count_re_run_trap_out(P, N) when is_integer(N) -> + receive + {trace,P,out,{erlang,re_run_trap,3}} -> + count_re_run_trap_out(P, N+1) + after 0 -> + N + end. %% Test compile options given directly to run. combined_options(Config) when is_list(Config) -> |