diff options
author | Rickard Green <[email protected]> | 2017-04-11 15:05:51 +0200 |
---|---|---|
committer | Rickard Green <[email protected]> | 2017-04-11 15:05:51 +0200 |
commit | 92a3b1a37182527d0bf8b6242d54e7fae6f4bbf9 (patch) | |
tree | 133119634987a6eeede055d644fe2f999ea3d318 /erts/emulator/pcre/pcre_exec.c | |
parent | b66834310f1d3f30a7b4d5c02753053d3f9d7ef8 (diff) | |
parent | 6520f78d4b878ff66adcf8d25de331d8ad06dd1d (diff) | |
download | otp-92a3b1a37182527d0bf8b6242d54e7fae6f4bbf9.tar.gz otp-92a3b1a37182527d0bf8b6242d54e7fae6f4bbf9.tar.bz2 otp-92a3b1a37182527d0bf8b6242d54e7fae6f4bbf9.zip |
Merge branch 'rickard/pcre-8.40'
OTP-14331
* rickard/pcre-8.40:
Update documentation
Update README.pcre_update.md
Stack guard for PCRE
Adjust for incompatibility between PCRE 8.40 and perl 5.22.1
Generate re replacement and split tests with perl vsn 5.22.1
Fix re_SUITE:pcre_compile_workspace_overflow/1
Skip line with lockout of modifiers in PCRE tests
Update tests for PCRE version 8.40
Update PCRE to version 8.40
Conflicts:
erts/emulator/beam/beam_debug.c
Diffstat (limited to 'erts/emulator/pcre/pcre_exec.c')
-rw-r--r-- | erts/emulator/pcre/pcre_exec.c | 903 |
1 files changed, 421 insertions, 482 deletions
diff --git a/erts/emulator/pcre/pcre_exec.c b/erts/emulator/pcre/pcre_exec.c index 07e662ff07..0f682d3daf 100644 --- a/erts/emulator/pcre/pcre_exec.c +++ b/erts/emulator/pcre/pcre_exec.c @@ -6,7 +6,7 @@ and semantics are as close as possible to those of the Perl 5 language. Written by Philip Hazel - Copyright (c) 1997-2013 University of Cambridge + Copyright (c) 1997-2014 University of Cambridge ----------------------------------------------------------------------------- Redistribution and use in source and binary forms, with or without @@ -111,8 +111,8 @@ because the offset vector is always a multiple of 3 long. */ /* Min and max values for the common repeats; for the maxima, 0 => infinity */ -static const char rep_min[] = { 0, 0, 1, 1, 0, 0 }; -static const char rep_max[] = { 0, 0, 0, 0, 1, 1 }; +static const char rep_min[] = { 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, }; +static const char rep_max[] = { 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, }; #ifdef PCRE_DEBUG /************************************************* @@ -138,7 +138,7 @@ pcre_uint32 c; BOOL utf = md->utf; if (is_subject && length > md->end_subject - p) length = md->end_subject - p; while (length-- > 0) - if (isprint(c = RAWUCHARINCTEST(p))) printf("%c", (char)c); else printf("\\x{%02x}", c); + if (isprint(c = UCHAR21INCTEST(p))) printf("%c", (char)c); else printf("\\x{%02x}", c); } #endif @@ -187,7 +187,7 @@ match_ref(int offset, register PCRE_PUCHAR eptr, int length, match_data *md, { PCRE_PUCHAR eptr_start = eptr; register PCRE_PUCHAR p = md->start_subject + md->offset_vector[offset]; -#ifdef SUPPORT_UTF +#if defined SUPPORT_UTF && defined SUPPORT_UCP BOOL utf = md->utf; #endif @@ -215,8 +215,7 @@ ASCII characters. */ if (caseless) { -#ifdef SUPPORT_UTF -#ifdef SUPPORT_UCP +#if defined SUPPORT_UTF && defined SUPPORT_UCP if (utf) { /* Match characters up to the end of the reference. NOTE: the number of @@ -250,7 +249,6 @@ if (caseless) } else #endif -#endif /* The same code works when not in UTF-8 mode and in UTF-8 mode when there is no UCP support. */ @@ -259,8 +257,8 @@ if (caseless) { pcre_uint32 cc, cp; if (eptr >= md->end_subject) return -2; /* Partial match */ - cc = RAWUCHARTEST(eptr); - cp = RAWUCHARTEST(p); + cc = UCHAR21TEST(eptr); + cp = UCHAR21TEST(p); if (TABLE_GET(cp, md->lcc, cp) != TABLE_GET(cc, md->lcc, cc)) return -1; p++; eptr++; @@ -276,7 +274,7 @@ else while (length-- > 0) { if (eptr >= md->end_subject) return -2; /* Partial match */ - if (RAWUCHARINCTEST(p) != RAWUCHARINCTEST(eptr)) return -1; + if (UCHAR21INCTEST(p) != UCHAR21INCTEST(eptr)) return -1; } } @@ -332,17 +330,13 @@ enum { RM1=1, RM2, RM3, RM4, RM5, RM6, RM7, RM8, RM9, RM10, RM31, RM32, RM33, RM34, RM35, RM36, RM37, RM38, RM39, RM40, RM41, RM42, RM43, RM44, RM45, RM46, RM47, RM48, RM49, RM50, RM51, RM52, RM53, RM54, RM55, RM56, RM57, RM58, RM59, RM60, - RM61, RM62, RM63, RM64, RM65, RM66, RM67, RM68 -}; + RM61, RM62, RM63, RM64, RM65, RM66, RM67 }; /* These versions of the macros use the stack, as normal. There are debugging versions and production versions. Note that the "rw" argument of RMATCH isn't actually used in this definition. */ #ifndef NO_RECURSE -#ifdef ERLANG_INTEGRATION -#error ERLANG_INTEGRATION only together with NO_RECURSE -#endif #define REGISTER register #ifdef PCRE_DEBUG @@ -997,7 +991,7 @@ for (;;) /* Continue as from after the group, updating the offsets high water mark, since extracts may have been taken. */ - do ecode += GET(ecode, 1); while (*ecode == OP_ALT); + do ecode += GET(ecode, 1); while (*ecode == OP_ALT); /* LOOP_COUNT: Ok */ offset_top = md->end_offset_top; eptr = md->end_match_ptr; @@ -1192,7 +1186,7 @@ for (;;) const pcre_uchar *scode = ecode; if (*scode != OP_ONCE) /* If not at start, find it */ { - while (*scode == OP_ALT) scode += GET(scode, 1); + while (*scode == OP_ALT) scode += GET(scode, 1); /* LOOP_COUNT: Ok */ scode -= GET(scode, 1); } if (md->once_target == scode) rrc = MATCH_NOMATCH; @@ -1230,87 +1224,81 @@ for (;;) printf("\n"); #endif - if (offset < md->offset_max) - { - matched_once = FALSE; - code_offset = (int)(ecode - md->start_code); - - save_offset1 = md->offset_vector[offset]; - save_offset2 = md->offset_vector[offset+1]; - save_offset3 = md->offset_vector[md->offset_end - number]; - save_capture_last = md->capture_last; + if (offset >= md->offset_max) goto POSSESSIVE_NON_CAPTURE; - DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3)); + matched_once = FALSE; + code_offset = (int)(ecode - md->start_code); - /* Each time round the loop, save the current subject position for use - when the group matches. For MATCH_MATCH, the group has matched, so we - restart it with a new subject starting position, remembering that we had - at least one match. For MATCH_NOMATCH, carry on with the alternatives, as - usual. If we haven't matched any alternatives in any iteration, check to - see if a previous iteration matched. If so, the group has matched; - continue from afterwards. Otherwise it has failed; restore the previous - capture values before returning NOMATCH. */ + save_offset1 = md->offset_vector[offset]; + save_offset2 = md->offset_vector[offset+1]; + save_offset3 = md->offset_vector[md->offset_end - number]; + save_capture_last = md->capture_last; - for (;;) /* LOOP_COUNT: Ok */ - { - md->offset_vector[md->offset_end - number] = - (int)(eptr - md->start_subject); - if (op >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP; - RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md, - eptrb, RM63); - if (rrc == MATCH_KETRPOS) - { - offset_top = md->end_offset_top; - eptr = md->end_match_ptr; - ecode = md->start_code + code_offset; - save_capture_last = md->capture_last; - matched_once = TRUE; - continue; - } + DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3)); - /* See comment in the code for capturing groups above about handling - THEN. */ + /* Each time round the loop, save the current subject position for use + when the group matches. For MATCH_MATCH, the group has matched, so we + restart it with a new subject starting position, remembering that we had + at least one match. For MATCH_NOMATCH, carry on with the alternatives, as + usual. If we haven't matched any alternatives in any iteration, check to + see if a previous iteration matched. If so, the group has matched; + continue from afterwards. Otherwise it has failed; restore the previous + capture values before returning NOMATCH. */ - if (rrc == MATCH_THEN) + for (;;) /* LOOP_COUNT: Ok */ + { + md->offset_vector[md->offset_end - number] = + (int)(eptr - md->start_subject); + if (op >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP; + RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md, + eptrb, RM63); + if (rrc == MATCH_KETRPOS) + { + offset_top = md->end_offset_top; + ecode = md->start_code + code_offset; + save_capture_last = md->capture_last; + matched_once = TRUE; + mstart = md->start_match_ptr; /* In case \K changed it */ + if (eptr == md->end_match_ptr) /* Matched an empty string */ { - next = ecode + GET(ecode,1); - if (md->start_match_ptr < next && - (*ecode == OP_ALT || *next == OP_ALT)) - rrc = MATCH_NOMATCH; + do ecode += GET(ecode, 1); while (*ecode == OP_ALT); /* LOOP_COUNT: Ok */ + break; } - - if (rrc != MATCH_NOMATCH) RRETURN(rrc); - md->capture_last = save_capture_last; - ecode += GET(ecode, 1); - if (*ecode != OP_ALT) break; + eptr = md->end_match_ptr; + continue; } - if (!matched_once) - { - md->offset_vector[offset] = save_offset1; - md->offset_vector[offset+1] = save_offset2; - md->offset_vector[md->offset_end - number] = save_offset3; - } + /* See comment in the code for capturing groups above about handling + THEN. */ - if (allow_zero || matched_once) + if (rrc == MATCH_THEN) { - ecode += 1 + LINK_SIZE; - break; + next = ecode + GET(ecode,1); + if (md->start_match_ptr < next && + (*ecode == OP_ALT || *next == OP_ALT)) + rrc = MATCH_NOMATCH; } - RRETURN(MATCH_NOMATCH); + if (rrc != MATCH_NOMATCH) RRETURN(rrc); + md->capture_last = save_capture_last; + ecode += GET(ecode, 1); + if (*ecode != OP_ALT) break; } - /* FALL THROUGH ... Insufficient room for saving captured contents. Treat - as a non-capturing bracket. */ - - /* VVVVVVVVVVVVVVVVVVVVVVVVV */ - /* VVVVVVVVVVVVVVVVVVVVVVVVV */ + if (!matched_once) + { + md->offset_vector[offset] = save_offset1; + md->offset_vector[offset+1] = save_offset2; + md->offset_vector[md->offset_end - number] = save_offset3; + } - DPRINTF(("insufficient capture room: treat as non-capturing\n")); + if (allow_zero || matched_once) + { + ecode += 1 + LINK_SIZE; + break; + } - /* VVVVVVVVVVVVVVVVVVVVVVVVV */ - /* VVVVVVVVVVVVVVVVVVVVVVVVV */ + RRETURN(MATCH_NOMATCH); /* Non-capturing possessive bracket with unlimited repeat. We come here from BRAZERO with allow_zero = TRUE. The code is similar to the above, @@ -1334,9 +1322,15 @@ for (;;) if (rrc == MATCH_KETRPOS) { offset_top = md->end_offset_top; - eptr = md->end_match_ptr; ecode = md->start_code + code_offset; matched_once = TRUE; + mstart = md->start_match_ptr; /* In case \K reset it */ + if (eptr == md->end_match_ptr) /* Matched an empty string */ + { + do ecode += GET(ecode, 1); while (*ecode == OP_ALT); /* LOOP_COUNT: Ok */ + break; + } + eptr = md->end_match_ptr; continue; } @@ -1366,25 +1360,32 @@ for (;;) /* Control never reaches here. */ - /* Conditional group: compilation checked that there are no more than - two branches. If the condition is false, skipping the first branch takes us - past the end if there is only one branch, but that's OK because that is - exactly what going to the ket would do. */ + /* Conditional group: compilation checked that there are no more than two + branches. If the condition is false, skipping the first branch takes us + past the end of the item if there is only one branch, but that's exactly + what we want. */ case OP_COND: case OP_SCOND: - codelink = GET(ecode, 1); + + /* The variable codelink will be added to ecode when the condition is + false, to get to the second branch. Setting it to the offset to the ALT + or KET, then incrementing ecode achieves this effect. We now have ecode + pointing to the condition or callout. */ + + codelink = GET(ecode, 1); /* Offset to the second branch */ + ecode += 1 + LINK_SIZE; /* From this opcode */ /* Because of the way auto-callout works during compile, a callout item is inserted between OP_COND and an assertion condition. */ - if (ecode[LINK_SIZE+1] == OP_CALLOUT) + if (*ecode == OP_CALLOUT) { if (PUBL(callout) != NULL) { PUBL(callout_block) cb; cb.version = 2; /* Version 1 of the callout block */ - cb.callout_number = ecode[LINK_SIZE+2]; + cb.callout_number = ecode[1]; cb.offset_vector = md->offset_vector; #if defined COMPILE_PCRE8 cb.subject = (PCRE_SPTR)md->start_subject; @@ -1396,8 +1397,8 @@ for (;;) cb.subject_length = (int)(md->end_subject - md->start_subject); cb.start_match = (int)(mstart - md->start_subject); cb.current_position = (int)(eptr - md->start_subject); - cb.pattern_position = GET(ecode, LINK_SIZE + 3); - cb.next_item_length = GET(ecode, 3 + 2*LINK_SIZE); + cb.pattern_position = GET(ecode, 2); + cb.next_item_length = GET(ecode, 2 + LINK_SIZE); cb.capture_top = offset_top/2; cb.capture_last = md->capture_last & CAPLMASK; /* Internal change requires this for API compatibility. */ @@ -1407,213 +1408,125 @@ for (;;) if ((rrc = (*PUBL(callout))(&cb)) > 0) RRETURN(MATCH_NOMATCH); if (rrc < 0) RRETURN(rrc); } + + /* Advance ecode past the callout, so it now points to the condition. We + must adjust codelink so that the value of ecode+codelink is unchanged. */ + ecode += PRIV(OP_lengths)[OP_CALLOUT]; codelink -= PRIV(OP_lengths)[OP_CALLOUT]; } - condcode = ecode[LINK_SIZE+1]; + /* Test the various possible conditions */ - /* Now see what the actual condition is */ - - if (condcode == OP_RREF || condcode == OP_NRREF) /* Recursion test */ + condition = FALSE; + switch(condcode = *ecode) { - if (md->recursive == NULL) /* Not recursing => FALSE */ + case OP_RREF: /* Numbered group recursion test */ + if (md->recursive != NULL) /* Not recursing => FALSE */ { - condition = FALSE; - ecode += GET(ecode, 1); - } - else - { /* LOOP_COUNT: Warning, No CHK in this block */ - unsigned int recno = GET2(ecode, LINK_SIZE + 2); /* Recursion group number*/ + unsigned int recno = GET2(ecode, 1); /* Recursion group number*/ condition = (recno == RREF_ANY || recno == md->recursive->group_num); + } + break; - /* If the test is for recursion into a specific subpattern, and it is - false, but the test was set up by name, scan the table to see if the - name refers to any other numbers, and test them. The condition is true - if any one is set. */ - - if (!condition && condcode == OP_NRREF) + case OP_DNRREF: /* Duplicate named group recursion test */ + if (md->recursive != NULL) + { + int count = GET2(ecode, 1 + IMM2_SIZE); + pcre_uchar *slot = md->name_table + GET2(ecode, 1) * md->name_entry_size; + while (count-- > 0) /* LOOP_COUNT: COST */ { - pcre_uchar *slotA = md->name_table; - for (i = 0; i < md->name_count; i++)/* LOOP_COUNT: COST */ - { - if (GET2(slotA, 0) == recno) break; - slotA += md->name_entry_size; - COST(1); - } - - /* Found a name for the number - there can be only one; duplicate - names for different numbers are allowed, but not vice versa. First - scan down for duplicates. */ - - if (i < md->name_count) - { - pcre_uchar *slotB = slotA; - while (slotB > md->name_table) /* LOOP_COUNT: COST */ - { - slotB -= md->name_entry_size; - if (STRCMP_UC_UC(slotA + IMM2_SIZE, slotB + IMM2_SIZE) == 0) - { - condition = GET2(slotB, 0) == md->recursive->group_num; - if (condition) break; - } - else break; - COST(1); - } - - /* Scan up for duplicates */ - - if (!condition) - { - slotB = slotA; - for (i++; i < md->name_count; i++)/* LOOP_COUNT: COST */ - { - slotB += md->name_entry_size; - if (STRCMP_UC_UC(slotA + IMM2_SIZE, slotB + IMM2_SIZE) == 0) - { - condition = GET2(slotB, 0) == md->recursive->group_num; - if (condition) break; - } - else break; - COST(1); - } - } - } + unsigned int recno = GET2(slot, 0); + condition = recno == md->recursive->group_num; + if (condition) break; + slot += md->name_entry_size; + COST(1); } - - /* Chose branch according to the condition */ - - ecode += condition? 1 + IMM2_SIZE : GET(ecode, 1); } - } + break; - else if (condcode == OP_CREF || condcode == OP_NCREF) /* Group used test */ - { - offset = GET2(ecode, LINK_SIZE+2) << 1; /* Doubled ref number */ + case OP_CREF: /* Numbered group used test */ + offset = GET2(ecode, 1) << 1; /* Doubled ref number */ condition = offset < offset_top && md->offset_vector[offset] >= 0; + break; - /* If the numbered capture is unset, but the reference was by name, - scan the table to see if the name refers to any other numbers, and test - them. The condition is true if any one is set. This is tediously similar - to the code above, but not close enough to try to amalgamate. */ - - if (!condition && condcode == OP_NCREF) + case OP_DNCREF: /* Duplicate named group used test */ { - unsigned int refno = offset >> 1; - pcre_uchar *slotA = md->name_table;/* LOOP_COUNT: Warning, no CHK in this block */ - - for (i = 0; i < md->name_count; i++) /* LOOP_COUNT: COST */ + int count = GET2(ecode, 1 + IMM2_SIZE); + pcre_uchar *slot = md->name_table + GET2(ecode, 1) * md->name_entry_size; + while (count-- > 0) /* LOOP_COUNT: COST */ { - if (GET2(slotA, 0) == refno) break; - slotA += md->name_entry_size; + offset = GET2(slot, 0) << 1; + condition = offset < offset_top && md->offset_vector[offset] >= 0; + if (condition) break; + slot += md->name_entry_size; COST(1); } - - /* Found a name for the number - there can be only one; duplicate names - for different numbers are allowed, but not vice versa. First scan down - for duplicates. */ - - if (i < md->name_count) - { - pcre_uchar *slotB = slotA; - while (slotB > md->name_table) /* LOOP_COUNT: COST */ - { - slotB -= md->name_entry_size; - if (STRCMP_UC_UC(slotA + IMM2_SIZE, slotB + IMM2_SIZE) == 0) - { - offset = GET2(slotB, 0) << 1; - condition = offset < offset_top && - md->offset_vector[offset] >= 0; - if (condition) break; - } - else break; - COST(1); - } - - /* Scan up for duplicates */ - - if (!condition) - { - slotB = slotA; - for (i++; i < md->name_count; i++) /* LOOP_COUNT: COST */ - { - slotB += md->name_entry_size; - if (STRCMP_UC_UC(slotA + IMM2_SIZE, slotB + IMM2_SIZE) == 0) - { - offset = GET2(slotB, 0) << 1; - condition = offset < offset_top && - md->offset_vector[offset] >= 0; - if (condition) break; - } - else break; - COST(1); - } - } - } } + break; - /* Chose branch according to the condition */ - - ecode += condition? 1 + IMM2_SIZE : GET(ecode, 1); - } - - else if (condcode == OP_DEF) /* DEFINE - always false */ - { - condition = FALSE; - ecode += GET(ecode, 1); - } + case OP_DEF: /* DEFINE - always false */ + case OP_FAIL: /* From optimized (?!) condition */ + break; - /* The condition is an assertion. Call match() to evaluate it - setting - md->match_function_type to MATCH_CONDASSERT causes it to stop at the end of - an assertion. */ + /* The condition is an assertion. Call match() to evaluate it - setting + md->match_function_type to MATCH_CONDASSERT causes it to stop at the end + of an assertion. */ - else - { + default: md->match_function_type = MATCH_CONDASSERT; - RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, NULL, RM3); + RMATCH(eptr, ecode, offset_top, md, NULL, RM3); if (rrc == MATCH_MATCH) { if (md->end_offset_top > offset_top) offset_top = md->end_offset_top; /* Captures may have happened */ condition = TRUE; - ecode += 1 + LINK_SIZE + GET(ecode, LINK_SIZE + 2); + + /* Advance ecode past the assertion to the start of the first branch, + but adjust it so that the general choosing code below works. If the + assertion has a quantifier that allows zero repeats we must skip over + the BRAZERO. This is a lunatic thing to do, but somebody did! */ + + if (*ecode == OP_BRAZERO) ecode++; + ecode += GET(ecode, 1); while (*ecode == OP_ALT) ecode += GET(ecode, 1); /* LOOP_COUNT: Ok */ + ecode += 1 + LINK_SIZE - PRIV(OP_lengths)[condcode]; } /* PCRE doesn't allow the effect of (*THEN) to escape beyond an - assertion; it is therefore treated as NOMATCH. */ + assertion; it is therefore treated as NOMATCH. Any other return is an + error. */ else if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) { RRETURN(rrc); /* Need braces because of following else */ } - else - { - condition = FALSE; - ecode += codelink; - } + break; } - /* We are now at the branch that is to be obeyed. As there is only one, can - use tail recursion to avoid using another stack frame, except when there is - unlimited repeat of a possibly empty group. In the latter case, a recursive - call to match() is always required, unless the second alternative doesn't - exist, in which case we can just plough on. Note that, for compatibility - with Perl, the | in a conditional group is NOT treated as creating two - alternatives. If a THEN is encountered in the branch, it propagates out to - the enclosing alternative (unless nested in a deeper set of alternatives, - of course). */ - - if (condition || *ecode == OP_ALT) + /* Choose branch according to the condition */ + + ecode += condition? PRIV(OP_lengths)[condcode] : codelink; + + /* We are now at the branch that is to be obeyed. As there is only one, we + can use tail recursion to avoid using another stack frame, except when + there is unlimited repeat of a possibly empty group. In the latter case, a + recursive call to match() is always required, unless the second alternative + doesn't exist, in which case we can just plough on. Note that, for + compatibility with Perl, the | in a conditional group is NOT treated as + creating two alternatives. If a THEN is encountered in the branch, it + propagates out to the enclosing alternative (unless nested in a deeper set + of alternatives, of course). */ + + if (condition || ecode[-(1+LINK_SIZE)] == OP_ALT) { if (op != OP_SCOND) { - ecode += 1 + LINK_SIZE; goto TAIL_RECURSE; } md->match_function_type = MATCH_CBEGROUP; - RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM49); + RMATCH(eptr, ecode, offset_top, md, eptrb, RM49); RRETURN(rrc); } @@ -1621,7 +1534,6 @@ for (;;) else { - ecode += 1 + LINK_SIZE; } break; @@ -1644,7 +1556,22 @@ for (;;) md->offset_vector[offset] = md->offset_vector[md->offset_end - number]; md->offset_vector[offset+1] = (int)(eptr - md->start_subject); - if (offset_top <= offset) offset_top = offset + 2; + + /* If this group is at or above the current highwater mark, ensure that + any groups between the current high water mark and this group are marked + unset and then update the high water mark. */ + + if (offset >= offset_top) + { + register int *iptr = md->offset_vector + offset_top; + register int *iend = md->offset_vector + offset; + if (iptr < iend) + { + COST(iend - iptr); + while (iptr < iend) *iptr++ = -1; /* LOOP_COUNT: COST */ + } + offset_top = offset + 2; + } } ecode += 1 + IMM2_SIZE; break; @@ -2000,7 +1927,11 @@ for (;;) are defined in a range that can be tested for. */ if (rrc >= MATCH_BACKTRACK_MIN && rrc <= MATCH_BACKTRACK_MAX) + { + if (new_recursive.offset_save != stacksave) + (PUBL(free))(new_recursive.offset_save); RRETURN(MATCH_NOMATCH); + } /* Any return code other than NOMATCH is an error. */ @@ -2151,7 +2082,11 @@ for (;;) { register int *iptr = md->offset_vector + offset_top; register int *iend = md->offset_vector + offset; - while (iptr < iend) *iptr++ = -1; /* LOOP_COUNT: CHK */ + if (iptr < iend) + { + COST(iend - iptr); + while (iptr < iend) *iptr++ = -1; /* LOOP_COUNT: COST */ + } } /* Now make the extraction */ @@ -2163,6 +2098,19 @@ for (;;) } } + /* OP_KETRPOS is a possessive repeating ket. Remember the current position, + and return the MATCH_KETRPOS. This makes it possible to do the repeats one + at a time from the outer level, thus saving stack. This must precede the + empty string test - in this case that test is done at the outer level. */ + + if (*ecode == OP_KETRPOS) + { + md->start_match_ptr = mstart; /* In case \K reset it */ + md->end_match_ptr = eptr; + md->end_offset_top = offset_top; + RRETURN(MATCH_KETRPOS); + } + /* For an ordinary non-repeating ket, just continue at this level. This also happens for a repeating ket if no characters were matched in the group. This is the forcible breaking of infinite loops as implemented in @@ -2185,17 +2133,6 @@ for (;;) break; } - /* OP_KETRPOS is a possessive repeating ket. Remember the current position, - and return the MATCH_KETRPOS. This makes it possible to do the repeats one - at a time from the outer level, thus saving stack. */ - - if (*ecode == OP_KETRPOS) - { - md->end_match_ptr = eptr; - md->end_offset_top = offset_top; - RRETURN(MATCH_KETRPOS); - } - /* The normal repeating kets try the rest of the pattern or restart from the preceding bracket, in the appropriate order. In the second case, we can use tail recursion to avoid using another stack frame, unless we have an @@ -2286,7 +2223,7 @@ for (;;) eptr + 1 >= md->end_subject && NLBLOCK->nltype == NLTYPE_FIXED && NLBLOCK->nllen == 2 && - RAWUCHARTEST(eptr) == NLBLOCK->nl[0]) + UCHAR21TEST(eptr) == NLBLOCK->nl[0]) { md->hitend = TRUE; if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL); @@ -2330,7 +2267,7 @@ for (;;) eptr + 1 >= md->end_subject && NLBLOCK->nltype == NLTYPE_FIXED && NLBLOCK->nllen == 2 && - RAWUCHARTEST(eptr) == NLBLOCK->nl[0]) + UCHAR21TEST(eptr) == NLBLOCK->nl[0]) { md->hitend = TRUE; if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL); @@ -2473,7 +2410,7 @@ for (;;) eptr + 1 >= md->end_subject && NLBLOCK->nltype == NLTYPE_FIXED && NLBLOCK->nllen == 2 && - RAWUCHARTEST(eptr) == NLBLOCK->nl[0]) + UCHAR21TEST(eptr) == NLBLOCK->nl[0]) { md->hitend = TRUE; if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL); @@ -2627,7 +2564,7 @@ for (;;) { SCHECK_PARTIAL(); } - else if (RAWUCHARTEST(eptr) == CHAR_LF) eptr++; + else if (UCHAR21TEST(eptr) == CHAR_LF) eptr++; break; case CHAR_LF: @@ -2758,19 +2695,24 @@ for (;;) RRETURN(MATCH_NOMATCH); break; - case PT_SPACE: /* Perl space */ - if ((PRIV(ucp_gentype)[prop->chartype] == ucp_Z || - c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR) - == (op == OP_NOTPROP)) - RRETURN(MATCH_NOMATCH); - break; + /* Perl space used to exclude VT, but from Perl 5.18 it is included, + which means that Perl space and POSIX space are now identical. PCRE + was changed at release 8.34. */ + case PT_SPACE: /* Perl space */ case PT_PXSPACE: /* POSIX space */ - if ((PRIV(ucp_gentype)[prop->chartype] == ucp_Z || - c == CHAR_HT || c == CHAR_NL || c == CHAR_VT || - c == CHAR_FF || c == CHAR_CR) - == (op == OP_NOTPROP)) - RRETURN(MATCH_NOMATCH); + switch(c) + { + HSPACE_CASES: + VSPACE_CASES: + if (op == OP_NOTPROP) RRETURN(MATCH_NOMATCH); + break; + + default: + if ((PRIV(ucp_gentype)[prop->chartype] == ucp_Z) == + (op == OP_NOTPROP)) RRETURN(MATCH_NOMATCH); + break; + } break; case PT_WORD: @@ -2848,15 +2790,7 @@ for (;;) similar code to character type repeats - written out again for speed. However, if the referenced string is the empty string, always treat it as matched, any number of times (otherwise there could be infinite - loops). */ - - case OP_REF: - case OP_REFI: - caseless = op == OP_REFI; - offset = GET2(ecode, 1) << 1; /* Doubled ref number */ - ecode += 1 + IMM2_SIZE; - - /* If the reference is unset, there are two possibilities: + loops). If the reference is unset, there are two possibilities: (a) In the default, Perl-compatible state, set the length negative; this ensures that every attempt at a match fails. We can't just fail @@ -2866,8 +2800,46 @@ for (;;) so that the back reference matches an empty string. Otherwise, set the length to the length of what was matched by the - referenced subpattern. */ + referenced subpattern. + + The OP_REF and OP_REFI opcodes are used for a reference to a numbered group + or to a non-duplicated named group. For a duplicated named group, OP_DNREF + and OP_DNREFI are used. In this case we must scan the list of groups to + which the name refers, and use the first one that is set. */ + + case OP_DNREF: + case OP_DNREFI: + caseless = op == OP_DNREFI; + { + int count = GET2(ecode, 1+IMM2_SIZE); + pcre_uchar *slot = md->name_table + GET2(ecode, 1) * md->name_entry_size; + ecode += 1 + 2*IMM2_SIZE; + + /* Setting the default length first and initializing 'offset' avoids + compiler warnings in the REF_REPEAT code. */ + + length = (md->jscript_compat)? 0 : -1; + offset = 0; + while (count-- > 0) /* LOOP_COUNT: COST */ + { + offset = GET2(slot, 0) << 1; + if (offset < offset_top && md->offset_vector[offset] >= 0) + { + length = md->offset_vector[offset+1] - md->offset_vector[offset]; + break; + } + slot += md->name_entry_size; + } + COST(1); + } + goto REF_REPEAT; + + case OP_REF: + case OP_REFI: + caseless = op == OP_REFI; + offset = GET2(ecode, 1) << 1; /* Doubled ref number */ + ecode += 1 + IMM2_SIZE; if (offset >= offset_top || md->offset_vector[offset] < 0) length = (md->jscript_compat)? 0 : -1; else @@ -2875,6 +2847,7 @@ for (;;) /* Set up for repetition, or handle the non-repeated case */ + REF_REPEAT: switch (*ecode) { case OP_CRSTAR: @@ -3027,8 +3000,12 @@ for (;;) case OP_CRMINPLUS: case OP_CRQUERY: case OP_CRMINQUERY: + case OP_CRPOSSTAR: + case OP_CRPOSPLUS: + case OP_CRPOSQUERY: c = *ecode++ - OP_CRSTAR; - minimize = (c & 1) != 0; + if (c < OP_CRPOSSTAR - OP_CRSTAR) minimize = (c & 1) != 0; + else possessive = TRUE; min = rep_min[c]; /* Pick up values from tables; */ max = rep_max[c]; /* zero for max => infinity */ if (max == 0) max = INT_MAX; @@ -3036,7 +3013,9 @@ for (;;) case OP_CRRANGE: case OP_CRMINRANGE: + case OP_CRPOSRANGE: minimize = (*ecode == OP_CRMINRANGE); + possessive = (*ecode == OP_CRPOSRANGE); min = GET2(ecode, 1); max = GET2(ecode, 1 + IMM2_SIZE); if (max == 0) max = INT_MAX; @@ -3181,6 +3160,9 @@ for (;;) eptr += len; COST_CHK(1); } + + if (possessive) continue; /* No backtracking */ + for (;;) /* LOOP_COUNT: Ok */ { RMATCH(eptr, ecode, offset_top, md, eptrb, RM18); @@ -3212,7 +3194,10 @@ for (;;) COST_CHK(1); eptr++; } - while (eptr >= pp) /* LOOP_COUNT: Ok */ + + if (possessive) continue; /* No backtracking */ + + while (eptr >= pp) /* LOOP_COUNT: Ok */ { RMATCH(eptr, ecode, offset_top, md, eptrb, RM19); if (rrc != MATCH_NOMATCH) RRETURN(rrc); @@ -3227,9 +3212,10 @@ for (;;) /* Control never gets here */ - /* Match an extended character class. This opcode is encountered only - when UTF-8 mode mode is supported. Nevertheless, we may not be in UTF-8 - mode, because Unicode properties are supported in non-UTF-8 mode. */ + /* Match an extended character class. In the 8-bit library, this opcode is + encountered only when UTF-8 mode mode is supported. In the 16-bit and + 32-bit libraries, codepoints greater than 255 may be encountered even when + UTF is not supported. */ #if defined SUPPORT_UTF || !defined COMPILE_PCRE8 case OP_XCLASS: @@ -3245,8 +3231,12 @@ for (;;) case OP_CRMINPLUS: case OP_CRQUERY: case OP_CRMINQUERY: + case OP_CRPOSSTAR: + case OP_CRPOSPLUS: + case OP_CRPOSQUERY: c = *ecode++ - OP_CRSTAR; - minimize = (c & 1) != 0; + if (c < OP_CRPOSSTAR - OP_CRSTAR) minimize = (c & 1) != 0; + else possessive = TRUE; min = rep_min[c]; /* Pick up values from tables; */ max = rep_max[c]; /* zero for max => infinity */ if (max == 0) max = INT_MAX; @@ -3254,7 +3244,9 @@ for (;;) case OP_CRRANGE: case OP_CRMINRANGE: + case OP_CRPOSRANGE: minimize = (*ecode == OP_CRMINRANGE); + possessive = (*ecode == OP_CRPOSRANGE); min = GET2(ecode, 1); max = GET2(ecode, 1 + IMM2_SIZE); if (max == 0) max = INT_MAX; @@ -3327,6 +3319,9 @@ for (;;) eptr += len; COST_CHK(1); } + + if (possessive) continue; /* No backtracking */ + for(;;) /* LOOP_COUNT: Ok */ { RMATCH(eptr, ecode, offset_top, md, eptrb, RM21); @@ -3357,7 +3352,7 @@ for (;;) CHECK_PARTIAL(); /* Not SCHECK_PARTIAL() */ RRETURN(MATCH_NOMATCH); } - while (length-- > 0) if (*ecode++ != RAWUCHARINC(eptr)) RRETURN(MATCH_NOMATCH); /* LOOP_COUNT: Ok */ + while (length-- > 0) if (*ecode++ != UCHAR21INC(eptr)) RRETURN(MATCH_NOMATCH); /* LOOP_COUNT: Ok */ } else #endif @@ -3398,7 +3393,7 @@ for (;;) if (fc < 128) { - pcre_uint32 cc = RAWUCHAR(eptr); + pcre_uint32 cc = UCHAR21(eptr); if (md->lcc[fc] != TABLE_GET(cc, md->lcc, cc)) RRETURN(MATCH_NOMATCH); ecode++; eptr++; @@ -3606,7 +3601,7 @@ for (;;) if (possessive) continue; /* No backtracking */ for(;;) /* LOOP_COUNT: Ok */ { - if (eptr == pp) goto TAIL_RECURSE; + if (eptr <= pp) goto TAIL_RECURSE; RMATCH(eptr, ecode, offset_top, md, eptrb, RM23); if (rrc != MATCH_NOMATCH) RRETURN(rrc); #ifdef SUPPORT_UCP @@ -3668,7 +3663,7 @@ for (;;) SCHECK_PARTIAL(); RRETURN(MATCH_NOMATCH); } - cc = RAWUCHARTEST(eptr); + cc = UCHAR21TEST(eptr); if (fc != cc && foc != cc) RRETURN(MATCH_NOMATCH); eptr++; COST_CHK(1); @@ -3687,7 +3682,7 @@ for (;;) SCHECK_PARTIAL(); RRETURN(MATCH_NOMATCH); } - cc = RAWUCHARTEST(eptr); + cc = UCHAR21TEST(eptr); if (fc != cc && foc != cc) RRETURN(MATCH_NOMATCH); eptr++; } @@ -3704,12 +3699,11 @@ for (;;) SCHECK_PARTIAL(); break; } - cc = RAWUCHARTEST(eptr); + cc = UCHAR21TEST(eptr); if (fc != cc && foc != cc) break; eptr++; COST_CHK(1); } - if (possessive) continue; /* No backtracking */ for (;;) /* LOOP_COUNT: Ok */ { @@ -3718,9 +3712,8 @@ for (;;) eptr--; if (rrc != MATCH_NOMATCH) RRETURN(rrc); } - RRETURN(MATCH_NOMATCH); + /* Control never gets here */ } - /* Control never gets here */ } /* Caseful comparisons (includes all multi-byte characters) */ @@ -3735,7 +3728,7 @@ for (;;) SCHECK_PARTIAL(); RRETURN(MATCH_NOMATCH); } - if (fc != RAWUCHARINCTEST(eptr)) RRETURN(MATCH_NOMATCH); + if (fc != UCHAR21INCTEST(eptr)) RRETURN(MATCH_NOMATCH); } if (min == max) continue; @@ -3752,7 +3745,7 @@ for (;;) SCHECK_PARTIAL(); RRETURN(MATCH_NOMATCH); } - if (fc != RAWUCHARINCTEST(eptr)) RRETURN(MATCH_NOMATCH); + if (fc != UCHAR21INCTEST(eptr)) RRETURN(MATCH_NOMATCH); } /* Control never gets here */ } @@ -3766,7 +3759,7 @@ for (;;) SCHECK_PARTIAL(); break; } - if (fc != RAWUCHARTEST(eptr)) break; + if (fc != UCHAR21TEST(eptr)) break; eptr++; COST_CHK(1); } @@ -3778,7 +3771,7 @@ for (;;) eptr--; if (rrc != MATCH_NOMATCH) RRETURN(rrc); } - RRETURN(MATCH_NOMATCH); + /* Control never gets here */ } } /* Control never gets here */ @@ -4036,7 +4029,7 @@ for (;;) if (possessive) continue; /* No backtracking */ for(;;) /* LOOP_COUNT: Ok */ { - if (eptr == pp) goto TAIL_RECURSE; + if (eptr <= pp) goto TAIL_RECURSE; RMATCH(eptr, ecode, offset_top, md, eptrb, RM30); if (rrc != MATCH_NOMATCH) RRETURN(rrc); eptr--; @@ -4067,10 +4060,8 @@ for (;;) eptr--; } } - - RRETURN(MATCH_NOMATCH); + /* Control never gets here */ } - /* Control never gets here */ } /* Caseful comparisons */ @@ -4098,7 +4089,7 @@ for (;;) /* Not UTF mode */ { COST(min); - for (i = 1; i <= min; i++) /* LOOP_COUNT: Ok */ + for (i = 1; i <= min; i++) /* LOOP_COUNT: Cost */ { if (eptr >= md->end_subject) { @@ -4177,7 +4168,7 @@ for (;;) if (possessive) continue; /* No backtracking */ for(;;) /* LOOP_COUNT: Ok */ { - if (eptr == pp) goto TAIL_RECURSE; + if (eptr <= pp) goto TAIL_RECURSE; RMATCH(eptr, ecode, offset_top, md, eptrb, RM34); if (rrc != MATCH_NOMATCH) RRETURN(rrc); eptr--; @@ -4208,8 +4199,7 @@ for (;;) eptr--; } } - - RRETURN(MATCH_NOMATCH); + /* Control never gets here */ } } /* Control never gets here */ @@ -4392,7 +4382,12 @@ for (;;) } break; + /* Perl space used to exclude VT, but from Perl 5.18 it is included, + which means that Perl space and POSIX space are now identical. PCRE + was changed at release 8.34. */ + case PT_SPACE: /* Perl space */ + case PT_PXSPACE: /* POSIX space */ for (i = 1; i <= min; i++) /* LOOP_COUNT: COST (above) */ { if (eptr >= md->end_subject) @@ -4401,26 +4396,18 @@ for (;;) RRETURN(MATCH_NOMATCH); } GETCHARINCTEST(c, eptr); - if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL || - c == CHAR_FF || c == CHAR_CR) - == prop_fail_result) - RRETURN(MATCH_NOMATCH); - } - break; - - case PT_PXSPACE: /* POSIX space */ - for (i = 1; i <= min; i++) /* LOOP_COUNT: COST (above) */ - { - if (eptr >= md->end_subject) + switch(c) { - SCHECK_PARTIAL(); - RRETURN(MATCH_NOMATCH); + HSPACE_CASES: + VSPACE_CASES: + if (prop_fail_result) RRETURN(MATCH_NOMATCH); + break; + + default: + if ((UCD_CATEGORY(c) == ucp_Z) == prop_fail_result) + RRETURN(MATCH_NOMATCH); + break; } - GETCHARINCTEST(c, eptr); - if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL || - c == CHAR_VT || c == CHAR_FF || c == CHAR_CR) - == prop_fail_result) - RRETURN(MATCH_NOMATCH); } break; @@ -4543,7 +4530,7 @@ for (;;) eptr + 1 >= md->end_subject && NLBLOCK->nltype == NLTYPE_FIXED && NLBLOCK->nllen == 2 && - RAWUCHAR(eptr) == NLBLOCK->nl[0]) + UCHAR21(eptr) == NLBLOCK->nl[0]) { md->hitend = TRUE; if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL); @@ -4587,7 +4574,7 @@ for (;;) default: RRETURN(MATCH_NOMATCH); case CHAR_CR: - if (eptr < md->end_subject && RAWUCHAR(eptr) == CHAR_LF) eptr++; + if (eptr < md->end_subject && UCHAR21(eptr) == CHAR_LF) eptr++; break; case CHAR_LF: @@ -4703,7 +4690,7 @@ for (;;) SCHECK_PARTIAL(); RRETURN(MATCH_NOMATCH); } - cc = RAWUCHAR(eptr); + cc = UCHAR21(eptr); if (cc >= 128 || (md->ctypes[cc] & ctype_digit) == 0) RRETURN(MATCH_NOMATCH); eptr++; @@ -4721,7 +4708,7 @@ for (;;) SCHECK_PARTIAL(); RRETURN(MATCH_NOMATCH); } - cc = RAWUCHAR(eptr); + cc = UCHAR21(eptr); if (cc < 128 && (md->ctypes[cc] & ctype_space) != 0) RRETURN(MATCH_NOMATCH); eptr++; @@ -4739,7 +4726,7 @@ for (;;) SCHECK_PARTIAL(); RRETURN(MATCH_NOMATCH); } - cc = RAWUCHAR(eptr); + cc = UCHAR21(eptr); if (cc >= 128 || (md->ctypes[cc] & ctype_space) == 0) RRETURN(MATCH_NOMATCH); eptr++; @@ -4757,7 +4744,7 @@ for (;;) SCHECK_PARTIAL(); RRETURN(MATCH_NOMATCH); } - cc = RAWUCHAR(eptr); + cc = UCHAR21(eptr); if (cc < 128 && (md->ctypes[cc] & ctype_word) != 0) RRETURN(MATCH_NOMATCH); eptr++; @@ -4775,7 +4762,7 @@ for (;;) SCHECK_PARTIAL(); RRETURN(MATCH_NOMATCH); } - cc = RAWUCHAR(eptr); + cc = UCHAR21(eptr); if (cc >= 128 || (md->ctypes[cc] & ctype_word) == 0) RRETURN(MATCH_NOMATCH); eptr++; @@ -5170,25 +5157,11 @@ for (;;) } /* Control never gets here */ - case PT_SPACE: /* Perl space */ - for (fi = min;; fi++) /* LOOP_COUNT: Ok */ - { - RMATCH(eptr, ecode, offset_top, md, eptrb, RM60); - if (rrc != MATCH_NOMATCH) RRETURN(rrc); - if (fi >= max) RRETURN(MATCH_NOMATCH); - if (eptr >= md->end_subject) - { - SCHECK_PARTIAL(); - RRETURN(MATCH_NOMATCH); - } - GETCHARINCTEST(c, eptr); - if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL || - c == CHAR_FF || c == CHAR_CR) - == prop_fail_result) - RRETURN(MATCH_NOMATCH); - } - /* Control never gets here */ + /* Perl space used to exclude VT, but from Perl 5.18 it is included, + which means that Perl space and POSIX space are now identical. PCRE + was changed at release 8.34. */ + case PT_SPACE: /* Perl space */ case PT_PXSPACE: /* POSIX space */ for (fi = min;; fi++) /* LOOP_COUNT: Ok */ { @@ -5201,10 +5174,18 @@ for (;;) RRETURN(MATCH_NOMATCH); } GETCHARINCTEST(c, eptr); - if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL || - c == CHAR_VT || c == CHAR_FF || c == CHAR_CR) - == prop_fail_result) - RRETURN(MATCH_NOMATCH); + switch(c) + { + HSPACE_CASES: + VSPACE_CASES: + if (prop_fail_result) RRETURN(MATCH_NOMATCH); + break; + + default: + if ((UCD_CATEGORY(c) == ucp_Z) == prop_fail_result) + RRETURN(MATCH_NOMATCH); + break; + } } /* Control never gets here */ @@ -5258,7 +5239,7 @@ for (;;) case PT_UCNC: for (fi = min;; fi++) /* LOOP_COUNT: Ok */ { - RMATCH(eptr, ecode, offset_top, md, eptrb, RM68); + RMATCH(eptr, ecode, offset_top, md, eptrb, RM60); if (rrc != MATCH_NOMATCH) RRETURN(rrc); if (fi >= max) RRETURN(MATCH_NOMATCH); if (eptr >= md->end_subject) @@ -5358,7 +5339,7 @@ for (;;) { default: RRETURN(MATCH_NOMATCH); case CHAR_CR: - if (eptr < md->end_subject && RAWUCHAR(eptr) == CHAR_LF) eptr++; + if (eptr < md->end_subject && UCHAR21(eptr) == CHAR_LF) eptr++; break; case CHAR_LF: @@ -5698,7 +5679,12 @@ for (;;) } break; + /* Perl space used to exclude VT, but from Perl 5.18 it is included, + which means that Perl space and POSIX space are now identical. PCRE + was changed at release 8.34. */ + case PT_SPACE: /* Perl space */ + case PT_PXSPACE: /* POSIX space */ for (i = min; i < max; i++) /* LOOP_COUNT: CHK */ { int len = 1; @@ -5708,32 +5694,22 @@ for (;;) break; } GETCHARLENTEST(c, eptr, len); - if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL || - c == CHAR_FF || c == CHAR_CR) - == prop_fail_result) + switch(c) + { + HSPACE_CASES: + VSPACE_CASES: + if (prop_fail_result) goto ENDLOOP99; /* Break the loop */ break; - eptr+= len; - COST_CHK(1); - } - break; - case PT_PXSPACE: /* POSIX space */ - for (i = min; i < max; i++) /* LOOP_COUNT: CHK */ - { - int len = 1; - if (eptr >= md->end_subject) - { - SCHECK_PARTIAL(); + default: + if ((UCD_CATEGORY(c) == ucp_Z) == prop_fail_result) + goto ENDLOOP99; /* Break the loop */ break; } - GETCHARLENTEST(c, eptr, len); - if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL || - c == CHAR_VT || c == CHAR_FF || c == CHAR_CR) - == prop_fail_result) - break; eptr+= len; COST_CHK(1); } + ENDLOOP99: break; case PT_WORD: @@ -5741,7 +5717,7 @@ for (;;) { int category; int len = 1; - if (eptr >= md->end_subject) + if (eptr >= md->end_subject) { SCHECK_PARTIAL(); break; @@ -5810,7 +5786,7 @@ for (;;) if (possessive) continue; /* No backtracking */ for(;;) /* LOOP_COUNT: Ok */ { - if (eptr == pp) goto TAIL_RECURSE; + if (eptr <= pp) goto TAIL_RECURSE; RMATCH(eptr, ecode, offset_top, md, eptrb, RM44); if (rrc != MATCH_NOMATCH) RRETURN(rrc); eptr--; @@ -5855,22 +5831,27 @@ for (;;) /* eptr is now past the end of the maximum run */ if (possessive) continue; /* No backtracking */ - + + /* We use <= pp rather than == pp to detect the start of the run while + backtracking because the use of \C in UTF mode can cause BACKCHAR to + move back past pp. This is just palliative; the use of \C in UTF mode + is fraught with danger. */ + for(;;) /* LOOP_COUNT: Ok */ { #ifndef ERLANG_INTEGRATION - int lgb, rgb; + int lgb, rgb; #endif PCRE_PUCHAR fptr; - - if (eptr == pp) goto TAIL_RECURSE; /* At start of char run */ + + if (eptr <= pp) goto TAIL_RECURSE; /* At start of char run */ RMATCH(eptr, ecode, offset_top, md, eptrb, RM45); if (rrc != MATCH_NOMATCH) RRETURN(rrc); /* Backtracking over an extended grapheme cluster involves inspecting the previous two characters (if present) to see if a break is permitted between them. */ - + eptr--; if (!utf) c = *eptr; else { @@ -5881,14 +5862,14 @@ for (;;) for (;;) /* LOOP_COUNT: COST */ { - if (eptr == pp) goto TAIL_RECURSE; /* At start of char run */ + if (eptr <= pp) goto TAIL_RECURSE; /* At start of char run */ fptr = eptr - 1; if (!utf) c = *fptr; else { BACKCHAR(fptr); GETCHAR(c, fptr); } - lgb = UCD_GRAPHBREAK(c); + lgb = UCD_GRAPHBREAK(c); if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break; eptr = fptr; rgb = lgb; @@ -5906,56 +5887,26 @@ for (;;) switch(ctype) { case OP_ANY: - if (max < INT_MAX) + for (i = min; i < max; i++) /* LOOP_COUNT: CHK */ { - for (i = min; i < max; i++) /* LOOP_COUNT: CHK */ + if (eptr >= md->end_subject) { - if (eptr >= md->end_subject) - { - SCHECK_PARTIAL(); - break; - } - if (IS_NEWLINE(eptr)) break; - if (md->partial != 0 && /* Take care with CRLF partial */ - eptr + 1 >= md->end_subject && - NLBLOCK->nltype == NLTYPE_FIXED && - NLBLOCK->nllen == 2 && - RAWUCHAR(eptr) == NLBLOCK->nl[0]) - { - md->hitend = TRUE; - if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL); - } - eptr++; - ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++); - COST_CHK(1); + SCHECK_PARTIAL(); + break; } - } - - /* Handle unlimited UTF-8 repeat */ - - else - { - for (i = min; i < max; i++) /* LOOP_COUNT: CHK */ + if (IS_NEWLINE(eptr)) break; + if (md->partial != 0 && /* Take care with CRLF partial */ + eptr + 1 >= md->end_subject && + NLBLOCK->nltype == NLTYPE_FIXED && + NLBLOCK->nllen == 2 && + UCHAR21(eptr) == NLBLOCK->nl[0]) { - if (eptr >= md->end_subject) - { - SCHECK_PARTIAL(); - break; - } - if (IS_NEWLINE(eptr)) break; - if (md->partial != 0 && /* Take care with CRLF partial */ - eptr + 1 >= md->end_subject && - NLBLOCK->nltype == NLTYPE_FIXED && - NLBLOCK->nllen == 2 && - RAWUCHAR(eptr) == NLBLOCK->nl[0]) - { - md->hitend = TRUE; - if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL); - } - eptr++; - ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++); - COST_CHK(1); + md->hitend = TRUE; + if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL); } + eptr++; + ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++); + COST_CHK(1); } break; @@ -6006,7 +5957,7 @@ for (;;) if (c == CHAR_CR) { if (++eptr >= md->end_subject) break; - if (RAWUCHAR(eptr) == CHAR_LF) eptr++; + if (UCHAR21(eptr) == CHAR_LF) eptr++; } else { @@ -6173,13 +6124,13 @@ for (;;) if (possessive) continue; /* No backtracking */ for(;;) /* LOOP_COUNT: Ok */ { - if (eptr == pp) goto TAIL_RECURSE; + if (eptr <= pp) goto TAIL_RECURSE; RMATCH(eptr, ecode, offset_top, md, eptrb, RM46); if (rrc != MATCH_NOMATCH) RRETURN(rrc); eptr--; BACKCHAR(eptr); - if (ctype == OP_ANYNL && eptr > pp && RAWUCHAR(eptr) == CHAR_NL && - RAWUCHAR(eptr - 1) == CHAR_CR) eptr--; + if (ctype == OP_ANYNL && eptr > pp && UCHAR21(eptr) == CHAR_NL && + UCHAR21(eptr - 1) == CHAR_CR) eptr--; } } else @@ -6438,11 +6389,8 @@ for (;;) } } - /* Get here if we can't make it match with any permitted repetitions */ - - RRETURN(MATCH_NOMATCH); + /* Control never gets here */ } - /* Control never gets here */ /* There's been some horrible disaster. Arrival here can only mean there is something seriously wrong in the code above or the OP_xxx definitions. */ @@ -6476,15 +6424,15 @@ switch (frame->Xwhere) LBL(53) LBL(54) LBL(55) LBL(56) LBL(57) LBL(58) LBL(63) LBL(64) LBL(65) LBL(66) #if defined SUPPORT_UTF || !defined COMPILE_PCRE8 - LBL(21) + LBL(20) LBL(21) #endif #ifdef SUPPORT_UTF - LBL(16) LBL(18) LBL(20) + LBL(16) LBL(18) LBL(22) LBL(23) LBL(28) LBL(30) LBL(32) LBL(34) LBL(42) LBL(46) #ifdef SUPPORT_UCP LBL(36) LBL(37) LBL(38) LBL(39) LBL(40) LBL(41) LBL(44) LBL(45) - LBL(59) LBL(60) LBL(61) LBL(62) LBL(67) LBL(68) + LBL(59) LBL(60) LBL(61) LBL(62) LBL(67) #endif /* SUPPORT_UCP */ #endif /* SUPPORT_UTF */ default: @@ -6747,7 +6695,7 @@ const pcre_uint8 *start_bits = NULL; PCRE_PUCHAR start_match = (PCRE_PUCHAR)subject + start_offset; PCRE_PUCHAR end_subject; PCRE_PUCHAR start_partial = NULL; -PCRE_PUCHAR match_partial; +PCRE_PUCHAR match_partial = NULL; PCRE_PUCHAR req_char_ptr = start_match - 1; const pcre_study_data *study; @@ -6852,6 +6800,7 @@ pcre_uchar req_char; start_bits = NULL; start_match = (PCRE_PUCHAR)subject + start_offset; start_partial = NULL; + match_partial = NULL; req_char_ptr = start_match - 1; re = (REAL_PCRE *)argument_re; @@ -6988,7 +6937,7 @@ tables = re->tables; if (extra_data != NULL) { - register unsigned int flags = extra_data->flags; + unsigned long int flags = extra_data->flags; if ((flags & PCRE_EXTRA_STUDY_DATA) != 0) study = (const pcre_study_data *)extra_data->study_data; if ((flags & PCRE_EXTRA_MATCH_LIMIT) != 0) @@ -7166,7 +7115,8 @@ if (md->offset_vector != NULL) register int *iend = iptr - re->top_bracket; if (iend < md->offset_vector + 2) iend = md->offset_vector + 2; while (--iptr >= iend) *iptr = -1; - md->offset_vector[0] = md->offset_vector[1] = -1; + if (offsetcount > 0) md->offset_vector[0] = -1; + if (offsetcount > 1) md->offset_vector[1] = -1; } /* Set up the first character to match, if available. The first_char value is @@ -7264,10 +7214,10 @@ for(;;) if (first_char != first_char2) while (start_match < end_subject && - (smc = RAWUCHARTEST(start_match)) != first_char && smc != first_char2) + (smc = UCHAR21TEST(start_match)) != first_char && smc != first_char2) start_match++; else - while (start_match < end_subject && RAWUCHARTEST(start_match) != first_char) + while (start_match < end_subject && UCHAR21TEST(start_match) != first_char) start_match++; } @@ -7299,7 +7249,7 @@ for(;;) if (start_match[-1] == CHAR_CR && (md->nltype == NLTYPE_ANY || md->nltype == NLTYPE_ANYCRLF) && start_match < end_subject && - RAWUCHARTEST(start_match) == CHAR_NL) + UCHAR21TEST(start_match) == CHAR_NL) start_match++; } } @@ -7310,33 +7260,22 @@ for(;;) { while (start_match < end_subject) { - register pcre_uint32 c = RAWUCHARTEST(start_match); + register pcre_uint32 c = UCHAR21TEST(start_match); #ifndef COMPILE_PCRE8 if (c > 255) c = 255; #endif - if ((start_bits[c/8] & (1 << (c&7))) == 0) - { - start_match++; -#if defined SUPPORT_UTF && defined COMPILE_PCRE8 - /* In non 8-bit mode, the iteration will stop for - characters > 255 at the beginning or not stop at all. */ - if (utf) - ACROSSCHAR(start_match < end_subject, *start_match, - start_match++); -#endif - } -#ifdef ERLANG_INTEGRATION - else { - if ((extra_data->flags & PCRE_EXTRA_LOOP_LIMIT) != 0) + if ((start_bits[c/8] & (1 << (c&7))) != 0) { - *extra_data->loop_counter_return = - (extra_data->loop_limit - md->loop_limit); - } - break; - } -#else - else break; +#ifdef ERLANG_INTEGRATION + if ((extra_data->flags & PCRE_EXTRA_LOOP_LIMIT) != 0) + { + *extra_data->loop_counter_return = + (extra_data->loop_limit - md->loop_limit); + } #endif + break; + } + start_match++; } } } /* Starting optimizations */ @@ -7396,7 +7335,7 @@ for(;;) { while (p < end_subject) { - register pcre_uint32 pp = RAWUCHARINCTEST(p); + register pcre_uint32 pp = UCHAR21INCTEST(p); if (pp == req_char || pp == req_char2) { p--; break; } } } @@ -7404,7 +7343,7 @@ for(;;) { while (p < end_subject) { - if (RAWUCHARINCTEST(p) == req_char) { p--; break; } + if (UCHAR21INCTEST(p) == req_char) { p--; break; } } } @@ -7684,7 +7623,7 @@ if (rc != MATCH_NOMATCH && rc != PCRE_ERROR_PARTIAL) /* Handle partial matches - disable any mark data */ -if (start_partial != NULL) +if (match_partial != NULL) { DPRINTF((">>>> returning PCRE_ERROR_PARTIAL\n")); md->mark = NULL; |