aboutsummaryrefslogtreecommitdiffstats
path: root/erts/emulator/pcre/pcre_dfa_exec.c
diff options
context:
space:
mode:
Diffstat (limited to 'erts/emulator/pcre/pcre_dfa_exec.c')
-rw-r--r--erts/emulator/pcre/pcre_dfa_exec.c230
1 files changed, 148 insertions, 82 deletions
diff --git a/erts/emulator/pcre/pcre_dfa_exec.c b/erts/emulator/pcre/pcre_dfa_exec.c
index f5718a3b33..529f40685b 100644
--- a/erts/emulator/pcre/pcre_dfa_exec.c
+++ b/erts/emulator/pcre/pcre_dfa_exec.c
@@ -7,7 +7,7 @@ and semantics are as close as possible to those of the Perl 5 language (but see
below for why this module is different).
Written by Philip Hazel
- Copyright (c) 1997-2013 University of Cambridge
+ Copyright (c) 1997-2014 University of Cambridge
-----------------------------------------------------------------------------
Redistribution and use in source and binary forms, with or without
@@ -121,7 +121,7 @@ static const pcre_uint8 coptable[] = {
0, 0, /* \P, \p */
0, 0, 0, 0, 0, /* \R, \H, \h, \V, \v */
0, /* \X */
- 0, 0, 0, 0, 0, 0, /* \Z, \z, ^, ^M, $, $M */
+ 0, 0, 0, 0, 0, 0, /* \Z, \z, $, $M, ^, ^M */
1, /* Char */
1, /* Chari */
1, /* not */
@@ -152,11 +152,14 @@ static const pcre_uint8 coptable[] = {
/* Character class & ref repeats */
0, 0, 0, 0, 0, 0, /* *, *?, +, +?, ?, ?? */
0, 0, /* CRRANGE, CRMINRANGE */
+ 0, 0, 0, 0, /* Possessive *+, ++, ?+, CRPOSRANGE */
0, /* CLASS */
0, /* NCLASS */
0, /* XCLASS - variable length */
0, /* REF */
0, /* REFI */
+ 0, /* DNREF */
+ 0, /* DNREFI */
0, /* RECURSE */
0, /* CALLOUT */
0, /* Alt */
@@ -172,8 +175,8 @@ static const pcre_uint8 coptable[] = {
0, 0, /* ONCE, ONCE_NC */
0, 0, 0, 0, 0, /* BRA, BRAPOS, CBRA, CBRAPOS, COND */
0, 0, 0, 0, 0, /* SBRA, SBRAPOS, SCBRA, SCBRAPOS, SCOND */
- 0, 0, /* CREF, NCREF */
- 0, 0, /* RREF, NRREF */
+ 0, 0, /* CREF, DNCREF */
+ 0, 0, /* RREF, DNRREF */
0, /* DEF */
0, 0, 0, /* BRAZERO, BRAMINZERO, BRAPOSZERO */
0, 0, 0, /* MARK, PRUNE, PRUNE_ARG */
@@ -195,7 +198,7 @@ static const pcre_uint8 poptable[] = {
1, 1, /* \P, \p */
1, 1, 1, 1, 1, /* \R, \H, \h, \V, \v */
1, /* \X */
- 0, 0, 0, 0, 0, 0, /* \Z, \z, ^, ^M, $, $M */
+ 0, 0, 0, 0, 0, 0, /* \Z, \z, $, $M, ^, ^M */
1, /* Char */
1, /* Chari */
1, /* not */
@@ -221,11 +224,14 @@ static const pcre_uint8 poptable[] = {
/* Character class & ref repeats */
1, 1, 1, 1, 1, 1, /* *, *?, +, +?, ?, ?? */
1, 1, /* CRRANGE, CRMINRANGE */
+ 1, 1, 1, 1, /* Possessive *+, ++, ?+, CRPOSRANGE */
1, /* CLASS */
1, /* NCLASS */
1, /* XCLASS - variable length */
0, /* REF */
0, /* REFI */
+ 0, /* DNREF */
+ 0, /* DNREFI */
0, /* RECURSE */
0, /* CALLOUT */
0, /* Alt */
@@ -241,8 +247,8 @@ static const pcre_uint8 poptable[] = {
0, 0, /* ONCE, ONCE_NC */
0, 0, 0, 0, 0, /* BRA, BRAPOS, CBRA, CBRAPOS, COND */
0, 0, 0, 0, 0, /* SBRA, SBRAPOS, SCBRA, SCBRAPOS, SCOND */
- 0, 0, /* CREF, NCREF */
- 0, 0, /* RREF, NRREF */
+ 0, 0, /* CREF, DNCREF */
+ 0, 0, /* RREF, DNRREF */
0, /* DEF */
0, 0, 0, /* BRAZERO, BRAMINZERO, BRAPOSZERO */
0, 0, 0, /* MARK, PRUNE, PRUNE_ARG */
@@ -1095,15 +1101,23 @@ for (;;)
PRIV(ucp_gentype)[prop->chartype] == ucp_N;
break;
- case PT_SPACE: /* Perl space */
- OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
- c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR;
- break;
+ /* Perl space used to exclude VT, but from Perl 5.18 it is included,
+ which means that Perl space and POSIX space are now identical. PCRE
+ was changed at release 8.34. */
+ case PT_SPACE: /* Perl space */
case PT_PXSPACE: /* POSIX space */
- OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
- c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
- c == CHAR_FF || c == CHAR_CR;
+ switch(c)
+ {
+ HSPACE_CASES:
+ VSPACE_CASES:
+ OK = TRUE;
+ break;
+
+ default:
+ OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z;
+ break;
+ }
break;
case PT_WORD:
@@ -1345,15 +1359,23 @@ for (;;)
PRIV(ucp_gentype)[prop->chartype] == ucp_N;
break;
- case PT_SPACE: /* Perl space */
- OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
- c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR;
- break;
+ /* Perl space used to exclude VT, but from Perl 5.18 it is included,
+ which means that Perl space and POSIX space are now identical. PCRE
+ was changed at release 8.34. */
+ case PT_SPACE: /* Perl space */
case PT_PXSPACE: /* POSIX space */
- OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
- c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
- c == CHAR_FF || c == CHAR_CR;
+ switch(c)
+ {
+ HSPACE_CASES:
+ VSPACE_CASES:
+ OK = TRUE;
+ break;
+
+ default:
+ OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z;
+ break;
+ }
break;
case PT_WORD:
@@ -1452,7 +1474,7 @@ for (;;)
goto ANYNL01;
case CHAR_CR:
- if (ptr + 1 < end_subject && RAWUCHARTEST(ptr + 1) == CHAR_LF) ncount = 1;
+ if (ptr + 1 < end_subject && UCHAR21TEST(ptr + 1) == CHAR_LF) ncount = 1;
/* Fall through */
ANYNL01:
@@ -1589,15 +1611,23 @@ for (;;)
PRIV(ucp_gentype)[prop->chartype] == ucp_N;
break;
- case PT_SPACE: /* Perl space */
- OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
- c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR;
- break;
+ /* Perl space used to exclude VT, but from Perl 5.18 it is included,
+ which means that Perl space and POSIX space are now identical. PCRE
+ was changed at release 8.34. */
+ case PT_SPACE: /* Perl space */
case PT_PXSPACE: /* POSIX space */
- OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
- c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
- c == CHAR_FF || c == CHAR_CR;
+ switch(c)
+ {
+ HSPACE_CASES:
+ VSPACE_CASES:
+ OK = TRUE;
+ break;
+
+ default:
+ OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z;
+ break;
+ }
break;
case PT_WORD:
@@ -1713,7 +1743,7 @@ for (;;)
goto ANYNL02;
case CHAR_CR:
- if (ptr + 1 < end_subject && RAWUCHARTEST(ptr + 1) == CHAR_LF) ncount = 1;
+ if (ptr + 1 < end_subject && UCHAR21TEST(ptr + 1) == CHAR_LF) ncount = 1;
/* Fall through */
ANYNL02:
@@ -1858,15 +1888,23 @@ for (;;)
PRIV(ucp_gentype)[prop->chartype] == ucp_N;
break;
- case PT_SPACE: /* Perl space */
- OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
- c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR;
- break;
+ /* Perl space used to exclude VT, but from Perl 5.18 it is included,
+ which means that Perl space and POSIX space are now identical. PCRE
+ was changed at release 8.34. */
+ case PT_SPACE: /* Perl space */
case PT_PXSPACE: /* POSIX space */
- OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
- c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
- c == CHAR_FF || c == CHAR_CR;
+ switch(c)
+ {
+ HSPACE_CASES:
+ VSPACE_CASES:
+ OK = TRUE;
+ break;
+
+ default:
+ OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z;
+ break;
+ }
break;
case PT_WORD:
@@ -1975,7 +2013,7 @@ for (;;)
goto ANYNL03;
case CHAR_CR:
- if (ptr + 1 < end_subject && RAWUCHARTEST(ptr + 1) == CHAR_LF) ncount = 1;
+ if (ptr + 1 < end_subject && UCHAR21TEST(ptr + 1) == CHAR_LF) ncount = 1;
/* Fall through */
ANYNL03:
@@ -2173,7 +2211,7 @@ for (;;)
if ((md->moptions & PCRE_PARTIAL_HARD) != 0)
reset_could_continue = TRUE;
}
- else if (RAWUCHARTEST(ptr + 1) == CHAR_LF)
+ else if (UCHAR21TEST(ptr + 1) == CHAR_LF)
{
ADD_NEW_DATA(-(state_offset + 1), 0, 1);
}
@@ -2534,31 +2572,65 @@ for (;;)
{
case OP_CRSTAR:
case OP_CRMINSTAR:
+ case OP_CRPOSSTAR:
ADD_ACTIVE(next_state_offset + 1, 0);
- if (isinclass) { ADD_NEW(state_offset, 0); }
+ if (isinclass)
+ {
+ if (*ecode == OP_CRPOSSTAR)
+ {
+ active_count--; /* Remove non-match possibility */
+ next_active_state--;
+ }
+ ADD_NEW(state_offset, 0);
+ }
break;
case OP_CRPLUS:
case OP_CRMINPLUS:
+ case OP_CRPOSPLUS:
count = current_state->count; /* Already matched */
if (count > 0) { ADD_ACTIVE(next_state_offset + 1, 0); }
- if (isinclass) { count++; ADD_NEW(state_offset, count); }
+ if (isinclass)
+ {
+ if (count > 0 && *ecode == OP_CRPOSPLUS)
+ {
+ active_count--; /* Remove non-match possibility */
+ next_active_state--;
+ }
+ count++;
+ ADD_NEW(state_offset, count);
+ }
break;
case OP_CRQUERY:
case OP_CRMINQUERY:
+ case OP_CRPOSQUERY:
ADD_ACTIVE(next_state_offset + 1, 0);
- if (isinclass) { ADD_NEW(next_state_offset + 1, 0); }
+ if (isinclass)
+ {
+ if (*ecode == OP_CRPOSQUERY)
+ {
+ active_count--; /* Remove non-match possibility */
+ next_active_state--;
+ }
+ ADD_NEW(next_state_offset + 1, 0);
+ }
break;
case OP_CRRANGE:
case OP_CRMINRANGE:
+ case OP_CRPOSRANGE:
count = current_state->count; /* Already matched */
if (count >= (int)GET2(ecode, 1))
{ ADD_ACTIVE(next_state_offset + 1 + 2 * IMM2_SIZE, 0); }
if (isinclass)
{
int max = (int)GET2(ecode, 1 + IMM2_SIZE);
+ if (*ecode == OP_CRPOSRANGE)
+ {
+ active_count--; /* Remove non-match possibility */
+ next_active_state--;
+ }
if (++count >= max && max != 0) /* Max 0 => no limit */
{ ADD_NEW(next_state_offset + 1 + 2 * IMM2_SIZE, 0); }
else
@@ -2658,21 +2730,24 @@ for (;;)
condcode = code[LINK_SIZE+1];
- /* Back reference conditions are not supported */
+ /* Back reference conditions and duplicate named recursion conditions
+ are not supported */
- if (condcode == OP_CREF || condcode == OP_NCREF)
+ if (condcode == OP_CREF || condcode == OP_DNCREF ||
+ condcode == OP_DNRREF)
return PCRE_ERROR_DFA_UCOND;
- /* The DEFINE condition is always false */
+ /* The DEFINE condition is always false, and the assertion (?!) is
+ converted to OP_FAIL. */
- if (condcode == OP_DEF)
+ if (condcode == OP_DEF || condcode == OP_FAIL)
{ ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
/* The only supported version of OP_RREF is for the value RREF_ANY,
which means "test if in any recursion". We can't test for specifically
recursed groups. */
- else if (condcode == OP_RREF || condcode == OP_NRREF)
+ else if (condcode == OP_RREF)
{
int value = GET2(code, LINK_SIZE + 2);
if (value != RREF_ANY) return PCRE_ERROR_DFA_UCOND;
@@ -3176,7 +3251,7 @@ md->callout_data = NULL;
if (extra_data != NULL)
{
- unsigned int flags = extra_data->flags;
+ unsigned long int flags = extra_data->flags;
if ((flags & PCRE_EXTRA_STUDY_DATA) != 0)
study = (const pcre_study_data *)extra_data->study_data;
if ((flags & PCRE_EXTRA_MATCH_LIMIT) != 0) return PCRE_ERROR_DFA_UMLIMIT;
@@ -3400,7 +3475,7 @@ for (;;)
if (((options | re->options) & PCRE_NO_START_OPTIMIZE) == 0)
{
- /* Advance to a known first char. */
+ /* Advance to a known first pcre_uchar (i.e. data item) */
if (has_first_char)
{
@@ -3408,12 +3483,12 @@ for (;;)
{
pcre_uchar csc;
while (current_subject < end_subject &&
- (csc = RAWUCHARTEST(current_subject)) != first_char && csc != first_char2)
+ (csc = UCHAR21TEST(current_subject)) != first_char && csc != first_char2)
current_subject++;
}
else
while (current_subject < end_subject &&
- RAWUCHARTEST(current_subject) != first_char)
+ UCHAR21TEST(current_subject) != first_char)
current_subject++;
}
@@ -3443,36 +3518,26 @@ for (;;)
ANYCRLF, and we are now at a LF, advance the match position by one
more character. */
- if (RAWUCHARTEST(current_subject - 1) == CHAR_CR &&
+ if (UCHAR21TEST(current_subject - 1) == CHAR_CR &&
(md->nltype == NLTYPE_ANY || md->nltype == NLTYPE_ANYCRLF) &&
current_subject < end_subject &&
- RAWUCHARTEST(current_subject) == CHAR_NL)
+ UCHAR21TEST(current_subject) == CHAR_NL)
current_subject++;
}
}
- /* Or to a non-unique first char after study */
+ /* Advance to a non-unique first pcre_uchar after study */
else if (start_bits != NULL)
{
while (current_subject < end_subject)
{
- register pcre_uint32 c = RAWUCHARTEST(current_subject);
+ register pcre_uint32 c = UCHAR21TEST(current_subject);
#ifndef COMPILE_PCRE8
if (c > 255) c = 255;
#endif
- if ((start_bits[c/8] & (1 << (c&7))) == 0)
- {
- current_subject++;
-#if defined SUPPORT_UTF && defined COMPILE_PCRE8
- /* In non 8-bit mode, the iteration will stop for
- characters > 255 at the beginning or not stop at all. */
- if (utf)
- ACROSSCHAR(current_subject < end_subject, *current_subject,
- current_subject++);
-#endif
- }
- else break;
+ if ((start_bits[c/8] & (1 << (c&7))) != 0) break;
+ current_subject++;
}
}
}
@@ -3491,19 +3556,20 @@ for (;;)
/* If the pattern was studied, a minimum subject length may be set. This
is a lower bound; no actual string of that length may actually match the
pattern. Although the value is, strictly, in characters, we treat it as
- bytes to avoid spending too much time in this optimization. */
+ in pcre_uchar units to avoid spending too much time in this optimization.
+ */
if (study != NULL && (study->flags & PCRE_STUDY_MINLEN) != 0 &&
(pcre_uint32)(end_subject - current_subject) < study->minlength)
return PCRE_ERROR_NOMATCH;
- /* If req_char is set, we know that that character must appear in the
- subject for the match to succeed. If the first character is set, req_char
- must be later in the subject; otherwise the test starts at the match
- point. This optimization can save a huge amount of work in patterns with
- nested unlimited repeats that aren't going to match. Writing separate
- code for cased/caseless versions makes it go faster, as does using an
- autoincrement and backing off on a match.
+ /* If req_char is set, we know that that pcre_uchar must appear in the
+ subject for the match to succeed. If the first pcre_uchar is set,
+ req_char must be later in the subject; otherwise the test starts at the
+ match point. This optimization can save a huge amount of work in patterns
+ with nested unlimited repeats that aren't going to match. Writing
+ separate code for cased/caseless versions makes it go faster, as does
+ using an autoincrement and backing off on a match.
HOWEVER: when the subject string is very, very long, searching to its end
can take a long time, and give bad performance on quite ordinary
@@ -3523,7 +3589,7 @@ for (;;)
{
while (p < end_subject)
{
- register pcre_uint32 pp = RAWUCHARINCTEST(p);
+ register pcre_uint32 pp = UCHAR21INCTEST(p);
if (pp == req_char || pp == req_char2) { p--; break; }
}
}
@@ -3531,18 +3597,18 @@ for (;;)
{
while (p < end_subject)
{
- if (RAWUCHARINCTEST(p) == req_char) { p--; break; }
+ if (UCHAR21INCTEST(p) == req_char) { p--; break; }
}
}
- /* If we can't find the required character, break the matching loop,
+ /* If we can't find the required pcre_uchar, break the matching loop,
which will cause a return or PCRE_ERROR_NOMATCH. */
if (p >= end_subject) break;
- /* If we have found the required character, save the point where we
+ /* If we have found the required pcre_uchar, save the point where we
found it, so that we don't search again next time round the loop if
- the start hasn't passed this character yet. */
+ the start hasn't passed this point yet. */
req_char_ptr = p;
}
@@ -3599,9 +3665,9 @@ for (;;)
not contain any explicit matches for \r or \n, and the newline option is CRLF
or ANY or ANYCRLF, advance the match position by one more character. */
- if (RAWUCHARTEST(current_subject - 1) == CHAR_CR &&
+ if (UCHAR21TEST(current_subject - 1) == CHAR_CR &&
current_subject < end_subject &&
- RAWUCHARTEST(current_subject) == CHAR_NL &&
+ UCHAR21TEST(current_subject) == CHAR_NL &&
(re->flags & PCRE_HASCRORLF) == 0 &&
(md->nltype == NLTYPE_ANY ||
md->nltype == NLTYPE_ANYCRLF ||