aboutsummaryrefslogtreecommitdiffstats
path: root/erts/emulator/pcre/pcre_xclass.c
diff options
context:
space:
mode:
Diffstat (limited to 'erts/emulator/pcre/pcre_xclass.c')
-rw-r--r--erts/emulator/pcre/pcre_xclass.c104
1 files changed, 84 insertions, 20 deletions
diff --git a/erts/emulator/pcre/pcre_xclass.c b/erts/emulator/pcre/pcre_xclass.c
index 1172cd17ac..cf07451a10 100644
--- a/erts/emulator/pcre/pcre_xclass.c
+++ b/erts/emulator/pcre/pcre_xclass.c
@@ -6,7 +6,7 @@
and semantics are as close as possible to those of the Perl 5 language.
Written by Philip Hazel
- Copyright (c) 1997-2008 University of Cambridge
+ Copyright (c) 1997-2013 University of Cambridge
-----------------------------------------------------------------------------
Redistribution and use in source and binary forms, with or without
@@ -39,8 +39,7 @@ POSSIBILITY OF SUCH DAMAGE.
/* This module contains an internal function that is used to match an extended
-class (one that contains characters whose values are > 255). It is used by both
-pcre_exec() and pcre_def_exec(). */
+class. It is used by both pcre_exec() and pcre_def_exec(). */
/* %ExternalCopyright% */
@@ -56,7 +55,7 @@ pcre_exec() and pcre_def_exec(). */
*************************************************/
/* This function is called to match a character against an extended class that
-might contain values > 255.
+might contain values > 255 and/or Unicode properties.
Arguments:
c the character
@@ -66,47 +65,70 @@ Returns: TRUE if character matches, else FALSE
*/
BOOL
-_erts_pcre_xclass(int c, const uschar *data)
+PRIV(xclass)(pcre_uint32 c, const pcre_uchar *data, BOOL utf)
{
-int t;
+pcre_uchar t;
BOOL negated = (*data & XCL_NOT) != 0;
+(void)utf;
+#ifdef COMPILE_PCRE8
+/* In 8 bit mode, this must always be TRUE. Help the compiler to know that. */
+utf = TRUE;
+#endif
+
/* Character values < 256 are matched against a bitmap, if one is present. If
not, we still carry on, because there may be ranges that start below 256 in the
additional data. */
if (c < 256)
{
- if ((*data & XCL_MAP) != 0 && (data[1 + c/8] & (1 << (c&7))) != 0)
- return !negated; /* char found */
+ if ((*data & XCL_MAP) != 0 &&
+ (((pcre_uint8 *)(data + 1))[c/8] & (1 << (c&7))) != 0)
+ return !negated; /* char found */
}
/* First skip the bit map if present. Then match against the list of Unicode
properties or large chars or ranges that end with a large char. We won't ever
encounter XCL_PROP or XCL_NOTPROP when UCP support is not compiled. */
-if ((*data++ & XCL_MAP) != 0) data += 32;
+if ((*data++ & XCL_MAP) != 0) data += 32 / sizeof(pcre_uchar);
while ((t = *data++) != XCL_END)
{
- int x, y;
+ pcre_uint32 x, y;
if (t == XCL_SINGLE)
{
- GETCHARINC(x, data);
+#ifdef SUPPORT_UTF
+ if (utf)
+ {
+ GETCHARINC(x, data); /* macro generates multiple statements */
+ }
+ else
+#endif
+ x = *data++;
if (c == x) return !negated;
}
else if (t == XCL_RANGE)
{
- GETCHARINC(x, data);
- GETCHARINC(y, data);
+#ifdef SUPPORT_UTF
+ if (utf)
+ {
+ GETCHARINC(x, data); /* macro generates multiple statements */
+ GETCHARINC(y, data); /* macro generates multiple statements */
+ }
+ else
+#endif
+ {
+ x = *data++;
+ y = *data++;
+ }
if (c >= x && c <= y) return !negated;
}
#ifdef SUPPORT_UCP
else /* XCL_PROP & XCL_NOTPROP */
{
- int chartype, script;
- int category = _erts_pcre_ucp_findprop(c, &chartype, &script);
+ const ucd_record *prop = GET_UCD(c);
switch(*data)
{
@@ -115,20 +137,62 @@ while ((t = *data++) != XCL_END)
break;
case PT_LAMP:
- if ((chartype == ucp_Lu || chartype == ucp_Ll || chartype == ucp_Lt) ==
- (t == XCL_PROP)) return !negated;
+ if ((prop->chartype == ucp_Lu || prop->chartype == ucp_Ll ||
+ prop->chartype == ucp_Lt) == (t == XCL_PROP)) return !negated;
break;
case PT_GC:
- if ((data[1] == category) == (t == XCL_PROP)) return !negated;
+ if ((data[1] == PRIV(ucp_gentype)[prop->chartype]) == (t == XCL_PROP))
+ return !negated;
break;
case PT_PC:
- if ((data[1] == chartype) == (t == XCL_PROP)) return !negated;
+ if ((data[1] == prop->chartype) == (t == XCL_PROP)) return !negated;
break;
case PT_SC:
- if ((data[1] == script) == (t == XCL_PROP)) return !negated;
+ if ((data[1] == prop->script) == (t == XCL_PROP)) return !negated;
+ break;
+
+ case PT_ALNUM:
+ if ((PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
+ PRIV(ucp_gentype)[prop->chartype] == ucp_N) == (t == XCL_PROP))
+ return !negated;
+ break;
+
+ case PT_SPACE: /* Perl space */
+ if ((PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
+ c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR)
+ == (t == XCL_PROP))
+ return !negated;
+ break;
+
+ case PT_PXSPACE: /* POSIX space */
+ if ((PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
+ c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
+ c == CHAR_FF || c == CHAR_CR) == (t == XCL_PROP))
+ return !negated;
+ break;
+
+ case PT_WORD:
+ if ((PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
+ PRIV(ucp_gentype)[prop->chartype] == ucp_N || c == CHAR_UNDERSCORE)
+ == (t == XCL_PROP))
+ return !negated;
+ break;
+
+ case PT_UCNC:
+ if (c < 0xa0)
+ {
+ if ((c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT ||
+ c == CHAR_GRAVE_ACCENT) == (t == XCL_PROP))
+ return !negated;
+ }
+ else
+ {
+ if ((c < 0xd800 || c > 0xdfff) == (t == XCL_PROP))
+ return !negated;
+ }
break;
/* This should never occur, but compilers may mutter if there is no