diff options
Diffstat (limited to 'erts/emulator/pcre/pcre_xclass.c')
-rw-r--r-- | erts/emulator/pcre/pcre_xclass.c | 104 |
1 files changed, 84 insertions, 20 deletions
diff --git a/erts/emulator/pcre/pcre_xclass.c b/erts/emulator/pcre/pcre_xclass.c index 1172cd17ac..cf07451a10 100644 --- a/erts/emulator/pcre/pcre_xclass.c +++ b/erts/emulator/pcre/pcre_xclass.c @@ -6,7 +6,7 @@ and semantics are as close as possible to those of the Perl 5 language. Written by Philip Hazel - Copyright (c) 1997-2008 University of Cambridge + Copyright (c) 1997-2013 University of Cambridge ----------------------------------------------------------------------------- Redistribution and use in source and binary forms, with or without @@ -39,8 +39,7 @@ POSSIBILITY OF SUCH DAMAGE. /* This module contains an internal function that is used to match an extended -class (one that contains characters whose values are > 255). It is used by both -pcre_exec() and pcre_def_exec(). */ +class. It is used by both pcre_exec() and pcre_def_exec(). */ /* %ExternalCopyright% */ @@ -56,7 +55,7 @@ pcre_exec() and pcre_def_exec(). */ *************************************************/ /* This function is called to match a character against an extended class that -might contain values > 255. +might contain values > 255 and/or Unicode properties. Arguments: c the character @@ -66,47 +65,70 @@ Returns: TRUE if character matches, else FALSE */ BOOL -_erts_pcre_xclass(int c, const uschar *data) +PRIV(xclass)(pcre_uint32 c, const pcre_uchar *data, BOOL utf) { -int t; +pcre_uchar t; BOOL negated = (*data & XCL_NOT) != 0; +(void)utf; +#ifdef COMPILE_PCRE8 +/* In 8 bit mode, this must always be TRUE. Help the compiler to know that. */ +utf = TRUE; +#endif + /* Character values < 256 are matched against a bitmap, if one is present. If not, we still carry on, because there may be ranges that start below 256 in the additional data. */ if (c < 256) { - if ((*data & XCL_MAP) != 0 && (data[1 + c/8] & (1 << (c&7))) != 0) - return !negated; /* char found */ + if ((*data & XCL_MAP) != 0 && + (((pcre_uint8 *)(data + 1))[c/8] & (1 << (c&7))) != 0) + return !negated; /* char found */ } /* First skip the bit map if present. Then match against the list of Unicode properties or large chars or ranges that end with a large char. We won't ever encounter XCL_PROP or XCL_NOTPROP when UCP support is not compiled. */ -if ((*data++ & XCL_MAP) != 0) data += 32; +if ((*data++ & XCL_MAP) != 0) data += 32 / sizeof(pcre_uchar); while ((t = *data++) != XCL_END) { - int x, y; + pcre_uint32 x, y; if (t == XCL_SINGLE) { - GETCHARINC(x, data); +#ifdef SUPPORT_UTF + if (utf) + { + GETCHARINC(x, data); /* macro generates multiple statements */ + } + else +#endif + x = *data++; if (c == x) return !negated; } else if (t == XCL_RANGE) { - GETCHARINC(x, data); - GETCHARINC(y, data); +#ifdef SUPPORT_UTF + if (utf) + { + GETCHARINC(x, data); /* macro generates multiple statements */ + GETCHARINC(y, data); /* macro generates multiple statements */ + } + else +#endif + { + x = *data++; + y = *data++; + } if (c >= x && c <= y) return !negated; } #ifdef SUPPORT_UCP else /* XCL_PROP & XCL_NOTPROP */ { - int chartype, script; - int category = _erts_pcre_ucp_findprop(c, &chartype, &script); + const ucd_record *prop = GET_UCD(c); switch(*data) { @@ -115,20 +137,62 @@ while ((t = *data++) != XCL_END) break; case PT_LAMP: - if ((chartype == ucp_Lu || chartype == ucp_Ll || chartype == ucp_Lt) == - (t == XCL_PROP)) return !negated; + if ((prop->chartype == ucp_Lu || prop->chartype == ucp_Ll || + prop->chartype == ucp_Lt) == (t == XCL_PROP)) return !negated; break; case PT_GC: - if ((data[1] == category) == (t == XCL_PROP)) return !negated; + if ((data[1] == PRIV(ucp_gentype)[prop->chartype]) == (t == XCL_PROP)) + return !negated; break; case PT_PC: - if ((data[1] == chartype) == (t == XCL_PROP)) return !negated; + if ((data[1] == prop->chartype) == (t == XCL_PROP)) return !negated; break; case PT_SC: - if ((data[1] == script) == (t == XCL_PROP)) return !negated; + if ((data[1] == prop->script) == (t == XCL_PROP)) return !negated; + break; + + case PT_ALNUM: + if ((PRIV(ucp_gentype)[prop->chartype] == ucp_L || + PRIV(ucp_gentype)[prop->chartype] == ucp_N) == (t == XCL_PROP)) + return !negated; + break; + + case PT_SPACE: /* Perl space */ + if ((PRIV(ucp_gentype)[prop->chartype] == ucp_Z || + c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR) + == (t == XCL_PROP)) + return !negated; + break; + + case PT_PXSPACE: /* POSIX space */ + if ((PRIV(ucp_gentype)[prop->chartype] == ucp_Z || + c == CHAR_HT || c == CHAR_NL || c == CHAR_VT || + c == CHAR_FF || c == CHAR_CR) == (t == XCL_PROP)) + return !negated; + break; + + case PT_WORD: + if ((PRIV(ucp_gentype)[prop->chartype] == ucp_L || + PRIV(ucp_gentype)[prop->chartype] == ucp_N || c == CHAR_UNDERSCORE) + == (t == XCL_PROP)) + return !negated; + break; + + case PT_UCNC: + if (c < 0xa0) + { + if ((c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT || + c == CHAR_GRAVE_ACCENT) == (t == XCL_PROP)) + return !negated; + } + else + { + if ((c < 0xd800 || c > 0xdfff) == (t == XCL_PROP)) + return !negated; + } break; /* This should never occur, but compilers may mutter if there is no |