diff options
Diffstat (limited to 'erts/emulator/pcre/pcre_tables.c')
-rw-r--r-- | erts/emulator/pcre/pcre_tables.c | 319 |
1 files changed, 319 insertions, 0 deletions
diff --git a/erts/emulator/pcre/pcre_tables.c b/erts/emulator/pcre/pcre_tables.c new file mode 100644 index 0000000000..72772de1dc --- /dev/null +++ b/erts/emulator/pcre/pcre_tables.c @@ -0,0 +1,319 @@ +/************************************************* +* Perl-Compatible Regular Expressions * +*************************************************/ + +/* PCRE is a library of functions to support regular expressions whose syntax +and semantics are as close as possible to those of the Perl 5 language. + + Written by Philip Hazel + Copyright (c) 1997-2008 University of Cambridge + +----------------------------------------------------------------------------- +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + * Neither the name of the University of Cambridge nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. +----------------------------------------------------------------------------- +*/ + + +/* This module contains some fixed tables that are used by more than one of the +PCRE code modules. The tables are also #included by the pcretest program, which +uses macros to change their names from _erts_pcre_xxx to xxxx, thereby avoiding name +clashes with the library. */ + +/* %ExternalCopyright% */ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include "pcre_internal.h" + + +/* Table of sizes for the fixed-length opcodes. It's defined in a macro so that +the definition is next to the definition of the opcodes in pcre_internal.h. */ + +const uschar _erts_pcre_OP_lengths[] = { OP_LENGTHS }; + + + +/************************************************* +* Tables for UTF-8 support * +*************************************************/ + +/* These are the breakpoints for different numbers of bytes in a UTF-8 +character. */ + +#ifdef SUPPORT_UTF8 + +const int _erts_pcre_utf8_table1[] = + { 0x7f, 0x7ff, 0xffff, 0x1fffff, 0x3ffffff, 0x7fffffff}; + +const int _erts_pcre_utf8_table1_size = sizeof(_erts_pcre_utf8_table1)/sizeof(int); + +/* These are the indicator bits and the mask for the data bits to set in the +first byte of a character, indexed by the number of additional bytes. */ + +const int _erts_pcre_utf8_table2[] = { 0, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc}; +const int _erts_pcre_utf8_table3[] = { 0xff, 0x1f, 0x0f, 0x07, 0x03, 0x01}; + +/* Table of the number of extra bytes, indexed by the first byte masked with +0x3f. The highest number for a valid UTF-8 first byte is in fact 0x3d. */ + +const uschar _erts_pcre_utf8_table4[] = { + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, + 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, + 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5 }; + +/* The pcre_utt[] table below translates Unicode property names into type and +code values. It is searched by binary chop, so must be in collating sequence of +name. Originally, the table contained pointers to the name strings in the first +field of each entry. However, that leads to a large number of relocations when +a shared library is dynamically loaded. A significant reduction is made by +putting all the names into a single, large string and then using offsets in the +table itself. Maintenance is more error-prone, but frequent changes to this +data is unlikely. */ + +const char _erts_pcre_utt_names[] = + "Any\0" + "Arabic\0" + "Armenian\0" + "Balinese\0" + "Bengali\0" + "Bopomofo\0" + "Braille\0" + "Buginese\0" + "Buhid\0" + "C\0" + "Canadian_Aboriginal\0" + "Cc\0" + "Cf\0" + "Cherokee\0" + "Cn\0" + "Co\0" + "Common\0" + "Coptic\0" + "Cs\0" + "Cuneiform\0" + "Cypriot\0" + "Cyrillic\0" + "Deseret\0" + "Devanagari\0" + "Ethiopic\0" + "Georgian\0" + "Glagolitic\0" + "Gothic\0" + "Greek\0" + "Gujarati\0" + "Gurmukhi\0" + "Han\0" + "Hangul\0" + "Hanunoo\0" + "Hebrew\0" + "Hiragana\0" + "Inherited\0" + "Kannada\0" + "Katakana\0" + "Kharoshthi\0" + "Khmer\0" + "L\0" + "L&\0" + "Lao\0" + "Latin\0" + "Limbu\0" + "Linear_B\0" + "Ll\0" + "Lm\0" + "Lo\0" + "Lt\0" + "Lu\0" + "M\0" + "Malayalam\0" + "Mc\0" + "Me\0" + "Mn\0" + "Mongolian\0" + "Myanmar\0" + "N\0" + "Nd\0" + "New_Tai_Lue\0" + "Nko\0" + "Nl\0" + "No\0" + "Ogham\0" + "Old_Italic\0" + "Old_Persian\0" + "Oriya\0" + "Osmanya\0" + "P\0" + "Pc\0" + "Pd\0" + "Pe\0" + "Pf\0" + "Phags_Pa\0" + "Phoenician\0" + "Pi\0" + "Po\0" + "Ps\0" + "Runic\0" + "S\0" + "Sc\0" + "Shavian\0" + "Sinhala\0" + "Sk\0" + "Sm\0" + "So\0" + "Syloti_Nagri\0" + "Syriac\0" + "Tagalog\0" + "Tagbanwa\0" + "Tai_Le\0" + "Tamil\0" + "Telugu\0" + "Thaana\0" + "Thai\0" + "Tibetan\0" + "Tifinagh\0" + "Ugaritic\0" + "Yi\0" + "Z\0" + "Zl\0" + "Zp\0" + "Zs\0"; + +const ucp_type_table _erts_pcre_utt[] = { + { 0, PT_ANY, 0 }, + { 4, PT_SC, ucp_Arabic }, + { 11, PT_SC, ucp_Armenian }, + { 20, PT_SC, ucp_Balinese }, + { 29, PT_SC, ucp_Bengali }, + { 37, PT_SC, ucp_Bopomofo }, + { 46, PT_SC, ucp_Braille }, + { 54, PT_SC, ucp_Buginese }, + { 63, PT_SC, ucp_Buhid }, + { 69, PT_GC, ucp_C }, + { 71, PT_SC, ucp_Canadian_Aboriginal }, + { 91, PT_PC, ucp_Cc }, + { 94, PT_PC, ucp_Cf }, + { 97, PT_SC, ucp_Cherokee }, + { 106, PT_PC, ucp_Cn }, + { 109, PT_PC, ucp_Co }, + { 112, PT_SC, ucp_Common }, + { 119, PT_SC, ucp_Coptic }, + { 126, PT_PC, ucp_Cs }, + { 129, PT_SC, ucp_Cuneiform }, + { 139, PT_SC, ucp_Cypriot }, + { 147, PT_SC, ucp_Cyrillic }, + { 156, PT_SC, ucp_Deseret }, + { 164, PT_SC, ucp_Devanagari }, + { 175, PT_SC, ucp_Ethiopic }, + { 184, PT_SC, ucp_Georgian }, + { 193, PT_SC, ucp_Glagolitic }, + { 204, PT_SC, ucp_Gothic }, + { 211, PT_SC, ucp_Greek }, + { 217, PT_SC, ucp_Gujarati }, + { 226, PT_SC, ucp_Gurmukhi }, + { 235, PT_SC, ucp_Han }, + { 239, PT_SC, ucp_Hangul }, + { 246, PT_SC, ucp_Hanunoo }, + { 254, PT_SC, ucp_Hebrew }, + { 261, PT_SC, ucp_Hiragana }, + { 270, PT_SC, ucp_Inherited }, + { 280, PT_SC, ucp_Kannada }, + { 288, PT_SC, ucp_Katakana }, + { 297, PT_SC, ucp_Kharoshthi }, + { 308, PT_SC, ucp_Khmer }, + { 314, PT_GC, ucp_L }, + { 316, PT_LAMP, 0 }, + { 319, PT_SC, ucp_Lao }, + { 323, PT_SC, ucp_Latin }, + { 329, PT_SC, ucp_Limbu }, + { 335, PT_SC, ucp_Linear_B }, + { 344, PT_PC, ucp_Ll }, + { 347, PT_PC, ucp_Lm }, + { 350, PT_PC, ucp_Lo }, + { 353, PT_PC, ucp_Lt }, + { 356, PT_PC, ucp_Lu }, + { 359, PT_GC, ucp_M }, + { 361, PT_SC, ucp_Malayalam }, + { 371, PT_PC, ucp_Mc }, + { 374, PT_PC, ucp_Me }, + { 377, PT_PC, ucp_Mn }, + { 380, PT_SC, ucp_Mongolian }, + { 390, PT_SC, ucp_Myanmar }, + { 398, PT_GC, ucp_N }, + { 400, PT_PC, ucp_Nd }, + { 403, PT_SC, ucp_New_Tai_Lue }, + { 415, PT_SC, ucp_Nko }, + { 419, PT_PC, ucp_Nl }, + { 422, PT_PC, ucp_No }, + { 425, PT_SC, ucp_Ogham }, + { 431, PT_SC, ucp_Old_Italic }, + { 442, PT_SC, ucp_Old_Persian }, + { 454, PT_SC, ucp_Oriya }, + { 460, PT_SC, ucp_Osmanya }, + { 468, PT_GC, ucp_P }, + { 470, PT_PC, ucp_Pc }, + { 473, PT_PC, ucp_Pd }, + { 476, PT_PC, ucp_Pe }, + { 479, PT_PC, ucp_Pf }, + { 482, PT_SC, ucp_Phags_Pa }, + { 491, PT_SC, ucp_Phoenician }, + { 502, PT_PC, ucp_Pi }, + { 505, PT_PC, ucp_Po }, + { 508, PT_PC, ucp_Ps }, + { 511, PT_SC, ucp_Runic }, + { 517, PT_GC, ucp_S }, + { 519, PT_PC, ucp_Sc }, + { 522, PT_SC, ucp_Shavian }, + { 530, PT_SC, ucp_Sinhala }, + { 538, PT_PC, ucp_Sk }, + { 541, PT_PC, ucp_Sm }, + { 544, PT_PC, ucp_So }, + { 547, PT_SC, ucp_Syloti_Nagri }, + { 560, PT_SC, ucp_Syriac }, + { 567, PT_SC, ucp_Tagalog }, + { 575, PT_SC, ucp_Tagbanwa }, + { 584, PT_SC, ucp_Tai_Le }, + { 591, PT_SC, ucp_Tamil }, + { 597, PT_SC, ucp_Telugu }, + { 604, PT_SC, ucp_Thaana }, + { 611, PT_SC, ucp_Thai }, + { 616, PT_SC, ucp_Tibetan }, + { 624, PT_SC, ucp_Tifinagh }, + { 633, PT_SC, ucp_Ugaritic }, + { 642, PT_SC, ucp_Yi }, + { 645, PT_GC, ucp_Z }, + { 647, PT_PC, ucp_Zl }, + { 650, PT_PC, ucp_Zp }, + { 653, PT_PC, ucp_Zs } +}; + +const int _erts_pcre_utt_size = sizeof(_erts_pcre_utt)/sizeof(ucp_type_table); + +#endif /* SUPPORT_UTF8 */ + +/* End of pcre_tables.c */ |