aboutsummaryrefslogtreecommitdiffstats
path: root/erts/emulator/pcre/ucpinternal.h
diff options
context:
space:
mode:
Diffstat (limited to 'erts/emulator/pcre/ucpinternal.h')
-rw-r--r--erts/emulator/pcre/ucpinternal.h94
1 files changed, 94 insertions, 0 deletions
diff --git a/erts/emulator/pcre/ucpinternal.h b/erts/emulator/pcre/ucpinternal.h
new file mode 100644
index 0000000000..9893d39672
--- /dev/null
+++ b/erts/emulator/pcre/ucpinternal.h
@@ -0,0 +1,94 @@
+/*************************************************
+* Unicode Property Table handler *
+*************************************************/
+
+/* %ExternalCopyright% */
+
+#ifndef _UCPINTERNAL_H
+#define _UCPINTERNAL_H
+
+/* Internal header file defining the layout of the bits in each pair of 32-bit
+words that form a data item in the table. */
+
+typedef struct cnode {
+ pcre_uint32 f0;
+ pcre_uint32 f1;
+} cnode;
+
+/* Things for the f0 field */
+
+#define f0_scriptmask 0xff000000 /* Mask for script field */
+#define f0_scriptshift 24 /* Shift for script value */
+#define f0_rangeflag 0x00f00000 /* Flag for a range item */
+#define f0_charmask 0x001fffff /* Mask for code point value */
+
+/* Things for the f1 field */
+
+#define f1_typemask 0xfc000000 /* Mask for char type field */
+#define f1_typeshift 26 /* Shift for the type field */
+#define f1_rangemask 0x0000ffff /* Mask for a range offset */
+#define f1_casemask 0x0000ffff /* Mask for a case offset */
+#define f1_caseneg 0xffff8000 /* Bits for negation */
+
+/* The data consists of a vector of structures of type cnode. The two unsigned
+32-bit integers are used as follows:
+
+(f0) (1) The most significant byte holds the script number. The numbers are
+ defined by the enum in ucp.h.
+
+ (2) The 0x00800000 bit is set if this entry defines a range of characters.
+ It is not set if this entry defines a single character
+
+ (3) The 0x00600000 bits are spare.
+
+ (4) The 0x001fffff bits contain the code point. No Unicode code point will
+ ever be greater than 0x0010ffff, so this should be OK for ever.
+
+(f1) (1) The 0xfc000000 bits contain the character type number. The numbers are
+ defined by an enum in ucp.h.
+
+ (2) The 0x03ff0000 bits are spare.
+
+ (3) The 0x0000ffff bits contain EITHER the unsigned offset to the top of
+ range if this entry defines a range, OR the *signed* offset to the
+ character's "other case" partner if this entry defines a single
+ character. There is no partner if the value is zero.
+
+-------------------------------------------------------------------------------
+| script (8) |.|.|.| codepoint (21) || type (6) |.|.| spare (8) | offset (16) |
+-------------------------------------------------------------------------------
+ | | | | |
+ | | |-> spare | |-> spare
+ | | |
+ | |-> spare |-> spare
+ |
+ |-> range flag
+
+The upper/lower casing information is set only for characters that come in
+pairs. The non-one-to-one mappings in the Unicode data are ignored.
+
+When searching the data, proceed as follows:
+
+(1) Set up for a binary chop search.
+
+(2) If the top is not greater than the bottom, the character is not in the
+ table. Its type must therefore be "Cn" ("Undefined").
+
+(3) Find the middle vector element.
+
+(4) Extract the code point and compare. If equal, we are done.
+
+(5) If the test character is smaller, set the top to the current point, and
+ goto (2).
+
+(6) If the current entry defines a range, compute the last character by adding
+ the offset, and see if the test character is within the range. If it is,
+ we are done.
+
+(7) Otherwise, set the bottom to one element past the current point and goto
+ (2).
+*/
+
+#endif /* _UCPINTERNAL_H */
+
+/* End of ucpinternal.h */