mux/src/pcre.cpp

Go to the documentation of this file.
00001 /*************************************************
00002 *      Perl-Compatible Regular Expressions       *
00003 *************************************************/
00004 
00005 
00006 /* This is a library of functions to support regular expressions whose syntax
00007 and semantics are as close as possible to those of the Perl 5 language. See
00008 the file Tech.Notes for some information on the internals.
00009 
00010 Written by: Philip Hazel <ph10@cam.ac.uk>
00011 
00012            Copyright (c) 1997-2003 University of Cambridge
00013 
00014 -----------------------------------------------------------------------------
00015 Permission is granted to anyone to use this software for any purpose on any
00016 computer system, and to redistribute it freely, subject to the following
00017 restrictions:
00018 
00019 1. This software is distributed in the hope that it will be useful,
00020    but WITHOUT ANY WARRANTY; without even the implied warranty of
00021    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
00022 
00023 2. The origin of this software must not be misrepresented, either by
00024    explicit claim or by omission.
00025 
00026 3. Altered versions must be plainly marked as such, and must not be
00027    misrepresented as being the original software.
00028 
00029 4. If PCRE is embedded in any software that is released under the GNU
00030    General Purpose Licence (GPL), then the terms of that licence shall
00031    supersede any condition above with which it is incompatible.
00032 -----------------------------------------------------------------------------
00033 */
00034 
00035 /* Modified by Shawn Wagner for MUX to fit in one file and remove
00036    things we don't use, like a bunch of API functions and utf-8
00037    support. If you want the full thing, see http://www.pcre.org.
00038 
00039    Patched by Alierak to protect against integer overflow in repeat
00040    counts.
00041  */
00042 
00043 #include "autoconf.h"
00044 #include "config.h"
00045 
00046 #include <limits.h>
00047 #include <string.h>
00048 #include <ctype.h>
00049 #include <stdlib.h>
00050 #include <stddef.h>
00051 #include "pcre.h"
00052 
00053 #include "externs.h"
00054 #include "timeutil.h"
00055 
00056 /* Bits of PCRE's config.h */
00057 #define LINK_SIZE 2
00058 #define MATCH_LIMIT 100000
00059 #define NEWLINE '\n'
00060 
00061 /* Bits of internal.h */
00062 /* This header contains definitions that are shared between the different
00063 modules, but which are not relevant to the outside. */
00064 
00065 
00066 /* PCRE keeps offsets in its compiled code as 2-byte quantities by default.
00067 These are used, for example, to link from the start of a subpattern to its
00068 alternatives and its end. The use of 2 bytes per offset limits the size of the
00069 compiled regex to around 64K, which is big enough for almost everybody.
00070 However, I received a request for an even bigger limit. For this reason, and
00071 also to make the code easier to maintain, the storing and loading of offsets
00072 from the byte string is now handled by the macros that are defined here.
00073 
00074 The macros are controlled by the value of LINK_SIZE. This defaults to 2 in
00075 the config.h file, but can be overridden by using -D on the command line. This
00076 is automated on Unix systems via the "configure" command. */
00077 
00078 #define PUT(a,n,d)   \
00079   (a[n] = (d) >> 8), \
00080   (a[(n)+1] = (d) & 255)
00081 
00082 #define GET(a,n) \
00083   (((a)[n] << 8) | (a)[(n)+1])
00084 
00085 #define MAX_PATTERN_SIZE (1 << 16)
00086 
00087 
00088 /* Convenience macro defined in terms of the others */
00089 
00090 #define PUTINC(a,n,d)   PUT(a,n,d), a += LINK_SIZE
00091 
00092 
00093 /* PCRE uses some other 2-byte quantities that do not change when the size of
00094 offsets changes. There are used for repeat counts and for other things such as
00095 capturing parenthesis numbers in back references. */
00096 
00097 #define PUT2(a,n,d)   \
00098   a[n] = (d) >> 8; \
00099   a[(n)+1] = (d) & 255
00100 
00101 #define GET2(a,n) \
00102   (((a)[n] << 8) | (a)[(n)+1])
00103 
00104 #define PUT2INC(a,n,d)  PUT2(a,n,d), a += 2
00105 
00106 /* These are the public options that can change during matching. */
00107 
00108 #define PCRE_IMS (PCRE_CASELESS|PCRE_MULTILINE|PCRE_DOTALL)
00109 
00110 /* Private options flags start at the most significant end of the four bytes,
00111 but skip the top bit so we can use ints for convenience without getting tangled
00112 with negative values. The public options defined in pcre.h start at the least
00113 significant end. Make sure they don't overlap, though now that we have expanded
00114 to four bytes there is plenty of space. */
00115 
00116 #define PCRE_FIRSTSET      0x40000000  /* first_byte is set */
00117 #define PCRE_REQCHSET      0x20000000  /* req_byte is set */
00118 #define PCRE_STARTLINE     0x10000000  /* start after \n for multiline */
00119 #define PCRE_ICHANGED      0x08000000  /* i option changes within regex */
00120 
00121 /* Options for the "extra" block produced by pcre_study(). */
00122 
00123 #define PCRE_STUDY_MAPPED   0x01     /* a map of starting chars exists */
00124 
00125 /* Masks for identifying the public options which are permitted at compile
00126 time, run time or study time, respectively. */
00127 
00128 #define PUBLIC_OPTIONS \
00129   (PCRE_CASELESS|PCRE_EXTENDED|PCRE_ANCHORED|PCRE_MULTILINE| \
00130    PCRE_DOTALL|PCRE_DOLLAR_ENDONLY|PCRE_EXTRA|PCRE_UNGREEDY|PCRE_UTF8| \
00131    PCRE_NO_AUTO_CAPTURE|PCRE_NO_UTF8_CHECK)
00132 
00133 #define PUBLIC_EXEC_OPTIONS \
00134   (PCRE_ANCHORED|PCRE_NOTBOL|PCRE_NOTEOL|PCRE_NOTEMPTY|PCRE_NO_UTF8_CHECK)
00135 
00136 #define PUBLIC_STUDY_OPTIONS 0   /* None defined */
00137 
00138 /* Magic number to provide a small check against being handed junk. */
00139 
00140 #define MAGIC_NUMBER  0x50435245UL   /* 'PCRE' */
00141 
00142 /* Negative values for the firstchar and reqchar variables */
00143 
00144 #define REQ_UNSET (-2)
00145 #define REQ_NONE  (-1)
00146 
00147 /* Flags added to firstbyte or reqbyte; a "non-literal" item is either a
00148 variable-length repeat, or a anything other than literal characters. */
00149 
00150 #define REQ_CASELESS 0x0100    /* indicates caselessness */
00151 #define REQ_VARY     0x0200    /* reqbyte followed non-literal item */
00152 
00153 /* Miscellaneous definitions */
00154 
00155 /* Escape items that are just an encoding of a particular data value. Note that
00156 ESC_n is defined as yet another macro, which is set in config.h to either \n
00157 (the default) or \r (which some people want). */
00158 
00159 #ifndef ESC_e
00160 #define ESC_e 27
00161 #endif
00162 
00163 #ifndef ESC_f
00164 #define ESC_f '\f'
00165 #endif
00166 
00167 #ifndef ESC_n
00168 #define ESC_n NEWLINE
00169 #endif
00170 
00171 #ifndef ESC_r
00172 #define ESC_r '\r'
00173 #endif
00174 
00175 /* We can't officially use ESC_t because it is a POSIX reserved identifier
00176 (presumably because of all the others like size_t). */
00177 
00178 #ifndef ESC_tee
00179 #define ESC_tee '\t'
00180 #endif
00181 
00182 /* These are escaped items that aren't just an encoding of a particular data
00183 value such as \n. They must have non-zero values, as check_escape() returns
00184 their negation. Also, they must appear in the same order as in the opcode
00185 definitions below, up to ESC_z. There's a dummy for OP_ANY because it
00186 corresponds to "." rather than an escape sequence. The final one must be
00187 ESC_REF as subsequent values are used for \1, \2, \3, etc. There is are two
00188 tests in the code for an escape greater than ESC_b and less than ESC_Z to
00189 detect the types that may be repeated. These are the types that consume a
00190 character. If any new escapes are put in between that don't consume a
00191 character, that code will have to change. */
00192 
00193 enum { ESC_A = 1, ESC_G, ESC_B, ESC_b, ESC_D, ESC_d, ESC_S, ESC_s, ESC_W,
00194        ESC_w, ESC_dum1, ESC_C, ESC_Z, ESC_z, ESC_E, ESC_Q, ESC_REF };
00195 
00196 /* Flag bits and data types for the extended class (OP_XCLASS) for classes that
00197 contain UTF-8 characters with values greater than 255. */
00198 
00199 #define XCL_NOT    0x01    /* Flag: this is a negative class */
00200 #define XCL_MAP    0x02    /* Flag: a 32-byte map is present */
00201 
00202 #define XCL_END       0    /* Marks end of individual items */
00203 #define XCL_SINGLE    1    /* Single item (one multibyte char) follows */
00204 #define XCL_RANGE     2    /* A range (two multibyte chars) follows */
00205 
00206 
00207 /* Opcode table: OP_BRA must be last, as all values >= it are used for brackets
00208 that extract substrings. Starting from 1 (i.e. after OP_END), the values up to
00209 OP_EOD must correspond in order to the list of escapes immediately above.
00210 Note that whenever this list is updated, the two macro definitions that follow
00211 must also be updated to match. */
00212 
00213 enum {
00214   OP_END,            /* 0 End of pattern */
00215 
00216   /* Values corresponding to backslashed metacharacters */
00217 
00218   OP_SOD,            /* 1 Start of data: \A */
00219   OP_SOM,            /* 2 Start of match (subject + offset): \G */
00220   OP_NOT_WORD_BOUNDARY,  /*  3 \B */
00221   OP_WORD_BOUNDARY,      /*  4 \b */
00222   OP_NOT_DIGIT,          /*  5 \D */
00223   OP_DIGIT,              /*  6 \d */
00224   OP_NOT_WHITESPACE,     /*  7 \S */
00225   OP_WHITESPACE,         /*  8 \s */
00226   OP_NOT_WORDCHAR,       /*  9 \W */
00227   OP_WORDCHAR,           /* 10 \w */
00228   OP_ANY,            /* 11 Match any character */
00229   OP_ANYBYTE,        /* 12 Match any byte (\C); different to OP_ANY for UTF-8 */
00230   OP_EODN,           /* 13 End of data or \n at end of data: \Z. */
00231   OP_EOD,            /* 14 End of data: \z */
00232 
00233   OP_OPT,            /* 15 Set runtime options */
00234   OP_CIRC,           /* 16 Start of line - varies with multiline switch */
00235   OP_DOLL,           /* 17 End of line - varies with multiline switch */
00236   OP_CHARS,          /* 18 Match string of characters */
00237   OP_NOT,            /* 19 Match anything but the following char */
00238 
00239   OP_STAR,           /* 20 The maximizing and minimizing versions of */
00240   OP_MINSTAR,        /* 21 all these opcodes must come in pairs, with */
00241   OP_PLUS,           /* 22 the minimizing one second. */
00242   OP_MINPLUS,        /* 23 This first set applies to single characters */
00243   OP_QUERY,          /* 24 */
00244   OP_MINQUERY,       /* 25 */
00245   OP_UPTO,           /* 26 From 0 to n matches */
00246   OP_MINUPTO,        /* 27 */
00247   OP_EXACT,          /* 28 Exactly n matches */
00248 
00249   OP_NOTSTAR,        /* 29 The maximizing and minimizing versions of */
00250   OP_NOTMINSTAR,     /* 30 all these opcodes must come in pairs, with */
00251   OP_NOTPLUS,        /* 31 the minimizing one second. */
00252   OP_NOTMINPLUS,     /* 32 This set applies to "not" single characters */
00253   OP_NOTQUERY,       /* 33 */
00254   OP_NOTMINQUERY,    /* 34 */
00255   OP_NOTUPTO,        /* 35 From 0 to n matches */
00256   OP_NOTMINUPTO,     /* 36 */
00257   OP_NOTEXACT,       /* 37 Exactly n matches */
00258 
00259   OP_TYPESTAR,       /* 38 The maximizing and minimizing versions of */
00260   OP_TYPEMINSTAR,    /* 39 all these opcodes must come in pairs, with */
00261   OP_TYPEPLUS,       /* 40 the minimizing one second. These codes must */
00262   OP_TYPEMINPLUS,    /* 41 be in exactly the same order as those above. */
00263   OP_TYPEQUERY,      /* 42 This set applies to character types such as \d */
00264   OP_TYPEMINQUERY,   /* 43 */
00265   OP_TYPEUPTO,       /* 44 From 0 to n matches */
00266   OP_TYPEMINUPTO,    /* 45 */
00267   OP_TYPEEXACT,      /* 46 Exactly n matches */
00268 
00269   OP_CRSTAR,         /* 47 The maximizing and minimizing versions of */
00270   OP_CRMINSTAR,      /* 48 all these opcodes must come in pairs, with */
00271   OP_CRPLUS,         /* 49 the minimizing one second. These codes must */
00272   OP_CRMINPLUS,      /* 50 be in exactly the same order as those above. */
00273   OP_CRQUERY,        /* 51 These are for character classes and back refs */
00274   OP_CRMINQUERY,     /* 52 */
00275   OP_CRRANGE,        /* 53 These are different to the three seta above. */
00276   OP_CRMINRANGE,     /* 54 */
00277 
00278   OP_CLASS,          /* 55 Match a character class, chars < 256 only */
00279   OP_NCLASS,         /* 56 Same, but the bitmap was created from a negative
00280                            class - the difference is relevant only when a UTF-8
00281                            character > 255 is encountered. */
00282 
00283   OP_XCLASS,         /* 57 Extended class for handling UTF-8 chars within the
00284                            class. This does both positive and negative. */
00285 
00286   OP_REF,            /* 58 Match a back reference */
00287   OP_RECURSE,        /* 59 Match a numbered subpattern (possibly recursive) */
00288   OP_CALLOUT,        /* 60 Call out to external function if provided */
00289 
00290   OP_ALT,            /* 61 Start of alternation */
00291   OP_KET,            /* 62 End of group that doesn't have an unbounded repeat */
00292   OP_KETRMAX,        /* 63 These two must remain together and in this */
00293   OP_KETRMIN,        /* 64 order. They are for groups the repeat for ever. */
00294 
00295   /* The assertions must come before ONCE and COND */
00296 
00297   OP_ASSERT,         /* 65 Positive lookahead */
00298   OP_ASSERT_NOT,     /* 66 Negative lookahead */
00299   OP_ASSERTBACK,     /* 67 Positive lookbehind */
00300   OP_ASSERTBACK_NOT, /* 68 Negative lookbehind */
00301   OP_REVERSE,        /* 69 Move pointer back - used in lookbehind assertions */
00302 
00303   /* ONCE and COND must come after the assertions, with ONCE first, as there's
00304   a test for >= ONCE for a subpattern that isn't an assertion. */
00305 
00306   OP_ONCE,           /* 70 Once matched, don't back up into the subpattern */
00307   OP_COND,           /* 71 Conditional group */
00308   OP_CREF,           /* 72 Used to hold an extraction string number (cond ref) */
00309 
00310   OP_BRAZERO,        /* 73 These two must remain together and in this */
00311   OP_BRAMINZERO,     /* 74 order. */
00312 
00313   OP_BRANUMBER,      /* 75 Used for extracting brackets whose number is greater
00314                            than can fit into an opcode. */
00315 
00316   OP_BRA             /* 76 This and greater values are used for brackets that
00317                            extract substrings up to a basic limit. After that,
00318                            use is made of OP_BRANUMBER. */
00319 };
00320 
00321 /* WARNING: There is an implicit assumption in study.c that all opcodes are
00322 less than 128 in value. This makes handling UTF-8 character sequences easier.
00323 */
00324 
00325 
00326 /* This macro defines textual names for all the opcodes. There are used only
00327 for debugging, in pcre.c when DEBUG is defined, and also in pcretest.c. The
00328 macro is referenced only in printint.c. */
00329 
00330 #define OP_NAME_LIST \
00331   "End", "\\A", "\\G", "\\B", "\\b", "\\D", "\\d",                \
00332   "\\S", "\\s", "\\W", "\\w", "Any", "Anybyte", "\\Z", "\\z",     \
00333   "Opt", "^", "$", "chars", "not",                                \
00334   "*", "*?", "+", "+?", "?", "??", "{", "{", "{",                 \
00335   "*", "*?", "+", "+?", "?", "??", "{", "{", "{",                 \
00336   "*", "*?", "+", "+?", "?", "??", "{", "{", "{",                 \
00337   "*", "*?", "+", "+?", "?", "??", "{", "{",                      \
00338   "class", "nclass", "xclass", "Ref", "Recurse", "Callout",       \
00339   "Alt", "Ket", "KetRmax", "KetRmin", "Assert", "Assert not",     \
00340   "AssertB", "AssertB not", "Reverse", "Once", "Cond", "Cond ref",\
00341   "Brazero", "Braminzero", "Branumber", "Bra"
00342 
00343 
00344 /* This macro defines the length of fixed length operations in the compiled
00345 regex. The lengths are used when searching for specific things, and also in the
00346 debugging printing of a compiled regex. We use a macro so that it can be
00347 incorporated both into pcre.c and pcretest.c without being publicly exposed.
00348 
00349 As things have been extended, some of these are no longer fixed lenths, but are
00350 minima instead. For example, the length of a single-character repeat may vary
00351 in UTF-8 mode. The code that uses this table must know about such things. */
00352 
00353 #define OP_LENGTHS \
00354   1,                             /* End                                    */ \
00355   1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  /* \A, \G, \B, \B, \D, \d, \S, \s, \W, \w */ \
00356   1, 1, 1, 1, 2, 1, 1,           /* Any, Anybyte, \Z, \z, Opt, ^, $        */ \
00357   2,                             /* Chars - the minimum length             */ \
00358   2,                             /* not                                    */ \
00359   /* Positive single-char repeats                            ** These are  */ \
00360   2, 2, 2, 2, 2, 2,              /* *, *?, +, +?, ?, ??      ** minima in  */ \
00361   4, 4, 4,                       /* upto, minupto, exact     ** UTF-8 mode */ \
00362   /* Negative single-char repeats - only for chars < 256                   */ \
00363   2, 2, 2, 2, 2, 2,              /* NOT *, *?, +, +?, ?, ??                */ \
00364   4, 4, 4,                       /* NOT upto, minupto, exact               */ \
00365   /* Positive type repeats                                                 */ \
00366   2, 2, 2, 2, 2, 2,              /* Type *, *?, +, +?, ?, ??               */ \
00367   4, 4, 4,                       /* Type upto, minupto, exact              */ \
00368   /* Character class & ref repeats                                         */ \
00369   1, 1, 1, 1, 1, 1,              /* *, *?, +, +?, ?, ??                    */ \
00370   5, 5,                          /* CRRANGE, CRMINRANGE                    */ \
00371  33,                             /* CLASS                                  */ \
00372  33,                             /* NCLASS                                 */ \
00373   0,                             /* XCLASS - variable length               */ \
00374   3,                             /* REF                                    */ \
00375   1+LINK_SIZE,                   /* RECURSE                                */ \
00376   2,                             /* CALLOUT                                */ \
00377   1+LINK_SIZE,                   /* Alt                                    */ \
00378   1+LINK_SIZE,                   /* Ket                                    */ \
00379   1+LINK_SIZE,                   /* KetRmax                                */ \
00380   1+LINK_SIZE,                   /* KetRmin                                */ \
00381   1+LINK_SIZE,                   /* Assert                                 */ \
00382   1+LINK_SIZE,                   /* Assert not                             */ \
00383   1+LINK_SIZE,                   /* Assert behind                          */ \
00384   1+LINK_SIZE,                   /* Assert behind not                      */ \
00385   1+LINK_SIZE,                   /* Reverse                                */ \
00386   1+LINK_SIZE,                   /* Once                                   */ \
00387   1+LINK_SIZE,                   /* COND                                   */ \
00388   3,                             /* CREF                                   */ \
00389   1, 1,                          /* BRAZERO, BRAMINZERO                    */ \
00390   3,                             /* BRANUMBER                              */ \
00391   1+LINK_SIZE                    /* BRA                                    */ \
00392 
00393 
00394 /* The highest extraction number before we have to start using additional
00395 bytes. (Originally PCRE didn't have support for extraction counts highter than
00396 this number.) The value is limited by the number of opcodes left after OP_BRA,
00397 i.e. 255 - OP_BRA. We actually set it a bit lower to leave room for additional
00398 opcodes. */
00399 
00400 #define EXTRACT_BASIC_MAX  150
00401 
00402 /* A magic value for OP_CREF to indicate the "in recursion" condition. */
00403 
00404 #define CREF_RECURSE  0xffff
00405 
00406 /* The texts of compile-time error messages are defined as macros here so that
00407 they can be accessed by the POSIX wrapper and converted into error codes.  Yes,
00408 I could have used error codes in the first place, but didn't feel like changing
00409 just to accommodate the POSIX wrapper. */
00410 
00411 #define ERR1  "\\ at end of pattern"
00412 #define ERR2  "\\c at end of pattern"
00413 #define ERR3  "unrecognized character follows \\"
00414 #define ERR4  "numbers out of order in {} quantifier"
00415 #define ERR5  "number too big in {} quantifier"
00416 #define ERR6  "missing terminating ] for character class"
00417 #define ERR7  "invalid escape sequence in character class"
00418 #define ERR8  "range out of order in character class"
00419 #define ERR9  "nothing to repeat"
00420 #define ERR10 "operand of unlimited repeat could match the empty string"
00421 #define ERR11 "internal error: unexpected repeat"
00422 #define ERR12 "unrecognized character after (?"
00423 #define ERR13 "POSIX named classes are supported only within a class"
00424 #define ERR14 "missing )"
00425 #define ERR15 "reference to non-existent subpattern"
00426 #define ERR16 "erroffset passed as NULL"
00427 #define ERR17 "unknown option bit(s) set"
00428 #define ERR18 "missing ) after comment"
00429 #define ERR19 "parentheses nested too deeply"
00430 #define ERR20 "regular expression too large"
00431 #define ERR21 "failed to get memory"
00432 #define ERR22 "unmatched parentheses"
00433 #define ERR23 "internal error: code overflow"
00434 #define ERR24 "unrecognized character after (?<"
00435 #define ERR25 "lookbehind assertion is not fixed length"
00436 #define ERR26 "malformed number after (?("
00437 #define ERR27 "conditional group contains more than two branches"
00438 #define ERR28 "assertion expected after (?("
00439 #define ERR29 "(?R or (?digits must be followed by )"
00440 #define ERR30 "unknown POSIX class name"
00441 #define ERR31 "POSIX collating elements are not supported"
00442 #define ERR32 "this version of PCRE is not compiled with PCRE_UTF8 support"
00443 #define ERR33 "spare error"
00444 #define ERR34 "character value in \\x{...} sequence is too large"
00445 #define ERR35 "invalid condition (?(0)"
00446 #define ERR36 "\\C not allowed in lookbehind assertion"
00447 #define ERR37 "PCRE does not support \\L, \\l, \\N, \\P, \\p, \\U, \\u, or \\X"
00448 #define ERR38 "number after (?C is > 255"
00449 #define ERR39 "closing ) for (?C expected"
00450 #define ERR40 "recursive call could loop indefinitely"
00451 #define ERR41 "unrecognized character after (?P"
00452 #define ERR42 "syntax error after (?P"
00453 #define ERR43 "two named groups have the same name"
00454 #define ERR44 "invalid UTF-8 string"
00455 
00456 /* All character handling must be done as unsigned characters. Otherwise there
00457 are problems with top-bit-set characters and functions such as isspace().
00458 However, we leave the interface to the outside world as char *, because that
00459 should make things easier for callers. We define a short type for unsigned char
00460 to save lots of typing. I tried "uchar", but it causes problems on Digital
00461 Unix, where it is defined in sys/types, so use "uschar" instead. */
00462 
00463 typedef unsigned char uschar;
00464 
00465 /* The real format of the start of the pcre block; the index of names and the
00466 code vector run on as long as necessary after the end. */
00467 
00468 typedef struct real_pcre {
00469   unsigned long int magic_number;
00470   size_t size;                        /* Total that was malloced */
00471   const unsigned char *tables;        /* Pointer to tables */
00472   unsigned long int options;
00473   unsigned short int top_bracket;
00474   unsigned short int top_backref;
00475   unsigned short int first_byte;
00476   unsigned short int req_byte;
00477   unsigned short int name_entry_size; /* Size of any name items; 0 => none */
00478   unsigned short int name_count;      /* Number of name items */
00479 } real_pcre;
00480 
00481 /* The format of the block used to store data from pcre_study(). */
00482 
00483 typedef struct pcre_study_data {
00484   size_t size;                        /* Total that was malloced */
00485   uschar options;
00486   uschar start_bits[32];
00487 } pcre_study_data;
00488 
00489 /* Structure for passing "static" information around between the functions
00490 doing the compiling, so that they are thread-safe. */
00491 
00492 typedef struct compile_data {
00493   const uschar *lcc;            /* Points to lower casing table */
00494   const uschar *fcc;            /* Points to case-flipping table */
00495   const uschar *cbits;          /* Points to character type table */
00496   const uschar *ctypes;         /* Points to table of type maps */
00497   const uschar *start_code;     /* The start of the compiled code */
00498   uschar *name_table;           /* The name/number table */
00499   int  names_found;             /* Number of entries so far */
00500   int  name_entry_size;         /* Size of each entry */
00501   int  top_backref;             /* Maximum back reference */
00502   unsigned int backref_map;     /* Bitmap of low back refs */
00503   int  req_varyopt;             /* "After variable item" flag for reqbyte */
00504 } compile_data;
00505 
00506 /* Structure for maintaining a chain of pointers to the currently incomplete
00507 branches, for testing for left recursion. */
00508 
00509 typedef struct branch_chain {
00510   struct branch_chain *outer;
00511   uschar *current;
00512 } branch_chain;
00513 
00514 /* Structure for items in a linked list that represents an explicit recursive
00515 call within the pattern. */
00516 
00517 typedef struct recursion_info {
00518   struct recursion_info *prevrec; /* Previous recursion record (or NULL) */
00519   int group_num;                /* Number of group that was called */
00520   const uschar *after_call;     /* "Return value": points after the call in the expr */
00521   const uschar *save_start;     /* Old value of md->start_match */
00522   int *offset_save;             /* Pointer to start of saved offsets */
00523   int saved_max;                /* Number of saved offsets */
00524 } recursion_info;
00525 
00526 /* When compiling in a mode that doesn't use recursive calls to match(),
00527 a structure is used to remember local variables on the heap. It is defined in
00528 pcre.c, close to the match() function, so that it is easy to keep it in step
00529 with any changes of local variable. However, the pointer to the current frame
00530 must be saved in some "static" place over a longjmp(). We declare the
00531 structure here so that we can put a pointer in the match_data structure.
00532 NOTE: This isn't used for a "normal" compilation of pcre. */
00533 
00534 /* Structure for passing "static" information around between the functions
00535 doing the matching, so that they are thread-safe. */
00536 
00537 typedef struct match_data {
00538   unsigned long int match_call_count; /* As it says */
00539   unsigned long int match_limit;/* As it says */
00540   int   *offset_vector;         /* Offset vector */
00541   int    offset_end;            /* One past the end */
00542   int    offset_max;            /* The maximum usable for return data */
00543   const uschar *lcc;            /* Points to lower casing table */
00544   const uschar *ctypes;         /* Points to table of type maps */
00545   bool   offset_overflow;       /* Set if too many extractions */
00546   bool   notbol;                /* NOTBOL flag */
00547   bool   noteol;                /* NOTEOL flag */
00548   bool   utf8;                  /* UTF8 flag */
00549   bool   endonly;               /* Dollar not before final \n */
00550   bool   notempty;              /* Empty string match not wanted */
00551   const uschar *start_code;     /* For use when recursing */
00552   const uschar *start_subject;  /* Start of the subject string */
00553   const uschar *end_subject;    /* End of the subject string */
00554   const uschar *start_match;    /* Start of this match attempt */
00555   const uschar *end_match_ptr;  /* Subject position at end match */
00556   int    end_offset_top;        /* Highwater mark at end of match */
00557   int    capture_last;          /* Most recent capture number */
00558   int    start_offset;          /* The start offset value */
00559   recursion_info *recursive;    /* Linked list of recursion data */
00560   void  *callout_data;          /* To pass back to callouts */
00561 } match_data;
00562 
00563 /* Bit definitions for entries in the pcre_ctypes table. */
00564 
00565 #define ctype_space   0x01
00566 #define ctype_letter  0x02
00567 #define ctype_digit   0x04
00568 #define ctype_xdigit  0x08
00569 #define ctype_word    0x10   /* alphameric or '_' */
00570 #define ctype_meta    0x80   /* regexp meta char or zero (end pattern) */
00571 
00572 /* Offsets for the bitmap tables in pcre_cbits. Each table contains a set
00573 of bits for a class map. Some classes are built by combining these tables. */
00574 
00575 #define cbit_space     0      /* [:space:] or \s */
00576 #define cbit_xdigit   32      /* [:xdigit:] */
00577 #define cbit_digit    64      /* [:digit:] or \d */
00578 #define cbit_upper    96      /* [:upper:] */
00579 #define cbit_lower   128      /* [:lower:] */
00580 #define cbit_word    160      /* [:word:] or \w */
00581 #define cbit_graph   192      /* [:graph:] */
00582 #define cbit_print   224      /* [:print:] */
00583 #define cbit_punct   256      /* [:punct:] */
00584 #define cbit_cntrl   288      /* [:cntrl:] */
00585 #define cbit_length  320      /* Length of the cbits table */
00586 
00587 /* Offsets of the various tables from the base tables pointer, and
00588 total length. */
00589 
00590 #define lcc_offset      0
00591 #define fcc_offset    256
00592 #define cbits_offset  512
00593 #define ctypes_offset (cbits_offset + cbit_length)
00594 #define tables_length (ctypes_offset + 256)
00595 
00596 /* End of internal.h */
00597 /* chartables.c */
00598 /*************************************************
00599 *      Perl-Compatible Regular Expressions       *
00600 *************************************************/
00601 
00602 /* This file is automatically written by the dftables auxiliary
00603 program. If you edit it by hand, you might like to edit the Makefile to
00604 prevent its ever being regenerated.
00605 
00606 This file is #included in the compilation of pcre.c to build the default
00607 character tables which are used when no tables are passed to the compile
00608 function. */
00609 
00610 static unsigned char pcre_default_tables[] = {
00611 
00612 /* This table is a lower casing table. */
00613 
00614     0,  1,  2,  3,  4,  5,  6,  7,
00615     8,  9, 10, 11, 12, 13, 14, 15,
00616    16, 17, 18, 19, 20, 21, 22, 23,
00617    24, 25, 26, 27, 28, 29, 30, 31,
00618    32, 33, 34, 35, 36, 37, 38, 39,
00619    40, 41, 42, 43, 44, 45, 46, 47,
00620    48, 49, 50, 51, 52, 53, 54, 55,
00621    56, 57, 58, 59, 60, 61, 62, 63,
00622    64, 97, 98, 99,100,101,102,103,
00623   104,105,106,107,108,109,110,111,
00624   112,113,114,115,116,117,118,119,
00625   120,121,122, 91, 92, 93, 94, 95,
00626    96, 97, 98, 99,100,101,102,103,
00627   104,105,106,107,108,109,110,111,
00628   112,113,114,115,116,117,118,119,
00629   120,121,122,123,124,125,126,127,
00630   128,129,130,131,132,133,134,135,
00631   136,137,138,139,140,141,142,143,
00632   144,145,146,147,148,149,150,151,
00633   152,153,154,155,156,157,158,159,
00634   160,161,162,163,164,165,166,167,
00635   168,169,170,171,172,173,174,175,
00636   176,177,178,179,180,181,182,183,
00637   184,185,186,187,188,189,190,191,
00638   192,193,194,195,196,197,198,199,
00639   200,201,202,203,204,205,206,207,
00640   208,209,210,211,212,213,214,215,
00641   216,217,218,219,220,221,222,223,
00642   224,225,226,227,228,229,230,231,
00643   232,233,234,235,236,237,238,239,
00644   240,241,242,243,244,245,246,247,
00645   248,249,250,251,252,253,254,255,
00646 
00647 /* This table is a case flipping table. */
00648 
00649     0,  1,  2,  3,  4,  5,  6,  7,
00650     8,  9, 10, 11, 12, 13, 14, 15,
00651    16, 17, 18, 19, 20, 21, 22, 23,
00652    24, 25, 26, 27, 28, 29, 30, 31,
00653    32, 33, 34, 35, 36, 37, 38, 39,
00654    40, 41, 42, 43, 44, 45, 46, 47,
00655    48, 49, 50, 51, 52, 53, 54, 55,
00656    56, 57, 58, 59, 60, 61, 62, 63,
00657    64, 97, 98, 99,100,101,102,103,
00658   104,105,106,107,108,109,110,111,
00659   112,113,114,115,116,117,118,119,
00660   120,121,122, 91, 92, 93, 94, 95,
00661    96, 65, 66, 67, 68, 69, 70, 71,
00662    72, 73, 74, 75, 76, 77, 78, 79,
00663    80, 81, 82, 83, 84, 85, 86, 87,
00664    88, 89, 90,123,124,125,126,127,
00665   128,129,130,131,132,133,134,135,
00666   136,137,138,139,140,141,142,143,
00667   144,145,146,147,148,149,150,151,
00668   152,153,154,155,156,157,158,159,
00669   160,161,162,163,164,165,166,167,
00670   168,169,170,171,172,173,174,175,
00671   176,177,178,179,180,181,182,183,
00672   184,185,186,187,188,189,190,191,
00673   192,193,194,195,196,197,198,199,
00674   200,201,202,203,204,205,206,207,
00675   208,209,210,211,212,213,214,215,
00676   216,217,218,219,220,221,222,223,
00677   224,225,226,227,228,229,230,231,
00678   232,233,234,235,236,237,238,239,
00679   240,241,242,243,244,245,246,247,
00680   248,249,250,251,252,253,254,255,
00681 
00682 /* This table contains bit maps for various character classes.
00683 Each map is 32 bytes long and the bits run from the least
00684 significant end of each byte. The classes that have their own
00685 maps are: space, xdigit, digit, upper, lower, word, graph
00686 print, punct, and cntrl. Other classes are built from combinations. */
00687 
00688   0x00,0x3e,0x00,0x00,0x01,0x00,0x00,0x00,
00689   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
00690   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
00691   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
00692 
00693   0x00,0x00,0x00,0x00,0x00,0x00,0xff,0x03,
00694   0x7e,0x00,0x00,0x00,0x7e,0x00,0x00,0x00,
00695   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
00696   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
00697 
00698   0x00,0x00,0x00,0x00,0x00,0x00,0xff,0x03,
00699   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
00700   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
00701   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
00702 
00703   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
00704   0xfe,0xff,0xff,0x07,0x00,0x00,0x00,0x00,
00705   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
00706   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
00707 
00708   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
00709   0x00,0x00,0x00,0x00,0xfe,0xff,0xff,0x07,
00710   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
00711   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
00712 
00713   0x00,0x00,0x00,0x00,0x00,0x00,0xff,0x03,
00714   0xfe,0xff,0xff,0x87,0xfe,0xff,0xff,0x07,
00715   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
00716   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
00717 
00718   0x00,0x00,0x00,0x00,0xfe,0xff,0xff,0xff,
00719   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x7f,
00720   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
00721   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
00722 
00723   0x00,0x00,0x00,0x00,0xff,0xff,0xff,0xff,
00724   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x7f,
00725   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
00726   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
00727 
00728   0x00,0x00,0x00,0x00,0xfe,0xff,0x00,0xfc,
00729   0x01,0x00,0x00,0xf8,0x01,0x00,0x00,0x78,
00730   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
00731   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
00732 
00733   0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,
00734   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x80,
00735   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
00736   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
00737 
00738 /* This table identifies various classes of character by individual bits:
00739   0x01   white space character
00740   0x02   letter
00741   0x04   decimal digit
00742   0x08   hexadecimal digit
00743   0x10   alphanumeric or '_'
00744   0x80   regular expression metacharacter or binary zero
00745 */
00746 
00747   0x80,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   0-  7 */
00748   0x00,0x01,0x01,0x00,0x01,0x01,0x00,0x00, /*   8- 15 */
00749   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  16- 23 */
00750   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  24- 31 */
00751   0x01,0x00,0x00,0x00,0x80,0x00,0x00,0x00, /*    - '  */
00752   0x80,0x80,0x80,0x80,0x00,0x00,0x80,0x00, /*  ( - /  */
00753   0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c, /*  0 - 7  */
00754   0x1c,0x1c,0x00,0x00,0x00,0x00,0x00,0x80, /*  8 - ?  */
00755   0x00,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /*  @ - G  */
00756   0x12,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /*  H - O  */
00757   0x12,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /*  P - W  */
00758   0x12,0x12,0x12,0x80,0x00,0x00,0x80,0x10, /*  X - _  */
00759   0x00,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /*  ` - g  */
00760   0x12,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /*  h - o  */
00761   0x12,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /*  p - w  */
00762   0x12,0x12,0x12,0x80,0x80,0x00,0x00,0x00, /*  x -127 */
00763   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 128-135 */
00764   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 136-143 */
00765   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144-151 */
00766   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 152-159 */
00767   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160-167 */
00768   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 168-175 */
00769   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 176-183 */
00770   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
00771   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 192-199 */
00772   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 200-207 */
00773   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 208-215 */
00774   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 216-223 */
00775   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 224-231 */
00776   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 232-239 */
00777   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */
00778   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */
00779 
00780 /* End of chartables.c */
00781 /* get.c */
00782 /*************************************************
00783 *      Copy captured string to given buffer      *
00784 *************************************************/
00785 
00786 /* This function copies a single captured substring into a given buffer.
00787 Note that we use memcpy() rather than strncpy() in case there are binary zeros
00788 in the string.
00789 
00790 Arguments:
00791   subject        the subject string that was matched
00792   ovector        pointer to the offsets table
00793   stringcount    the number of substrings that were captured
00794                    (i.e. the yield of the pcre_exec call, unless
00795                    that was zero, in which case it should be 1/3
00796                    of the offset table size)
00797   stringnumber   the number of the required substring
00798   buffer         where to put the substring
00799   size           the size of the buffer
00800 
00801 Returns:         if successful:
00802                    the length of the copied string, not including the zero
00803                    that is put on the end; can be zero
00804                  if not successful:
00805                    PCRE_ERROR_NOMEMORY (-6) buffer too small
00806                    PCRE_ERROR_NOSUBSTRING (-7) no such captured substring
00807 */
00808 
00809 int
00810 pcre_copy_substring(const char *subject, int *ovector, int stringcount,
00811   int stringnumber, char *buffer, int size)
00812 {
00813 int yield;
00814 if (stringnumber < 0 || stringnumber >= stringcount)
00815   return PCRE_ERROR_NOSUBSTRING;
00816 stringnumber *= 2;
00817 yield = ovector[stringnumber+1] - ovector[stringnumber];
00818 if (size < yield + 1) return PCRE_ERROR_NOMEMORY;
00819 memcpy(buffer, subject + ovector[stringnumber], yield);
00820 buffer[yield] = 0;
00821 return yield;
00822 }
00823 
00824 /* End of get.c */
00825 /* maketables.c */
00826 /*************************************************
00827 *           Create PCRE character tables         *
00828 *************************************************/
00829 
00830 /* This function builds a set of character tables for use by PCRE and returns
00831 a pointer to them. They are build using the ctype functions, and consequently
00832 their contents will depend upon the current locale setting. When compiled as
00833 part of the library, the store is obtained via malloc(), but when compiled
00834 inside dftables, use malloc().
00835 
00836 Arguments:   none
00837 Returns:     pointer to the contiguous block of data
00838 */
00839 
00840 const unsigned char *
00841 pcre_maketables(void)
00842 {
00843 unsigned char *yield, *p;
00844 int i;
00845 
00846 yield = static_cast<unsigned char*>(malloc(tables_length));
00847 
00848 if (yield == NULL) return NULL;
00849 p = yield;
00850 
00851 /* First comes the lower casing table */
00852 
00853 for (i = 0; i < 256; i++) *p++ = tolower(i);
00854 
00855 /* Next the case-flipping table */
00856 
00857 for (i = 0; i < 256; i++) *p++ = islower(i)? toupper(i) : tolower(i);
00858 
00859 /* Then the character class tables. Don't try to be clever and save effort
00860 on exclusive ones - in some locales things may be different. Note that the
00861 table for "space" includes everything "isspace" gives, including VT in the
00862 default locale. This makes it work for the POSIX class [:space:]. */
00863 
00864 memset(p, 0, cbit_length);
00865 for (i = 0; i < 256; i++)
00866   {
00867   if (isdigit(i))
00868     {
00869     p[cbit_digit  + i/8] |= 1 << (i&7);
00870     p[cbit_word   + i/8] |= 1 << (i&7);
00871     }
00872   if (isupper(i))
00873     {
00874     p[cbit_upper  + i/8] |= 1 << (i&7);
00875     p[cbit_word   + i/8] |= 1 << (i&7);
00876     }
00877   if (islower(i))
00878     {
00879     p[cbit_lower  + i/8] |= 1 << (i&7);
00880     p[cbit_word   + i/8] |= 1 << (i&7);
00881     }
00882   if (i == '_')   p[cbit_word   + i/8] |= 1 << (i&7);
00883   if (isspace(i)) p[cbit_space  + i/8] |= 1 << (i&7);
00884   if (isxdigit(i))p[cbit_xdigit + i/8] |= 1 << (i&7);
00885   if (isgraph(i)) p[cbit_graph  + i/8] |= 1 << (i&7);
00886   if (isprint(i)) p[cbit_print  + i/8] |= 1 << (i&7);
00887   if (ispunct(i)) p[cbit_punct  + i/8] |= 1 << (i&7);
00888   if (iscntrl(i)) p[cbit_cntrl  + i/8] |= 1 << (i&7);
00889   }
00890 p += cbit_length;
00891 
00892 /* Finally, the character type table. In this, we exclude VT from the white
00893 space chars, because Perl doesn't recognize it as such for \s and for comments
00894 within regexes. */
00895 
00896 for (i = 0; i < 256; i++)
00897   {
00898   int x = 0;
00899   if (i != 0x0b && isspace(i)) x += ctype_space;
00900   if (isalpha(i)) x += ctype_letter;
00901   if (isdigit(i)) x += ctype_digit;
00902   if (isxdigit(i)) x += ctype_xdigit;
00903   if (isalnum(i) || i == '_') x += ctype_word;
00904   if (strchr("*+?{^.$|()[", i) != 0) x += ctype_meta;
00905   *p++ = x;
00906   }
00907 
00908 return yield;
00909 }
00910 
00911 /* End of maketables.c */
00912 /* study.c */
00913 
00914 /*************************************************
00915 *      Set a bit and maybe its alternate case    *
00916 *************************************************/
00917 
00918 /* Given a character, set its bit in the table, and also the bit for the other
00919 version of a letter if we are caseless.
00920 
00921 Arguments:
00922   start_bits    points to the bit map
00923   c             is the character
00924   caseless      the caseless flag
00925   cd            the block with char table pointers
00926 
00927 Returns:        nothing
00928 */
00929 
00930 static void
00931 set_bit(uschar *start_bits, int c, bool caseless, compile_data *cd)
00932 {
00933 start_bits[c/8] |= (1 << (c&7));
00934 if (caseless && (cd->ctypes[c] & ctype_letter) != 0)
00935   start_bits[cd->fcc[c]/8] |= (1 << (cd->fcc[c]&7));
00936 }
00937 
00938 
00939 
00940 /*************************************************
00941 *          Create bitmap of starting chars       *
00942 *************************************************/
00943 
00944 /* This function scans a compiled unanchored expression and attempts to build a
00945 bitmap of the set of initial characters. If it can't, it returns false. As time
00946 goes by, we may be able to get more clever at doing this.
00947 
00948 Arguments:
00949   code         points to an expression
00950   start_bits   points to a 32-byte table, initialized to 0
00951   caseless     the current state of the caseless flag
00952   utf8         true if in UTF-8 mode
00953   cd           the block with char table pointers
00954 
00955 Returns:       true if table built, false otherwise
00956 */
00957 
00958 static bool
00959 set_start_bits(const uschar *code, uschar *start_bits, bool caseless,
00960   bool utf8, compile_data *cd)
00961 {
00962 register int c;
00963 
00964 /* This next statement and the later reference to dummy are here in order to
00965 trick the optimizer of the IBM C compiler for OS/2 into generating correct
00966 code. Apparently IBM isn't going to fix the problem, and we would rather not
00967 disable optimization (in this module it actually makes a big difference, and
00968 the pcre module can use all the optimization it can get). */
00969 
00970 volatile int dummy;
00971 
00972 do
00973   {
00974   const uschar *tcode = code + 1 + LINK_SIZE;
00975   bool try_next = true;
00976 
00977   while (try_next)
00978     {
00979     /* If a branch starts with a bracket or a positive lookahead assertion,
00980     recurse to set bits from within them. That's all for this branch. */
00981 
00982     if ((int)*tcode >= OP_BRA || *tcode == OP_ASSERT)
00983       {
00984       if (!set_start_bits(tcode, start_bits, caseless, utf8, cd))
00985         return false;
00986       try_next = false;
00987       }
00988 
00989     else switch(*tcode)
00990       {
00991       default:
00992       return false;
00993 
00994       /* Skip over callout */
00995 
00996       case OP_CALLOUT:
00997       tcode += 2;
00998       break;
00999 
01000       /* Skip over extended extraction bracket number */
01001 
01002       case OP_BRANUMBER:
01003       tcode += 3;
01004       break;
01005 
01006       /* Skip over lookbehind and negative lookahead assertions */
01007 
01008       case OP_ASSERT_NOT:
01009       case OP_ASSERTBACK:
01010       case OP_ASSERTBACK_NOT:
01011       do tcode += GET(tcode, 1); while (*tcode == OP_ALT);
01012       tcode += 1+LINK_SIZE;
01013       break;
01014 
01015       /* Skip over an option setting, changing the caseless flag */
01016 
01017       case OP_OPT:
01018       caseless = (tcode[1] & PCRE_CASELESS) != 0;
01019       tcode += 2;
01020       break;
01021 
01022       /* BRAZERO does the bracket, but carries on. */
01023 
01024       case OP_BRAZERO:
01025       case OP_BRAMINZERO:
01026       if (!set_start_bits(++tcode, start_bits, caseless, utf8, cd))
01027         return false;
01028       dummy = 1;
01029       do tcode += GET(tcode,1); while (*tcode == OP_ALT);
01030       tcode += 1+LINK_SIZE;
01031       break;
01032 
01033       /* Single-char * or ? sets the bit and tries the next item */
01034 
01035       case OP_STAR:
01036       case OP_MINSTAR:
01037       case OP_QUERY:
01038       case OP_MINQUERY:
01039       set_bit(start_bits, tcode[1], caseless, cd);
01040       tcode += 2;
01041       break;
01042 
01043       /* Single-char upto sets the bit and tries the next */
01044 
01045       case OP_UPTO:
01046       case OP_MINUPTO:
01047       set_bit(start_bits, tcode[3], caseless, cd);
01048       tcode += 4;
01049       break;
01050 
01051       /* At least one single char sets the bit and stops */
01052 
01053       case OP_EXACT:       /* Fall through */
01054       tcode++;
01055 
01056       case OP_CHARS:       /* Fall through */
01057       tcode++;
01058 
01059       case OP_PLUS:
01060       case OP_MINPLUS:
01061       set_bit(start_bits, tcode[1], caseless, cd);
01062       try_next = false;
01063       break;
01064 
01065       /* Single character type sets the bits and stops */
01066 
01067       case OP_NOT_DIGIT:
01068       for (c = 0; c < 32; c++)
01069         start_bits[c] |= ~cd->cbits[c+cbit_digit];
01070       try_next = false;
01071       break;
01072 
01073       case OP_DIGIT:
01074       for (c = 0; c < 32; c++)
01075         start_bits[c] |= cd->cbits[c+cbit_digit];
01076       try_next = false;
01077       break;
01078 
01079       case OP_NOT_WHITESPACE:
01080       for (c = 0; c < 32; c++)
01081         start_bits[c] |= ~cd->cbits[c+cbit_space];
01082       try_next = false;
01083       break;
01084 
01085       case OP_WHITESPACE:
01086       for (c = 0; c < 32; c++)
01087         start_bits[c] |= cd->cbits[c+cbit_space];
01088       try_next = false;
01089       break;
01090 
01091       case OP_NOT_WORDCHAR:
01092       for (c = 0; c < 32; c++)
01093         start_bits[c] |= ~cd->cbits[c+cbit_word];
01094       try_next = false;
01095       break;
01096 
01097       case OP_WORDCHAR:
01098       for (c = 0; c < 32; c++)
01099         start_bits[c] |= cd->cbits[c+cbit_word];
01100       try_next = false;
01101       break;
01102 
01103       /* One or more character type fudges the pointer and restarts, knowing
01104       it will hit a single character type and stop there. */
01105 
01106       case OP_TYPEPLUS:
01107       case OP_TYPEMINPLUS:
01108       tcode++;
01109       break;
01110 
01111       case OP_TYPEEXACT:
01112       tcode += 3;
01113       break;
01114 
01115       /* Zero or more repeats of character types set the bits and then
01116       try again. */
01117 
01118       case OP_TYPEUPTO:
01119       case OP_TYPEMINUPTO:
01120       tcode += 2;               /* Fall through */
01121 
01122       case OP_TYPESTAR:
01123       case OP_TYPEMINSTAR:
01124       case OP_TYPEQUERY:
01125       case OP_TYPEMINQUERY:
01126       switch(tcode[1])
01127         {
01128         case OP_NOT_DIGIT:
01129         for (c = 0; c < 32; c++)
01130           start_bits[c] |= ~cd->cbits[c+cbit_digit];
01131         break;
01132 
01133         case OP_DIGIT:
01134         for (c = 0; c < 32; c++)
01135           start_bits[c] |= cd->cbits[c+cbit_digit];
01136         break;
01137 
01138         case OP_NOT_WHITESPACE:
01139         for (c = 0; c < 32; c++)
01140           start_bits[c] |= ~cd->cbits[c+cbit_space];
01141         break;
01142 
01143         case OP_WHITESPACE:
01144         for (c = 0; c < 32; c++)
01145           start_bits[c] |= cd->cbits[c+cbit_space];
01146         break;
01147 
01148         case OP_NOT_WORDCHAR:
01149         for (c = 0; c < 32; c++)
01150           start_bits[c] |= ~cd->cbits[c+cbit_word];
01151         break;
01152 
01153         case OP_WORDCHAR:
01154         for (c = 0; c < 32; c++)
01155           start_bits[c] |= cd->cbits[c+cbit_word];
01156         break;
01157         }
01158 
01159       tcode += 2;
01160       break;
01161 
01162       /* Character class where all the information is in a bit map: set the
01163       bits and either carry on or not, according to the repeat count. If it was
01164       a negative class, and we are operating with UTF-8 characters, any byte
01165       with the top-bit set is a potentially valid starter because it may start
01166       a character with a value > 255. (This is sub-optimal in that the
01167       character may be in the range 128-255, and those characters might be
01168       unwanted, but that's as far as we go for the moment.) */
01169 
01170       case OP_NCLASS:
01171       if (utf8) memset(start_bits+16, 0xff, 16);
01172       /* Fall through */
01173 
01174       case OP_CLASS:
01175         {
01176         tcode++;
01177         for (c = 0; c < 32; c++) start_bits[c] |= tcode[c];
01178         tcode += 32;
01179         switch (*tcode)
01180           {
01181           case OP_CRSTAR:
01182           case OP_CRMINSTAR:
01183           case OP_CRQUERY:
01184           case OP_CRMINQUERY:
01185           tcode++;
01186           break;
01187 
01188           case OP_CRRANGE:
01189           case OP_CRMINRANGE:
01190           if (((tcode[1] << 8) + tcode[2]) == 0) tcode += 5;
01191             else try_next = false;
01192           break;
01193 
01194           default:
01195           try_next = false;
01196           break;
01197           }
01198         }
01199       break; /* End of bitmap class handling */
01200 
01201       }      /* End of switch */
01202     }        /* End of try_next loop */
01203 
01204   code += GET(code, 1);   /* Advance to next branch */
01205   }
01206 while (*code == OP_ALT);
01207 return true;
01208 }
01209 
01210 
01211 
01212 /*************************************************
01213 *          Study a compiled expression           *
01214 *************************************************/
01215 
01216 /* This function is handed a compiled expression that it must study to produce
01217 information that will speed up the matching. It returns a pcre_extra block
01218 which then gets handed back to pcre_exec().
01219 
01220 Arguments:
01221   re        points to the compiled expression
01222   options   contains option bits
01223   errorptr  points to where to place error messages;
01224             set NULL unless error
01225 
01226 Returns:    pointer to a pcre_extra block, with study_data filled in and the
01227               appropriate flag set;
01228             NULL on error or if no optimization possible
01229 */
01230 
01231 pcre_extra *
01232 pcre_study(const pcre *external_re, int options, const char **errorptr)
01233 {
01234 uschar start_bits[32];
01235 pcre_extra *extra;
01236 pcre_study_data *study;
01237 const real_pcre *re = (const real_pcre *)external_re;
01238 uschar *code = (uschar *)re + sizeof(real_pcre) +
01239   (re->name_count * re->name_entry_size);
01240 compile_data compile_block;
01241 
01242 *errorptr = NULL;
01243 
01244 if (re == NULL || re->magic_number != MAGIC_NUMBER)
01245   {
01246   *errorptr = "argument is not a compiled regular expression";
01247   return NULL;
01248   }
01249 
01250 if ((options & ~PUBLIC_STUDY_OPTIONS) != 0)
01251   {
01252   *errorptr = "unknown or incorrect option bit(s) set";
01253   return NULL;
01254   }
01255 
01256 /* For an anchored pattern, or an unanchored pattern that has a first char, or
01257 a multiline pattern that matches only at "line starts", no further processing
01258 at present. */
01259 
01260 if ((re->options & (PCRE_ANCHORED|PCRE_FIRSTSET|PCRE_STARTLINE)) != 0)
01261   return NULL;
01262 
01263 /* Set the character tables in the block which is passed around */
01264 
01265 compile_block.lcc = re->tables + lcc_offset;
01266 compile_block.fcc = re->tables + fcc_offset;
01267 compile_block.cbits = re->tables + cbits_offset;
01268 compile_block.ctypes = re->tables + ctypes_offset;
01269 
01270 /* See if we can find a fixed set of initial characters for the pattern. */
01271 
01272 memset(start_bits, 0, 32 * sizeof(uschar));
01273 if (!set_start_bits(code, start_bits, (re->options & PCRE_CASELESS) != 0,
01274   (re->options & PCRE_UTF8) != 0, &compile_block)) return NULL;
01275 
01276 /* Get a pcre_extra block and a pcre_study_data block. The study data is put in
01277 the latter, which is pointed to by the former, which may also get additional
01278 data set later by the calling program. At the moment, the size of
01279 pcre_study_data is fixed. We nevertheless save it in a field for returning via
01280 the pcre_fullinfo() function so that if it becomes variable in the future, we
01281 don't have to change that code. */
01282 
01283 extra = static_cast<pcre_extra *>(malloc(sizeof(pcre_extra) + sizeof(pcre_study_data)));
01284 
01285 if (extra == NULL)
01286   {
01287   *errorptr = "failed to get memory";
01288   return NULL;
01289   }
01290 
01291 // Hmm.
01292 study = reinterpret_cast<pcre_study_data *>(reinterpret_cast<char*>(extra) + sizeof(pcre_extra));
01293 extra->flags = PCRE_EXTRA_STUDY_DATA;
01294 extra->study_data = study;
01295 
01296 study->size = sizeof(pcre_study_data);
01297 study->options = PCRE_STUDY_MAPPED;
01298 memcpy(study->start_bits, start_bits, sizeof(start_bits));
01299 
01300 return extra;
01301 }
01302 
01303 /* End of study.c */
01304 /* pcre.c */
01305 #define DPRINTF(p) /*nothing*/
01306 
01307 /* Maximum number of items on the nested bracket stacks at compile time. This
01308 applies to the nesting of all kinds of parentheses. It does not limit
01309 un-nested, non-capturing parentheses. This number can be made bigger if
01310 necessary - it is used to dimension one int and one unsigned char vector at
01311 compile time. */
01312 
01313 #define BRASTACK_SIZE 200
01314 
01315 
01316 /* Maximum number of ints of offset to save on the stack for recursive calls.
01317 If the offset vector is bigger, malloc is used. This should be a multiple of 3,
01318 because the offset vector is always a multiple of 3 long. */
01319 
01320 #define REC_STACK_SAVE_MAX 30
01321 
01322 
01323 /* The number of bytes in a literal character string above which we can't add
01324 any more is set at 250 in order to allow for UTF-8 characters. (In theory it
01325 could be 255 when UTF-8 support is excluded, but that means that some of the
01326 test output would be different, which just complicates things.) */
01327 
01328 #define MAXLIT 250
01329 
01330 
01331 /* The maximum remaining length of subject we are prepared to search for a
01332 req_byte match. */
01333 
01334 #define REQ_BYTE_MAX 1000
01335 
01336 
01337 /* Table of sizes for the fixed-length opcodes. It's defined in a macro so that
01338 the definition is next to the definition of the opcodes in internal.h. */
01339 
01340 static const uschar OP_lengths[] = { OP_LENGTHS };
01341 
01342 /* Min and max values for the common repeats; for the maxima, 0 => infinity */
01343 
01344 static const char rep_min[] = { 0, 0, 1, 1, 0, 0 };
01345 static const char rep_max[] = { 0, 0, 0, 0, 1, 1 };
01346 
01347 /* Table for handling escaped characters in the range '0'-'z'. Positive returns
01348 are simple data values; negative values are for special things like \d and so
01349 on. Zero means further processing is needed (for things like \x), or the escape
01350 is invalid. */
01351 
01352 static const short int escapes[] = {
01353     0,      0,      0,      0,      0,      0,      0,      0,   /* 0 - 7 */
01354     0,      0,    ':',    ';',    '<',    '=',    '>',    '?',   /* 8 - ? */
01355   '@', -ESC_A, -ESC_B, -ESC_C, -ESC_D, -ESC_E,      0, -ESC_G,   /* @ - G */
01356     0,      0,      0,      0,      0,      0,      0,      0,   /* H - O */
01357     0, -ESC_Q,      0, -ESC_S,      0,      0,      0, -ESC_W,   /* P - W */
01358     0,      0, -ESC_Z,    '[',   '\\',    ']',    '^',    '_',   /* X - _ */
01359   '`',      7, -ESC_b,      0, -ESC_d,  ESC_e,  ESC_f,      0,   /* ` - g */
01360     0,      0,      0,      0,      0,      0,  ESC_n,      0,   /* h - o */
01361     0,      0,  ESC_r, -ESC_s,  ESC_tee,    0,      0, -ESC_w,   /* p - w */
01362     0,      0, -ESC_z                                            /* x - z */
01363 };
01364 
01365 /* Tables of names of POSIX character classes and their lengths. The list is
01366 terminated by a zero length entry. The first three must be alpha, upper, lower,
01367 as this is assumed for handling case independence. */
01368 
01369 static const char *const posix_names[] = {
01370   "alpha", "lower", "upper",
01371   "alnum", "ascii", "blank", "cntrl", "digit", "graph",
01372   "print", "punct", "space", "word",  "xdigit" };
01373 
01374 static const uschar posix_name_lengths[] = {
01375   5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };
01376 
01377 /* Table of class bit maps for each POSIX class; up to three may be combined
01378 to form the class. The table for [:blank:] is dynamically modified to remove
01379 the vertical space characters. */
01380 
01381 static const int posix_class_maps[] = {
01382   cbit_lower, cbit_upper, -1,             /* alpha */
01383   cbit_lower, -1,         -1,             /* lower */
01384   cbit_upper, -1,         -1,             /* upper */
01385   cbit_digit, cbit_lower, cbit_upper,     /* alnum */
01386   cbit_print, cbit_cntrl, -1,             /* ascii */
01387   cbit_space, -1,         -1,             /* blank - a GNU extension */
01388   cbit_cntrl, -1,         -1,             /* cntrl */
01389   cbit_digit, -1,         -1,             /* digit */
01390   cbit_graph, -1,         -1,             /* graph */
01391   cbit_print, -1,         -1,             /* print */
01392   cbit_punct, -1,         -1,             /* punct */
01393   cbit_space, -1,         -1,             /* space */
01394   cbit_word,  -1,         -1,             /* word - a Perl extension */
01395   cbit_xdigit,-1,         -1              /* xdigit */
01396 };
01397 
01398 /* Table to identify digits and hex digits. This is used when compiling
01399 patterns. Note that the tables in chartables are dependent on the locale, and
01400 may mark arbitrary characters as digits - but the PCRE compiling code expects
01401 to handle only 0-9, a-z, and A-Z as digits when compiling. That is why we have
01402 a private table here. It costs 256 bytes, but it is a lot faster than doing
01403 character value tests (at least in some simple cases I timed), and in some
01404 applications one wants PCRE to compile efficiently as well as match
01405 efficiently.
01406 
01407 For convenience, we use the same bit definitions as in chartables:
01408 
01409   0x04   decimal digit
01410   0x08   hexadecimal digit
01411 
01412 Then we can use ctype_digit and ctype_xdigit in the code. */
01413 
01414 static const unsigned char digitab[] =
01415   {
01416   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   0-  7 */
01417   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   8- 15 */
01418   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  16- 23 */
01419   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  24- 31 */
01420   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*    - '  */
01421   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  ( - /  */
01422   0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /*  0 - 7  */
01423   0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00, /*  8 - ?  */
01424   0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /*  @ - G  */
01425   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  H - O  */
01426   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  P - W  */
01427   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  X - _  */
01428   0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /*  ` - g  */
01429   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  h - o  */
01430   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  p - w  */
01431   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  x -127 */
01432   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 128-135 */
01433   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 136-143 */
01434   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144-151 */
01435   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 152-159 */
01436   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160-167 */
01437   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 168-175 */
01438   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 176-183 */
01439   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
01440   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 192-199 */
01441   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 200-207 */
01442   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 208-215 */
01443   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 216-223 */
01444   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 224-231 */
01445   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 232-239 */
01446   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */
01447   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */
01448 
01449 /* Definition to allow mutual recursion */
01450 
01451 static bool
01452   compile_regex(int, int, int *, uschar **, const uschar **, const char **,
01453     bool, int, int *, int *, branch_chain *, compile_data *);
01454 
01455 /* Structure for building a chain of data that actually lives on the
01456 stack, for holding the values of the subject pointer at the start of each
01457 subpattern, so as to detect when an empty string has been matched by a
01458 subpattern - to break infinite loops. */
01459 
01460 typedef struct eptrblock {
01461   struct eptrblock *epb_prev;
01462   const uschar *epb_saved_eptr;
01463 } eptrblock;
01464 
01465 /* Flag bits for the match() function */
01466 
01467 #define match_condassert   0x01    /* Called to check a condition assertion */
01468 #define match_isgroup      0x02    /* Set if start of bracketed group */
01469 
01470 /* Non-error returns from the match() function. Error returns are externally
01471 defined PCRE_ERROR_xxx codes, which are all negative. */
01472 
01473 #define MATCH_MATCH        1
01474 #define MATCH_NOMATCH      0
01475 
01476 
01477 
01478 /*************************************************
01479 *               Global variables                 *
01480 *************************************************/
01481 
01482 /* PCRE is thread-clean and doesn't use any global variables in the normal
01483 sense. However, it calls memory allocation and free functions via the four
01484 indirections below, and it can optionally do callouts. These values can be
01485 changed by the caller, but are shared between all threads. However, when
01486 compiling for Virtual Pascal, things are done differently (see pcre.in). */
01487 
01488 int   (*pcre_callout)(pcre_callout_block *) = NULL;
01489 
01490 
01491 /*************************************************
01492 *    Macros and tables for character handling    *
01493 *************************************************/
01494 
01495 #define GETCHAR(c, eptr) c = *eptr;
01496 #define GETCHARINC(c, eptr) c = *eptr++;
01497 #define GETCHARINCTEST(c, eptr) c = *eptr++;
01498 #define GETCHARLEN(c, eptr, len) c = *eptr;
01499 #define BACKCHAR(eptr)
01500 
01501 /*************************************************
01502 *            Handle escapes                      *
01503 *************************************************/
01504 
01505 /* This function is called when a \ has been encountered. It either returns a
01506 positive value for a simple escape such as \n, or a negative value which
01507 encodes one of the more complicated things such as \d. When UTF-8 is enabled,
01508 a positive value greater than 255 may be returned. On entry, ptr is pointing at
01509 the \. On exit, it is on the final character of the escape sequence.
01510 
01511 Arguments:
01512   ptrptr     points to the pattern position pointer
01513   errorptr   points to the pointer to the error message
01514   bracount   number of previous extracting brackets
01515   options    the options bits
01516   isclass    true if inside a character class
01517 
01518 Returns:     zero or positive => a data character
01519              negative => a special escape sequence
01520              on error, errorptr is set
01521 */
01522 
01523 static int
01524 check_escape(const uschar **ptrptr, const char **errorptr, int bracount,
01525   int options, bool isclass)
01526 {
01527 const uschar *ptr = *ptrptr;
01528 int c, i;
01529 
01530 /* If backslash is at the end of the pattern, it's an error. */
01531 
01532 c = *(++ptr);
01533 if (c == 0) *errorptr = ERR1;
01534 
01535 /* Non-alphamerics are literals. For digits or letters, do an initial lookup in
01536 a table. A non-zero result is something that can be returned immediately.
01537 Otherwise further processing may be required. */
01538 
01539 else if (c < '0' || c > 'z') {}                           /* Not alphameric */
01540 else if ((i = escapes[c - '0']) != 0) c = i;
01541 
01542 /* Escapes that need further processing, or are illegal. */
01543 
01544 else
01545   {
01546   const uschar *oldptr;
01547   switch (c)
01548     {
01549     /* A number of Perl escapes are not handled by PCRE. We give an explicit
01550     error. */
01551 
01552     case 'l':
01553     case 'L':
01554     case 'N':
01555     case 'p':
01556     case 'P':
01557     case 'u':
01558     case 'U':
01559     case 'X':
01560     *errorptr = ERR37;
01561     break;
01562 
01563     /* The handling of escape sequences consisting of a string of digits
01564     starting with one that is not zero is not straightforward. By experiment,
01565     the way Perl works seems to be as follows:
01566 
01567     Outside a character class, the digits are read as a decimal number. If the
01568     number is less than 10, or if there are that many previous extracting
01569     left brackets, then it is a back reference. Otherwise, up to three octal
01570     digits are read to form an escaped byte. Thus \123 is likely to be octal
01571     123 (cf \0123, which is octal 012 followed by the literal 3). If the octal
01572     value is greater than 377, the least significant 8 bits are taken. Inside a
01573     character class, \ followed by a digit is always an octal number. */
01574 
01575     case '1': case '2': case '3': case '4': case '5':
01576     case '6': case '7': case '8': case '9':
01577 
01578     if (!isclass)
01579       {
01580       oldptr = ptr;
01581       c -= '0';
01582       while ((digitab[ptr[1]] & ctype_digit) != 0)
01583         c = c * 10 + *(++ptr) - '0';
01584       if (c < 10 || c <= bracount)
01585         {
01586         c = -(ESC_REF + c);
01587         break;
01588         }
01589       ptr = oldptr;      /* Put the pointer back and fall through */
01590       }
01591 
01592     /* Handle an octal number following \. If the first digit is 8 or 9, Perl
01593     generates a binary zero byte and treats the digit as a following literal.
01594     Thus we have to pull back the pointer by one. */
01595 
01596     if ((c = *ptr) >= '8')
01597       {
01598       ptr--;
01599       c = 0;
01600       break;
01601       }
01602 
01603     /* \0 always starts an octal number, but we may drop through to here with a
01604     larger first octal digit. */
01605 
01606     case '0':
01607     c -= '0';
01608     while(i++ < 2 && ptr[1] >= '0' && ptr[1] <= '7')
01609         c = c * 8 + *(++ptr) - '0';
01610     c &= 255;     /* Take least significant 8 bits */
01611     break;
01612 
01613     /* \x is complicated when UTF-8 is enabled. \x{ddd} is a character number
01614     which can be greater than 0xff, but only if the ddd are hex digits. */
01615 
01616     case 'x':
01617 
01618     /* Read just a single hex char */
01619 
01620     c = 0;
01621     while (i++ < 2 && (digitab[ptr[1]] & ctype_xdigit) != 0)
01622       {
01623       int cc;                               /* Some compilers don't like ++ */
01624       cc = *(++ptr);                        /* in initializers */
01625       if (cc >= 'a') cc -= 32;              /* Convert to upper case */
01626       c = c * 16 + cc - ((cc < 'A')? '0' : ('A' - 10));
01627       }
01628     break;
01629 
01630     /* Other special escapes not starting with a digit are straightforward */
01631 
01632     case 'c':
01633     c = *(++ptr);
01634     if (c == 0)
01635       {
01636       *errorptr = ERR2;
01637       return 0;
01638       }
01639 
01640     /* A letter is upper-cased; then the 0x40 bit is flipped. This coding
01641     is ASCII-specific, but then the whole concept of \cx is ASCII-specific. */
01642 
01643     if (c >= 'a' && c <= 'z') c -= 32;
01644     c ^= 0x40;
01645     break;
01646 
01647     /* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any
01648     other alphameric following \ is an error if PCRE_EXTRA was set; otherwise,
01649     for Perl compatibility, it is a literal. This code looks a bit odd, but
01650     there used to be some cases other than the default, and there may be again
01651     in future, so I haven't "optimized" it. */
01652 
01653     default:
01654     if ((options & PCRE_EXTRA) != 0)
01655       {
01656       *errorptr = ERR3;
01657       }
01658     break;
01659     }
01660   }
01661 
01662 *ptrptr = ptr;
01663 return c;
01664 }
01665 
01666 
01667 
01668 /*************************************************
01669 *            Check for counted repeat            *
01670 *************************************************/
01671 
01672 /* This function is called when a '{' is encountered in a place where it might
01673 start a quantifier. It looks ahead to see if it really is a quantifier or not.
01674 It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd}
01675 where the ddds are digits.
01676 
01677 Arguments:
01678   p         pointer to the first char after '{'
01679 
01680 Returns:    true or false
01681 */
01682 
01683 static bool
01684 is_counted_repeat(const uschar *p)
01685 {
01686 if ((digitab[*p++] & ctype_digit) == 0) return false;
01687 while ((digitab[*p] & ctype_digit) != 0) p++;
01688 if (*p == '}') return true;
01689 
01690 if (*p++ != ',') return false;
01691 if (*p == '}') return true;
01692 
01693 if ((digitab[*p++] & ctype_digit) == 0) return false;
01694 while ((digitab[*p] & ctype_digit) != 0) p++;
01695 
01696 return (*p == '}');
01697 }
01698 
01699 
01700 
01701 /*************************************************
01702 *         Read repeat counts                     *
01703 *************************************************/
01704 
01705 /* Read an item of the form {n,m} and return the values. This is called only
01706 after is_counted_repeat() has confirmed that a repeat-count quantifier exists,
01707 so the syntax is guaranteed to be correct, but we need to check the values.
01708 
01709 Arguments:
01710   p          pointer to first char after '{'
01711   minp       pointer to int for min
01712   maxp       pointer to int for max
01713              returned as -1 if no max
01714   errorptr   points to pointer to error message
01715 
01716 Returns:     pointer to '}' on success;
01717              current ptr on error, with errorptr set
01718 */
01719 
01720 static const uschar *
01721 read_repeat_counts(const uschar *p, int *minp, int *maxp, const char **errorptr)
01722 {
01723 int min = 0;
01724 int max = -1;
01725 
01726 while ((digitab[*p] & ctype_digit) != 0) min = min * 10 + *p++ - '0';
01727 
01728 if (*p == '}') max = min; else
01729   {
01730   if (*(++p) != '}')
01731     {
01732     max = 0;
01733     while((digitab[*p] & ctype_digit) != 0) max = max * 10 + *p++ - '0';
01734     if (max < min)
01735       {
01736       *errorptr = ERR4;
01737       return p;
01738       }
01739     }
01740   }
01741 
01742 /* Do paranoid checks, then fill in the required variables, and pass back the
01743 pointer to the terminating '}'. */
01744 
01745 if (min < 0 || 65535 < min ||
01746     max < -1 || 65535 < max)
01747   *errorptr = ERR5;
01748 else
01749   {
01750   *minp = min;
01751   *maxp = max;
01752   }
01753 return p;
01754 }
01755 
01756 
01757 
01758 /*************************************************
01759 *      Find first significant op code            *
01760 *************************************************/
01761 
01762 /* This is called by several functions that scan a compiled expression looking
01763 for a fixed first character, or an anchoring op code etc. It skips over things
01764 that do not influence this. For some calls, a change of option is important.
01765 
01766 Arguments:
01767   code       pointer to the start of the group
01768   options    pointer to external options
01769   optbit     the option bit whose changing is significant, or
01770                zero if none are
01771 
01772 Returns:     pointer to the first significant opcode
01773 */
01774 
01775 static const uschar*
01776 first_significant_code(const uschar *code, int *options, int optbit)
01777 {
01778 for (;;)
01779   {
01780   switch ((int)*code)
01781     {
01782     case OP_OPT:
01783     if (optbit > 0 && ((int)code[1] & optbit) != (*options & optbit))
01784       *options = (int)code[1];
01785     code += 2;
01786     break;
01787 
01788     case OP_ASSERT_NOT:
01789     case OP_ASSERTBACK:
01790     case OP_ASSERTBACK_NOT:
01791     do code += GET(code, 1); while (*code == OP_ALT);
01792     /* Fall through */
01793 
01794     case OP_CALLOUT:
01795     case OP_CREF:
01796     case OP_BRANUMBER:
01797     case OP_WORD_BOUNDARY:
01798     case OP_NOT_WORD_BOUNDARY:
01799     code += OP_lengths[*code];
01800     break;
01801 
01802     default:
01803     return code;
01804     }
01805   }
01806 /* Control never reaches here */
01807 }
01808 
01809 
01810 
01811 
01812 /*************************************************
01813 *        Find the fixed length of a pattern      *
01814 *************************************************/
01815 
01816 /* Scan a pattern and compute the fixed length of subject that will match it,
01817 if the length is fixed. This is needed for dealing with backward assertions.
01818 In UTF8 mode, the result is in characters rather than bytes.
01819 
01820 Arguments:
01821   code     points to the start of the pattern (the bracket)
01822   options  the compiling options
01823 
01824 Returns:   the fixed length, or -1 if there is no fixed length,
01825              or -2 if \C was encountered
01826 */
01827 
01828 static int
01829 find_fixedlength(uschar *code, int options)
01830 {
01831 int length = -1;
01832 
01833 register int branchlength = 0;
01834 register uschar *cc = code + 1 + LINK_SIZE;
01835 
01836 /* Scan along the opcodes for this branch. If we get to the end of the
01837 branch, check the length against that of the other branches. */
01838 
01839 for (;;)
01840   {
01841   int d;
01842   register int op = *cc;
01843   if (op >= OP_BRA) op = OP_BRA;
01844 
01845   switch (op)
01846     {
01847     case OP_BRA:
01848     case OP_ONCE:
01849     case OP_COND:
01850     d = find_fixedlength(cc, options);
01851     if (d < 0) return d;
01852     branchlength += d;
01853     do cc += GET(cc, 1); while (*cc == OP_ALT);
01854     cc += 1 + LINK_SIZE;
01855     break;
01856 
01857     /* Reached end of a branch; if it's a ket it is the end of a nested
01858     call. If it's ALT it is an alternation in a nested call. If it is
01859     END it's the end of the outer call. All can be handled by the same code. */
01860 
01861     case OP_ALT:
01862     case OP_KET:
01863     case OP_KETRMAX:
01864     case OP_KETRMIN:
01865     case OP_END:
01866     if (length < 0) length = branchlength;
01867       else if (length != branchlength) return -1;
01868     if (*cc != OP_ALT) return length;
01869     cc += 1 + LINK_SIZE;
01870     branchlength = 0;
01871     break;
01872 
01873     /* Skip over assertive subpatterns */
01874 
01875     case OP_ASSERT:
01876     case OP_ASSERT_NOT:
01877     case OP_ASSERTBACK:
01878     case OP_ASSERTBACK_NOT:
01879     do cc += GET(cc, 1); while (*cc == OP_ALT);
01880     /* Fall through */
01881 
01882     /* Skip over things that don't match chars */
01883 
01884     case OP_REVERSE:
01885     case OP_BRANUMBER:
01886     case OP_CREF:
01887     case OP_OPT:
01888     case OP_CALLOUT:
01889     case OP_SOD:
01890     case OP_SOM:
01891     case OP_EOD:
01892     case OP_EODN:
01893     case OP_CIRC:
01894     case OP_DOLL:
01895     case OP_NOT_WORD_BOUNDARY:
01896     case OP_WORD_BOUNDARY:
01897     cc += OP_lengths[*cc];
01898     break;
01899 
01900     /* Handle char strings. In UTF-8 mode we must count characters, not bytes.
01901     This requires a scan of the string, unfortunately. We assume valid UTF-8
01902     strings, so all we do is reduce the length by one for every byte whose bits
01903     are 10xxxxxx. */
01904 
01905     case OP_CHARS:
01906     branchlength += *(++cc);
01907     cc += *cc + 1;
01908     break;
01909 
01910     /* Handle exact repetitions. The count is already in characters, but we
01911     need to skip over a multibyte character in UTF8 mode.  */
01912 
01913     case OP_EXACT:
01914     branchlength += GET2(cc,1);
01915     cc += 4;
01916     break;
01917 
01918     case OP_TYPEEXACT:
01919     branchlength += GET2(cc,1);
01920     cc += 4;
01921     break;
01922 
01923     /* Handle single-char matchers */
01924 
01925     case OP_NOT_DIGIT:
01926     case OP_DIGIT:
01927     case OP_NOT_WHITESPACE:
01928     case OP_WHITESPACE:
01929     case OP_NOT_WORDCHAR:
01930     case OP_WORDCHAR:
01931     case OP_ANY:
01932     branchlength++;
01933     cc++;
01934     break;
01935 
01936     /* The single-byte matcher isn't allowed */
01937 
01938     case OP_ANYBYTE:
01939     return -2;
01940 
01941     /* Check a class for variable quantification */
01942 
01943 
01944     case OP_CLASS:
01945     case OP_NCLASS:
01946     cc += 33;
01947 
01948     switch (*cc)
01949       {
01950       case OP_CRSTAR:
01951       case OP_CRMINSTAR:
01952       case OP_CRQUERY:
01953       case OP_CRMINQUERY:
01954       return -1;
01955 
01956       case OP_CRRANGE:
01957       case OP_CRMINRANGE:
01958       if (GET2(cc,1) != GET2(cc,3)) return -1;
01959       branchlength += GET2(cc,1);
01960       cc += 5;
01961       break;
01962 
01963       default:
01964       branchlength++;
01965       }
01966     break;
01967 
01968     /* Anything else is variable length */
01969 
01970     default:
01971     return -1;
01972     }
01973   }
01974 /* Control never gets here */
01975 }
01976 
01977 
01978 
01979 
01980 /*************************************************
01981 *    Scan compiled regex for numbered bracket    *
01982 *************************************************/
01983 
01984 /* This little function scans through a compiled pattern until it finds a
01985 capturing bracket with the given number.
01986 
01987 Arguments:
01988   code        points to start of expression
01989   utf8        true in UTF-8 mode
01990   number      the required bracket number
01991 
01992 Returns:      pointer to the opcode for the bracket, or NULL if not found
01993 */
01994 
01995 static const uschar *
01996 find_bracket(const uschar *code, int number)
01997 {
01998 
01999 for (;;)
02000   {
02001   register int c = *code;
02002   if (c == OP_END) return NULL;
02003   else if (c == OP_CHARS) code += code[1] + OP_lengths[c];
02004   else if (c > OP_BRA)
02005     {
02006     int n = c - OP_BRA;
02007     if (n > EXTRACT_BASIC_MAX) n = GET2(code, 2+LINK_SIZE);
02008     if (n == number) return (uschar *)code;
02009     code += OP_lengths[OP_BRA];
02010     }
02011   else
02012     {
02013     code += OP_lengths[c];
02014 
02015     }
02016   }
02017 }
02018 
02019 
02020 
02021 /*************************************************
02022 *   Scan compiled regex for recursion reference  *
02023 *************************************************/
02024 
02025 /* This little function scans through a compiled pattern until it finds an
02026 instance of OP_RECURSE.
02027 
02028 Arguments:
02029   code        points to start of expression
02030   utf8        true in UTF-8 mode
02031 
02032 Returns:      pointer to the opcode for OP_RECURSE, or NULL if not found
02033 */
02034 
02035 static const uschar *
02036 find_recurse(const uschar *code, bool utf8)
02037 {
02038 utf8 = utf8;               /* Stop pedantic compilers complaining */
02039 
02040 for (;;)
02041   {
02042   register int c = *code;
02043   if (c == OP_END) return NULL;
02044   else if (c == OP_RECURSE) return code;
02045   else if (c == OP_CHARS) code += code[1] + OP_lengths[c];
02046   else if (c > OP_BRA)
02047     {
02048     code += OP_lengths[OP_BRA];
02049     }
02050   else
02051     {
02052     code += OP_lengths[c];
02053 
02054     }
02055   }
02056 }
02057 
02058 
02059 
02060 /*************************************************
02061 *    Scan compiled branch for non-emptiness      *
02062 *************************************************/
02063 
02064 /* This function scans through a branch of a compiled pattern to see whether it
02065 can match the empty string or not. It is called only from could_be_empty()
02066 below. Note that first_significant_code() skips over assertions. If we hit an
02067 unclosed bracket, we return "empty" - this means we've struck an inner bracket
02068 whose current branch will already have been scanned.
02069 
02070 Arguments:
02071   code        points to start of search
02072   endcode     points to where to stop
02073   utf8        true if in UTF8 mode
02074 
02075 Returns:      true if what is matched could be empty
02076 */
02077 
02078 static bool
02079 could_be_empty_branch(const uschar *code, const uschar *endcode, bool utf8)
02080 {
02081 register int c;
02082 for (code = first_significant_code(code + 1 + LINK_SIZE, NULL, 0);
02083      code < endcode;
02084      code = first_significant_code(code + OP_lengths[c], NULL, 0))
02085   {
02086   const uschar *ccode;
02087 
02088   c = *code;
02089 
02090   if (c >= OP_BRA)
02091     {
02092     bool empty_branch;
02093     if (GET(code, 1) == 0) return true;    /* Hit unclosed bracket */
02094 
02095     /* Scan a closed bracket */
02096 
02097     empty_branch = false;
02098     do
02099       {
02100       if (!empty_branch && could_be_empty_branch(code, endcode, utf8))
02101         empty_branch = true;
02102       code += GET(code, 1);
02103       }
02104     while (*code == OP_ALT);
02105     if (!empty_branch) return false;   /* All branches are non-empty */
02106     code += 1 + LINK_SIZE;
02107     c = *code;
02108     }
02109 
02110   else switch (c)
02111     {
02112     /* Check for quantifiers after a class */
02113 
02114 
02115     case OP_CLASS:
02116     case OP_NCLASS:
02117     ccode = code + 33;
02118 
02119 
02120     switch (*ccode)
02121       {
02122       case OP_CRSTAR:            /* These could be empty; continue */
02123       case OP_CRMINSTAR:
02124       case OP_CRQUERY:
02125       case OP_CRMINQUERY:
02126       break;
02127 
02128       default:                   /* Non-repeat => class must match */
02129       case OP_CRPLUS:            /* These repeats aren't empty */
02130       case OP_CRMINPLUS:
02131       return false;
02132 
02133       case OP_CRRANGE:
02134       case OP_CRMINRANGE:
02135       if (GET2(ccode, 1) > 0) return false;  /* Minimum > 0 */
02136       break;
02137       }
02138     break;
02139 
02140     /* Opcodes that must match a character */
02141 
02142     case OP_NOT_DIGIT:
02143     case OP_DIGIT:
02144     case OP_NOT_WHITESPACE:
02145     case OP_WHITESPACE:
02146     case OP_NOT_WORDCHAR:
02147     case OP_WORDCHAR:
02148     case OP_ANY:
02149     case OP_ANYBYTE:
02150     case OP_CHARS:
02151     case OP_NOT:
02152     case OP_PLUS:
02153     case OP_MINPLUS:
02154     case OP_EXACT:
02155     case OP_NOTPLUS:
02156     case OP_NOTMINPLUS:
02157     case OP_NOTEXACT:
02158     case OP_TYPEPLUS:
02159     case OP_TYPEMINPLUS:
02160     case OP_TYPEEXACT:
02161     return false;
02162 
02163     /* End of branch */
02164 
02165     case OP_KET:
02166     case OP_KETRMAX:
02167     case OP_KETRMIN:
02168     case OP_ALT:
02169     return true;
02170 
02171     }
02172   }
02173 
02174 return true;
02175 }
02176 
02177 
02178 
02179 /*************************************************
02180 *    Scan compiled regex for non-emptiness       *
02181 *************************************************/
02182 
02183 /* This function is called to check for left recursive calls. We want to check
02184 the current branch of the current pattern to see if it could match the empty
02185 string. If it could, we must look outwards for branches at other levels,
02186 stopping when we pass beyond the bracket which is the subject of the recursion.
02187 
02188 Arguments:
02189   code        points to start of the recursion
02190   endcode     points to where to stop (current RECURSE item)
02191   bcptr       points to the chain of current (unclosed) branch starts
02192   utf8        true if in UTF-8 mode
02193 
02194 Returns:      true if what is matched could be empty
02195 */
02196 
02197 static bool
02198 could_be_empty(const uschar *code, const uschar *endcode, branch_chain *bcptr,
02199   bool utf8)
02200 {
02201 while (bcptr != NULL && bcptr->current >= code)
02202   {
02203   if (!could_be_empty_branch(bcptr->current, endcode, utf8)) return false;
02204   bcptr = bcptr->outer;
02205   }
02206 return true;
02207 }
02208 
02209 
02210 
02211 /*************************************************
02212 *           Check for POSIX class syntax         *
02213 *************************************************/
02214 
02215 /* This function is called when the sequence "[:" or "[." or "[=" is
02216 encountered in a character class. It checks whether this is followed by an
02217 optional ^ and then a sequence of letters, terminated by a matching ":]" or
02218 ".]" or "=]".
02219 
02220 Argument:
02221   ptr      pointer to the initial [
02222   endptr   where to return the end pointer
02223   cd       pointer to compile data
02224 
02225 Returns:   true or false
02226 */
02227 
02228 static bool
02229 check_posix_syntax(const uschar *ptr, const uschar **endptr, compile_data *cd)
02230 {
02231 int terminator;          /* Don't combine these lines; the Solaris cc */
02232 terminator = *(++ptr);   /* compiler warns about "non-constant" initializer. */
02233 if (*(++ptr) == '^') ptr++;
02234 while ((cd->ctypes[*ptr] & ctype_letter) != 0) ptr++;
02235 if (*ptr == terminator && ptr[1] == ']')
02236   {
02237   *endptr = ptr;
02238   return true;
02239   }
02240 return false;
02241 }
02242 
02243 
02244 
02245 
02246 /*************************************************
02247 *          Check POSIX class name                *
02248 *************************************************/
02249 
02250 /* This function is called to check the name given in a POSIX-style class entry
02251 such as [:alnum:].
02252 
02253 Arguments:
02254   ptr        points to the first letter
02255   len        the length of the name
02256 
02257 Returns:     a value representing the name, or -1 if unknown
02258 */
02259 
02260 static int
02261 check_posix_name(const uschar *ptr, int len)
02262 {
02263 register int yield = 0;
02264 while (posix_name_lengths[yield] != 0)
02265   {
02266   if (len == posix_name_lengths[yield] &&
02267     strncmp((const char *)ptr, posix_names[yield], len) == 0) return yield;
02268   yield++;
02269   }
02270 return -1;
02271 }
02272 
02273 
02274 /*************************************************
02275 *    Adjust OP_RECURSE items in repeated group   *
02276 *************************************************/
02277 
02278 /* OP_RECURSE items contain an offset from the start of the regex to the group
02279 that is referenced. This means that groups can be replicated for fixed
02280 repetition simply by copying (because the recursion is allowed to refer to
02281 earlier groups that are outside the current group). However, when a group is
02282 optional (i.e. the minimum quantifier is zero), OP_BRAZERO is inserted before
02283 it, after it has been compiled. This means that any OP_RECURSE items within it
02284 that refer to the group itself or any contained groups have to have their
02285 offsets adjusted. That is the job of this function. Before it is called, the
02286 partially compiled regex must be temporarily terminated with OP_END.
02287 
02288 Arguments:
02289   group      points to the start of the group
02290   adjust     the amount by which the group is to be moved
02291   utf8       true in UTF-8 mode
02292   cd         contains pointers to tables etc.
02293 
02294 Returns:     nothing
02295 */
02296 
02297 static void
02298 adjust_recurse(uschar *group, int adjust, bool utf8, compile_data *cd)
02299 {
02300 uschar *ptr = group;
02301 while ((ptr = (uschar *)find_recurse(ptr, utf8)) != NULL)
02302   {
02303   int offset = GET(ptr, 1);
02304   if (cd->start_code + offset >= group) PUT(ptr, 1, offset + adjust);
02305   ptr += 1 + LINK_SIZE;
02306   }
02307 }
02308 
02309 
02310 
02311 /*************************************************
02312 *           Compile one branch                   *
02313 *************************************************/
02314 
02315 /* Scan the pattern, compiling it into the code vector. If the options are
02316 changed during the branch, the pointer is used to change the external options
02317 bits.
02318 
02319 Arguments:
02320   optionsptr     pointer to the option bits
02321   brackets       points to number of extracting brackets used
02322   code           points to the pointer to the current code point
02323   ptrptr         points to the current pattern pointer
02324   errorptr       points to pointer to error message
02325   firstbyteptr   set to initial literal character, or < 0 (REQ_UNSET, REQ_NONE)
02326   reqbyteptr     set to the last literal character required, else < 0
02327   bcptr          points to current branch chain
02328   cd             contains pointers to tables etc.
02329 
02330 Returns:         true on success
02331                  false, with *errorptr set on error
02332 */
02333 
02334 static bool
02335 compile_branch(int *optionsptr, int *brackets, uschar **codeptr,
02336   const uschar **ptrptr, const char **errorptr, int *firstbyteptr,
02337   int *reqbyteptr, branch_chain *bcptr, compile_data *cd)
02338 {
02339 int repeat_type, op_type;
02340 int repeat_min = 0, repeat_max = 0;      /* To please picky compilers */
02341 int bravalue = 0;
02342 int length;
02343 int greedy_default, greedy_non_default;
02344 int firstbyte, reqbyte;
02345 int zeroreqbyte, zerofirstbyte;
02346 int req_caseopt, reqvary, tempreqvary;
02347 int condcount = 0;
02348 int options = *optionsptr;
02349 register int c;
02350 register uschar *code = *codeptr;
02351 uschar *tempcode;
02352 bool inescq = false;
02353 bool groupsetfirstbyte = false;
02354 const uschar *ptr = *ptrptr;
02355 const uschar *tempptr;
02356 uschar *previous = NULL;
02357 uschar classa[32];
02358 
02359 bool utf8 = false;
02360 
02361 /* Set up the default and non-default settings for greediness */
02362 
02363 greedy_default = ((options & PCRE_UNGREEDY) != 0);
02364 greedy_non_default = greedy_default ^ 1;
02365 
02366 /* Initialize no first char, no required char. REQ_UNSET means "no char
02367 matching encountered yet". It gets changed to REQ_NONE if we hit something that
02368 matches a non-fixed char first char; reqbyte just remains unset if we never
02369 find one.
02370 
02371 When we hit a repeat whose minimum is zero, we may have to adjust these values
02372 to take the zero repeat into account. This is implemented by setting them to
02373 zerofirstbyte and zeroreqbyte when such a repeat is encountered. The individual
02374 item types that can be repeated set these backoff variables appropriately. */
02375 
02376 firstbyte = reqbyte = zerofirstbyte = zeroreqbyte = REQ_UNSET;
02377 
02378 /* The variable req_caseopt contains either the REQ_CASELESS value or zero,
02379 according to the current setting of the caseless flag. REQ_CASELESS is a bit
02380 value > 255. It is added into the firstbyte or reqbyte variables to record the
02381 case status of the value. */
02382 
02383 req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
02384 
02385 /* Switch on next character until the end of the branch */
02386 
02387 for (;; ptr++)
02388   {
02389   bool negate_class;
02390   bool possessive_quantifier;
02391   int class_charcount;
02392   int class_lastchar;
02393   int newoptions;
02394   int recno;
02395   int skipbytes;
02396   int subreqbyte;
02397   int subfirstbyte;
02398 
02399   c = *ptr;
02400   if (inescq && c != 0) goto NORMAL_CHAR;
02401 
02402   if ((options & PCRE_EXTENDED) != 0)
02403     {
02404     if ((cd->ctypes[c] & ctype_space) != 0) continue;
02405     if (c == '#')
02406       {
02407       /* The space before the ; is to avoid a warning on a silly compiler
02408       on the Macintosh. */
02409       while ((c = *(++ptr)) != 0 && c != NEWLINE) ;
02410       if (c != 0) continue;   /* Else fall through to handle end of string */
02411       }
02412     }
02413 
02414   switch(c)
02415     {
02416     /* The branch terminates at end of string, |, or ). */
02417 
02418     case 0:
02419     case '|':
02420     case ')':
02421     *firstbyteptr = firstbyte;
02422     *reqbyteptr = reqbyte;
02423     *codeptr = code;
02424     *ptrptr = ptr;
02425     return true;
02426 
02427     /* Handle single-character metacharacters. In multiline mode, ^ disables
02428     the setting of any following char as a first character. */
02429 
02430     case '^':
02431     if ((options & PCRE_MULTILINE) != 0)
02432       {
02433       if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
02434       }
02435     previous = NULL;
02436     *code++ = OP_CIRC;
02437     break;
02438 
02439     case '$':
02440     previous = NULL;
02441     *code++ = OP_DOLL;
02442     break;
02443 
02444     /* There can never be a first char if '.' is first, whatever happens about
02445     repeats. The value of reqbyte doesn't change either. */
02446 
02447     case '.':
02448     if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
02449     zerofirstbyte = firstbyte;
02450     zeroreqbyte = reqbyte;
02451     previous = code;
02452     *code++ = OP_ANY;
02453     break;
02454 
02455     /* Character classes. If the included characters are all < 255 in value, we
02456     build a 32-byte bitmap of the permitted characters, except in the special
02457     case where there is only one such character. For negated classes, we build
02458     the map as usual, then invert it at the end. However, we use a different
02459     opcode so that data characters > 255 can be handled correctly.
02460 
02461     If the class contains characters outside the 0-255 range, a different
02462     opcode is compiled. It may optionally have a bit map for characters < 256,
02463     but those above are are explicitly listed afterwards. A flag byte tells
02464     whether the bitmap is present, and whether this is a negated class or not.
02465     */
02466 
02467     case '[':
02468     previous = code;
02469 
02470     /* PCRE supports POSIX class stuff inside a class. Perl gives an error if
02471     they are encountered at the top level, so we'll do that too. */
02472 
02473     if ((ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&
02474         check_posix_syntax(ptr, &tempptr, cd))
02475       {
02476       *errorptr = (ptr[1] == ':')? ERR13 : ERR31;
02477       goto FAILED;
02478       }
02479 
02480     /* If the first character is '^', set the negation flag and skip it. */
02481 
02482     if ((c = *(++ptr)) == '^')
02483       {
02484       negate_class = true;
02485       c = *(++ptr);
02486       }
02487     else
02488       {
02489       negate_class = false;
02490       }
02491 
02492     /* Keep a count of chars with values < 256 so that we can optimize the case
02493     of just a single character (as long as it's < 256). For higher valued UTF-8
02494     characters, we don't yet do any optimization. */
02495 
02496     class_charcount = 0;
02497     class_lastchar = -1;
02498 
02499 
02500     /* Initialize the 32-char bit map to all zeros. We have to build the
02501     map in a temporary bit of store, in case the class contains only 1
02502     character (< 256), because in that case the compiled code doesn't use the
02503     bit map. */
02504 
02505     memset(classa, 0, 32 * sizeof(uschar));
02506 
02507     /* Process characters until ] is reached. By writing this as a "do" it
02508     means that an initial ] is taken as a data character. The first pass
02509     through the regex checked the overall syntax, so we don't need to be very
02510     strict here. At the start of the loop, c contains the first byte of the
02511     character. */
02512 
02513     do
02514       {
02515 
02516       /* Inside \Q...\E everything is literal except \E */
02517 
02518       if (inescq)
02519         {
02520         if (c == '\\' && ptr[1] == 'E')
02521           {
02522           inescq = false;
02523           ptr++;
02524           continue;
02525           }
02526         else goto LONE_SINGLE_CHARACTER;
02527         }
02528 
02529       /* Handle POSIX class names. Perl allows a negation extension of the
02530       form [:^name:]. A square bracket that doesn't match the syntax is
02531       treated as a literal. We also recognize the POSIX constructions
02532       [.ch.] and [=ch=] ("collating elements") and fault them, as Perl
02533       5.6 and 5.8 do. */
02534 
02535       if (c == '[' &&
02536           (ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&
02537           check_posix_syntax(ptr, &tempptr, cd))
02538         {
02539         bool local_negate = false;
02540         int posix_class, i;
02541         register const uschar *cbits = cd->cbits;
02542 
02543         if (ptr[1] != ':')
02544           {
02545           *errorptr = ERR31;
02546           goto FAILED;
02547           }
02548 
02549         ptr += 2;
02550         if (*ptr == '^')
02551           {
02552           local_negate = true;
02553           ptr++;
02554           }
02555 
02556         posix_class = check_posix_name(ptr, tempptr - ptr);
02557         if (posix_class < 0)
02558           {
02559           *errorptr = ERR30;
02560           goto FAILED;
02561           }
02562 
02563         /* If matching is caseless, upper and lower are converted to
02564         alpha. This relies on the fact that the class table starts with
02565         alpha, lower, upper as the first 3 entries. */
02566 
02567         if ((options & PCRE_CASELESS) != 0 && posix_class <= 2)
02568           posix_class = 0;
02569 
02570         /* Or into the map we are building up to 3 of the static class
02571         tables, or their negations. The [:blank:] class sets up the same
02572         chars as the [:space:] class (all white space). We remove the vertical
02573         white space chars afterwards. */
02574 
02575         posix_class *= 3;
02576         for (i = 0; i < 3; i++)
02577           {
02578           bool blankclass = strncmp((char *)ptr, "blank", 5) == 0;
02579           int taboffset = posix_class_maps[posix_class + i];
02580           if (taboffset < 0) break;
02581           if (local_negate)
02582             {
02583             for (c = 0; c < 32; c++) classa[c] |= ~cbits[c+taboffset];
02584             if (blankclass) classa[1] |= 0x3c;
02585             }
02586           else
02587             {
02588             for (c = 0; c < 32; c++) classa[c] |= cbits[c+taboffset];
02589             if (blankclass) classa[1] &= ~0x3c;
02590             }
02591           }
02592 
02593         ptr = tempptr + 1;
02594         class_charcount = 10;  /* Set > 1; assumes more than 1 per class */
02595         continue;    /* End of POSIX syntax handling */
02596         }
02597 
02598       /* Backslash may introduce a single character, or it may introduce one
02599       of the specials, which just set a flag. Escaped items are checked for
02600       validity in the pre-compiling pass. The sequence \b is a special case.
02601       Inside a class (and only there) it is treated as backspace. Elsewhere
02602       it marks a word boundary. Other escapes have preset maps ready to
02603       or into the one we are building. We assume they have more than one
02604       character in them, so set class_charcount bigger than one. */
02605 
02606       if (c == '\\')
02607         {
02608         c = check_escape(&ptr, errorptr, *brackets, options, true);
02609         if (-c == ESC_b) c = '\b';  /* \b is backslash in a class */
02610 
02611         if (-c == ESC_Q)            /* Handle start of quoted string */
02612           {
02613           if (ptr[1] == '\\' && ptr[2] == 'E')
02614             {
02615             ptr += 2; /* avoid empty string */
02616             }
02617           else inescq = true;
02618           continue;
02619           }
02620 
02621         else if (c < 0)
02622           {
02623           register const uschar *cbits = cd->cbits;
02624           class_charcount = 10;     /* Greater than 1 is what matters */
02625           switch (-c)
02626             {
02627             case ESC_d:
02628             for (c = 0; c < 32; c++) classa[c] |= cbits[c+cbit_digit];
02629             continue;
02630 
02631             case ESC_D:
02632             for (c = 0; c < 32; c++) classa[c] |= ~cbits[c+cbit_digit];
02633             continue;
02634 
02635             case ESC_w:
02636             for (c = 0; c < 32; c++) classa[c] |= cbits[c+cbit_word];
02637             continue;
02638 
02639             case ESC_W:
02640             for (c = 0; c < 32; c++) classa[c] |= ~cbits[c+cbit_word];
02641             continue;
02642 
02643             case ESC_s:
02644             for (c = 0; c < 32; c++) classa[c] |= cbits[c+cbit_space];
02645             classa[1] &= ~0x08;   /* Perl 5.004 onwards omits VT from \s */
02646             continue;
02647 
02648             case ESC_S:
02649             for (c = 0; c < 32; c++) classa[c] |= ~cbits[c+cbit_space];
02650             classa[1] |= 0x08;    /* Perl 5.004 onwards omits VT from \s */
02651             continue;
02652 
02653             /* Unrecognized escapes are faulted if PCRE is running in its
02654             strict mode. By default, for compatibility with Perl, they are
02655             treated as literals. */
02656 
02657             default:
02658             if ((options & PCRE_EXTRA) != 0)
02659               {
02660               *errorptr = ERR7;
02661               goto FAILED;
02662               }
02663             c = *ptr;    /* The final character */
02664             }
02665           }
02666 
02667         /* Fall through if we have a single character (c >= 0). This may be
02668         > 256 in UTF-8 mode. */
02669 
02670         }   /* End of backslash handling */
02671 
02672       /* A single character may be followed by '-' to form a range. However,
02673       Perl does not permit ']' to be the end of the range. A '-' character
02674       here is treated as a literal. */
02675 
02676       if (ptr[1] == '-' && ptr[2] != ']')
02677         {
02678         int d;
02679         ptr += 2;
02680 
02681         d = *ptr;
02682 
02683         /* The second part of a range can be a single-character escape, but
02684         not any of the other escapes. Perl 5.6 treats a hyphen as a literal
02685         in such circumstances. */
02686 
02687         if (d == '\\')
02688           {
02689           const uschar *oldptr = ptr;
02690           d = check_escape(&ptr, errorptr, *brackets, options, true);
02691 
02692           /* \b is backslash; any other special means the '-' was literal */
02693 
02694           if (d < 0)
02695             {
02696             if (d == -ESC_b) d = '\b'; else
02697               {
02698               ptr = oldptr - 2;
02699               goto LONE_SINGLE_CHARACTER;  /* A few lines below */
02700               }
02701             }
02702           }
02703 
02704         /* Check that the two values are in the correct order */
02705 
02706         if (d < c)
02707           {
02708           *errorptr = ERR8;
02709           goto FAILED;
02710           }
02711 
02712         /* If d is greater than 255, we can't just use the bit map, so set up
02713         for the UTF-8 supporting class type. If we are not caseless, we can
02714         just set up a single range. If we are caseless, the characters < 256
02715         are handled with a bitmap, in order to get the case-insensitive
02716         handling. */
02717 
02718         /* We use the bit map if the range is entirely < 255, or if part of it
02719         is < 255 and matching is caseless. */
02720 
02721         for (; c <= d; c++)
02722           {
02723           classa[c/8] |= (1 << (c&7));
02724           if ((options & PCRE_CASELESS) != 0)
02725             {
02726             int uc = cd->fcc[c];           /* flip case */
02727             classa[uc/8] |= (1 << (uc&7));
02728             }
02729           class_charcount++;                /* in case a one-char range */
02730           class_lastchar = c;
02731           }
02732 
02733         continue;   /* Go get the next char in the class */
02734         }
02735 
02736       /* Handle a lone single character - we can get here for a normal
02737       non-escape char, or after \ that introduces a single character. */
02738 
02739       LONE_SINGLE_CHARACTER:
02740 
02741       /* Handle a single-byte character */
02742         {
02743         classa[c/8] |= (1 << (c&7));
02744         if ((options & PCRE_CASELESS) != 0)
02745           {
02746           c = cd->fcc[c];   /* flip case */
02747           classa[c/8] |= (1 << (c&7));
02748           }
02749         class_charcount++;
02750         class_lastchar = c;
02751         }
02752       }
02753 
02754     /* Loop until ']' reached; the check for end of string happens inside the
02755     loop. This "while" is the end of the "do" above. */
02756 
02757     while ((c = *(++ptr)) != ']' || inescq);
02758 
02759     /* If class_charcount is 1, we saw precisely one character with a value <
02760     256. In UTF-8 mode, we can optimize if there were no characters >= 256 and
02761     the one character is < 128. In non-UTF-8 mode we can always optimize.
02762 
02763     The optimization throws away the bit map. We turn the item into a
02764     1-character OP_CHARS if it's positive, or OP_NOT if it's negative. Note
02765     that OP_NOT does not support multibyte characters. In the positive case, it
02766     can cause firstbyte to be set. Otherwise, there can be no first char if
02767     this item is first, whatever repeat count may follow. In the case of
02768     reqbyte, save the previous value for reinstating. */
02769 
02770     if (class_charcount == 1)
02771       {
02772       zeroreqbyte = reqbyte;
02773       if (negate_class)
02774         {
02775         if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
02776         zerofirstbyte = firstbyte;
02777         *code++ = OP_NOT;
02778         }
02779       else
02780         {
02781         if (firstbyte == REQ_UNSET)
02782           {
02783           zerofirstbyte = REQ_NONE;
02784           firstbyte = class_lastchar | req_caseopt;
02785           }
02786         else
02787           {
02788           zerofirstbyte = firstbyte;
02789           reqbyte = class_lastchar | req_caseopt | cd->req_varyopt;
02790           }
02791         *code++ = OP_CHARS;
02792         *code++ = 1;
02793         }
02794       *code++ = class_lastchar;
02795       break;  /* End of class handling */
02796       }       /* End of 1-byte optimization */
02797 
02798     /* Otherwise, if this is the first thing in the branch, there can be no
02799     first char setting, whatever the repeat count. Any reqbyte setting must
02800     remain unchanged after any kind of repeat. */
02801 
02802     if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
02803     zerofirstbyte = firstbyte;
02804     zeroreqbyte = reqbyte;
02805 
02806     /* If there are characters with values > 255, we have to compile an
02807     extended class, with its own opcode. If there are no characters < 256,
02808     we can omit the bitmap. */
02809 
02810 
02811     /* If there are no characters > 255, negate the 32-byte map if necessary,
02812     and copy it into the code vector. If this is the first thing in the branch,
02813     there can be no first char setting, whatever the repeat count. Any reqbyte
02814     setting must remain unchanged after any kind of repeat. */
02815 
02816     if (negate_class)
02817       {
02818       *code++ = OP_NCLASS;
02819       for (c = 0; c < 32; c++) code[c] = ~classa[c];
02820       }
02821     else
02822       {
02823       *code++ = OP_CLASS;
02824       memcpy(code, classa, 32);
02825       }
02826     code += 32;
02827     break;
02828 
02829     /* Various kinds of repeat */
02830 
02831     case '{':
02832     if (!is_counted_repeat(ptr+1)) goto NORMAL_CHAR;
02833     ptr = read_repeat_counts(ptr+1, &repeat_min, &repeat_max, errorptr);
02834     if (*errorptr != NULL) goto FAILED;
02835     goto REPEAT;
02836 
02837     case '*':
02838     repeat_min = 0;
02839     repeat_max = -1;
02840     goto REPEAT;
02841 
02842     case '+':
02843     repeat_min = 1;
02844     repeat_max = -1;
02845     goto REPEAT;
02846 
02847     case '?':
02848     repeat_min = 0;
02849     repeat_max = 1;
02850 
02851     REPEAT:
02852     if (previous == NULL)
02853       {
02854       *errorptr = ERR9;
02855       goto FAILED;
02856       }
02857 
02858     if (repeat_min == 0)
02859       {
02860       firstbyte = zerofirstbyte;    /* Adjust for zero repeat */
02861       reqbyte = zeroreqbyte;        /* Ditto */
02862       }
02863 
02864     /* Remember whether this is a variable length repeat */
02865 
02866     reqvary = (repeat_min == repeat_max)? 0 : REQ_VARY;
02867 
02868     op_type = 0;                    /* Default single-char op codes */
02869     possessive_quantifier = false;  /* Default not possessive quantifier */
02870 
02871     /* Save start of previous item, in case we have to move it up to make space
02872     for an inserted OP_ONCE for the additional '+' extension. */
02873 
02874     tempcode = previous;
02875 
02876     /* If the next character is '+', we have a possessive quantifier. This
02877     implies greediness, whatever the setting of the PCRE_UNGREEDY option.
02878     If the next character is '?' this is a minimizing repeat, by default,
02879     but if PCRE_UNGREEDY is set, it works the other way round. We change the
02880     repeat type to the non-default. */
02881 
02882     if (ptr[1] == '+')
02883       {
02884       repeat_type = 0;                  /* Force greedy */
02885       possessive_quantifier = true;
02886       ptr++;
02887       }
02888     else if (ptr[1] == '?')
02889       {
02890       repeat_type = greedy_non_default;
02891       ptr++;
02892       }
02893     else repeat_type = greedy_default;
02894 
02895     /* If previous was a recursion, we need to wrap it inside brackets so that
02896     it can be replicated if necessary. */
02897 
02898     if (*previous == OP_RECURSE)
02899       {
02900       memmove(previous + 1 + LINK_SIZE, previous, 1 + LINK_SIZE);
02901       code += 1 + LINK_SIZE;
02902       *previous = OP_BRA;
02903       PUT(previous, 1, code - previous);
02904       *code = OP_KET;
02905       PUT(code, 1, code - previous);
02906       code += 1 + LINK_SIZE;
02907       }
02908 
02909     /* If previous was a string of characters, chop off the last one and use it
02910     as the subject of the repeat. If there was only one character, we can
02911     abolish the previous item altogether. If a one-char item has a minumum of
02912     more than one, ensure that it is set in reqbyte - it might not be if a
02913     sequence such as x{3} is the first thing in a branch because the x will
02914     have gone into firstbyte instead.  */
02915 
02916     if (*previous == OP_CHARS)
02917       {
02918       /* Deal with UTF-8 characters that take up more than one byte. It's
02919       easier to write this out separately than try to macrify it. Use c to
02920       hold the length of the character in bytes, plus 0x80 to flag that it's a
02921       length rather than a small character. */
02922 
02923 
02924       /* Handle the case of a single byte - either with no UTF8 support, or
02925       with UTF-8 disabled, or for a UTF-8 character < 128. */
02926 
02927         {
02928         c = *(--code);
02929         if (code == previous + 2)   /* There was only one character */
02930           {
02931           code = previous;              /* Abolish the previous item */
02932           if (repeat_min > 1) reqbyte = c | req_caseopt | cd->req_varyopt;
02933           }
02934         else
02935           {
02936           previous[1]--;             /* adjust length */
02937           tempcode = code;           /* Adjust position to be moved for '+' */
02938           }
02939         }
02940 
02941       goto OUTPUT_SINGLE_REPEAT;   /* Code shared with single character types */
02942       }
02943 
02944     /* If previous was a single negated character ([^a] or similar), we use
02945     one of the special opcodes, replacing it. The code is shared with single-
02946     character repeats by setting opt_type to add a suitable offset into
02947     repeat_type. OP_NOT is currently used only for single-byte chars. */
02948 
02949     else if (*previous == OP_NOT)
02950       {
02951       op_type = OP_NOTSTAR - OP_STAR;  /* Use "not" opcodes */
02952       c = previous[1];
02953       code = previous;
02954       goto OUTPUT_SINGLE_REPEAT;
02955       }
02956 
02957     /* If previous was a character type match (\d or similar), abolish it and
02958     create a suitable repeat item. The code is shared with single-character
02959     repeats by setting op_type to add a suitable offset into repeat_type. */
02960 
02961     else if (*previous < OP_EODN)
02962       {
02963       op_type = OP_TYPESTAR - OP_STAR;  /* Use type opcodes */
02964       c = *previous;
02965       code = previous;
02966 
02967       OUTPUT_SINGLE_REPEAT:
02968 
02969       /* If the maximum is zero then the minimum must also be zero; Perl allows
02970       this case, so we do too - by simply omitting the item altogether. */
02971 
02972       if (repeat_max == 0) goto END_REPEAT;
02973 
02974       /* Combine the op_type with the repeat_type */
02975 
02976       repeat_type += op_type;
02977 
02978       /* A minimum of zero is handled either as the special case * or ?, or as
02979       an UPTO, with the maximum given. */
02980 
02981       if (repeat_min == 0)
02982         {
02983         if (repeat_max == -1) *code++ = OP_STAR + repeat_type;
02984           else if (repeat_max == 1) *code++ = OP_QUERY + repeat_type;
02985         else
02986           {
02987           *code++ = OP_UPTO + repeat_type;
02988           PUT2INC(code, 0, repeat_max);
02989           }
02990         }
02991 
02992       /* The case {1,} is handled as the special case + */
02993 
02994       else if (repeat_min == 1 && repeat_max == -1)
02995         *code++ = OP_PLUS + repeat_type;
02996 
02997       /* The case {n,n} is just an EXACT, while the general case {n,m} is
02998       handled as an EXACT followed by an UPTO. An EXACT of 1 is optimized. */
02999 
03000       else
03001         {
03002         if (repeat_min != 1)
03003           {
03004           *code++ = OP_EXACT + op_type;  /* NB EXACT doesn't have repeat_type */
03005           PUT2INC(code, 0, repeat_min);
03006           }
03007 
03008         /* If the mininum is 1 and the previous item was a character string,
03009         we either have to put back the item that got cancelled if the string
03010         length was 1, or add the character back onto the end of a longer
03011         string. For a character type nothing need be done; it will just get
03012         put back naturally. Note that the final character is always going to
03013         get added below, so we leave code ready for its insertion. */
03014 
03015         else if (*previous == OP_CHARS)
03016           {
03017           if (code == previous) code += 2; else
03018 
03019           /* In UTF-8 mode, a multibyte char has its length in c, with the 0x80
03020           bit set as a flag. The length will always be between 2 and 6. */
03021 
03022           previous[1]++;
03023           }
03024 
03025         /*  For a single negated character we also have to put back the
03026         item that got cancelled. At present this applies only to single byte
03027         characters in any mode. */
03028 
03029         else if (*previous == OP_NOT) code++;
03030 
03031         /* If the maximum is unlimited, insert an OP_STAR. Before doing so,
03032         we have to insert the character for the previous code. In UTF-8 mode,
03033         long characters have their length in c, with the 0x80 bit as a flag. */
03034 
03035         if (repeat_max < 0)
03036           {
03037           *code++ = c;
03038           *code++ = OP_STAR + repeat_type;
03039           }
03040 
03041         /* Else insert an UPTO if the max is greater than the min, again
03042         preceded by the character, for the previously inserted code. */
03043 
03044         else if (repeat_max != repeat_min)
03045           {
03046           *code++ = c;
03047           repeat_max -= repeat_min;
03048           *code++ = OP_UPTO + repeat_type;
03049           PUT2INC(code, 0, repeat_max);
03050           }
03051         }
03052 
03053       /* The character or character type itself comes last in all cases. */
03054 
03055 
03056       *code++ = c;
03057       }
03058 
03059     /* If previous was a character class or a back reference, we put the repeat
03060     stuff after it, but just skip the item if the repeat was {0,0}. */
03061 
03062     else if (*previous == OP_CLASS ||
03063              *previous == OP_NCLASS ||
03064              *previous == OP_REF)
03065       {
03066       if (repeat_max == 0)
03067         {
03068         code = previous;
03069         goto END_REPEAT;
03070         }
03071       if (repeat_min == 0 && repeat_max == -1)
03072         *code++ = OP_CRSTAR + repeat_type;
03073       else if (repeat_min == 1 && repeat_max == -1)
03074         *code++ = OP_CRPLUS + repeat_type;
03075       else if (repeat_min == 0 && repeat_max == 1)
03076         *code++ = OP_CRQUERY + repeat_type;
03077       else
03078         {
03079         *code++ = OP_CRRANGE + repeat_type;
03080         PUT2INC(code, 0, repeat_min);
03081         if (repeat_max == -1) repeat_max = 0;  /* 2-byte encoding for max */
03082         PUT2INC(code, 0, repeat_max);
03083         }
03084       }
03085 
03086     /* If previous was a bracket group, we may have to replicate it in certain
03087     cases. */
03088 
03089     else if (*previous >= OP_BRA || *previous == OP_ONCE ||
03090              *previous == OP_COND)
03091       {
03092       register int i;
03093       int ketoffset = 0;
03094       int len = code - previous;
03095       uschar *bralink = NULL;
03096 
03097       /* If the maximum repeat count is unlimited, find the end of the bracket
03098       by scanning through from the start, and compute the offset back to it
03099       from the current code pointer. There may be an OP_OPT setting following
03100       the final KET, so we can't find the end just by going back from the code
03101       pointer. */
03102 
03103       if (repeat_max == -1)
03104         {
03105         register uschar *ket = previous;
03106         do ket += GET(ket, 1); while (*ket != OP_KET);
03107         ketoffset = code - ket;
03108         }
03109 
03110       /* The case of a zero minimum is special because of the need to stick
03111       OP_BRAZERO in front of it, and because the group appears once in the
03112       data, whereas in other cases it appears the minimum number of times. For
03113       this reason, it is simplest to treat this case separately, as otherwise
03114       the code gets far too messy. There are several special subcases when the
03115       minimum is zero. */
03116 
03117       if (repeat_min == 0)
03118         {
03119         /* If the maximum is also zero, we just omit the group from the output
03120         altogether. */
03121 
03122         if (repeat_max == 0)
03123           {
03124           code = previous;
03125           goto END_REPEAT;
03126           }
03127 
03128         /* If the maximum is 1 or unlimited, we just have to stick in the
03129         BRAZERO and do no more at this point. However, we do need to adjust
03130         any OP_RECURSE calls inside the group that refer to the group itself or
03131         any internal group, because the offset is from the start of the whole
03132         regex. Temporarily terminate the pattern while doing this. */
03133 
03134         if (repeat_max <= 1)
03135           {
03136           *code = OP_END;
03137           adjust_recurse(previous, 1, utf8, cd);
03138           memmove(previous+1, previous, len);
03139           code++;
03140           *previous++ = OP_BRAZERO + repeat_type;
03141           }
03142 
03143         /* If the maximum is greater than 1 and limited, we have to replicate
03144         in a nested fashion, sticking OP_BRAZERO before each set of brackets.
03145         The first one has to be handled carefully because it's the original
03146         copy, which has to be moved up. The remainder can be handled by code
03147         that is common with the non-zero minimum case below. We have to
03148         adjust the value or repeat_max, since one less copy is required. Once
03149         again, we may have to adjust any OP_RECURSE calls inside the group. */
03150 
03151         else
03152           {
03153           int offset;
03154           *code = OP_END;
03155           adjust_recurse(previous, 2 + LINK_SIZE, utf8, cd);
03156           memmove(previous + 2 + LINK_SIZE, previous, len);
03157           code += 2 + LINK_SIZE;
03158           *previous++ = OP_BRAZERO + repeat_type;
03159           *previous++ = OP_BRA;
03160 
03161           /* We chain together the bracket offset fields that have to be
03162           filled in later when the ends of the brackets are reached. */
03163 
03164           offset = (bralink == NULL)? 0 : previous - bralink;
03165           bralink = previous;
03166           PUTINC(previous, 0, offset);
03167           }
03168 
03169         repeat_max--;
03170         }
03171 
03172       /* If the minimum is greater than zero, replicate the group as many
03173       times as necessary, and adjust the maximum to the number of subsequent
03174       copies that we need. If we set a first char from the group, and didn't
03175       set a required char, copy the latter from the former. */
03176 
03177       else
03178         {
03179         if (repeat_min > 1)
03180           {
03181           if (groupsetfirstbyte && reqbyte < 0) reqbyte = firstbyte;
03182           for (i = 1; i < repeat_min; i++)
03183             {
03184             memcpy(code, previous, len);
03185             code += len;
03186             }
03187           }
03188         if (repeat_max > 0) repeat_max -= repeat_min;
03189         }
03190 
03191       /* This code is common to both the zero and non-zero minimum cases. If
03192       the maximum is limited, it replicates the group in a nested fashion,
03193       remembering the bracket starts on a stack. In the case of a zero minimum,
03194       the first one was set up above. In all cases the repeat_max now specifies
03195       the number of additional copies needed. */
03196 
03197       if (repeat_max >= 0)
03198         {
03199         for (i = repeat_max - 1; i >= 0; i--)
03200           {
03201           *code++ = OP_BRAZERO + repeat_type;
03202 
03203           /* All but the final copy start a new nesting, maintaining the
03204           chain of brackets outstanding. */
03205 
03206           if (i != 0)
03207             {
03208             int offset;
03209             *code++ = OP_BRA;
03210             offset = (bralink == NULL)? 0 : code - bralink;
03211             bralink = code;
03212             PUTINC(code, 0, offset);
03213             }
03214 
03215           memcpy(code, previous, len);
03216           code += len;
03217           }
03218 
03219         /* Now chain through the pending brackets, and fill in their length
03220         fields (which are holding the chain links pro tem). */
03221 
03222         while (bralink != NULL)
03223           {
03224           int oldlinkoffset;
03225           int offset = code - bralink + 1;
03226           uschar *bra = code - offset;
03227           oldlinkoffset = GET(bra, 1);
03228           bralink = (oldlinkoffset == 0)? NULL : bralink - oldlinkoffset;
03229           *code++ = OP_KET;
03230           PUTINC(code, 0, offset);
03231           PUT(bra, 1, offset);
03232           }
03233         }
03234 
03235       /* If the maximum is unlimited, set a repeater in the final copy. We
03236       can't just offset backwards from the current code point, because we
03237       don't know if there's been an options resetting after the ket. The
03238       correct offset was computed above. */
03239 
03240       else code[-ketoffset] = OP_KETRMAX + repeat_type;
03241       }
03242 
03243     /* Else there's some kind of shambles */
03244 
03245     else
03246       {
03247       *errorptr = ERR11;
03248       goto FAILED;
03249       }
03250 
03251     /* If the character following a repeat is '+', we wrap the entire repeated
03252     item inside OP_ONCE brackets. This is just syntactic sugar, taken from
03253     Sun's Java package. The repeated item starts at tempcode, not at previous,
03254     which might be the first part of a string whose (former) last char we
03255     repeated. However, we don't support '+' after a greediness '?'. */
03256 
03257     if (possessive_quantifier)
03258       {
03259       int len = code - tempcode;
03260       memmove(tempcode + 1+LINK_SIZE, tempcode, len);
03261       code += 1 + LINK_SIZE;
03262       len += 1 + LINK_SIZE;
03263       tempcode[0] = OP_ONCE;
03264       *code++ = OP_KET;
03265       PUTINC(code, 0, len);
03266       PUT(tempcode, 1, len);
03267       }
03268 
03269     /* In all case we no longer have a previous item. We also set the
03270     "follows varying string" flag for subsequently encountered reqbytes if
03271     it isn't already set and we have just passed a varying length item. */
03272 
03273     END_REPEAT:
03274     previous = NULL;
03275     cd->req_varyopt |= reqvary;
03276     break;
03277 
03278 
03279     /* Start of nested bracket sub-expression, or comment or lookahead or
03280     lookbehind or option setting or condition. First deal with special things
03281     that can come after a bracket; all are introduced by ?, and the appearance
03282     of any of them means that this is not a referencing group. They were
03283     checked for validity in the first pass over the string, so we don't have to
03284     check for syntax errors here.  */
03285 
03286     case '(':
03287     newoptions = options;
03288     skipbytes = 0;
03289 
03290     if (*(++ptr) == '?')
03291       {
03292       int set, unset;
03293       int *optset;
03294 
03295       switch (*(++ptr))
03296         {
03297         case '#':                 /* Comment; skip to ket */
03298         ptr++;
03299         while (*ptr != ')') ptr++;
03300         continue;
03301 
03302         case ':':                 /* Non-extracting bracket */
03303         bravalue = OP_BRA;
03304         ptr++;
03305         break;
03306 
03307         case '(':
03308         bravalue = OP_COND;       /* Conditional group */
03309 
03310         /* Condition to test for recursion */
03311 
03312         if (ptr[1] == 'R')
03313           {
03314           code[1+LINK_SIZE] = OP_CREF;
03315           PUT2(code, 2+LINK_SIZE, CREF_RECURSE);
03316           skipbytes = 3;
03317           ptr += 3;
03318           }
03319 
03320         /* Condition to test for a numbered subpattern match. We know that
03321         if a digit follows ( then there will just be digits until ) because
03322         the syntax was checked in the first pass. */
03323 
03324         else if ((digitab[ptr[1]] && ctype_digit) != 0)
03325           {
03326           int condref;                 /* Don't amalgamate; some compilers */
03327           condref = *(++ptr) - '0';    /* grumble at autoincrement in declaration */
03328           while (*(++ptr) != ')') condref = condref*10 + *ptr - '0';
03329           if (condref == 0)
03330             {
03331             *errorptr = ERR35;
03332             goto FAILED;
03333             }
03334           ptr++;
03335           code[1+LINK_SIZE] = OP_CREF;
03336           PUT2(code, 2+LINK_SIZE, condref);
03337           skipbytes = 3;
03338           }
03339         /* For conditions that are assertions, we just fall through, having
03340         set bravalue above. */
03341         break;
03342 
03343         case '=':                 /* Positive lookahead */
03344         bravalue = OP_ASSERT;
03345         ptr++;
03346         break;
03347 
03348         case '!':                 /* Negative lookahead */
03349         bravalue = OP_ASSERT_NOT;
03350         ptr++;
03351         break;
03352 
03353         case '<':                 /* Lookbehinds */
03354         switch (*(++ptr))
03355           {
03356           case '=':               /* Positive lookbehind */
03357           bravalue = OP_ASSERTBACK;
03358           ptr++;
03359           break;
03360 
03361           case '!':               /* Negative lookbehind */
03362           bravalue = OP_ASSERTBACK_NOT;
03363           ptr++;
03364           break;
03365           }
03366         break;
03367 
03368         case '>':                 /* One-time brackets */
03369         bravalue = OP_ONCE;
03370         ptr++;
03371         break;
03372 
03373         case 'C':                 /* Callout - may be followed by digits */
03374         *code++ = OP_CALLOUT;
03375           {
03376           int n = 0;
03377           while ((digitab[*(++ptr)] & ctype_digit) != 0)
03378             n = n * 10 + *ptr - '0';
03379           if (n > 255)
03380             {
03381             *errorptr = ERR38;
03382             goto FAILED;
03383             }
03384           *code++ = n;
03385           }
03386         previous = NULL;
03387         continue;
03388 
03389         case 'P':                 /* Named subpattern handling */
03390         if (*(++ptr) == '<')      /* Definition */
03391           {
03392           int i, namelen;
03393           uschar *slot = cd->name_table;
03394           const uschar *name;     /* Don't amalgamate; some compilers */
03395           name = ++ptr;           /* grumble at autoincrement in declaration */
03396 
03397           while (*ptr++ != '>');
03398           namelen = ptr - name - 1;
03399 
03400           for (i = 0; i < cd->names_found; i++)
03401             {
03402             int crc = memcmp(name, slot+2, namelen);
03403             if (crc == 0)
03404               {
03405               if (slot[2+namelen] == 0)
03406                 {
03407                 *errorptr = ERR43;
03408                 goto FAILED;
03409                 }
03410               crc = -1;             /* Current name is substring */
03411               }
03412             if (crc < 0)
03413               {
03414               memmove(slot + cd->name_entry_size, slot,
03415                 (cd->names_found - i) * cd->name_entry_size);
03416               break;
03417               }
03418             slot += cd->name_entry_size;
03419             }
03420 
03421           PUT2(slot, 0, *brackets + 1);
03422           memcpy(slot + 2, name, namelen);
03423           slot[2+namelen] = 0;
03424           cd->names_found++;
03425           goto NUMBERED_GROUP;
03426           }
03427 
03428         if (*ptr == '=' || *ptr == '>')  /* Reference or recursion */
03429           {
03430           int i, namelen;
03431           int type = *ptr++;
03432           const uschar *name = ptr;
03433           uschar *slot = cd->name_table;
03434 
03435           while (*ptr != ')') ptr++;
03436           namelen = ptr - name;
03437 
03438           for (i = 0; i < cd->names_found; i++)
03439             {
03440             if (strncmp((char *)name, (char *)slot+2, namelen) == 0) break;
03441             slot += cd->name_entry_size;
03442             }
03443           if (i >= cd->names_found)
03444             {
03445             *errorptr = ERR15;
03446             goto FAILED;
03447             }
03448 
03449           recno = GET2(slot, 0);
03450 
03451           if (type == '>') goto HANDLE_RECURSION;  /* A few lines below */
03452 
03453           /* Back reference */
03454 
03455           previous = code;
03456           *code++ = OP_REF;
03457           PUT2INC(code, 0, recno);
03458           cd->backref_map |= (recno < 32)? (1 << recno) : 1;
03459           if (recno > cd->top_backref) cd->top_backref = recno;
03460           continue;
03461           }
03462 
03463         /* Should never happen */
03464         break;
03465 
03466         case 'R':                 /* Pattern recursion */
03467         ptr++;                    /* Same as (?0)      */
03468         /* Fall through */
03469 
03470         /* Recursion or "subroutine" call */
03471 
03472         case '0': case '1': case '2': case '3': case '4':
03473         case '5': case '6': case '7': case '8': case '9':
03474           {
03475           const uschar *called;
03476           recno = 0;
03477           while((digitab[*ptr] & ctype_digit) != 0)
03478             recno = recno * 10 + *ptr++ - '0';
03479 
03480           /* Come here from code above that handles a named recursion */
03481 
03482           HANDLE_RECURSION:
03483 
03484           previous = code;
03485 
03486           /* Find the bracket that is being referenced. Temporarily end the
03487           regex in case it doesn't exist. */
03488 
03489           *code = OP_END;
03490           called = (recno == 0)?
03491             cd->start_code : find_bracket(cd->start_code, recno);
03492 
03493           if (called == NULL)
03494             {
03495             *errorptr = ERR15;
03496             goto FAILED;
03497             }
03498 
03499           /* If the subpattern is still open, this is a recursive call. We
03500           check to see if this is a left recursion that could loop for ever,
03501           and diagnose that case. */
03502 
03503           if (GET(called, 1) == 0 && could_be_empty(called, code, bcptr, utf8))
03504             {
03505             *errorptr = ERR40;
03506             goto FAILED;
03507             }
03508 
03509           /* Insert the recursion/subroutine item */
03510 
03511           *code = OP_RECURSE;
03512           PUT(code, 1, called - cd->start_code);
03513           code += 1 + LINK_SIZE;
03514           }
03515         continue;
03516 
03517         /* Character after (? not specially recognized */
03518 
03519         default:                  /* Option setting */
03520         set = unset = 0;
03521         optset = &set;
03522 
03523         while (*ptr != ')' && *ptr != ':')
03524           {
03525           switch (*ptr++)
03526             {
03527             case '-': optset = &unset; break;
03528 
03529             case 'i': *optset |= PCRE_CASELESS; break;
03530             case 'm': *optset |= PCRE_MULTILINE; break;
03531             case 's': *optset |= PCRE_DOTALL; break;
03532             case 'x': *optset |= PCRE_EXTENDED; break;
03533             case 'U': *optset |= PCRE_UNGREEDY; break;
03534             case 'X': *optset |= PCRE_EXTRA; break;
03535             }
03536           }
03537 
03538         /* Set up the changed option bits, but don't change anything yet. */
03539 
03540         newoptions = (options | set) & (~unset);
03541 
03542         /* If the options ended with ')' this is not the start of a nested
03543         group with option changes, so the options change at this level. Compile
03544         code to change the ims options if this setting actually changes any of
03545         them. We also pass the new setting back so that it can be put at the
03546         start of any following branches, and when this group ends (if we are in
03547         a group), a resetting item can be compiled.
03548 
03549         Note that if this item is right at the start of the pattern, the
03550         options will have been abstracted and made global, so there will be no
03551         change to compile. */
03552 
03553         if (*ptr == ')')
03554           {
03555           if ((options & PCRE_IMS) != (newoptions & PCRE_IMS))
03556             {
03557             *code++ = OP_OPT;
03558             *code++ = newoptions & PCRE_IMS;
03559             }
03560 
03561           /* Change options at this level, and pass them back for use
03562           in subsequent branches. Reset the greedy defaults and the case
03563           value for firstbyte and reqbyte. */
03564 
03565           *optionsptr = options = newoptions;
03566           greedy_default = ((newoptions & PCRE_UNGREEDY) != 0);
03567           greedy_non_default = greedy_default ^ 1;
03568           req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
03569 
03570           previous = NULL;       /* This item can't be repeated */
03571           continue;              /* It is complete */
03572           }
03573 
03574         /* If the options ended with ':' we are heading into a nested group
03575         with possible change of options. Such groups are non-capturing and are
03576         not assertions of any kind. All we need to do is skip over the ':';
03577         the newoptions value is handled below. */
03578 
03579         bravalue = OP_BRA;
03580         ptr++;
03581         }
03582       }
03583 
03584     /* If PCRE_NO_AUTO_CAPTURE is set, all unadorned brackets become
03585     non-capturing and behave like (?:...) brackets */
03586 
03587     else if ((options & PCRE_NO_AUTO_CAPTURE) != 0)
03588       {
03589       bravalue = OP_BRA;
03590       }
03591 
03592     /* Else we have a referencing group; adjust the opcode. If the bracket
03593     number is greater than EXTRACT_BASIC_MAX, we set the opcode one higher, and
03594     arrange for the true number to follow later, in an OP_BRANUMBER item. */
03595 
03596     else
03597       {
03598       NUMBERED_GROUP:
03599       if (++(*brackets) > EXTRACT_BASIC_MAX)
03600         {
03601         bravalue = OP_BRA + EXTRACT_BASIC_MAX + 1;
03602         code[1+LINK_SIZE] = OP_BRANUMBER;
03603         PUT2(code, 2+LINK_SIZE, *brackets);
03604         skipbytes = 3;
03605         }
03606       else bravalue = OP_BRA + *brackets;
03607       }
03608 
03609     /* Process nested bracketed re. Assertions may not be repeated, but other
03610     kinds can be. We copy code into a non-register variable in order to be able
03611     to pass its address because some compilers complain otherwise. Pass in a
03612     new setting for the ims options if they have changed. */
03613 
03614     previous = (bravalue >= OP_ONCE)? code : NULL;
03615     *code = bravalue;
03616     tempcode = code;
03617     tempreqvary = cd->req_varyopt;     /* Save value before bracket */
03618 
03619     if (!compile_regex(
03620          newoptions,                   /* The complete new option state */
03621          options & PCRE_IMS,           /* The previous ims option state */
03622          brackets,                     /* Extracting bracket count */
03623          &tempcode,                    /* Where to put code (updated) */
03624          &ptr,                         /* Input pointer (updated) */
03625          errorptr,                     /* Where to put an error message */
03626          (bravalue == OP_ASSERTBACK ||
03627           bravalue == OP_ASSERTBACK_NOT), /* true if back assert */
03628          skipbytes,                    /* Skip over OP_COND/OP_BRANUMBER */
03629          &subfirstbyte,                /* For possible first char */
03630          &subreqbyte,                  /* For possible last char */
03631          bcptr,                        /* Current branch chain */
03632          cd))                          /* Tables block */
03633       goto FAILED;
03634 
03635     /* At the end of compiling, code is still pointing to the start of the
03636     group, while tempcode has been updated to point past the end of the group
03637     and any option resetting that may follow it. The pattern pointer (ptr)
03638     is on the bracket. */
03639 
03640     /* If this is a conditional bracket, check that there are no more than
03641     two branches in the group. */
03642 
03643     else if (bravalue == OP_COND)
03644       {
03645       uschar *tc = code;
03646       condcount = 0;
03647 
03648       do {
03649          condcount++;
03650          tc += GET(tc,1);
03651          }
03652       while (*tc != OP_KET);
03653 
03654       if (condcount > 2)
03655         {
03656         *errorptr = ERR27;
03657         goto FAILED;
03658         }
03659 
03660       /* If there is just one branch, we must not make use of its firstbyte or
03661       reqbyte, because this is equivalent to an empty second branch. */
03662 
03663       if (condcount == 1) subfirstbyte = subreqbyte = REQ_NONE;
03664       }
03665 
03666     /* Handle updating of the required and first characters. Update for normal
03667     brackets of all kinds, and conditions with two branches (see code above).
03668     If the bracket is followed by a quantifier with zero repeat, we have to
03669     back off. Hence the definition of zeroreqbyte and zerofirstbyte outside the
03670     main loop so that they can be accessed for the back off. */
03671 
03672     zeroreqbyte = reqbyte;
03673     zerofirstbyte = firstbyte;
03674     groupsetfirstbyte = false;
03675 
03676     if (bravalue >= OP_BRA || bravalue == OP_ONCE || bravalue == OP_COND)
03677       {
03678       /* If we have not yet set a firstbyte in this branch, take it from the
03679       subpattern, remembering that it was set here so that a repeat of more
03680       than one can replicate it as reqbyte if necessary. If the subpattern has
03681       no firstbyte, set "none" for the whole branch. In both cases, a zero
03682       repeat forces firstbyte to "none". */
03683 
03684       if (firstbyte == REQ_UNSET)
03685         {
03686         if (subfirstbyte >= 0)
03687           {
03688           firstbyte = subfirstbyte;
03689           groupsetfirstbyte = true;
03690           }
03691         else firstbyte = REQ_NONE;
03692         zerofirstbyte = REQ_NONE;
03693         }
03694 
03695       /* If firstbyte was previously set, convert the subpattern's firstbyte
03696       into reqbyte if there wasn't one, using the vary flag that was in
03697       existence beforehand. */
03698 
03699       else if (subfirstbyte >= 0 && subreqbyte < 0)
03700         subreqbyte = subfirstbyte | tempreqvary;
03701 
03702       /* If the subpattern set a required byte (or set a first byte that isn't
03703       really the first byte - see above), set it. */
03704 
03705       if (subreqbyte >= 0) reqbyte = subreqbyte;
03706       }
03707 
03708     /* For a forward assertion, we take the reqbyte, if set. This can be
03709     helpful if the pattern that follows the assertion doesn't set a different
03710     char. For example, it's useful for /(?=abcde).+/. We can't set firstbyte
03711     for an assertion, however because it leads to incorrect effect for patterns
03712     such as /(?=a)a.+/ when the "real" "a" would then become a reqbyte instead
03713     of a firstbyte. This is overcome by a scan at the end if there's no
03714     firstbyte, looking for an asserted first char. */
03715 
03716     else if (bravalue == OP_ASSERT && subreqbyte >= 0) reqbyte = subreqbyte;
03717 
03718     /* Now update the main code pointer to the end of the group. */
03719 
03720     code = tempcode;
03721 
03722     /* Error if hit end of pattern */
03723 
03724     if (*ptr != ')')
03725       {
03726       *errorptr = ERR14;
03727       goto FAILED;
03728       }
03729     break;
03730 
03731     /* Check \ for being a real metacharacter; if not, fall through and handle
03732     it as a data character at the start of a string. Escape items are checked
03733     for validity in the pre-compiling pass. */
03734 
03735     case '\\':
03736     tempptr = ptr;
03737     c = check_escape(&ptr, errorptr, *brackets, options, false);
03738 
03739     /* Handle metacharacters introduced by \. For ones like \d, the ESC_ values
03740     are arranged to be the negation of the corresponding OP_values. For the
03741     back references, the values are ESC_REF plus the reference number. Only
03742     back references and those types that consume a character may be repeated.
03743     We can test for values between ESC_b and ESC_Z for the latter; this may
03744     have to change if any new ones are ever created. */
03745 
03746     if (c < 0)
03747       {
03748       if (-c == ESC_Q)            /* Handle start of quoted string */
03749         {
03750         if (ptr[1] == '\\' && ptr[2] == 'E') ptr += 2; /* avoid empty string */
03751           else inescq = true;
03752         continue;
03753         }
03754 
03755       /* For metasequences that actually match a character, we disable the
03756       setting of a first character if it hasn't already been set. */
03757 
03758       if (firstbyte == REQ_UNSET && -c > ESC_b && -c < ESC_Z)
03759         firstbyte = REQ_NONE;
03760 
03761       /* Set values to reset to if this is followed by a zero repeat. */
03762 
03763       zerofirstbyte = firstbyte;
03764       zeroreqbyte = reqbyte;
03765 
03766       /* Back references are handled specially */
03767 
03768       if (-c >= ESC_REF)
03769         {
03770         int number = -c - ESC_REF;
03771         previous = code;
03772         *code++ = OP_REF;
03773         PUT2INC(code, 0, number);
03774         }
03775       else
03776         {
03777         previous = (-c > ESC_b && -c < ESC_Z)? code : NULL;
03778         *code++ = -c;
03779         }
03780       continue;
03781       }
03782 
03783     /* Data character: reset and fall through */
03784 
03785     ptr = tempptr;
03786     c = '\\';
03787 
03788     /* Handle a run of data characters until a metacharacter is encountered.
03789     The first character is guaranteed not to be whitespace or # when the
03790     extended flag is set. */
03791 
03792     NORMAL_CHAR:
03793     default:
03794     previous = code;
03795     *code = OP_CHARS;
03796     code += 2;
03797     length = 0;
03798 
03799     do
03800       {
03801       /* If in \Q...\E, check for the end; if not, we always have a literal */
03802 
03803       if (inescq)
03804         {
03805         if (c == '\\' && ptr[1] == 'E')
03806           {
03807           inescq = false;
03808           ptr++;
03809           }
03810         else
03811           {
03812           *code++ = c;
03813           length++;
03814           }
03815         continue;
03816         }
03817 
03818       /* Skip white space and comments for /x patterns */
03819 
03820       if ((options & PCRE_EXTENDED) != 0)
03821         {
03822         if ((cd->ctypes[c] & ctype_space) != 0) continue;
03823         if (c == '#')
03824           {
03825           /* The space before the ; is to avoid a warning on a silly compiler
03826           on the Macintosh. */
03827           while ((c = *(++ptr)) != 0 && c != NEWLINE) ;
03828           if (c == 0) break;
03829           continue;
03830           }
03831         }
03832 
03833       /* Backslash may introduce a data char or a metacharacter. Escaped items
03834       are checked for validity in the pre-compiling pass. Stop the string
03835       before a metaitem. */
03836 
03837       if (c == '\\')
03838         {
03839         tempptr = ptr;
03840         c = check_escape(&ptr, errorptr, *brackets, options, false);
03841         if (c < 0) { ptr = tempptr; break; }
03842 
03843         /* If a character is > 127 in UTF-8 mode, we have to turn it into
03844         two or more bytes in the UTF-8 encoding. */
03845 
03846         }
03847 
03848       /* Ordinary character or single-char escape */
03849 
03850       *code++ = c;
03851       length++;
03852       }
03853 
03854     /* This "while" is the end of the "do" above. */
03855 
03856     while (length < MAXLIT && (cd->ctypes[c = *(++ptr)] & ctype_meta) == 0);
03857 
03858     /* Update the first and last requirements. These are always bytes, even in
03859     UTF-8 mode. However, there is a special case to be considered when there
03860     are only one or two characters. Because this gets messy in UTF-8 mode, the
03861     code is kept separate. When we get here "length" contains the number of
03862     bytes. */
03863 
03864 
03865     /* This is the code for non-UTF-8 operation, either without UTF-8 support,
03866     or when UTF-8 is not enabled. */
03867 
03868       {
03869       /* firstbyte was not previously set; take it from this string */
03870 
03871       if (firstbyte == REQ_UNSET)
03872         {
03873         if (length == 1)
03874           {
03875           zerofirstbyte = REQ_NONE;
03876           firstbyte = previous[2] | req_caseopt;
03877           zeroreqbyte = reqbyte;
03878           }
03879         else
03880           {
03881           zerofirstbyte = firstbyte = previous[2] | req_caseopt;
03882           zeroreqbyte = (length > 2)?
03883             (code[-2] | req_caseopt | cd->req_varyopt) : reqbyte;
03884           reqbyte = code[-1] | req_caseopt | cd->req_varyopt;
03885           }
03886         }
03887 
03888       /* firstbyte was previously set */
03889 
03890       else
03891         {
03892         zerofirstbyte = firstbyte;
03893         zeroreqbyte = (length == 1)? reqbyte :
03894           code[-2] | req_caseopt | cd->req_varyopt;
03895         reqbyte = code[-1] | req_caseopt | cd->req_varyopt;
03896         }
03897       }
03898 
03899     /* Set the length in the data vector, and advance to the next state. */
03900 
03901     previous[1] = length;
03902     if (length < MAXLIT) ptr--;
03903     break;
03904     }
03905   }                   /* end of big loop */
03906 
03907 /* Control never reaches here by falling through, only by a goto for all the
03908 error states. Pass back the position in the pattern so that it can be displayed
03909 to the user for diagnosing the error. */
03910 
03911 FAILED:
03912 *ptrptr = ptr;
03913 return false;
03914 }
03915 
03916 
03917 
03918 
03919 /*************************************************
03920 *     Compile sequence of alternatives           *
03921 *************************************************/
03922 
03923 /* On entry, ptr is pointing past the bracket character, but on return
03924 it points to the closing bracket, or vertical bar, or end of string.
03925 The code variable is pointing at the byte into which the BRA operator has been
03926 stored. If the ims options are changed at the start (for a (?ims: group) or
03927 during any branch, we need to insert an OP_OPT item at the start of every
03928 following branch to ensure they get set correctly at run time, and also pass
03929 the new options into every subsequent branch compile.
03930 
03931 Argument:
03932   options        option bits, including any changes for this subpattern
03933   oldims         previous settings of ims option bits
03934   brackets       -> int containing the number of extracting brackets used
03935   codeptr        -> the address of the current code pointer
03936   ptrptr         -> the address of the current pattern pointer
03937   errorptr       -> pointer to error message
03938   lookbehind     true if this is a lookbehind assertion
03939   skipbytes      skip this many bytes at start (for OP_COND, OP_BRANUMBER)
03940   firstbyteptr   place to put the first required character, or a negative number
03941   reqbyteptr     place to put the last required character, or a negative number
03942   bcptr          pointer to the chain of currently open branches
03943   cd             points to the data block with tables pointers etc.
03944 
03945 Returns:      true on success
03946 */
03947 
03948 static bool
03949 compile_regex(int options, int oldims, int *brackets, uschar **codeptr,
03950   const uschar **ptrptr, const char **errorptr, bool lookbehind, int skipbytes,
03951   int *firstbyteptr, int *reqbyteptr, branch_chain *bcptr, compile_data *cd)
03952 {
03953 const uschar *ptr = *ptrptr;
03954 uschar *code = *codeptr;
03955 uschar *last_branch = code;
03956 uschar *start_bracket = code;
03957 uschar *reverse_count = NULL;
03958 int firstbyte, reqbyte;
03959 int branchfirstbyte, branchreqbyte;
03960 branch_chain bc;
03961 
03962 bc.outer = bcptr;
03963 bc.current = code;
03964 
03965 firstbyte = reqbyte = REQ_UNSET;
03966 
03967 /* Offset is set zero to mark that this bracket is still open */
03968 
03969 PUT(code, 1, 0);
03970 code += 1 + LINK_SIZE + skipbytes;
03971 
03972 /* Loop for each alternative branch */
03973 
03974 for (;!MuxAlarm.bAlarmed;)
03975   {
03976   /* Handle a change of ims options at the start of the branch */
03977 
03978   if ((options & PCRE_IMS) != oldims)
03979     {
03980     *code++ = OP_OPT;
03981     *code++ = options & PCRE_IMS;
03982     }
03983 
03984   /* Set up dummy OP_REVERSE if lookbehind assertion */
03985 
03986   if (lookbehind)
03987     {
03988     *code++ = OP_REVERSE;
03989     reverse_count = code;
03990     PUTINC(code, 0, 0);
03991     }
03992 
03993   /* Now compile the branch */
03994 
03995   if (!compile_branch(&options, brackets, &code, &ptr, errorptr,
03996         &branchfirstbyte, &branchreqbyte, &bc, cd))
03997     {
03998     *ptrptr = ptr;
03999     return false;
04000     }
04001 
04002   /* If this is the first branch, the firstbyte and reqbyte values for the
04003   branch become the values for the regex. */
04004 
04005   if (*last_branch != OP_ALT)
04006     {
04007     firstbyte = branchfirstbyte;
04008     reqbyte = branchreqbyte;
04009     }
04010 
04011   /* If this is not the first branch, the first char and reqbyte have to
04012   match the values from all the previous branches, except that if the previous
04013   value for reqbyte didn't have REQ_VARY set, it can still match, and we set
04014   REQ_VARY for the regex. */
04015 
04016   else
04017     {
04018     /* If we previously had a firstbyte, but it doesn't match the new branch,
04019     we have to abandon the firstbyte for the regex, but if there was previously
04020     no reqbyte, it takes on the value of the old firstbyte. */
04021 
04022     if (firstbyte >= 0 && firstbyte != branchfirstbyte)
04023       {
04024       if (reqbyte < 0) reqbyte = firstbyte;
04025       firstbyte = REQ_NONE;
04026       }
04027 
04028     /* If we (now or from before) have no firstbyte, a firstbyte from the
04029     branch becomes a reqbyte if there isn't a branch reqbyte. */
04030 
04031     if (firstbyte < 0 && branchfirstbyte >= 0 && branchreqbyte < 0)
04032         branchreqbyte = branchfirstbyte;
04033 
04034     /* Now ensure that the reqbytes match */
04035 
04036     if ((reqbyte & ~REQ_VARY) != (branchreqbyte & ~REQ_VARY))
04037       reqbyte = REQ_NONE;
04038     else reqbyte |= branchreqbyte;   /* To "or" REQ_VARY */
04039     }
04040 
04041   /* If lookbehind, check that this branch matches a fixed-length string,
04042   and put the length into the OP_REVERSE item. Temporarily mark the end of
04043   the branch with OP_END. */
04044 
04045   if (lookbehind)
04046     {
04047     int length;
04048     *code = OP_END;
04049     length = find_fixedlength(last_branch, options);
04050     DPRINTF(("fixed length = %d\n", length));
04051     if (length < 0)
04052       {
04053       *errorptr = (length == -2)? ERR36 : ERR25;
04054       *ptrptr = ptr;
04055       return false;
04056       }
04057     PUT(reverse_count, 0, length);
04058     }
04059 
04060   /* Reached end of expression, either ')' or end of pattern. Go back through
04061   the alternative branches and reverse the chain of offsets, with the field in
04062   the BRA item now becoming an offset to the first alternative. If there are
04063   no alternatives, it points to the end of the group. The length in the
04064   terminating ket is always the length of the whole bracketed item. If any of
04065   the ims options were changed inside the group, compile a resetting op-code
04066   following, except at the very end of the pattern. Return leaving the pointer
04067   at the terminating char. */
04068 
04069   if (*ptr != '|')
04070     {
04071     int length = code - last_branch;
04072     do
04073       {
04074       int prev_length = GET(last_branch, 1);
04075       PUT(last_branch, 1, length);
04076       length = prev_length;
04077       last_branch -= length;
04078       }
04079     while (length > 0);
04080 
04081     /* Fill in the ket */
04082 
04083     *code = OP_KET;
04084     PUT(code, 1, code - start_bracket);
04085     code += 1 + LINK_SIZE;
04086 
04087     /* Resetting option if needed */
04088 
04089     if ((options & PCRE_IMS) != oldims && *ptr == ')')
04090       {
04091       *code++ = OP_OPT;
04092       *code++ = oldims;
04093       }
04094 
04095     /* Set values to pass back */
04096 
04097     *codeptr = code;
04098     *ptrptr = ptr;
04099     *firstbyteptr = firstbyte;
04100     *reqbyteptr = reqbyte;
04101     return true;
04102     }
04103 
04104   /* Another branch follows; insert an "or" node. Its length field points back
04105   to the previous branch while the bracket remains open. At the end the chain
04106   is reversed. It's done like this so that the start of the bracket has a
04107   zero offset until it is closed, making it possible to detect recursion. */
04108 
04109   *code = OP_ALT;
04110   PUT(code, 1, code - last_branch);
04111   bc.current = last_branch = code;
04112   code += 1 + LINK_SIZE;
04113   ptr++;
04114   }
04115 return false;
04116 }
04117 
04118 
04119 
04120 
04121 /*************************************************
04122 *          Check for anchored expression         *
04123 *************************************************/
04124 
04125 /* Try to find out if this is an anchored regular expression. Consider each
04126 alternative branch. If they all start with OP_SOD or OP_CIRC, or with a bracket
04127 all of whose alternatives start with OP_SOD or OP_CIRC (recurse ad lib), then
04128 it's anchored. However, if this is a multiline pattern, then only OP_SOD
04129 counts, since OP_CIRC can match in the middle.
04130 
04131 We can also consider a regex to be anchored if OP_SOM starts all its branches.
04132 This is the code for \G, which means "match at start of match position, taking
04133 into account the match offset".
04134 
04135 A branch is also implicitly anchored if it starts with .* and DOTALL is set,
04136 because that will try the rest of the pattern at all possible matching points,
04137 so there is no point trying again.... er ....
04138 
04139 .... except when the .* appears inside capturing parentheses, and there is a
04140 subsequent back reference to those parentheses. We haven't enough information
04141 to catch that case precisely.
04142 
04143 At first, the best we could do was to detect when .* was in capturing brackets
04144 and the highest back reference was greater than or equal to that level.
04145 However, by keeping a bitmap of the first 31 back references, we can catch some
04146 of the more common cases more precisely.
04147 
04148 Arguments:
04149   code           points to start of expression (the bracket)
04150   options        points to the options setting
04151   bracket_map    a bitmap of which brackets we are inside while testing; this
04152                   handles up to substring 31; after that we just have to take
04153                   the less precise approach
04154   backref_map    the back reference bitmap
04155 
04156 Returns:     true or false
04157 */
04158 
04159 static bool
04160 is_anchored(register const uschar *code, int *options, unsigned int bracket_map,
04161   unsigned int backref_map)
04162 {
04163 do {
04164    const uschar *scode =
04165      first_significant_code(code + 1+LINK_SIZE, options, PCRE_MULTILINE);
04166    register int op = *scode;
04167 
04168    /* Capturing brackets */
04169 
04170    if (op > OP_BRA)
04171      {
04172      int new_map;
04173      op -= OP_BRA;
04174      if (op > EXTRACT_BASIC_MAX) op = GET2(scode, 2+LINK_SIZE);
04175      new_map = bracket_map | ((op < 32)? (1 << op) : 1);
04176      if (!is_anchored(scode, options, new_map, backref_map)) return false;
04177      }
04178 
04179    /* Other brackets */
04180 
04181    else if (op == OP_BRA || op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
04182      {
04183      if (!is_anchored(scode, options, bracket_map, backref_map)) return false;
04184      }
04185 
04186    /* .* is not anchored unless DOTALL is set and it isn't in brackets that
04187    are or may be referenced. */
04188 
04189    else if ((op == OP_TYPESTAR || op == OP_TYPEMINSTAR) &&
04190             (*options & PCRE_DOTALL) != 0)
04191      {
04192      if (scode[1] != OP_ANY || (bracket_map & backref_map) != 0) return false;
04193      }
04194 
04195    /* Check for explicit anchoring */
04196 
04197    else if (op != OP_SOD && op != OP_SOM &&
04198            ((*options & PCRE_MULTILINE) != 0 || op != OP_CIRC))
04199      return false;
04200    code += GET(code, 1);
04201    }
04202 while (*code == OP_ALT);   /* Loop for each alternative */
04203 return true;
04204 }
04205 
04206 
04207 
04208 /*************************************************
04209 *         Check for starting with ^ or .*        *
04210 *************************************************/
04211 
04212 /* This is called to find out if every branch starts with ^ or .* so that
04213 "first char" processing can be done to speed things up in multiline
04214 matching and for non-DOTALL patterns that start with .* (which must start at
04215 the beginning or after \n). As in the case of is_anchored() (see above), we
04216 have to take account of back references to capturing brackets that contain .*
04217 because in that case we can't make the assumption.
04218 
04219 Arguments:
04220   code           points to start of expression (the bracket)
04221   bracket_map    a bitmap of which brackets we are inside while testing; this
04222                   handles up to substring 31; after that we just have to take
04223                   the less precise approach
04224   backref_map    the back reference bitmap
04225 
04226 Returns:         true or false
04227 */
04228 
04229 static bool
04230 is_startline(const uschar *code, unsigned int bracket_map,
04231   unsigned int backref_map)
04232 {
04233 do {
04234    const uschar *scode = first_significant_code(code + 1+LINK_SIZE, NULL, 0);
04235    register int op = *scode;
04236 
04237    /* Capturing brackets */
04238 
04239    if (op > OP_BRA)
04240      {
04241      int new_map;
04242      op -= OP_BRA;
04243      if (op > EXTRACT_BASIC_MAX) op = GET2(scode, 2+LINK_SIZE);
04244      new_map = bracket_map | ((op < 32)? (1 << op) : 1);
04245      if (!is_startline(scode, new_map, backref_map)) return false;
04246      }
04247 
04248    /* Other brackets */
04249 
04250    else if (op == OP_BRA || op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
04251      { if (!is_startline(scode, bracket_map, backref_map)) return false; }
04252 
04253    /* .* is not anchored unless DOTALL is set and it isn't in brackets that
04254    may be referenced. */
04255 
04256    else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR)
04257      {
04258      if (scode[1] != OP_ANY || (bracket_map & backref_map) != 0) return false;
04259      }
04260 
04261    /* Check for explicit circumflex */
04262 
04263    else if (op != OP_CIRC) return false;
04264    code += GET(code, 1);
04265    }
04266 while (*code == OP_ALT);  /* Loop for each alternative */
04267 return true;
04268 }
04269 
04270 
04271 
04272 /*************************************************
04273 *       Check for asserted fixed first char      *
04274 *************************************************/
04275 
04276 /* During compilation, the "first char" settings from forward assertions are
04277 discarded, because they can cause conflicts with actual literals that follow.
04278 However, if we end up without a first char setting for an unanchored pattern,
04279 it is worth scanning the regex to see if there is an initial asserted first
04280 char. If all branches start with the same asserted char, or with a bracket all
04281 of whose alternatives start with the same asserted char (recurse ad lib), then
04282 we return that char, otherwise -1.
04283 
04284 Arguments:
04285   code       points to start of expression (the bracket)
04286   options    pointer to the options (used to check casing changes)
04287   inassert   true if in an assertion
04288 
04289 Returns:     -1 or the fixed first char
04290 */
04291 
04292 static int
04293 find_firstassertedchar(const uschar *code, int *options, bool inassert)
04294 {
04295 register int c = -1;
04296 do {
04297    int d;
04298    const uschar *scode =
04299      first_significant_code(code + 1+LINK_SIZE, options, PCRE_CASELESS);
04300    register int op = *scode;
04301 
04302    if (op >= OP_BRA) op = OP_BRA;
04303 
04304    switch(op)
04305      {
04306      default:
04307      return -1;
04308 
04309      case OP_BRA:
04310      case OP_ASSERT:
04311      case OP_ONCE:
04312      case OP_COND:
04313      if ((d = find_firstassertedchar(scode, options, op == OP_ASSERT)) < 0)
04314        return -1;
04315      if (c < 0) c = d; else if (c != d) return -1;
04316      break;
04317 
04318      case OP_EXACT:       /* Fall through */
04319      scode++;
04320 
04321      case OP_CHARS:       /* Fall through */
04322      scode++;
04323 
04324      case OP_PLUS:
04325      case OP_MINPLUS:
04326      if (!inassert) return -1;
04327      if (c < 0)
04328        {
04329        c = scode[1];
04330        if ((*options & PCRE_CASELESS) != 0) c |= REQ_CASELESS;
04331        }
04332      else if (c != scode[1]) return -1;
04333      break;
04334      }
04335 
04336    code += GET(code, 1);
04337    }
04338 while (*code == OP_ALT);
04339 return c;
04340 }
04341 
04342 
04343 
04344 
04345 /*************************************************
04346 *        Compile a Regular Expression            *
04347 *************************************************/
04348 
04349 /* This function takes a string and returns a pointer to a block of store
04350 holding a compiled version of the expression.
04351 
04352 Arguments:
04353   pattern      the regular expression
04354   options      various option bits
04355   errorptr     pointer to pointer to error text
04356   erroroffset  ptr offset in pattern where error was detected
04357   tables       pointer to character tables or NULL
04358 
04359 Returns:       pointer to compiled data block, or NULL on error,
04360                with errorptr and erroroffset set
04361 */
04362 
04363 pcre *
04364 pcre_compile(const char *pattern, int options, const char **errorptr,
04365   int *erroroffset, const unsigned char *tables)
04366 {
04367 real_pcre *re;
04368 int length = 1 + LINK_SIZE;      /* For initial BRA plus length */
04369 int runlength;
04370 int c, firstbyte, reqbyte;
04371 int bracount = 0;
04372 int branch_extra = 0;
04373 int branch_newextra;
04374 int item_count = -1;
04375 int name_count = 0;
04376 int max_name_size = 0;
04377 bool inescq = false;
04378 unsigned int brastackptr = 0;
04379 size_t size;
04380 uschar *code;
04381 const uschar *codestart;
04382 const uschar *ptr;
04383 compile_data compile_block;
04384 int brastack[BRASTACK_SIZE];
04385 uschar bralenstack[BRASTACK_SIZE];
04386 
04387 /* We can't pass back an error message if errorptr is NULL; I guess the best we
04388 can do is just return NULL. */
04389 
04390 if (errorptr == NULL) return NULL;
04391 *errorptr = NULL;
04392 
04393 /* However, we can give a message for this error */
04394 
04395 if (erroroffset == NULL)
04396   {
04397   *errorptr = ERR16;
04398   return NULL;
04399   }
04400 *erroroffset = 0;
04401 
04402 /* Can't support UTF8 unless PCRE has been compiled to include the code. */
04403 
04404 if ((options & PCRE_UTF8) != 0)
04405   {
04406   *errorptr = ERR32;
04407   return NULL;
04408   }
04409 
04410 if ((options & ~PUBLIC_OPTIONS) != 0)
04411   {
04412   *errorptr = ERR17;
04413   return NULL;
04414   }
04415 
04416 /* Set up pointers to the individual character tables */
04417 
04418 if (tables == NULL) tables = pcre_default_tables;
04419 compile_block.lcc = tables + lcc_offset;
04420 compile_block.fcc = tables + fcc_offset;
04421 compile_block.cbits = tables + cbits_offset;
04422 compile_block.ctypes = tables + ctypes_offset;
04423 
04424 /* Maximum back reference and backref bitmap. This is updated for numeric
04425 references during the first pass, but for named references during the actual
04426 compile pass. The bitmap records up to 31 back references to help in deciding
04427 whether (.*) can be treated as anchored or not. */
04428 
04429 compile_block.top_backref = 0;
04430 compile_block.backref_map = 0;
04431 
04432 /* Reflect pattern for debugging output */
04433 
04434 DPRINTF(("------------------------------------------------------------------\n"));
04435 DPRINTF(("%s\n", pattern));
04436 
04437 /* The first thing to do is to make a pass over the pattern to compute the
04438 amount of store required to hold the compiled code. This does not have to be
04439 perfect as long as errors are overestimates. At the same time we can detect any
04440 flag settings right at the start, and extract them. Make an attempt to correct
04441 for any counted white space if an "extended" flag setting appears late in the
04442 pattern. We can't be so clever for #-comments. */
04443 
04444 ptr = (const uschar *)(pattern - 1);
04445 while ((c = *(++ptr)) != 0)
04446   {
04447   int min, max;
04448 #if defined(WIN32) && (_MSC_VER == 1200) && defined(_M_IX86) && !defined(__INTEL_COMPILER)
04449   // The addition of 'volatile' works around a bug in Version 12.0 of
04450   // Microsoft's Visual C/C++ compiler (part of Visual Studio 6.0). Without
04451   // volatile, class_optcount is calculated properly, but the compiler
04452   // clobbers the EAX register before tests it as class_optcount.
04453   //
04454   // This is not a problem with the Intel Compiler.
04455   //
04456   volatile int class_optcount;
04457 #else
04458   int class_optcount;
04459 #endif
04460   int bracket_length;
04461   int duplength;
04462 
04463   /* If we are inside a \Q...\E sequence, all chars are literal */
04464 
04465   if (inescq) goto NORMAL_CHAR;
04466 
04467   /* Otherwise, first check for ignored whitespace and comments */
04468 
04469   if ((options & PCRE_EXTENDED) != 0)
04470     {
04471     if ((compile_block.ctypes[c] & ctype_space) != 0) continue;
04472     if (c == '#')
04473       {
04474       /* The space before the ; is to avoid a warning on a silly compiler
04475       on the Macintosh. */
04476       while ((c = *(++ptr)) != 0 && c != NEWLINE) ;
04477       if (c == 0) break;
04478       continue;
04479       }
04480     }
04481 
04482   item_count++;    /* Is zero for the first non-comment item */
04483 
04484   switch(c)
04485     {
04486     /* A backslashed item may be an escaped "normal" character or a
04487     character type. For a "normal" character, put the pointers and
04488     character back so that tests for whitespace etc. in the input
04489     are done correctly. */
04490 
04491     case '\\':
04492       {
04493       const uschar *save_ptr = ptr;
04494       c = check_escape(&ptr, errorptr, bracount, options, false);
04495       if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
04496       if (c >= 0)
04497         {
04498         ptr = save_ptr;
04499         c = '\\';
04500         goto NORMAL_CHAR;
04501         }
04502       }
04503 
04504     /* If \Q, enter "literal" mode */
04505 
04506     if (-c == ESC_Q)
04507       {
04508       inescq = true;
04509       continue;
04510       }
04511 
04512     /* Other escapes need one byte, and are of length one for repeats */
04513 
04514     length++;
04515 
04516     /* A back reference needs an additional 2 bytes, plus either one or 5
04517     bytes for a repeat. We also need to keep the value of the highest
04518     back reference. */
04519 
04520     if (c <= -ESC_REF)
04521       {
04522       int refnum = -c - ESC_REF;
04523       compile_block.backref_map |= (refnum < 32)? (1 << refnum) : 1;
04524       if (refnum > compile_block.top_backref)
04525         compile_block.top_backref = refnum;
04526       length += 2;   /* For single back reference */
04527       if (ptr[1] == '{' && is_counted_repeat(ptr+2))
04528         {
04529         ptr = read_repeat_counts(ptr+2, &min, &max, errorptr);
04530         if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
04531         if ((min == 0 && (max == 1 || max == -1)) ||
04532           (min == 1 && max == -1))
04533             length++;
04534         else length += 5;
04535         if (ptr[1] == '?') ptr++;
04536         }
04537       }
04538     continue;
04539 
04540     case '^':     /* Single-byte metacharacters */
04541     case '.':
04542     case '$':
04543     length++;
04544     continue;
04545 
04546     case '*':            /* These repeats won't be after brackets; */
04547     case '+':            /* those are handled separately */
04548     case '?':
04549     length++;
04550     goto POSESSIVE;      /* A few lines below */
04551 
04552     /* This covers the cases of braced repeats after a single char, metachar,
04553     class, or back reference. */
04554 
04555     case '{':
04556     if (!is_counted_repeat(ptr+1)) goto NORMAL_CHAR;
04557     ptr = read_repeat_counts(ptr+1, &min, &max, errorptr);
04558     if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
04559 
04560     /* These special cases just insert one extra opcode */
04561 
04562     if ((min == 0 && (max == 1 || max == -1)) ||
04563       (min == 1 && max == -1))
04564         length++;
04565 
04566     /* These cases might insert additional copies of a preceding character. */
04567 
04568     else
04569       {
04570 
04571       /* Not UTF-8 mode: all characters are one byte */
04572         {
04573         if (min != 1)
04574           {
04575           length--;   /* Uncount the original char or metachar */
04576           if (min > 0) length += 4;
04577           }
04578 
04579         length += (max > 0)? 4 : 2;
04580         }
04581       }
04582 
04583     if (ptr[1] == '?') ptr++;      /* Needs no extra length */
04584 
04585     POSESSIVE:                     /* Test for possessive quantifier */
04586     if (ptr[1] == '+')
04587       {
04588       ptr++;
04589       length += 2 + 2*LINK_SIZE;   /* Allow for atomic brackets */
04590       }
04591     continue;
04592 
04593     /* An alternation contains an offset to the next branch or ket. If any ims
04594     options changed in the previous branch(es), and/or if we are in a
04595     lookbehind assertion, extra space will be needed at the start of the
04596     branch. This is handled by branch_extra. */
04597 
04598     case '|':
04599     length += 1 + LINK_SIZE + branch_extra;
04600     continue;
04601 
04602     /* A character class uses 33 characters provided that all the character
04603     values are less than 256. Otherwise, it uses a bit map for low valued
04604     characters, and individual items for others. Don't worry about character
04605     types that aren't allowed in classes - they'll get picked up during the
04606     compile. A character class that contains only one single-byte character
04607     uses 2 or 3 bytes, depending on whether it is negated or not. Notice this
04608     where we can. (In UTF-8 mode we can do this only for chars < 128.) */
04609 
04610     case '[':
04611     class_optcount = 0;
04612 
04613     if (*(++ptr) == '^') ptr++;
04614 
04615     /* Written as a "do" so that an initial ']' is taken as data */
04616 
04617     if (*ptr != 0) do
04618       {
04619       /* Inside \Q...\E everything is literal except \E */
04620 
04621       if (inescq)
04622         {
04623         if (*ptr != '\\' || ptr[1] != 'E') goto NON_SPECIAL_CHARACTER;
04624         inescq = false;
04625         ptr += 1;
04626         continue;
04627         }
04628 
04629       /* Outside \Q...\E, check for escapes */
04630 
04631       if (*ptr == '\\')
04632         {
04633         int ch = check_escape(&ptr, errorptr, bracount, options, true);
04634         if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
04635 
04636         /* \b is backspace inside a class */
04637 
04638         if (-ch == ESC_b) ch = '\b';
04639 
04640         /* \Q enters quoting mode */
04641 
04642         if (-ch == ESC_Q)
04643           {
04644           inescq = true;
04645           continue;
04646           }
04647 
04648         /* Handle escapes that turn into characters */
04649 
04650         if (ch >= 0)
04651           {
04652           class_optcount++;            /* for possible optimization */
04653           }
04654         else class_optcount = 10;      /* \d, \s etc; make sure > 1 */
04655         }
04656 
04657       /* Check the syntax for POSIX stuff. The bits we actually handle are
04658       checked during the real compile phase. */
04659 
04660       else if (*ptr == '[' && check_posix_syntax(ptr, &ptr, &compile_block))
04661         {
04662         ptr++;
04663         class_optcount = 10;    /* Make sure > 1 */
04664         }
04665 
04666       /* Anything else just increments the possible optimization count. If
04667       there are wide characters, we are going to have to use an XCLASS. */
04668 
04669       else
04670         {
04671         NON_SPECIAL_CHARACTER:
04672         class_optcount++;
04673 
04674         }
04675       }
04676     while (*(++ptr) != 0 && (inescq || *ptr != ']')); /* Concludes "do" above */
04677 
04678     if (*ptr == 0)                          /* Missing terminating ']' */
04679       {
04680       *errorptr = ERR6;
04681       goto PCRE_ERROR_RETURN;
04682       }
04683 
04684     /* We can optimize when there was only one optimizable character. Repeats
04685     for positive and negated single one-byte chars are handled by the general
04686     code. Here, we handle repeats for the class opcodes. */
04687 
04688     if (class_optcount == 1) length += 3; else
04689       {
04690       length += 33;
04691 
04692       /* A repeat needs either 1 or 5 bytes. If it is a possessive quantifier,
04693       we also need extra for wrapping the whole thing in a sub-pattern. */
04694 
04695       if (*ptr != 0 && ptr[1] == '{' && is_counted_repeat(ptr+2))
04696         {
04697         ptr = read_repeat_counts(ptr+2, &min, &max, errorptr);
04698         if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
04699         if ((min == 0 && (max == 1 || max == -1)) ||
04700           (min == 1 && max == -1))
04701             length++;
04702         else length += 5;
04703         if (ptr[1] == '+')
04704           {
04705           ptr++;
04706           length += 2 + 2*LINK_SIZE;
04707           }
04708         else if (ptr[1] == '?') ptr++;
04709         }
04710       }
04711     continue;
04712 
04713     /* Brackets may be genuine groups or special things */
04714 
04715     case '(':
04716     branch_newextra = 0;
04717     bracket_length = 1 + LINK_SIZE;
04718 
04719     /* Handle special forms of bracket, which all start (? */
04720 
04721     if (ptr[1] == '?')
04722       {
04723       int set, unset;
04724       int *optset;
04725 
04726       switch (c = ptr[2])
04727         {
04728         /* Skip over comments entirely */
04729         case '#':
04730         ptr += 3;
04731         while (*ptr != 0 && *ptr != ')') ptr++;
04732         if (*ptr == 0)
04733           {
04734           *errorptr = ERR18;
04735           goto PCRE_ERROR_RETURN;
04736           }
04737         continue;
04738 
04739         /* Non-referencing groups and lookaheads just move the pointer on, and
04740         then behave like a non-special bracket, except that they don't increment
04741         the count of extracting brackets. Ditto for the "once only" bracket,
04742         which is in Perl from version 5.005. */
04743 
04744         case ':':
04745         case '=':
04746         case '!':
04747         case '>':
04748         ptr += 2;
04749         break;
04750 
04751         /* (?R) specifies a recursive call to the regex, which is an extension
04752         to provide the facility which can be obtained by (?p{perl-code}) in
04753         Perl 5.6. In Perl 5.8 this has become (??{perl-code}).
04754 
04755         From PCRE 4.00, items such as (?3) specify subroutine-like "calls" to
04756         the appropriate numbered brackets. This includes both recursive and
04757         non-recursive calls. (?R) is now synonymous with (?0). */
04758 
04759         case 'R':
04760         ptr++;
04761 
04762         case '0': case '1': case '2': case '3': case '4':
04763         case '5': case '6': case '7': case '8': case '9':
04764         ptr += 2;
04765         if (c != 'R')
04766           while ((digitab[*(++ptr)] & ctype_digit) != 0);
04767         if (*ptr != ')')
04768           {
04769           *errorptr = ERR29;
04770           goto PCRE_ERROR_RETURN;
04771           }
04772         length += 1 + LINK_SIZE;
04773 
04774         /* If this item is quantified, it will get wrapped inside brackets so
04775         as to use the code for quantified brackets. We jump down and use the
04776         code that handles this for real brackets. */
04777 
04778         if (ptr[1] == '+' || ptr[1] == '*' || ptr[1] == '?' || ptr[1] == '{')
04779           {
04780           length += 2 + 2 * LINK_SIZE;       /* to make bracketed */
04781           duplength = 5 + 3 * LINK_SIZE;
04782           goto HANDLE_QUANTIFIED_BRACKETS;
04783           }
04784         continue;
04785 
04786         /* (?C) is an extension which provides "callout" - to provide a bit of
04787         the functionality of the Perl (?{...}) feature. An optional number may
04788         follow (default is zero). */
04789 
04790         case 'C':
04791         ptr += 2;
04792         while ((digitab[*(++ptr)] & ctype_digit) != 0);
04793         if (*ptr != ')')
04794           {
04795           *errorptr = ERR39;
04796           goto PCRE_ERROR_RETURN;
04797           }
04798         length += 2;
04799         continue;
04800 
04801         /* Named subpatterns are an extension copied from Python */
04802 
04803         case 'P':
04804         ptr += 3;
04805         if (*ptr == '<')
04806           {
04807           const uschar *p;    /* Don't amalgamate; some compilers */
04808           p = ++ptr;          /* grumble at autoincrement in declaration */
04809           while ((compile_block.ctypes[*ptr] & ctype_word) != 0) ptr++;
04810           if (*ptr != '>')
04811             {
04812             *errorptr = ERR42;
04813             goto PCRE_ERROR_RETURN;
04814             }
04815           name_count++;
04816           if (ptr - p > max_name_size) max_name_size = (ptr - p);
04817           break;
04818           }
04819 
04820         if (*ptr == '=' || *ptr == '>')
04821           {
04822           while ((compile_block.ctypes[*(++ptr)] & ctype_word) != 0);
04823           if (*ptr != ')')
04824             {
04825             *errorptr = ERR42;
04826             goto PCRE_ERROR_RETURN;
04827             }
04828           break;
04829           }
04830 
04831         /* Unknown character after (?P */
04832 
04833         *errorptr = ERR41;
04834         goto PCRE_ERROR_RETURN;
04835 
04836         /* Lookbehinds are in Perl from version 5.005 */
04837 
04838         case '<':
04839         ptr += 3;
04840         if (*ptr == '=' || *ptr == '!')
04841           {
04842           branch_newextra = 1 + LINK_SIZE;
04843           length += 1 + LINK_SIZE;         /* For the first branch */
04844           break;
04845           }
04846         *errorptr = ERR24;
04847         goto PCRE_ERROR_RETURN;
04848 
04849         /* Conditionals are in Perl from version 5.005. The bracket must either
04850         be followed by a number (for bracket reference) or by an assertion
04851         group, or (a PCRE extension) by 'R' for a recursion test. */
04852 
04853         case '(':
04854         if (ptr[3] == 'R' && ptr[4] == ')')
04855           {
04856           ptr += 4;
04857           length += 3;
04858           }
04859         else if ((digitab[ptr[3]] & ctype_digit) != 0)
04860           {
04861           ptr += 4;
04862           length += 3;
04863           while ((digitab[*ptr] & ctype_digit) != 0) ptr++;
04864           if (*ptr != ')')
04865             {
04866             *errorptr = ERR26;
04867             goto PCRE_ERROR_RETURN;
04868             }
04869           }
04870         else   /* An assertion must follow */
04871           {
04872           ptr++;   /* Can treat like ':' as far as spacing is concerned */
04873           if (ptr[2] != '?' ||
04874              (ptr[3] != '=' && ptr[3] != '!' && ptr[3] != '<') )
04875             {
04876             ptr += 2;    /* To get right offset in message */
04877             *errorptr = ERR28;
04878             goto PCRE_ERROR_RETURN;
04879             }
04880           }
04881         break;
04882 
04883         /* Else loop checking valid options until ) is met. Anything else is an
04884         error. If we are without any brackets, i.e. at top level, the settings
04885         act as if specified in the options, so massage the options immediately.
04886         This is for backward compatibility with Perl 5.004. */
04887 
04888         default:
04889         set = unset = 0;
04890         optset = &set;
04891         ptr += 2;
04892 
04893         for (;; ptr++)
04894           {
04895           c = *ptr;
04896           switch (c)
04897             {
04898             case 'i':
04899             *optset |= PCRE_CASELESS;
04900             continue;
04901 
04902             case 'm':
04903             *optset |= PCRE_MULTILINE;
04904             continue;
04905 
04906             case 's':
04907             *optset |= PCRE_DOTALL;
04908             continue;
04909 
04910             case 'x':
04911             *optset |= PCRE_EXTENDED;
04912             continue;
04913 
04914             case 'X':
04915             *optset |= PCRE_EXTRA;
04916             continue;
04917 
04918             case 'U':
04919             *optset |= PCRE_UNGREEDY;
04920             continue;
04921 
04922             case '-':
04923             optset = &unset;
04924             continue;
04925 
04926             /* A termination by ')' indicates an options-setting-only item; if
04927             this is at the very start of the pattern (indicated by item_count
04928             being zero), we use it to set the global options. This is helpful
04929             when analyzing the pattern for first characters, etc. Otherwise
04930             nothing is done here and it is handled during the compiling
04931             process.
04932 
04933             [Historical note: Up to Perl 5.8, options settings at top level
04934             were always global settings, wherever they appeared in the pattern.
04935             That is, they were equivalent to an external setting. From 5.8
04936             onwards, they apply only to what follows (which is what you might
04937             expect).] */
04938 
04939             case ')':
04940             if (item_count == 0)
04941               {
04942               options = (options | set) & (~unset);
04943               set = unset = 0;     /* To save length */
04944               item_count--;        /* To allow for several */
04945               }
04946 
04947             /* Fall through */
04948 
04949             /* A termination by ':' indicates the start of a nested group with
04950             the given options set. This is again handled at compile time, but
04951             we must allow for compiled space if any of the ims options are
04952             set. We also have to allow for resetting space at the end of
04953             the group, which is why 4 is added to the length and not just 2.
04954             If there are several changes of options within the same group, this
04955             will lead to an over-estimate on the length, but this shouldn't
04956             matter very much. We also have to allow for resetting options at
04957             the start of any alternations, which we do by setting
04958             branch_newextra to 2. Finally, we record whether the case-dependent
04959             flag ever changes within the regex. This is used by the "required
04960             character" code. */
04961 
04962             case ':':
04963             if (((set|unset) & PCRE_IMS) != 0)
04964               {
04965               length += 4;
04966               branch_newextra = 2;
04967               if (((set|unset) & PCRE_CASELESS) != 0) options |= PCRE_ICHANGED;
04968               }
04969             goto END_OPTIONS;
04970 
04971             /* Unrecognized option character */
04972 
04973             default:
04974             *errorptr = ERR12;
04975             goto PCRE_ERROR_RETURN;
04976             }
04977           }
04978 
04979         /* If we hit a closing bracket, that's it - this is a freestanding
04980         option-setting. We need to ensure that branch_extra is updated if
04981         necessary. The only values branch_newextra can have here are 0 or 2.
04982         If the value is 2, then branch_extra must either be 2 or 5, depending
04983         on whether this is a lookbehind group or not. */
04984 
04985         END_OPTIONS:
04986         if (c == ')')
04987           {
04988           if (branch_newextra == 2 &&
04989               (branch_extra == 0 || branch_extra == 1+LINK_SIZE))
04990             branch_extra += branch_newextra;
04991           continue;
04992           }
04993 
04994         /* If options were terminated by ':' control comes here. Fall through
04995         to handle the group below. */
04996         }
04997       }
04998 
04999     /* Extracting brackets must be counted so we can process escapes in a
05000     Perlish way. If the number exceeds EXTRACT_BASIC_MAX we are going to
05001     need an additional 3 bytes of store per extracting bracket. However, if
05002     PCRE_NO_AUTO)CAPTURE is set, unadorned brackets become non-capturing, so we
05003     must leave the count alone (it will aways be zero). */
05004 
05005     else if ((options & PCRE_NO_AUTO_CAPTURE) == 0)
05006       {
05007       bracount++;
05008       if (bracount > EXTRACT_BASIC_MAX) bracket_length += 3;
05009       }
05010 
05011     /* Save length for computing whole length at end if there's a repeat that
05012     requires duplication of the group. Also save the current value of
05013     branch_extra, and start the new group with the new value. If non-zero, this
05014     will either be 2 for a (?imsx: group, or 3 for a lookbehind assertion. */
05015 
05016     if (brastackptr >= sizeof(brastack)/sizeof(int))
05017       {
05018       *errorptr = ERR19;
05019       goto PCRE_ERROR_RETURN;
05020       }
05021 
05022     bralenstack[brastackptr] = branch_extra;
05023     branch_extra = branch_newextra;
05024 
05025     brastack[brastackptr++] = length;
05026     length += bracket_length;
05027     continue;
05028 
05029     /* Handle ket. Look for subsequent max/min; for certain sets of values we
05030     have to replicate this bracket up to that many times. If brastackptr is
05031     0 this is an unmatched bracket which will generate an error, but take care
05032     not to try to access brastack[-1] when computing the length and restoring
05033     the branch_extra value. */
05034 
05035     case ')':
05036     length += 1 + LINK_SIZE;
05037     if (brastackptr > 0)
05038       {
05039       duplength = length - brastack[--brastackptr];
05040       branch_extra = bralenstack[brastackptr];
05041       }
05042     else duplength = 0;
05043 
05044     /* The following code is also used when a recursion such as (?3) is
05045     followed by a quantifier, because in that case, it has to be wrapped inside
05046     brackets so that the quantifier works. The value of duplength must be
05047     set before arrival. */
05048 
05049     HANDLE_QUANTIFIED_BRACKETS:
05050 
05051     /* Leave ptr at the final char; for read_repeat_counts this happens
05052     automatically; for the others we need an increment. */
05053 
05054     if ((c = ptr[1]) == '{' && is_counted_repeat(ptr+2))
05055       {
05056       ptr = read_repeat_counts(ptr+2, &min, &max, errorptr);
05057       if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
05058       }
05059     else if (c == '*') { min = 0; max = -1; ptr++; }
05060     else if (c == '+') { min = 1; max = -1; ptr++; }
05061     else if (c == '?') { min = 0; max = 1;  ptr++; }
05062     else { min = 1; max = 1; }
05063 
05064     /* If the minimum is zero, we have to allow for an OP_BRAZERO before the
05065     group, and if the maximum is greater than zero, we have to replicate
05066     maxval-1 times; each replication acquires an OP_BRAZERO plus a nesting
05067     bracket set. */
05068 
05069     if (min == 0)
05070       {
05071       length++;
05072       if (max > 0) length += (max - 1) * (duplength + 3 + 2*LINK_SIZE);
05073       }
05074 
05075     /* When the minimum is greater than zero, we have to replicate up to
05076     minval-1 times, with no additions required in the copies. Then, if there
05077     is a limited maximum we have to replicate up to maxval-1 times allowing
05078     for a BRAZERO item before each optional copy and nesting brackets for all
05079     but one of the optional copies. */
05080 
05081     else
05082       {
05083       length += (min - 1) * duplength;
05084       if (max > min)   /* Need this test as max=-1 means no limit */
05085         length += (max - min) * (duplength + 3 + 2*LINK_SIZE)
05086           - (2 + 2*LINK_SIZE);
05087       }
05088 
05089     /* Allow space for once brackets for "possessive quantifier" */
05090 
05091     if (ptr[1] == '+')
05092       {
05093       ptr++;
05094       length += 2 + 2*LINK_SIZE;
05095       }
05096     continue;
05097 
05098     /* Non-special character. For a run of such characters the length required
05099     is the number of characters + 2, except that the maximum run length is
05100     MAXLIT. We won't get a skipped space or a non-data escape or the start of a
05101     # comment as the first character, so the length can't be zero. */
05102 
05103     NORMAL_CHAR:
05104     default:
05105     length += 2;
05106     runlength = 0;
05107     do
05108       {
05109 
05110       /* If in a \Q...\E sequence, check for end; otherwise it's a literal */
05111       if (inescq)
05112         {
05113         if (c == '\\' && ptr[1] == 'E')
05114           {
05115           inescq = false;
05116           ptr++;
05117           }
05118         else runlength++;
05119         continue;
05120         }
05121 
05122       /* Skip whitespace and comments for /x */
05123 
05124       if ((options & PCRE_EXTENDED) != 0)
05125         {
05126         if ((compile_block.ctypes[c] & ctype_space) != 0) continue;
05127         if (c == '#')
05128           {
05129           /* The space before the ; is to avoid a warning on a silly compiler
05130           on the Macintosh. */
05131           while ((c = *(++ptr)) != 0 && c != NEWLINE) ;
05132           continue;
05133           }
05134         }
05135 
05136       /* Backslash may introduce a data char or a metacharacter; stop the
05137       string before the latter. */
05138 
05139       if (c == '\\')
05140         {
05141         const uschar *saveptr = ptr;
05142         c = check_escape(&ptr, errorptr, bracount, options, false);
05143         if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
05144         if (c < 0) { ptr = saveptr; break; }
05145 
05146         /* In UTF-8 mode, add on the number of additional bytes needed to
05147         encode this character, and save the total length in case this is a
05148         final char that is repeated. */
05149 
05150         }
05151 
05152       /* Ordinary character or single-char escape */
05153 
05154       runlength++;
05155       }
05156 
05157     /* This "while" is the end of the "do" above. */
05158 
05159     while (runlength < MAXLIT &&
05160       (compile_block.ctypes[c = *(++ptr)] & ctype_meta) == 0);
05161 
05162     /* If we hit a meta-character, back off to point to it */
05163 
05164     if (runlength < MAXLIT) ptr--;
05165 
05166     /* If the last char in the string is a UTF-8 multibyte character, we must
05167     set lastcharlength correctly. If it was specified as an escape, this will
05168     already have been done above. However, we also have to support in-line
05169     UTF-8 characters, so check backwards from where we are. */
05170 
05171 
05172     length += runlength;
05173     continue;
05174     }
05175   }
05176 
05177 length += 2 + LINK_SIZE;    /* For final KET and END */
05178 
05179 if (length > MAX_PATTERN_SIZE)
05180   {
05181   *errorptr = ERR20;
05182   return NULL;
05183   }
05184 
05185 /* Compute the size of data block needed and get it, either from malloc or
05186 externally provided function. */
05187 
05188 size = length + sizeof(real_pcre) + name_count * (max_name_size + 3);
05189 re = static_cast<real_pcre *>(malloc(size));
05190 
05191 if (re == NULL)
05192   {
05193   *errorptr = ERR21;
05194   return NULL;
05195   }
05196 
05197 /* Put in the magic number, and save the size, options, and table pointer */
05198 
05199 re->magic_number = MAGIC_NUMBER;
05200 re->size = size;
05201 re->options = options;
05202 re->tables = tables;
05203 re->name_entry_size = max_name_size + 3;
05204 re->name_count = name_count;
05205 
05206 /* The starting points of the name/number translation table and of the code are
05207 passed around in the compile data block. */
05208 
05209 compile_block.names_found = 0;
05210 compile_block.name_entry_size = max_name_size + 3;
05211 compile_block.name_table = (uschar *)re + sizeof(real_pcre);
05212 codestart = compile_block.name_table + re->name_entry_size * re->name_count;
05213 compile_block.start_code = codestart;
05214 compile_block.req_varyopt = 0;
05215 
05216 /* Set up a starting, non-extracting bracket, then compile the expression. On
05217 error, *errorptr will be set non-NULL, so we don't need to look at the result
05218 of the function here. */
05219 
05220 ptr = (const uschar *)pattern;
05221 code = (uschar *)codestart;
05222 *code = OP_BRA;
05223 bracount = 0;
05224 (void)compile_regex(options, options & PCRE_IMS, &bracount, &code, &ptr,
05225   errorptr, false, 0, &firstbyte, &reqbyte, NULL, &compile_block);
05226 re->top_bracket = bracount;
05227 re->top_backref = compile_block.top_backref;
05228 
05229 /* If not reached end of pattern on success, there's an excess bracket. */
05230 
05231 if (*errorptr == NULL && *ptr != 0) *errorptr = ERR22;
05232 
05233 /* Fill in the terminating state and check for disastrous overflow, but
05234 if debugging, leave the test till after things are printed out. */
05235 
05236 *code++ = OP_END;
05237 
05238 if (code - codestart > length) *errorptr = ERR23;
05239 
05240 /* Give an error if there's back reference to a non-existent capturing
05241 subpattern. */
05242 
05243 if (re->top_backref > re->top_bracket) *errorptr = ERR15;
05244 
05245 /* Failed to compile, or error while post-processing */
05246 
05247 if (*errorptr != NULL)
05248   {
05249   free(re);
05250   PCRE_ERROR_RETURN:
05251   *erroroffset = ptr - (const uschar *)pattern;
05252   return NULL;
05253   }
05254 
05255 /* If the anchored option was not passed, set the flag if we can determine that
05256 the pattern is anchored by virtue of ^ characters or \A or anything else (such
05257 as starting with .* when DOTALL is set).
05258 
05259 Otherwise, if we know what the first character has to be, save it, because that
05260 speeds up unanchored matches no end. If not, see if we can set the
05261 PCRE_STARTLINE flag. This is helpful for multiline matches when all branches
05262 start with ^. and also when all branches start with .* for non-DOTALL matches.
05263 */
05264 
05265 if ((options & PCRE_ANCHORED) == 0)
05266   {
05267   int temp_options = options;
05268   if (is_anchored(codestart, &temp_options, 0, compile_block.backref_map))
05269     re->options |= PCRE_ANCHORED;
05270   else
05271     {
05272     if (firstbyte < 0)
05273       firstbyte = find_firstassertedchar(codestart, &temp_options, false);
05274     if (firstbyte >= 0)   /* Remove caseless flag for non-caseable chars */
05275       {
05276       int ch = firstbyte & 255;
05277       re->first_byte = ((firstbyte & REQ_CASELESS) != 0 &&
05278          compile_block.fcc[ch] == ch)? ch : firstbyte;
05279       re->options |= PCRE_FIRSTSET;
05280       }
05281     else if (is_startline(codestart, 0, compile_block.backref_map))
05282       re->options |= PCRE_STARTLINE;
05283     }
05284   }
05285 
05286 /* For an anchored pattern, we use the "required byte" only if it follows a
05287 variable length item in the regex. Remove the caseless flag for non-caseable
05288 chars. */
05289 
05290 if (reqbyte >= 0 &&
05291      ((re->options & PCRE_ANCHORED) == 0 || (reqbyte & REQ_VARY) != 0))
05292   {
05293   int ch = reqbyte & 255;
05294   re->req_byte = ((reqbyte & REQ_CASELESS) != 0 &&
05295     compile_block.fcc[ch] == ch)? (reqbyte & ~REQ_CASELESS) : reqbyte;
05296   re->options |= PCRE_REQCHSET;
05297   }
05298 
05299 return (pcre *)re;
05300 }
05301 
05302 
05303 
05304 /*************************************************
05305 *          Match a back-reference                *
05306 *************************************************/
05307 
05308 /* If a back reference hasn't been set, the length that is passed is greater
05309 than the number of characters left in the string, so the match fails.
05310 
05311 Arguments:
05312   offset      index into the offset vector
05313   eptr        points into the subject
05314   length      length to be matched
05315   md          points to match data block
05316   ims         the ims flags
05317 
05318 Returns:      true if matched
05319 */
05320 
05321 static bool
05322 match_ref(int offset, register const uschar *eptr, int length, match_data *md,
05323   unsigned long int ims)
05324 {
05325 const uschar *p = md->start_subject + md->offset_vector[offset];
05326 
05327 /* Always fail if not enough characters left */
05328 
05329 if (length > md->end_subject - eptr) return false;
05330 
05331 /* Separate the caselesss case for speed */
05332 
05333 if ((ims & PCRE_CASELESS) != 0)
05334   {
05335   while (length-- > 0)
05336     if (md->lcc[*p++] != md->lcc[*eptr++]) return false;
05337   }
05338 else
05339   { while (length-- > 0) if (*p++ != *eptr++) return false; }
05340 
05341 return true;
05342 }
05343 
05344 /***************************************************************************
05345 ****************************************************************************
05346                    RECURSION IN THE match() FUNCTION
05347 
05348 The match() function is highly recursive. Some regular expressions can cause
05349 it to recurse thousands of times. I was writing for Unix, so I just let it
05350 call itself recursively. This uses the stack for saving everything that has
05351 to be saved for a recursive call. On Unix, the stack can be large, and this
05352 works fine.
05353 
05354 It turns out that on non-Unix systems there are problems with programs that
05355 use a lot of stack. (This despite the fact that every last chip has oodles
05356 of memory these days, and techniques for extending the stack have been known
05357 for decades.) So....
05358 
05359 There is a fudge, triggered by defining NO_RECURSE, which avoids recursive
05360 calls by keeping local variables that need to be preserved in blocks of memory
05361 obtained from malloc instead instead of on the stack. Macros are used to
05362 achieve this so that the actual code doesn't look very different to what it
05363 always used to.
05364 ****************************************************************************
05365 ***************************************************************************/
05366 
05367 
05368 /* These versions of the macros use the stack, as normal */
05369 
05370 #define REGISTER register
05371 #define RMATCH(rx,ra,rb,rc,rd,re,rf,rg) rx = match(ra,rb,rc,rd,re,rf,rg)
05372 #define RRETURN(ra) return ra
05373 
05374 /***************************************************************************
05375 ***************************************************************************/
05376 
05377 
05378 /*************************************************
05379 *         Match from current position            *
05380 *************************************************/
05381 
05382 /* On entry ecode points to the first opcode, and eptr to the first character
05383 in the subject string, while eptrb holds the value of eptr at the start of the
05384 last bracketed group - used for breaking infinite loops matching zero-length
05385 strings. This function is called recursively in many circumstances. Whenever it
05386 returns a negative (error) response, the outer incarnation must also return the
05387 same response.
05388 
05389 Performance note: It might be tempting to extract commonly used fields from the
05390 md structure (e.g. utf8, end_subject) into individual variables to improve
05391 performance. Tests using gcc on a SPARC disproved this; in the first case, it
05392 made performance worse.
05393 
05394 Arguments:
05395    eptr        pointer in subject
05396    ecode       position in code
05397    offset_top  current top pointer
05398    md          pointer to "static" info for the match
05399    ims         current /i, /m, and /s options
05400    eptrb       pointer to chain of blocks containing eptr at start of
05401                  brackets - for testing for empty matches
05402    flags       can contain
05403                  match_condassert - this is an assertion condition
05404                  match_isgroup - this is the start of a bracketed group
05405 
05406 Returns:       MATCH_MATCH if matched            )  these values are >= 0
05407                MATCH_NOMATCH if failed to match  )
05408                a negative PCRE_ERROR_xxx value if aborted by an error condition
05409                  (e.g. stopped by recursion limit)
05410 */
05411 
05412 static int
05413 match(REGISTER const uschar *eptr, REGISTER const uschar *ecode,
05414   int offset_top, match_data *md, unsigned long int ims, eptrblock *eptrb,
05415   int flags)
05416 {
05417 /* These variables do not need to be preserved over recursion in this function,
05418 so they can be ordinary variables in all cases. Mark them with "register"
05419 because they are used a lot in loops. */
05420 
05421 register int rrc;    /* Returns from recursive calls */
05422 register int i;      /* Used for loops not involving calls to RMATCH() */
05423 register int c;      /* Character values not kept over RMATCH() calls */
05424 
05425 /* When recursion is not being used, all "local" variables that have to be
05426 preserved over calls to RMATCH() are part of a "frame" which is obtained from
05427 heap storage. Set up the top-level frame here; others are obtained from the
05428 heap whenever RMATCH() does a "recursion". See the macro definitions above. */
05429 
05430 #define fi i
05431 #define fc c
05432 
05433 const uschar *callpat;             /* Many of these variables are used ony */
05434                                    /* small blocks of the code. My normal  */
05435 const uschar *data;                /* style of coding would have declared  */
05436                                    /* them within each of those blocks.    */
05437 const uschar *next;                /* However, in order to accommodate the */
05438 const uschar *pp;                  /* version of this code that uses an    */
05439 const uschar *prev;                /* external "stack" implemented on the  */
05440 const uschar *saved_eptr;          /* heap, it is easier to declare them   */
05441                                    /* all here, so the declarations can    */
05442 recursion_info new_recursive;      /* be cut out in a block. The only      */
05443                                    /* declarations within blocks below are */
05444 bool cur_is_word;                  /* for variables that do not have to    */
05445 bool condition;                    /* be preserved over a recursive call   */
05446 bool minimize;                     /* to RMATCH().                         */
05447 bool prev_is_word;
05448 
05449 unsigned long int original_ims;
05450 
05451 int ctype;
05452 int length;
05453 int max;
05454 int min;
05455 int number;
05456 int offset;
05457 int op;
05458 int save_capture_last;
05459 int save_offset1, save_offset2, save_offset3;
05460 int stacksave[REC_STACK_SAVE_MAX];
05461 
05462 eptrblock newptrb;
05463 
05464 /* OK, now we can get on with the real code of the function. Recursion is
05465 specified by the macros RMATCH and RRETURN. When NO_RECURSE is *not* defined,
05466 these just turn into a recursive call to match() and a "return", respectively.
05467 However, RMATCH isn't like a function call because it's quite a complicated
05468 macro. It has to be used in one particular way. This shouldn't, however, impact
05469 performance when true recursion is being used. */
05470 
05471 if (md->match_call_count++ >= md->match_limit) RRETURN(PCRE_ERROR_MATCHLIMIT);
05472 
05473 original_ims = ims;    /* Save for resetting on ')' */
05474 
05475 /* At the start of a bracketed group, add the current subject pointer to the
05476 stack of such pointers, to be re-instated at the end of the group when we hit
05477 the closing ket. When match() is called in other circumstances, we don't add to
05478 this stack. */
05479 
05480 if ((flags & match_isgroup) != 0)
05481   {
05482   newptrb.epb_prev = eptrb;
05483   newptrb.epb_saved_eptr = eptr;
05484   eptrb = &newptrb;
05485   }
05486 
05487 /* Now start processing the operations. */
05488 
05489 for (;!MuxAlarm.bAlarmed;)
05490   {
05491   op = *ecode;
05492   minimize = false;
05493 
05494   /* Opening capturing bracket. If there is space in the offset vector, save
05495   the current subject position in the working slot at the top of the vector. We
05496   mustn't change the current values of the data slot, because they may be set
05497   from a previous iteration of this group, and be referred to by a reference
05498   inside the group.
05499 
05500   If the bracket fails to match, we need to restore this value and also the
05501   values of the final offsets, in case they were set by a previous iteration of
05502   the same bracket.
05503 
05504   If there isn't enough space in the offset vector, treat this as if it were a
05505   non-capturing bracket. Don't worry about setting the flag for the error case
05506   here; that is handled in the code for KET. */
05507 
05508   if (op > OP_BRA)
05509     {
05510     number = op - OP_BRA;
05511 
05512     /* For extended extraction brackets (large number), we have to fish out the
05513     number from a dummy opcode at the start. */
05514 
05515     if (number > EXTRACT_BASIC_MAX)
05516       number = GET2(ecode, 2+LINK_SIZE);
05517     offset = number << 1;
05518 
05519     if (offset < md->offset_max)
05520       {
05521       save_offset1 = md->offset_vector[offset];
05522       save_offset2 = md->offset_vector[offset+1];
05523       save_offset3 = md->offset_vector[md->offset_end - number];
05524       save_capture_last = md->capture_last;
05525 
05526       DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
05527       md->offset_vector[md->offset_end - number] = eptr - md->start_subject;
05528 
05529       do
05530         {
05531         RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb,
05532           match_isgroup);
05533         if (rrc != MATCH_NOMATCH) RRETURN(rrc);
05534         md->capture_last = save_capture_last;
05535         ecode += GET(ecode, 1);
05536         }
05537       while (*ecode == OP_ALT);
05538 
05539       DPRINTF(("bracket %d failed\n", number));
05540 
05541       md->offset_vector[offset] = save_offset1;
05542       md->offset_vector[offset+1] = save_offset2;
05543       md->offset_vector[md->offset_end - number] = save_offset3;
05544 
05545       RRETURN(MATCH_NOMATCH);
05546       }
05547 
05548     /* Insufficient room for saving captured contents */
05549 
05550     else op = OP_BRA;
05551     }
05552 
05553   /* Other types of node can be handled by a switch */
05554 
05555   switch(op)
05556     {
05557     case OP_BRA:     /* Non-capturing bracket: optimized */
05558     DPRINTF(("start bracket 0\n"));
05559     do
05560       {
05561       RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb,
05562         match_isgroup);
05563       if (rrc != MATCH_NOMATCH) RRETURN(rrc);
05564       ecode += GET(ecode, 1);
05565       }
05566     while (*ecode == OP_ALT);
05567     DPRINTF(("bracket 0 failed\n"));
05568     RRETURN(MATCH_NOMATCH);
05569 
05570     /* Conditional group: compilation checked that there are no more than
05571     two branches. If the condition is false, skipping the first branch takes us
05572     past the end if there is only one branch, but that's OK because that is
05573     exactly what going to the ket would do. */
05574 
05575     case OP_COND:
05576     if (ecode[LINK_SIZE+1] == OP_CREF) /* Condition extract or recurse test */
05577       {
05578       offset = GET2(ecode, LINK_SIZE+2) << 1;  /* Doubled ref number */
05579       condition = (offset == CREF_RECURSE * 2)?
05580         (md->recursive != NULL) :
05581         (offset < offset_top && md->offset_vector[offset] >= 0);
05582       RMATCH(rrc, eptr, ecode + (condition?
05583         (LINK_SIZE + 4) : (LINK_SIZE + 1 + GET(ecode, 1))),
05584         offset_top, md, ims, eptrb, match_isgroup);
05585       RRETURN(rrc);
05586       }
05587 
05588     /* The condition is an assertion. Call match() to evaluate it - setting
05589     the final argument true causes it to stop at the end of an assertion. */
05590 
05591     else
05592       {
05593       RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL,
05594           match_condassert | match_isgroup);
05595       if (rrc == MATCH_MATCH)
05596         {
05597         ecode += 1 + LINK_SIZE + GET(ecode, LINK_SIZE+2);
05598         while (*ecode == OP_ALT) ecode += GET(ecode, 1);
05599         }
05600       else if (rrc != MATCH_NOMATCH)
05601         {
05602         RRETURN(rrc);         /* Need braces because of following else */
05603         }
05604       else ecode += GET(ecode, 1);
05605       RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb,
05606         match_isgroup);
05607       RRETURN(rrc);
05608       }
05609     /* Control never reaches here */
05610 
05611     /* Skip over conditional reference or large extraction number data if
05612     encountered. */
05613 
05614     case OP_CREF:
05615     case OP_BRANUMBER:
05616     ecode += 3;
05617     break;
05618 
05619     /* End of the pattern. If we are in a recursion, we should restore the
05620     offsets appropriately and continue from after the call. */
05621 
05622     case OP_END:
05623     if (md->recursive != NULL && md->recursive->group_num == 0)
05624       {
05625       recursion_info *rec = md->recursive;
05626       DPRINTF(("Hit the end in a (?0) recursion\n"));
05627       md->recursive = rec->prevrec;
05628       memmove(md->offset_vector, rec->offset_save,
05629         rec->saved_max * sizeof(int));
05630       md->start_match = rec->save_start;
05631       ims = original_ims;
05632       ecode = rec->after_call;
05633       break;
05634       }
05635 
05636     /* Otherwise, if PCRE_NOTEMPTY is set, fail if we have matched an empty
05637     string - backtracking will then try other alternatives, if any. */
05638 
05639     if (md->notempty && eptr == md->start_match) RRETURN(MATCH_NOMATCH);
05640     md->end_match_ptr = eptr;          /* Record where we ended */
05641     md->end_offset_top = offset_top;   /* and how many extracts were taken */
05642     RRETURN(MATCH_MATCH);
05643 
05644     /* Change option settings */
05645 
05646     case OP_OPT:
05647     ims = ecode[1];
05648     ecode += 2;
05649     DPRINTF(("ims set to %02lx\n", ims));
05650     break;
05651 
05652     /* Assertion brackets. Check the alternative branches in turn - the
05653     matching won't pass the KET for an assertion. If any one branch matches,
05654     the assertion is true. Lookbehind assertions have an OP_REVERSE item at the
05655     start of each branch to move the current point backwards, so the code at
05656     this level is identical to the lookahead case. */
05657 
05658     case OP_ASSERT:
05659     case OP_ASSERTBACK:
05660     do
05661       {
05662       RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL,
05663         match_isgroup);
05664       if (rrc == MATCH_MATCH) break;
05665       if (rrc != MATCH_NOMATCH) RRETURN(rrc);
05666       ecode += GET(ecode, 1);
05667       }
05668     while (*ecode == OP_ALT);
05669     if (*ecode == OP_KET) RRETURN(MATCH_NOMATCH);
05670 
05671     /* If checking an assertion for a condition, return MATCH_MATCH. */
05672 
05673     if ((flags & match_condassert) != 0) RRETURN(MATCH_MATCH);
05674 
05675     /* Continue from after the assertion, updating the offsets high water
05676     mark, since extracts may have been taken during the assertion. */
05677 
05678     do ecode += GET(ecode,1); while (*ecode == OP_ALT);
05679     ecode += 1 + LINK_SIZE;
05680     offset_top = md->end_offset_top;
05681     continue;
05682 
05683     /* Negative assertion: all branches must fail to match */
05684 
05685     case OP_ASSERT_NOT:
05686     case OP_ASSERTBACK_NOT:
05687     do
05688       {
05689       RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL,
05690         match_isgroup);
05691       if (rrc == MATCH_MATCH) RRETURN(MATCH_NOMATCH);
05692       if (rrc != MATCH_NOMATCH) RRETURN(rrc);
05693       ecode += GET(ecode,1);
05694       }
05695     while (*ecode == OP_ALT);
05696 
05697     if ((flags & match_condassert) != 0) RRETURN(MATCH_MATCH);
05698 
05699     ecode += 1 + LINK_SIZE;
05700     continue;
05701 
05702     /* Move the subject pointer back. This occurs only at the start of
05703     each branch of a lookbehind assertion. If we are too close to the start to
05704     move back, this match function fails. When working with UTF-8 we move
05705     back a number of characters, not bytes. */
05706 
05707     case OP_REVERSE:
05708 
05709     /* No UTF-8 support, or not in UTF-8 mode: count is byte count */
05710 
05711       {
05712       eptr -= GET(ecode,1);
05713       if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH);
05714       }
05715 
05716     /* Skip to next op code */
05717 
05718     ecode += 1 + LINK_SIZE;
05719     break;
05720 
05721     /* The callout item calls an external function, if one is provided, passing
05722     details of the match so far. This is mainly for debugging, though the
05723     function is able to force a failure. */
05724 
05725     case OP_CALLOUT:
05726     if (pcre_callout != NULL)
05727       {
05728       pcre_callout_block cb;
05729       cb.version          = 0;   /* Version 0 of the callout block */
05730       cb.callout_number   = ecode[1];
05731       cb.offset_vector    = md->offset_vector;
05732       cb.subject          = (const char *)md->start_subject;
05733       cb.subject_length   = md->end_subject - md->start_subject;
05734       cb.start_match      = md->start_match - md->start_subject;
05735       cb.current_position = eptr - md->start_subject;
05736       cb.capture_top      = offset_top/2;
05737       cb.capture_last     = md->capture_last;
05738       cb.callout_data     = md->callout_data;
05739       if ((rrc = (*pcre_callout)(&cb)) > 0) RRETURN(MATCH_NOMATCH);
05740       if (rrc < 0) RRETURN(rrc);
05741       }
05742     ecode += 2;
05743     break;
05744 
05745     /* Recursion either matches the current regex, or some subexpression. The
05746     offset data is the offset to the starting bracket from the start of the
05747     whole pattern. (This is so that it works from duplicated subpatterns.)
05748 
05749     If there are any capturing brackets started but not finished, we have to
05750     save their starting points and reinstate them after the recursion. However,
05751     we don't know how many such there are (offset_top records the completed
05752     total) so we just have to save all the potential data. There may be up to
05753     65535 such values, which is too large to put on the stack, but using malloc
05754     for small numbers seems expensive. As a compromise, the stack is used when
05755     there are no more than REC_STACK_SAVE_MAX values to store; otherwise malloc
05756     is used. A problem is what to do if the malloc fails ... there is no way of
05757     returning to the top level with an error. Save the top REC_STACK_SAVE_MAX
05758     values on the stack, and accept that the rest may be wrong.
05759 
05760     There are also other values that have to be saved. We use a chained
05761     sequence of blocks that actually live on the stack. Thanks to Robin Houston
05762     for the original version of this logic. */
05763 
05764     case OP_RECURSE:
05765       {
05766       callpat = md->start_code + GET(ecode, 1);
05767       new_recursive.group_num = *callpat - OP_BRA;
05768 
05769       /* For extended extraction brackets (large number), we have to fish out
05770       the number from a dummy opcode at the start. */
05771 
05772       if (new_recursive.group_num > EXTRACT_BASIC_MAX)
05773         new_recursive.group_num = GET2(callpat, 2+LINK_SIZE);
05774 
05775       /* Add to "recursing stack" */
05776 
05777       new_recursive.prevrec = md->recursive;
05778       md->recursive = &new_recursive;
05779 
05780       /* Find where to continue from afterwards */
05781 
05782       ecode += 1 + LINK_SIZE;
05783       new_recursive.after_call = ecode;
05784 
05785       /* Now save the offset data. */
05786 
05787       new_recursive.saved_max = md->offset_end;
05788       if (new_recursive.saved_max <= REC_STACK_SAVE_MAX)
05789         new_recursive.offset_save = stacksave;
05790       else
05791         {
05792         new_recursive.offset_save =
05793           static_cast<int *>(malloc(new_recursive.saved_max * sizeof(int)));
05794         if (new_recursive.offset_save == NULL) RRETURN(PCRE_ERROR_NOMEMORY);
05795         }
05796 
05797       memcpy(new_recursive.offset_save, md->offset_vector,
05798             new_recursive.saved_max * sizeof(int));
05799       new_recursive.save_start = md->start_match;
05800       md->start_match = eptr;
05801 
05802       /* OK, now we can do the recursion. For each top-level alternative we
05803       restore the offset and recursion data. */
05804 
05805       DPRINTF(("Recursing into group %d\n", new_recursive.group_num));
05806       do
05807         {
05808         RMATCH(rrc, eptr, callpat + 1 + LINK_SIZE, offset_top, md, ims,
05809             eptrb, match_isgroup);
05810         if (rrc == MATCH_MATCH)
05811           {
05812           md->recursive = new_recursive.prevrec;
05813           if (new_recursive.offset_save != stacksave)
05814             free(new_recursive.offset_save);
05815           RRETURN(MATCH_MATCH);
05816           }
05817         else if (rrc != MATCH_NOMATCH) RRETURN(rrc);
05818 
05819         md->recursive = &new_recursive;
05820         memcpy(md->offset_vector, new_recursive.offset_save,
05821             new_recursive.saved_max * sizeof(int));
05822         callpat += GET(callpat, 1);
05823         }
05824       while (*callpat == OP_ALT);
05825 
05826       DPRINTF(("Recursion didn't match\n"));
05827       md->recursive = new_recursive.prevrec;
05828       if (new_recursive.offset_save != stacksave)
05829         free(new_recursive.offset_save);
05830       RRETURN(MATCH_NOMATCH);
05831       }
05832     /* Control never reaches here */
05833 
05834     /* "Once" brackets are like assertion brackets except that after a match,
05835     the point in the subject string is not moved back. Thus there can never be
05836     a move back into the brackets. Friedl calls these "atomic" subpatterns.
05837     Check the alternative branches in turn - the matching won't pass the KET
05838     for this kind of subpattern. If any one branch matches, we carry on as at
05839     the end of a normal bracket, leaving the subject pointer. */
05840 
05841     case OP_ONCE:
05842       {
05843       prev = ecode;
05844       saved_eptr = eptr;
05845 
05846       do
05847         {
05848         RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims,
05849           eptrb, match_isgroup);
05850         if (rrc == MATCH_MATCH) break;
05851         if (rrc != MATCH_NOMATCH) RRETURN(rrc);
05852         ecode += GET(ecode,1);
05853         }
05854       while (*ecode == OP_ALT);
05855 
05856       /* If hit the end of the group (which could be repeated), fail */
05857 
05858       if (*ecode != OP_ONCE && *ecode != OP_ALT) RRETURN(MATCH_NOMATCH);
05859 
05860       /* Continue as from after the assertion, updating the offsets high water
05861       mark, since extracts may have been taken. */
05862 
05863       do ecode += GET(ecode,1); while (*ecode == OP_ALT);
05864 
05865       offset_top = md->end_offset_top;
05866       eptr = md->end_match_ptr;
05867 
05868       /* For a non-repeating ket, just continue at this level. This also
05869       happens for a repeating ket if no characters were matched in the group.
05870       This is the forcible breaking of infinite loops as implemented in Perl
05871       5.005. If there is an options reset, it will get obeyed in the normal
05872       course of events. */
05873 
05874       if (*ecode == OP_KET || eptr == saved_eptr)
05875         {
05876         ecode += 1+LINK_SIZE;
05877         break;
05878         }
05879 
05880       /* The repeating kets try the rest of the pattern or restart from the
05881       preceding bracket, in the appropriate order. We need to reset any options
05882       that changed within the bracket before re-running it, so check the next
05883       opcode. */
05884 
05885       if (ecode[1+LINK_SIZE] == OP_OPT)
05886         {
05887         ims = (ims & ~PCRE_IMS) | ecode[4];
05888         DPRINTF(("ims set to %02lx at group repeat\n", ims));
05889         }
05890 
05891       if (*ecode == OP_KETRMIN)
05892         {
05893         RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb, 0);
05894         if (rrc != MATCH_NOMATCH) RRETURN(rrc);
05895         RMATCH(rrc, eptr, prev, offset_top, md, ims, eptrb, match_isgroup);
05896         if (rrc != MATCH_NOMATCH) RRETURN(rrc);
05897         }
05898       else  /* OP_KETRMAX */
05899         {
05900         RMATCH(rrc, eptr, prev, offset_top, md, ims, eptrb, match_isgroup);
05901         if (rrc != MATCH_NOMATCH) RRETURN(rrc);
05902         RMATCH(rrc, eptr, ecode + 1+LINK_SIZE, offset_top, md, ims, eptrb, 0);
05903         if (rrc != MATCH_NOMATCH) RRETURN(rrc);
05904         }
05905       }
05906     RRETURN(MATCH_NOMATCH);
05907 
05908     /* An alternation is the end of a branch; scan along to find the end of the
05909     bracketed group and go to there. */
05910 
05911     case OP_ALT:
05912     do ecode += GET(ecode,1); while (*ecode == OP_ALT);
05913     break;
05914 
05915     /* BRAZERO and BRAMINZERO occur just before a bracket group, indicating
05916     that it may occur zero times. It may repeat infinitely, or not at all -
05917     i.e. it could be ()* or ()? in the pattern. Brackets with fixed upper
05918     repeat limits are compiled as a number of copies, with the optional ones
05919     preceded by BRAZERO or BRAMINZERO. */
05920 
05921     case OP_BRAZERO:
05922       {
05923       next = ecode+1;
05924       RMATCH(rrc, eptr, next, offset_top, md, ims, eptrb, match_isgroup);
05925       if (rrc != MATCH_NOMATCH) RRETURN(rrc);
05926       do next += GET(next,1); while (*next == OP_ALT);
05927       ecode = next + 1+LINK_SIZE;
05928       }
05929     break;
05930 
05931     case OP_BRAMINZERO:
05932       {
05933       next = ecode+1;
05934       do next += GET(next,1); while (*next == OP_ALT);
05935       RMATCH(rrc, eptr, next + 1+LINK_SIZE, offset_top, md, ims, eptrb,
05936         match_isgroup);
05937       if (rrc != MATCH_NOMATCH) RRETURN(rrc);
05938       ecode++;
05939       }
05940     break;
05941 
05942     /* End of a group, repeated or non-repeating. If we are at the end of
05943     an assertion "group", stop matching and return MATCH_MATCH, but record the
05944     current high water mark for use by positive assertions. Do this also
05945     for the "once" (not-backup up) groups. */
05946 
05947     case OP_KET:
05948     case OP_KETRMIN:
05949     case OP_KETRMAX:
05950       {
05951       prev = ecode - GET(ecode, 1);
05952       saved_eptr = eptrb->epb_saved_eptr;
05953 
05954       /* Back up the stack of bracket start pointers. */
05955 
05956       eptrb = eptrb->epb_prev;
05957 
05958       if (*prev == OP_ASSERT || *prev == OP_ASSERT_NOT ||
05959           *prev == OP_ASSERTBACK || *prev == OP_ASSERTBACK_NOT ||
05960           *prev == OP_ONCE)
05961         {
05962         md->end_match_ptr = eptr;      /* For ONCE */
05963         md->end_offset_top = offset_top;
05964         RRETURN(MATCH_MATCH);
05965         }
05966 
05967       /* In all other cases except a conditional group we have to check the
05968       group number back at the start and if necessary complete handling an
05969       extraction by setting the offsets and bumping the high water mark. */
05970 
05971       if (*prev != OP_COND)
05972         {
05973         number = *prev - OP_BRA;
05974 
05975         /* For extended extraction brackets (large number), we have to fish out
05976         the number from a dummy opcode at the start. */
05977 
05978         if (number > EXTRACT_BASIC_MAX) number = GET2(prev, 2+LINK_SIZE);
05979         offset = number << 1;
05980 
05981         /* Test for a numbered group. This includes groups called as a result
05982         of recursion. Note that whole-pattern recursion is coded as a recurse
05983         into group 0, so it won't be picked up here. Instead, we catch it when
05984         the OP_END is reached. */
05985 
05986         if (number > 0)
05987           {
05988           md->capture_last = number;
05989           if (offset >= md->offset_max) md->offset_overflow = true; else
05990             {
05991             md->offset_vector[offset] =
05992               md->offset_vector[md->offset_end - number];
05993             md->offset_vector[offset+1] = eptr - md->start_subject;
05994             if (offset_top <= offset) offset_top = offset + 2;
05995             }
05996 
05997           /* Handle a recursively called group. Restore the offsets
05998           appropriately and continue from after the call. */
05999 
06000           if (md->recursive != NULL && md->recursive->group_num == number)
06001             {
06002             recursion_info *rec = md->recursive;
06003             DPRINTF(("Recursion (%d) succeeded - continuing\n", number));
06004             md->recursive = rec->prevrec;
06005             md->start_match = rec->save_start;
06006             memcpy(md->offset_vector, rec->offset_save,
06007               rec->saved_max * sizeof(int));
06008             ecode = rec->after_call;
06009             ims = original_ims;
06010             break;
06011             }
06012           }
06013         }
06014 
06015       /* Reset the value of the ims flags, in case they got changed during
06016       the group. */
06017 
06018       ims = original_ims;
06019       DPRINTF(("ims reset to %02lx\n", ims));
06020 
06021       /* For a non-repeating ket, just continue at this level. This also
06022       happens for a repeating ket if no characters were matched in the group.
06023       This is the forcible breaking of infinite loops as implemented in Perl
06024       5.005. If there is an options reset, it will get obeyed in the normal
06025       course of events. */
06026 
06027       if (*ecode == OP_KET || eptr == saved_eptr)
06028         {
06029         ecode += 1 + LINK_SIZE;
06030         break;
06031         }
06032 
06033       /* The repeating kets try the rest of the pattern or restart from the
06034       preceding bracket, in the appropriate order. */
06035 
06036       if (*ecode == OP_KETRMIN)
06037         {
06038         RMATCH(rrc, eptr, ecode + 1+LINK_SIZE, offset_top, md, ims, eptrb, 0);
06039         if (rrc != MATCH_NOMATCH) RRETURN(rrc);
06040         RMATCH(rrc, eptr, prev, offset_top, md, ims, eptrb, match_isgroup);
06041         if (rrc != MATCH_NOMATCH) RRETURN(rrc);
06042         }
06043       else  /* OP_KETRMAX */
06044         {
06045         RMATCH(rrc, eptr, prev, offset_top, md, ims, eptrb, match_isgroup);
06046         if (rrc != MATCH_NOMATCH) RRETURN(rrc);
06047         RMATCH(rrc, eptr, ecode + 1+LINK_SIZE, offset_top, md, ims, eptrb, 0);
06048         if (rrc != MATCH_NOMATCH) RRETURN(rrc);
06049         }
06050       }
06051 
06052     RRETURN(MATCH_NOMATCH);
06053 
06054     /* Start of subject unless notbol, or after internal newline if multiline */
06055 
06056     case OP_CIRC:
06057     if (md->notbol && eptr == md->start_subject) RRETURN(MATCH_NOMATCH);
06058     if ((ims & PCRE_MULTILINE) != 0)
06059       {
06060       if (eptr != md->start_subject && eptr[-1] != NEWLINE)
06061         RRETURN(MATCH_NOMATCH);
06062       ecode++;
06063       break;
06064       }
06065     /* ... else fall through */
06066 
06067     /* Start of subject assertion */
06068 
06069     case OP_SOD:
06070     if (eptr != md->start_subject) RRETURN(MATCH_NOMATCH);
06071     ecode++;
06072     break;
06073 
06074     /* Start of match assertion */
06075 
06076     case OP_SOM:
06077     if (eptr != md->start_subject + md->start_offset) RRETURN(MATCH_NOMATCH);
06078     ecode++;
06079     break;
06080 
06081     /* Assert before internal newline if multiline, or before a terminating
06082     newline unless endonly is set, else end of subject unless noteol is set. */
06083 
06084     case OP_DOLL:
06085     if ((ims & PCRE_MULTILINE) != 0)
06086       {
06087       if (eptr < md->end_subject)
06088         { if (*eptr != NEWLINE) RRETURN(MATCH_NOMATCH); }
06089       else
06090         { if (md->noteol) RRETURN(MATCH_NOMATCH); }
06091       ecode++;
06092       break;
06093       }
06094     else
06095       {
06096       if (md->noteol) RRETURN(MATCH_NOMATCH);
06097       if (!md->endonly)
06098         {
06099         if (eptr < md->end_subject - 1 ||
06100            (eptr == md->end_subject - 1 && *eptr != NEWLINE))
06101           RRETURN(MATCH_NOMATCH);
06102         ecode++;
06103         break;
06104         }
06105       }
06106     /* ... else fall through */
06107 
06108     /* End of subject assertion (\z) */
06109 
06110     case OP_EOD:
06111     if (eptr < md->end_subject) RRETURN(MATCH_NOMATCH);
06112     ecode++;
06113     break;
06114 
06115     /* End of subject or ending \n assertion (\Z) */
06116 
06117     case OP_EODN:
06118     if (eptr < md->end_subject - 1 ||
06119        (eptr == md->end_subject - 1 && *eptr != NEWLINE)) RRETURN(MATCH_NOMATCH);
06120     ecode++;
06121     break;
06122 
06123     /* Word boundary assertions */
06124 
06125     case OP_NOT_WORD_BOUNDARY:
06126     case OP_WORD_BOUNDARY:
06127       {
06128 
06129       /* Find out if the previous and current characters are "word" characters.
06130       It takes a bit more work in UTF-8 mode. Characters > 255 are assumed to
06131       be "non-word" characters. */
06132 
06133 
06134       /* More streamlined when not in UTF-8 mode */
06135 
06136         {
06137         prev_is_word = (eptr != md->start_subject) &&
06138           ((md->ctypes[eptr[-1]] & ctype_word) != 0);
06139         cur_is_word = (eptr < md->end_subject) &&
06140           ((md->ctypes[*eptr] & ctype_word) != 0);
06141         }
06142 
06143       /* Now see if the situation is what we want */
06144 
06145       if ((*ecode++ == OP_WORD_BOUNDARY)?
06146            cur_is_word == prev_is_word : cur_is_word != prev_is_word)
06147         RRETURN(MATCH_NOMATCH);
06148       }
06149     break;
06150 
06151     /* Match a single character type; inline for speed */
06152 
06153     case OP_ANY:
06154     if ((ims & PCRE_DOTALL) == 0 && eptr < md->end_subject && *eptr == NEWLINE)
06155       RRETURN(MATCH_NOMATCH);
06156     if (eptr++ >= md->end_subject) RRETURN(MATCH_NOMATCH);
06157     ecode++;
06158     break;
06159 
06160     /* Match a single byte, even in UTF-8 mode. This opcode really does match
06161     any byte, even newline, independent of the setting of PCRE_DOTALL. */
06162 
06163     case OP_ANYBYTE:
06164     if (eptr++ >= md->end_subject) RRETURN(MATCH_NOMATCH);
06165     ecode++;
06166     break;
06167 
06168     case OP_NOT_DIGIT:
06169     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
06170     GETCHARINCTEST(c, eptr);
06171     if (
06172        (md->ctypes[c] & ctype_digit) != 0
06173        )
06174       RRETURN(MATCH_NOMATCH);
06175     ecode++;
06176     break;
06177 
06178     case OP_DIGIT:
06179     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
06180     GETCHARINCTEST(c, eptr);
06181     if (
06182        (md->ctypes[c] & ctype_digit) == 0
06183        )
06184       RRETURN(MATCH_NOMATCH);
06185     ecode++;
06186     break;
06187 
06188     case OP_NOT_WHITESPACE:
06189     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
06190     GETCHARINCTEST(c, eptr);
06191     if (
06192        (md->ctypes[c] & ctype_space) != 0
06193        )
06194       RRETURN(MATCH_NOMATCH);
06195     ecode++;
06196     break;
06197 
06198     case OP_WHITESPACE:
06199     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
06200     GETCHARINCTEST(c, eptr);
06201     if (
06202        (md->ctypes[c] & ctype_space) == 0
06203        )
06204       RRETURN(MATCH_NOMATCH);
06205     ecode++;
06206     break;
06207 
06208     case OP_NOT_WORDCHAR:
06209     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
06210     GETCHARINCTEST(c, eptr);
06211     if (
06212        (md->ctypes[c] & ctype_word) != 0
06213        )
06214       RRETURN(MATCH_NOMATCH);
06215     ecode++;
06216     break;
06217 
06218     case OP_WORDCHAR:
06219     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
06220     GETCHARINCTEST(c, eptr);
06221     if (
06222        (md->ctypes[c] & ctype_word) == 0
06223        )
06224       RRETURN(MATCH_NOMATCH);
06225     ecode++;
06226     break;
06227 
06228     /* Match a back reference, possibly repeatedly. Look past the end of the
06229     item to see if there is repeat information following. The code is similar
06230     to that for character classes, but repeated for efficiency. Then obey
06231     similar code to character type repeats - written out again for speed.
06232     However, if the referenced string is the empty string, always treat
06233     it as matched, any number of times (otherwise there could be infinite
06234     loops). */
06235 
06236     case OP_REF:
06237       {
06238       offset = GET2(ecode, 1) << 1;               /* Doubled ref number */
06239       ecode += 3;                                     /* Advance past item */
06240 
06241       /* If the reference is unset, set the length to be longer than the amount
06242       of subject left; this ensures that every attempt at a match fails. We
06243       can't just fail here, because of the possibility of quantifiers with zero
06244       minima. */
06245 
06246       length = (offset >= offset_top || md->offset_vector[offset] < 0)?
06247         md->end_subject - eptr + 1 :
06248         md->offset_vector[offset+1] - md->offset_vector[offset];
06249 
06250       /* Set up for repetition, or handle the non-repeated case */
06251 
06252       switch (*ecode)
06253         {
06254         case OP_CRSTAR:
06255         case OP_CRMINSTAR:
06256         case OP_CRPLUS:
06257         case OP_CRMINPLUS:
06258         case OP_CRQUERY:
06259         case OP_CRMINQUERY:
06260         c = *ecode++ - OP_CRSTAR;
06261         minimize = (c & 1) != 0;
06262         min = rep_min[c];                 /* Pick up values from tables; */
06263         max = rep_max[c];                 /* zero for max => infinity */
06264         if (max == 0) max = INT_MAX;
06265         break;
06266 
06267         case OP_CRRANGE:
06268         case OP_CRMINRANGE:
06269         minimize = (*ecode == OP_CRMINRANGE);
06270         min = GET2(ecode, 1);
06271         max = GET2(ecode, 3);
06272         if (max == 0) max = INT_MAX;
06273         ecode += 5;
06274         break;
06275 
06276         default:               /* No repeat follows */
06277         if (!match_ref(offset, eptr, length, md, ims)) RRETURN(MATCH_NOMATCH);
06278         eptr += length;
06279         continue;              /* With the main loop */
06280         }
06281 
06282       /* If the length of the reference is zero, just continue with the
06283       main loop. */
06284 
06285       if (length == 0) continue;
06286 
06287       /* First, ensure the minimum number of matches are present. We get back
06288       the length of the reference string explicitly rather than passing the
06289       address of eptr, so that eptr can be a register variable. */
06290 
06291       for (i = 1; i <= min; i++)
06292         {
06293         if (!match_ref(offset, eptr, length, md, ims)) RRETURN(MATCH_NOMATCH);
06294         eptr += length;
06295         }
06296 
06297       /* If min = max, continue at the same level without recursion.
06298       They are not both allowed to be zero. */
06299 
06300       if (min == max) continue;
06301 
06302       /* If minimizing, keep trying and advancing the pointer */
06303 
06304       if (minimize)
06305         {
06306         for (fi = min;; fi++)
06307           {
06308           RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
06309           if (rrc != MATCH_NOMATCH) RRETURN(rrc);
06310           if (fi >= max || !match_ref(offset, eptr, length, md, ims))
06311             RRETURN(MATCH_NOMATCH);
06312           eptr += length;
06313           }
06314         /* Control never gets here */
06315         }
06316 
06317       /* If maximizing, find the longest string and work backwards */
06318 
06319       else
06320         {
06321         pp = eptr;
06322         for (i = min; i < max; i++)
06323           {
06324           if (!match_ref(offset, eptr, length, md, ims)) break;
06325           eptr += length;
06326           }
06327         while (eptr >= pp)
06328           {
06329           RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
06330           if (rrc != MATCH_NOMATCH) RRETURN(rrc);
06331           eptr -= length;
06332           }
06333         RRETURN(MATCH_NOMATCH);
06334         }
06335       }
06336     /* Control never gets here */
06337 
06338 
06339 
06340     /* Match a bit-mapped character class, possibly repeatedly. This op code is
06341     used when all the characters in the class have values in the range 0-255.
06342     The only difference between OP_CLASS and OP_NCLASS occurs when a data
06343     character outside the range is encountered.
06344 
06345     First, look past the end of the item to see if there is repeat information
06346     following. Then obey similar code to character type repeats - written out
06347     again for speed. */
06348 
06349     case OP_NCLASS:
06350     case OP_CLASS:
06351       {
06352       data = ecode + 1;                /* Save for matching */
06353       ecode += 33;                     /* Advance past the item */
06354 
06355       switch (*ecode)
06356         {
06357         case OP_CRSTAR:
06358         case OP_CRMINSTAR:
06359         case OP_CRPLUS:
06360         case OP_CRMINPLUS:
06361         case OP_CRQUERY:
06362         case OP_CRMINQUERY:
06363         c = *ecode++ - OP_CRSTAR;
06364         minimize = (c & 1) != 0;
06365         min = rep_min[c];                 /* Pick up values from tables; */
06366         max = rep_max[c];                 /* zero for max => infinity */
06367         if (max == 0) max = INT_MAX;
06368         break;
06369 
06370         case OP_CRRANGE:
06371         case OP_CRMINRANGE:
06372         minimize = (*ecode == OP_CRMINRANGE);
06373         min = GET2(ecode, 1);
06374         max = GET2(ecode, 3);
06375         if (max == 0) max = INT_MAX;
06376         ecode += 5;
06377         break;
06378 
06379         default:               /* No repeat follows */
06380         min = max = 1;
06381         break;
06382         }
06383 
06384       /* First, ensure the minimum number of matches are present. */
06385 
06386       /* Not UTF-8 mode */
06387         {
06388         for (i = 1; i <= min; i++)
06389           {
06390           if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
06391           c = *eptr++;
06392           if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
06393           }
06394         }
06395 
06396       /* If max == min we can continue with the main loop without the
06397       need to recurse. */
06398 
06399       if (min == max) continue;
06400 
06401       /* If minimizing, keep testing the rest of the expression and advancing
06402       the pointer while it matches the class. */
06403 
06404       if (minimize)
06405         {
06406         /* Not UTF-8 mode */
06407           {
06408           for (fi = min;; fi++)
06409             {
06410             RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
06411             if (rrc != MATCH_NOMATCH) RRETURN(rrc);
06412             if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
06413             c = *eptr++;
06414             if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
06415             }
06416           }
06417         /* Control never gets here */
06418         }
06419 
06420       /* If maximizing, find the longest possible run, then work backwards. */
06421 
06422       else
06423         {
06424         pp = eptr;
06425 
06426           /* Not UTF-8 mode */
06427           {
06428           for (i = min; i < max; i++)
06429             {
06430             if (eptr >= md->end_subject) break;
06431             c = *eptr;
06432             if ((data[c/8] & (1 << (c&7))) == 0) break;
06433             eptr++;
06434             }
06435           while (eptr >= pp)
06436             {
06437             RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
06438             eptr--;
06439             if (rrc != MATCH_NOMATCH) RRETURN(rrc);
06440             }
06441           }
06442 
06443         RRETURN(MATCH_NOMATCH);
06444         }
06445       }
06446     /* Control never gets here */
06447 
06448 
06449     /* Match an extended character class. This opcode is encountered only
06450     in UTF-8 mode, because that's the only time it is compiled. */
06451 
06452 
06453     /* Match a run of characters */
06454 
06455     case OP_CHARS:
06456       {
06457       register int slen = ecode[1];
06458       ecode += 2;
06459 
06460       if (slen > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
06461       if ((ims & PCRE_CASELESS) != 0)
06462         {
06463         while (slen-- > 0)
06464           if (md->lcc[*ecode++] != md->lcc[*eptr++])
06465             RRETURN(MATCH_NOMATCH);
06466         }
06467       else
06468         {
06469         while (slen-- > 0) if (*ecode++ != *eptr++) RRETURN(MATCH_NOMATCH);
06470         }
06471       }
06472     break;
06473 
06474     /* Match a single character repeatedly; different opcodes share code. */
06475 
06476     case OP_EXACT:
06477     min = max = GET2(ecode, 1);
06478     ecode += 3;
06479     goto REPEATCHAR;
06480 
06481     case OP_UPTO:
06482     case OP_MINUPTO:
06483     min = 0;
06484     max = GET2(ecode, 1);
06485     minimize = *ecode == OP_MINUPTO;
06486     ecode += 3;
06487     goto REPEATCHAR;
06488 
06489     case OP_STAR:
06490     case OP_MINSTAR:
06491     case OP_PLUS:
06492     case OP_MINPLUS:
06493     case OP_QUERY:
06494     case OP_MINQUERY:
06495     c = *ecode++ - OP_STAR;
06496     minimize = (c & 1) != 0;
06497     min = rep_min[c];                 /* Pick up values from tables; */
06498     max = rep_max[c];                 /* zero for max => infinity */
06499     if (max == 0) max = INT_MAX;
06500 
06501     /* Common code for all repeated single-character matches. We can give
06502     up quickly if there are fewer than the minimum number of characters left in
06503     the subject. */
06504 
06505     REPEATCHAR:
06506 
06507     /* When not in UTF-8 mode, load a single-byte character. */
06508       {
06509       if (min > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
06510       fc = *ecode++;
06511       }
06512 
06513     /* The value of fc at this point is always less than 256, though we may or
06514     may not be in UTF-8 mode. The code is duplicated for the caseless and
06515     caseful cases, for speed, since matching characters is likely to be quite
06516     common. First, ensure the minimum number of matches are present. If min =
06517     max, continue at the same level without recursing. Otherwise, if
06518     minimizing, keep trying the rest of the expression and advancing one
06519     matching character if failing, up to the maximum. Alternatively, if
06520     maximizing, find the maximum number of characters and work backwards. */
06521 
06522     DPRINTF(("matching %c{%d,%d} against subject %.*s\n", fc, min, max,
06523       max, eptr));
06524 
06525     if ((ims & PCRE_CASELESS) != 0)
06526       {
06527       fc = md->lcc[fc];
06528       for (i = 1; i <= min; i++)
06529         if (fc != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
06530       if (min == max) continue;
06531       if (minimize)
06532         {
06533         for (fi = min;; fi++)
06534           {
06535           RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
06536           if (rrc != MATCH_NOMATCH) RRETURN(rrc);
06537           if (fi >= max || eptr >= md->end_subject ||
06538               fc != md->lcc[*eptr++])
06539             RRETURN(MATCH_NOMATCH);
06540           }
06541         /* Control never gets here */
06542         }
06543       else
06544         {
06545         pp = eptr;
06546         for (i = min; i < max; i++)
06547           {
06548           if (eptr >= md->end_subject || fc != md->lcc[*eptr]) break;
06549           eptr++;
06550           }
06551         while (eptr >= pp)
06552           {
06553           RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
06554           eptr--;
06555           if (rrc != MATCH_NOMATCH) RRETURN(rrc);
06556           }
06557         RRETURN(MATCH_NOMATCH);
06558         }
06559       /* Control never gets here */
06560       }
06561 
06562     /* Caseful comparisons (includes all multi-byte characters) */
06563 
06564     else
06565       {
06566       for (i = 1; i <= min; i++) if (fc != *eptr++) RRETURN(MATCH_NOMATCH);
06567       if (min == max) continue;
06568       if (minimize)
06569         {
06570         for (fi = min;; fi++)
06571           {
06572           RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
06573           if (rrc != MATCH_NOMATCH) RRETURN(rrc);
06574           if (fi >= max || eptr >= md->end_subject || fc != *eptr++)
06575             RRETURN(MATCH_NOMATCH);
06576           }
06577         /* Control never gets here */
06578         }
06579       else
06580         {
06581         pp = eptr;
06582         for (i = min; i < max; i++)
06583           {
06584           if (eptr >= md->end_subject || fc != *eptr) break;
06585           eptr++;
06586           }
06587         while (eptr >= pp)
06588           {
06589           RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
06590           eptr--;
06591           if (rrc != MATCH_NOMATCH) RRETURN(rrc);
06592           }
06593         RRETURN(MATCH_NOMATCH);
06594         }
06595       }
06596     /* Control never gets here */
06597 
06598     /* Match a negated single one-byte character. The character we are
06599     checking can be multibyte. */
06600 
06601     case OP_NOT:
06602     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
06603     ecode++;
06604     GETCHARINCTEST(c, eptr);
06605     if ((ims & PCRE_CASELESS) != 0)
06606       {
06607       c = md->lcc[c];
06608       if (md->lcc[*ecode++] == c) RRETURN(MATCH_NOMATCH);
06609       }
06610     else
06611       {
06612       if (*ecode++ == c) RRETURN(MATCH_NOMATCH);
06613       }
06614     break;
06615 
06616     /* Match a negated single one-byte character repeatedly. This is almost a
06617     repeat of the code for a repeated single character, but I haven't found a
06618     nice way of commoning these up that doesn't require a test of the
06619     positive/negative option for each character match. Maybe that wouldn't add
06620     very much to the time taken, but character matching *is* what this is all
06621     about... */
06622 
06623     case OP_NOTEXACT:
06624     min = max = GET2(ecode, 1);
06625     ecode += 3;
06626     goto REPEATNOTCHAR;
06627 
06628     case OP_NOTUPTO:
06629     case OP_NOTMINUPTO:
06630     min = 0;
06631     max = GET2(ecode, 1);
06632     minimize = *ecode == OP_NOTMINUPTO;
06633     ecode += 3;
06634     goto REPEATNOTCHAR;
06635 
06636     case OP_NOTSTAR:
06637     case OP_NOTMINSTAR:
06638     case OP_NOTPLUS:
06639     case OP_NOTMINPLUS:
06640     case OP_NOTQUERY:
06641     case OP_NOTMINQUERY:
06642     c = *ecode++ - OP_NOTSTAR;
06643     minimize = (c & 1) != 0;
06644     min = rep_min[c];                 /* Pick up values from tables; */
06645     max = rep_max[c];                 /* zero for max => infinity */
06646     if (max == 0) max = INT_MAX;
06647 
06648     /* Common code for all repeated single-character (less than 255) matches.
06649     We can give up quickly if there are fewer than the minimum number of
06650     characters left in the subject. */
06651 
06652     REPEATNOTCHAR:
06653     if (min > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
06654     fc = *ecode++;
06655 
06656     /* The code is duplicated for the caseless and caseful cases, for speed,
06657     since matching characters is likely to be quite common. First, ensure the
06658     minimum number of matches are present. If min = max, continue at the same
06659     level without recursing. Otherwise, if minimizing, keep trying the rest of
06660     the expression and advancing one matching character if failing, up to the
06661     maximum. Alternatively, if maximizing, find the maximum number of
06662     characters and work backwards. */
06663 
06664     DPRINTF(("negative matching %c{%d,%d} against subject %.*s\n", fc, min, max,
06665       max, eptr));
06666 
06667     if ((ims & PCRE_CASELESS) != 0)
06668       {
06669       fc = md->lcc[fc];
06670 
06671 
06672       /* Not UTF-8 mode */
06673         {
06674         for (i = 1; i <= min; i++)
06675           if (fc == md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
06676         }
06677 
06678       if (min == max) continue;
06679 
06680       if (minimize)
06681         {
06682         /* Not UTF-8 mode */
06683           {
06684           for (fi = min;; fi++)
06685             {
06686             RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
06687             if (rrc != MATCH_NOMATCH) RRETURN(rrc);
06688             if (fi >= max || eptr >= md->end_subject || fc == md->lcc[*eptr++])
06689               RRETURN(MATCH_NOMATCH);
06690             }
06691           }
06692         /* Control never gets here */
06693         }
06694 
06695       /* Maximize case */
06696 
06697       else
06698         {
06699         pp = eptr;
06700 
06701         /* Not UTF-8 mode */
06702           {
06703           for (i = min; i < max; i++)
06704             {
06705             if (eptr >= md->end_subject || fc == md->lcc[*eptr]) break;
06706             eptr++;
06707             }
06708           while (eptr >= pp)
06709             {
06710             RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
06711             if (rrc != MATCH_NOMATCH) RRETURN(rrc);
06712             eptr--;
06713             }
06714           }
06715 
06716         RRETURN(MATCH_NOMATCH);
06717         }
06718       /* Control never gets here */
06719       }
06720 
06721     /* Caseful comparisons */
06722 
06723     else
06724       {
06725       /* Not UTF-8 mode */
06726         {
06727         for (i = 1; i <= min; i++)
06728           if (fc == *eptr++) RRETURN(MATCH_NOMATCH);
06729         }
06730 
06731       if (min == max) continue;
06732 
06733       if (minimize)
06734         {
06735         /* Not UTF-8 mode */
06736           {
06737           for (fi = min;; fi++)
06738             {
06739             RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
06740             if (rrc != MATCH_NOMATCH) RRETURN(rrc);
06741             if (fi >= max || eptr >= md->end_subject || fc == *eptr++)
06742               RRETURN(MATCH_NOMATCH);
06743             }
06744           }
06745         /* Control never gets here */
06746         }
06747 
06748       /* Maximize case */
06749 
06750       else
06751         {
06752         pp = eptr;
06753 
06754         /* Not UTF-8 mode */
06755           {
06756           for (i = min; i < max; i++)
06757             {
06758             if (eptr >= md->end_subject || fc == *eptr) break;
06759             eptr++;
06760             }
06761           while (eptr >= pp)
06762             {
06763             RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
06764             if (rrc != MATCH_NOMATCH) RRETURN(rrc);
06765             eptr--;
06766             }
06767           }
06768 
06769         RRETURN(MATCH_NOMATCH);
06770         }
06771       }
06772     /* Control never gets here */
06773 
06774     /* Match a single character type repeatedly; several different opcodes
06775     share code. This is very similar to the code for single characters, but we
06776     repeat it in the interests of efficiency. */
06777 
06778     case OP_TYPEEXACT:
06779     min = max = GET2(ecode, 1);
06780     minimize = true;
06781     ecode += 3;
06782     goto REPEATTYPE;
06783 
06784     case OP_TYPEUPTO:
06785     case OP_TYPEMINUPTO:
06786     min = 0;
06787     max = GET2(ecode, 1);
06788     minimize = *ecode == OP_TYPEMINUPTO;
06789     ecode += 3;
06790     goto REPEATTYPE;
06791 
06792     case OP_TYPESTAR:
06793     case OP_TYPEMINSTAR:
06794     case OP_TYPEPLUS:
06795     case OP_TYPEMINPLUS:
06796     case OP_TYPEQUERY:
06797     case OP_TYPEMINQUERY:
06798     c = *ecode++ - OP_TYPESTAR;
06799     minimize = (c & 1) != 0;
06800     min = rep_min[c];                 /* Pick up values from tables; */
06801     max = rep_max[c];                 /* zero for max => infinity */
06802     if (max == 0) max = INT_MAX;
06803 
06804     /* Common code for all repeated single character type matches. Note that
06805     in UTF-8 mode, '.' matches a character of any length, but for the other
06806     character types, the valid characters are all one-byte long. */
06807 
06808     REPEATTYPE:
06809     ctype = *ecode++;      /* Code for the character type */
06810 
06811     /* First, ensure the minimum number of matches are present. Use inline
06812     code for maximizing the speed, and do the type test once at the start
06813     (i.e. keep it out of the loop). Also we can test that there are at least
06814     the minimum number of bytes before we start. This isn't as effective in
06815     UTF-8 mode, but it does no harm. Separate the UTF-8 code completely as that
06816     is tidier. */
06817 
06818     if (min > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
06819     if (min > 0)
06820       {
06821 
06822       /* Code for the non-UTF-8 case for minimum matching */
06823 
06824       switch(ctype)
06825         {
06826         case OP_ANY:
06827         if ((ims & PCRE_DOTALL) == 0)
06828           {
06829           for (i = 1; i <= min; i++)
06830             if (*eptr++ == NEWLINE) RRETURN(MATCH_NOMATCH);
06831           }
06832         else eptr += min;
06833         break;
06834 
06835         case OP_ANYBYTE:
06836         eptr += min;
06837         break;
06838 
06839         case OP_NOT_DIGIT:
06840         for (i = 1; i <= min; i++)
06841           if ((md->ctypes[*eptr++] & ctype_digit) != 0) RRETURN(MATCH_NOMATCH);
06842         break;
06843 
06844         case OP_DIGIT:
06845         for (i = 1; i <= min; i++)
06846           if ((md->ctypes[*eptr++] & ctype_digit) == 0) RRETURN(MATCH_NOMATCH);
06847         break;
06848 
06849         case OP_NOT_WHITESPACE:
06850         for (i = 1; i <= min; i++)
06851           if ((md->ctypes[*eptr++] & ctype_space) != 0) RRETURN(MATCH_NOMATCH);
06852         break;
06853 
06854         case OP_WHITESPACE:
06855         for (i = 1; i <= min; i++)
06856           if ((md->ctypes[*eptr++] & ctype_space) == 0) RRETURN(MATCH_NOMATCH);
06857         break;
06858 
06859         case OP_NOT_WORDCHAR:
06860         for (i = 1; i <= min; i++)
06861           if ((md->ctypes[*eptr++] & ctype_word) != 0)
06862             RRETURN(MATCH_NOMATCH);
06863         break;
06864 
06865         case OP_WORDCHAR:
06866         for (i = 1; i <= min; i++)
06867           if ((md->ctypes[*eptr++] & ctype_word) == 0)
06868             RRETURN(MATCH_NOMATCH);
06869         break;
06870         }
06871       }
06872 
06873     /* If min = max, continue at the same level without recursing */
06874 
06875     if (min == max) continue;
06876 
06877     /* If minimizing, we have to test the rest of the pattern before each
06878     subsequent match. Again, separate the UTF-8 case for speed. */
06879 
06880     if (minimize)
06881       {
06882       /* Not UTF-8 mode */
06883         {
06884         for (fi = min;; fi++)
06885           {
06886           RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
06887           if (rrc != MATCH_NOMATCH) RRETURN(rrc);
06888           if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
06889           c = *eptr++;
06890           switch(ctype)
06891             {
06892             case OP_ANY:
06893             if ((ims & PCRE_DOTALL) == 0 && c == NEWLINE) RRETURN(MATCH_NOMATCH);
06894             break;
06895 
06896             case OP_ANYBYTE:
06897             break;
06898 
06899             case OP_NOT_DIGIT:
06900             if ((md->ctypes[c] & ctype_digit) != 0) RRETURN(MATCH_NOMATCH);
06901             break;
06902 
06903             case OP_DIGIT:
06904             if ((md->ctypes[c] & ctype_digit) == 0) RRETURN(MATCH_NOMATCH);
06905             break;
06906 
06907             case OP_NOT_WHITESPACE:
06908             if ((md->ctypes[c] & ctype_space) != 0) RRETURN(MATCH_NOMATCH);
06909             break;
06910 
06911             case OP_WHITESPACE:
06912             if  ((md->ctypes[c] & ctype_space) == 0) RRETURN(MATCH_NOMATCH);
06913             break;
06914 
06915             case OP_NOT_WORDCHAR:
06916             if ((md->ctypes[c] & ctype_word) != 0) RRETURN(MATCH_NOMATCH);
06917             break;
06918 
06919             case OP_WORDCHAR:
06920             if ((md->ctypes[c] & ctype_word) == 0) RRETURN(MATCH_NOMATCH);
06921             break;
06922             }
06923           }
06924         }
06925       /* Control never gets here */
06926       }
06927 
06928     /* If maximizing it is worth using inline code for speed, doing the type
06929     test once at the start (i.e. keep it out of the loop). Again, keep the
06930     UTF-8 stuff separate. */
06931 
06932     else
06933       {
06934       pp = eptr;
06935 
06936       /* Not UTF-8 mode */
06937         {
06938         switch(ctype)
06939           {
06940           case OP_ANY:
06941           if ((ims & PCRE_DOTALL) == 0)
06942             {
06943             for (i = min; i < max; i++)
06944               {
06945               if (eptr >= md->end_subject || *eptr == NEWLINE) break;
06946               eptr++;
06947               }
06948             break;
06949             }
06950           /* For DOTALL case, fall through and treat as \C */
06951 
06952           case OP_ANYBYTE:
06953           c = max - min;
06954           if (c > md->end_subject - eptr) c = md->end_subject - eptr;
06955           eptr += c;
06956           break;
06957 
06958           case OP_NOT_DIGIT:
06959           for (i = min; i < max; i++)
06960             {
06961             if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_digit) != 0)
06962               break;
06963             eptr++;
06964             }
06965           break;
06966 
06967           case OP_DIGIT:
06968           for (i = min; i < max; i++)
06969             {
06970             if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_digit) == 0)
06971               break;
06972             eptr++;
06973             }
06974           break;
06975 
06976           case OP_NOT_WHITESPACE:
06977           for (i = min; i < max; i++)
06978             {
06979             if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_space) != 0)
06980               break;
06981             eptr++;
06982             }
06983           break;
06984 
06985           case OP_WHITESPACE:
06986           for (i = min; i < max; i++)
06987             {
06988             if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_space) == 0)
06989               break;
06990             eptr++;
06991             }
06992           break;
06993 
06994           case OP_NOT_WORDCHAR:
06995           for (i = min; i < max; i++)
06996             {
06997             if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_word) != 0)
06998               break;
06999             eptr++;
07000             }
07001           break;
07002 
07003           case OP_WORDCHAR:
07004           for (i = min; i < max; i++)
07005             {
07006             if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_word) == 0)
07007               break;
07008             eptr++;
07009             }
07010           break;
07011           }
07012 
07013         /* eptr is now past the end of the maximum run */
07014 
07015         while (eptr >= pp)
07016           {
07017           RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
07018           eptr--;
07019           if (rrc != MATCH_NOMATCH) RRETURN(rrc);
07020           }
07021         }
07022 
07023       /* Get here if we can't make it match with any permitted repetitions */
07024 
07025       RRETURN(MATCH_NOMATCH);
07026       }
07027     /* Control never gets here */
07028 
07029     /* There's been some horrible disaster. Since all codes > OP_BRA are
07030     for capturing brackets, and there shouldn't be any gaps between 0 and
07031     OP_BRA, arrival here can only mean there is something seriously wrong
07032     in the code above or the OP_xxx definitions. */
07033 
07034     default:
07035     DPRINTF(("Unknown opcode %d\n", *ecode));
07036     RRETURN(PCRE_ERROR_UNKNOWN_NODE);
07037     }
07038 
07039   /* Do not stick any code in here without much thought; it is assumed
07040   that "continue" in the code above comes out to here to repeat the main
07041   loop. */
07042 
07043   }             /* End of main loop */
07044 RRETURN(MATCH_NOMATCH);
07045 }
07046 
07047 
07048 /***************************************************************************
07049 ****************************************************************************
07050                    RECURSION IN THE match() FUNCTION
07051 
07052 Undefine all the macros that were defined above to handle this. */
07053 
07054 /* These two are defined as macros in both cases */
07055 
07056 #undef fc
07057 #undef fi
07058 
07059 /***************************************************************************
07060 ***************************************************************************/
07061 
07062 
07063 
07064 /*************************************************
07065 *         Execute a Regular Expression           *
07066 *************************************************/
07067 
07068 /* This function applies a compiled re to a subject string and picks out
07069 portions of the string if it matches. Two elements in the vector are set for
07070 each substring: the offsets to the start and end of the substring.
07071 
07072 Arguments:
07073   external_re     points to the compiled expression
07074   extra_data      points to extra data or is NULL
07075   subject         points to the subject string
07076   length          length of subject string (may contain binary zeros)
07077   start_offset    where to start in the subject string
07078   options         option bits
07079   offsets         points to a vector of ints to be filled in with offsets
07080   offsetcount     the number of elements in the vector
07081 
07082 Returns:          > 0 => success; value is the number of elements filled in
07083                   = 0 => success, but offsets is not big enough
07084                    -1 => failed to match
07085                  < -1 => some kind of unexpected problem
07086 */
07087 
07088 int
07089 pcre_exec(const pcre *external_re, const pcre_extra *extra_data,
07090   const char *subject, int length, int start_offset, int options, int *offsets,
07091   int offsetcount)
07092 {
07093 int rc, resetcount, ocount;
07094 int first_byte = -1;
07095 int req_byte = -1;
07096 int req_byte2 = -1;
07097 unsigned long int ims = 0;
07098 bool using_temporary_offsets = false;
07099 bool anchored;
07100 bool startline;
07101 bool first_byte_caseless = false;
07102 bool req_byte_caseless = false;
07103 match_data match_block;
07104 const uschar *start_bits = NULL;
07105 const uschar *start_match = (const uschar *)subject + start_offset;
07106 const uschar *end_subject;
07107 const uschar *req_byte_ptr = start_match - 1;
07108 const pcre_study_data *study;
07109 const real_pcre *re = (const real_pcre *)external_re;
07110 
07111 /* Plausibility checks */
07112 
07113 if ((options & ~PUBLIC_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
07114 if (re == NULL || subject == NULL ||
07115    (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;
07116 
07117 /* Fish out the optional data from the extra_data structure, first setting
07118 the default values. */
07119 
07120 study = NULL;
07121 match_block.match_limit = MATCH_LIMIT;
07122 match_block.callout_data = NULL;
07123 
07124 if (extra_data != NULL)
07125   {
07126   register unsigned int flags = extra_data->flags;
07127   if ((flags & PCRE_EXTRA_STUDY_DATA) != 0)
07128     study = (const pcre_study_data *)extra_data->study_data;
07129   if ((flags & PCRE_EXTRA_MATCH_LIMIT) != 0)
07130     match_block.match_limit = extra_data->match_limit;
07131   if ((flags & PCRE_EXTRA_CALLOUT_DATA) != 0)
07132     match_block.callout_data = extra_data->callout_data;
07133   }
07134 
07135 /* Now we have re supposedly pointing to the regex */
07136 
07137 if (re->magic_number != MAGIC_NUMBER) return PCRE_ERROR_BADMAGIC;
07138 
07139 anchored = ((re->options | options) & PCRE_ANCHORED) != 0;
07140 startline = (re->options & PCRE_STARTLINE) != 0;
07141 
07142 match_block.start_code =
07143   (const uschar *)re + sizeof(real_pcre) + re->name_count * re->name_entry_size;
07144 match_block.start_subject = (const uschar *)subject;
07145 match_block.start_offset = start_offset;
07146 match_block.end_subject = match_block.start_subject + length;
07147 end_subject = match_block.end_subject;
07148 
07149 match_block.endonly = (re->options & PCRE_DOLLAR_ENDONLY) != 0;
07150 match_block.utf8 = (re->options & PCRE_UTF8) != 0;
07151 
07152 match_block.notbol = (options & PCRE_NOTBOL) != 0;
07153 match_block.noteol = (options & PCRE_NOTEOL) != 0;
07154 match_block.notempty = (options & PCRE_NOTEMPTY) != 0;
07155 
07156 match_block.recursive = NULL;                   /* No recursion at top level */
07157 
07158 match_block.lcc = re->tables + lcc_offset;
07159 match_block.ctypes = re->tables + ctypes_offset;
07160 
07161 /* Check a UTF-8 string if required. Unfortunately there's no way of passing
07162 back the character offset. */
07163 
07164 /* The ims options can vary during the matching as a result of the presence
07165 of (?ims) items in the pattern. They are kept in a local variable so that
07166 restoring at the exit of a group is easy. */
07167 
07168 ims = re->options & (PCRE_CASELESS|PCRE_MULTILINE|PCRE_DOTALL);
07169 
07170 /* If the expression has got more back references than the offsets supplied can
07171 hold, we get a temporary bit of working store to use during the matching.
07172 Otherwise, we can use the vector supplied, rounding down its size to a multiple
07173 of 3. */
07174 
07175 ocount = offsetcount - (offsetcount % 3);
07176 
07177 if (re->top_backref > 0 && re->top_backref >= ocount/3)
07178   {
07179   ocount = re->top_backref * 3 + 3;
07180   match_block.offset_vector = static_cast<int *>(malloc(ocount * sizeof(int)));
07181   if (match_block.offset_vector == NULL) return PCRE_ERROR_NOMEMORY;
07182   using_temporary_offsets = true;
07183   DPRINTF(("Got memory to hold back references\n"));
07184   }
07185 else match_block.offset_vector = offsets;
07186 
07187 match_block.offset_end = ocount;
07188 match_block.offset_max = (2*ocount)/3;
07189 match_block.offset_overflow = false;
07190 match_block.capture_last = -1;
07191 
07192 /* Compute the minimum number of offsets that we need to reset each time. Doing
07193 this makes a huge difference to execution time when there aren't many brackets
07194 in the pattern. */
07195 
07196 resetcount = 2 + re->top_bracket * 2;
07197 if (resetcount > offsetcount) resetcount = ocount;
07198 
07199 /* Reset the working variable associated with each extraction. These should
07200 never be used unless previously set, but they get saved and restored, and so we
07201 initialize them to avoid reading uninitialized locations. */
07202 
07203 if (match_block.offset_vector != NULL)
07204   {
07205   register int *iptr = match_block.offset_vector + ocount;
07206   register int *iend = iptr - resetcount/2 + 1;
07207   while (--iptr >= iend) *iptr = -1;
07208   }
07209 
07210 /* Set up the first character to match, if available. The first_byte value is
07211 never set for an anchored regular expression, but the anchoring may be forced
07212 at run time, so we have to test for anchoring. The first char may be unset for
07213 an unanchored pattern, of course. If there's no first char and the pattern was
07214 studied, there may be a bitmap of possible first characters. */
07215 
07216 if (!anchored)
07217   {
07218   if ((re->options & PCRE_FIRSTSET) != 0)
07219     {
07220     first_byte = re->first_byte & 255;
07221     if ((first_byte_caseless = ((re->first_byte & REQ_CASELESS) != 0)) == true)
07222       first_byte = match_block.lcc[first_byte];
07223     }
07224   else
07225     if (!startline && study != NULL &&
07226       (study->options & PCRE_STUDY_MAPPED) != 0)
07227         start_bits = study->start_bits;
07228   }
07229 
07230 /* For anchored or unanchored matches, there may be a "last known required
07231 character" set. */
07232 
07233 if ((re->options & PCRE_REQCHSET) != 0)
07234   {
07235   req_byte = re->req_byte & 255;
07236   req_byte_caseless = (re->req_byte & REQ_CASELESS) != 0;
07237   req_byte2 = (re->tables + fcc_offset)[req_byte];  /* case flipped */
07238   }
07239 
07240 /* Loop for handling unanchored repeated matching attempts; for anchored regexs
07241 the loop runs just once. */
07242 
07243 do
07244   {
07245   register int *iptr = match_block.offset_vector;
07246   register int *iend = iptr + resetcount;
07247 
07248   /* Reset the maximum number of extractions we might see. */
07249 
07250   while (iptr < iend) *iptr++ = -1;
07251 
07252   /* Advance to a unique first char if possible */
07253 
07254   if (first_byte >= 0)
07255     {
07256     if (first_byte_caseless)
07257       while (start_match < end_subject &&
07258              match_block.lcc[*start_match] != first_byte)
07259         start_match++;
07260     else
07261       while (start_match < end_subject && *start_match != first_byte)
07262         start_match++;
07263     }
07264 
07265   /* Or to just after \n for a multiline match if possible */
07266 
07267   else if (startline)
07268     {
07269     if (start_match > match_block.start_subject + start_offset)
07270       {
07271       while (start_match < end_subject && start_match[-1] != NEWLINE)
07272         start_match++;
07273       }
07274     }
07275 
07276   /* Or to a non-unique first char after study */
07277 
07278   else if (start_bits != NULL)
07279     {
07280     while (start_match < end_subject)
07281       {
07282       register int c = *start_match;
07283       if ((start_bits[c/8] & (1 << (c&7))) == 0) start_match++; else break;
07284       }
07285     }
07286 
07287   /* If req_byte is set, we know that that character must appear in the subject
07288   for the match to succeed. If the first character is set, req_byte must be
07289   later in the subject; otherwise the test starts at the match point. This
07290   optimization can save a huge amount of backtracking in patterns with nested
07291   unlimited repeats that aren't going to match. Writing separate code for
07292   cased/caseless versions makes it go faster, as does using an autoincrement
07293   and backing off on a match.
07294 
07295   HOWEVER: when the subject string is very, very long, searching to its end can
07296   take a long time, and give bad performance on quite ordinary patterns. This
07297   showed up when somebody was matching /^C/ on a 32-megabyte string... so we
07298   don't do this when the string is sufficiently long. */
07299 
07300   if (req_byte >= 0 && end_subject - start_match < REQ_BYTE_MAX)
07301     {
07302     register const uschar *p = start_match + ((first_byte >= 0)? 1 : 0);
07303 
07304     /* We don't need to repeat the search if we haven't yet reached the
07305     place we found it at last time. */
07306 
07307     if (p > req_byte_ptr)
07308       {
07309       if (req_byte_caseless)
07310         {
07311         while (p < end_subject)
07312           {
07313           register int pp = *p++;
07314           if (pp == req_byte || pp == req_byte2) { p--; break; }
07315           }
07316         }
07317       else
07318         {
07319         while (p < end_subject)
07320           {
07321           if (*p++ == req_byte) { p--; break; }
07322           }
07323         }
07324 
07325       /* If we can't find the required character, break the matching loop */
07326 
07327       if (p >= end_subject) break;
07328 
07329       /* If we have found the required character, save the point where we
07330       found it, so that we don't search again next time round the loop if
07331       the start hasn't passed this character yet. */
07332 
07333       req_byte_ptr = p;
07334       }
07335     }
07336 
07337   /* When a match occurs, substrings will be set for all internal extractions;
07338   we just need to set up the whole thing as substring 0 before returning. If
07339   there were too many extractions, set the return code to zero. In the case
07340   where we had to get some local store to hold offsets for backreferences, copy
07341   those back references that we can. In this case there need not be overflow
07342   if certain parts of the pattern were not used. */
07343 
07344   match_block.start_match = start_match;
07345   match_block.match_call_count = 0;
07346 
07347   rc = match(start_match, match_block.start_code, 2, &match_block, ims, NULL,
07348     match_isgroup);
07349 
07350   if (rc == MATCH_NOMATCH)
07351     {
07352     start_match++;
07353     continue;
07354     }
07355 
07356   if (rc != MATCH_MATCH)
07357     {
07358     DPRINTF((">>>> error: returning %d\n", rc));
07359     return rc;
07360     }
07361 
07362   /* We have a match! Copy the offset information from temporary store if
07363   necessary */
07364 
07365   if (using_temporary_offsets)
07366     {
07367     if (offsetcount >= 4)
07368       {
07369       memcpy(offsets + 2, match_block.offset_vector + 2,
07370         (offsetcount - 2) * sizeof(int));
07371       DPRINTF(("Copied offsets from temporary memory\n"));
07372       }
07373     if (match_block.end_offset_top > offsetcount)
07374       match_block.offset_overflow = true;
07375 
07376     DPRINTF(("Freeing temporary memory\n"));
07377     free(match_block.offset_vector);
07378     }
07379 
07380   rc = match_block.offset_overflow? 0 : match_block.end_offset_top/2;
07381 
07382   if (offsetcount < 2) rc = 0; else
07383     {
07384     offsets[0] = start_match - match_block.start_subject;
07385     offsets[1] = match_block.end_match_ptr - match_block.start_subject;
07386     }
07387 
07388   DPRINTF((">>>> returning %d\n", rc));
07389   return rc;
07390   }
07391 
07392 /* This "while" is the end of the "do" above */
07393 
07394 while (!anchored && start_match <= end_subject);
07395 
07396 if (using_temporary_offsets)
07397   {
07398   DPRINTF(("Freeing temporary memory\n"));
07399   free(match_block.offset_vector);
07400   }
07401 
07402 DPRINTF((">>>> returning PCRE_ERROR_NOMATCH\n"));
07403 
07404 return PCRE_ERROR_NOMATCH;
07405 }
07406 
07407 /* End of pcre.c */

Generated on Mon May 28 04:40:11 2007 for MUX by  doxygen 1.4.7