MUX: mux/src/pcre.h File Reference

#define PCRE_ANCHORED 0x0010

Definition at line 32 of file pcre.h.

Referenced by pcre_compile(), pcre_exec(), and pcre_study().

#define PCRE_CASELESS 0x0001

Definition at line 28 of file pcre.h.

Referenced by atr_match1(), check_filter(), compile_branch(), find_firstassertedchar(), match(), match_ref(), pcre_compile(), pcre_exec(), pcre_study(), process_cmdent(), real_regmatch(), real_regrab(), and set_start_bits().

#define PCRE_CONFIG_LINK_SIZE 2

Definition at line 76 of file pcre.h.

#define PCRE_CONFIG_MATCH_LIMIT 4

Definition at line 78 of file pcre.h.

#define PCRE_CONFIG_NEWLINE 1

Definition at line 75 of file pcre.h.

#define PCRE_CONFIG_POSIX_MALLOC_THRESHOLD 3

Definition at line 77 of file pcre.h.

#define PCRE_CONFIG_STACKRECURSE 5

Definition at line 79 of file pcre.h.

#define PCRE_CONFIG_UTF8 0

Definition at line 74 of file pcre.h.

#define PCRE_DATA_SCOPE extern

Definition at line 18 of file pcre.h.

#define PCRE_DATE 01-December-2003

Definition at line 15 of file pcre.h.

#define PCRE_DOLLAR_ENDONLY 0x0020

Definition at line 33 of file pcre.h.

Referenced by pcre_exec().

#define PCRE_DOTALL 0x0004

Definition at line 30 of file pcre.h.

Referenced by compile_branch(), is_anchored(), match(), pcre_compile(), and pcre_exec().

#define PCRE_ERROR_BADMAGIC (-4)

Definition at line 48 of file pcre.h.

Referenced by pcre_exec().

#define PCRE_ERROR_BADOPTION (-3)

Definition at line 47 of file pcre.h.

Referenced by pcre_exec().

#define PCRE_ERROR_BADUTF8 (-10)

Definition at line 54 of file pcre.h.

#define PCRE_ERROR_BADUTF8_OFFSET (-11)

Definition at line 55 of file pcre.h.

#define PCRE_ERROR_CALLOUT (-9)

Definition at line 53 of file pcre.h.

#define PCRE_ERROR_MATCHLIMIT (-8)

Definition at line 52 of file pcre.h.

Referenced by match().

#define PCRE_ERROR_NOMATCH (-1)

Definition at line 45 of file pcre.h.

Referenced by pcre_exec().

#define PCRE_ERROR_NOMEMORY (-6)

Definition at line 50 of file pcre.h.

Referenced by match(), pcre_copy_substring(), and pcre_exec().

#define PCRE_ERROR_NOSUBSTRING (-7)

Definition at line 51 of file pcre.h.

Referenced by pcre_copy_substring().

#define PCRE_ERROR_NULL (-2)

Definition at line 46 of file pcre.h.

Referenced by pcre_exec().

#define PCRE_ERROR_UNKNOWN_NODE (-5)

Definition at line 49 of file pcre.h.

Referenced by match().

#define PCRE_EXTENDED 0x0008

Definition at line 31 of file pcre.h.

Referenced by compile_branch(), and pcre_compile().

#define PCRE_EXTRA 0x0040

Definition at line 34 of file pcre.h.

Referenced by check_escape(), compile_branch(), and pcre_compile().

#define PCRE_EXTRA_CALLOUT_DATA 0x0004

Definition at line 85 of file pcre.h.

Referenced by pcre_exec().

#define PCRE_EXTRA_MATCH_LIMIT 0x0002

Definition at line 84 of file pcre.h.

Referenced by pcre_exec().

#define PCRE_EXTRA_STUDY_DATA 0x0001

Definition at line 83 of file pcre.h.

Referenced by pcre_exec(), and pcre_study().

#define PCRE_INFO_BACKREFMAX 3

Definition at line 62 of file pcre.h.

#define PCRE_INFO_CAPTURECOUNT 2

Definition at line 61 of file pcre.h.

#define PCRE_INFO_FIRSTBYTE 4

Definition at line 63 of file pcre.h.

#define PCRE_INFO_FIRSTCHAR 4

Definition at line 64 of file pcre.h.

#define PCRE_INFO_FIRSTTABLE 5

Definition at line 65 of file pcre.h.

#define PCRE_INFO_LASTLITERAL 6

Definition at line 66 of file pcre.h.

#define PCRE_INFO_NAMECOUNT 8

Definition at line 68 of file pcre.h.

#define PCRE_INFO_NAMEENTRYSIZE 7

Definition at line 67 of file pcre.h.

#define PCRE_INFO_NAMETABLE 9

Definition at line 69 of file pcre.h.

#define PCRE_INFO_OPTIONS 0

Definition at line 59 of file pcre.h.

#define PCRE_INFO_SIZE 1

Definition at line 60 of file pcre.h.

#define PCRE_INFO_STUDYSIZE 10

Definition at line 70 of file pcre.h.

#define PCRE_MAJOR 4

Definition at line 13 of file pcre.h.

#define PCRE_MINOR 5

Definition at line 14 of file pcre.h.

#define PCRE_MULTILINE 0x0002

Definition at line 29 of file pcre.h.

Referenced by compile_branch(), is_anchored(), match(), pcre_compile(), and pcre_exec().

#define PCRE_NO_AUTO_CAPTURE 0x1000

Definition at line 40 of file pcre.h.

Referenced by compile_branch(), and pcre_compile().

#define PCRE_NO_UTF8_CHECK 0x2000

Definition at line 41 of file pcre.h.

#define PCRE_NOTBOL 0x0080

Definition at line 35 of file pcre.h.

Referenced by pcre_exec().

#define PCRE_NOTEMPTY 0x0400

Definition at line 38 of file pcre.h.

Referenced by pcre_exec().

#define PCRE_NOTEOL 0x0100

Definition at line 36 of file pcre.h.

Referenced by pcre_exec().

#define PCRE_UNGREEDY 0x0200

Definition at line 37 of file pcre.h.

Referenced by compile_branch(), and pcre_compile().

#define PCRE_UTF8 0x0800

Definition at line 39 of file pcre.h.

Referenced by pcre_compile(), pcre_exec(), and pcre_study().

typedef struct real_pcre pcre

Definition at line 90 of file pcre.h.

pcre* pcre_compile	(	const char *	,
		int	,
		const char **	,
		int *	,
		const unsigned char *
	)

Definition at line 4365 of file pcre.cpp.

References compile_data::backref_map, BRASTACK_SIZE, compile_data::cbits, cbits_offset, check_escape(), check_posix_syntax(), compile_regex(), ctype_digit, ctype_meta, ctype_space, ctype_word, compile_data::ctypes, ctypes_offset, digitab, DPRINTF, ERR12, ERR15, ERR16, ERR17, ERR18, ERR19, ERR20, ERR21, ERR22, ERR23, ERR24, ERR26, ERR28, ERR29, ERR32, ERR39, ERR41, ERR42, ERR6, ESC_b, ESC_Q, ESC_REF, EXTRACT_BASIC_MAX, compile_data::fcc, fcc_offset, find_firstassertedchar(), real_pcre::first_byte, is_anchored(), is_counted_repeat(), is_startline(), compile_data::lcc, lcc_offset, LINK_SIZE, MAGIC_NUMBER, real_pcre::magic_number, MAX_PATTERN_SIZE, MAXLIT, real_pcre::name_count, compile_data::name_entry_size, real_pcre::name_entry_size, compile_data::name_table, compile_data::names_found, NEWLINE, OP_BRA, OP_END, real_pcre::options, PCRE_ANCHORED, PCRE_CASELESS, pcre_default_tables, PCRE_DOTALL, PCRE_EXTENDED, PCRE_EXTRA, PCRE_FIRSTSET, PCRE_ICHANGED, PCRE_IMS, PCRE_MULTILINE, PCRE_NO_AUTO_CAPTURE, PCRE_REQCHSET, PCRE_STARTLINE, PCRE_UNGREEDY, PCRE_UTF8, PUBLIC_OPTIONS, read_repeat_counts(), real_pcre::req_byte, REQ_CASELESS, REQ_VARY, compile_data::req_varyopt, real_pcre::size, compile_data::start_code, real_pcre::tables, real_pcre::top_backref, compile_data::top_backref, and real_pcre::top_bracket.

Referenced by CF_HAND(), check_filter(), real_regmatch(), real_regrab(), and regexp_match().

04366 {
04367 real_pcre *re;
04368 int length = 1 + LINK_SIZE;      /* For initial BRA plus length */
04369 int runlength;
04370 int c, firstbyte, reqbyte;
04371 int bracount = 0;
04372 int branch_extra = 0;
04373 int branch_newextra;
04374 int item_count = -1;
04375 int name_count = 0;
04376 int max_name_size = 0;
04377 bool inescq = false;
04378 unsigned int brastackptr = 0;
04379 size_t size;
04380 uschar *code;
04381 const uschar *codestart;
04382 const uschar *ptr;
04383 compile_data compile_block;
04384 int brastack[BRASTACK_SIZE];
04385 uschar bralenstack[BRASTACK_SIZE];
04386 
04387 /* We can't pass back an error message if errorptr is NULL; I guess the best we
04388 can do is just return NULL. */
04389 
04390 if (errorptr == NULL) return NULL;
04391 *errorptr = NULL;
04392 
04393 /* However, we can give a message for this error */
04394 
04395 if (erroroffset == NULL)
04396   {
04397   *errorptr = ERR16;
04398   return NULL;
04399   }
04400 *erroroffset = 0;
04401 
04402 /* Can't support UTF8 unless PCRE has been compiled to include the code. */
04403 
04404 if ((options & PCRE_UTF8) != 0)
04405   {
04406   *errorptr = ERR32;
04407   return NULL;
04408   }
04409 
04410 if ((options & ~PUBLIC_OPTIONS) != 0)
04411   {
04412   *errorptr = ERR17;
04413   return NULL;
04414   }
04415 
04416 /* Set up pointers to the individual character tables */
04417 
04418 if (tables == NULL) tables = pcre_default_tables;
04419 compile_block.lcc = tables + lcc_offset;
04420 compile_block.fcc = tables + fcc_offset;
04421 compile_block.cbits = tables + cbits_offset;
04422 compile_block.ctypes = tables + ctypes_offset;
04423 
04424 /* Maximum back reference and backref bitmap. This is updated for numeric
04425 references during the first pass, but for named references during the actual
04426 compile pass. The bitmap records up to 31 back references to help in deciding
04427 whether (.*) can be treated as anchored or not. */
04428 
04429 compile_block.top_backref = 0;
04430 compile_block.backref_map = 0;
04431 
04432 /* Reflect pattern for debugging output */
04433 
04434 DPRINTF(("------------------------------------------------------------------\n"));
04435 DPRINTF(("%s\n", pattern));
04436 
04437 /* The first thing to do is to make a pass over the pattern to compute the
04438 amount of store required to hold the compiled code. This does not have to be
04439 perfect as long as errors are overestimates. At the same time we can detect any
04440 flag settings right at the start, and extract them. Make an attempt to correct
04441 for any counted white space if an "extended" flag setting appears late in the
04442 pattern. We can't be so clever for #-comments. */
04443 
04444 ptr = (const uschar *)(pattern - 1);
04445 while ((c = *(++ptr)) != 0)
04446   {
04447   int min, max;
04448 #if defined(WIN32) && (_MSC_VER == 1200) && defined(_M_IX86) && !defined(__INTEL_COMPILER)
04449   // The addition of 'volatile' works around a bug in Version 12.0 of
04450   // Microsoft's Visual C/C++ compiler (part of Visual Studio 6.0). Without
04451   // volatile, class_optcount is calculated properly, but the compiler
04452   // clobbers the EAX register before tests it as class_optcount.
04453   //
04454   // This is not a problem with the Intel Compiler.
04455   //
04456   volatile int class_optcount;
04457 #else
04458   int class_optcount;
04459 #endif
04460   int bracket_length;
04461   int duplength;
04462 
04463   /* If we are inside a \Q...\E sequence, all chars are literal */
04464 
04465   if (inescq) goto NORMAL_CHAR;
04466 
04467   /* Otherwise, first check for ignored whitespace and comments */
04468 
04469   if ((options & PCRE_EXTENDED) != 0)
04470     {
04471     if ((compile_block.ctypes[c] & ctype_space) != 0) continue;
04472     if (c == '#')
04473       {
04474       /* The space before the ; is to avoid a warning on a silly compiler
04475       on the Macintosh. */
04476       while ((c = *(++ptr)) != 0 && c != NEWLINE) ;
04477       if (c == 0) break;
04478       continue;
04479       }
04480     }
04481 
04482   item_count++;    /* Is zero for the first non-comment item */
04483 
04484   switch(c)
04485     {
04486     /* A backslashed item may be an escaped "normal" character or a
04487     character type. For a "normal" character, put the pointers and
04488     character back so that tests for whitespace etc. in the input
04489     are done correctly. */
04490 
04491     case '\\':
04492       {
04493       const uschar *save_ptr = ptr;
04494       c = check_escape(&ptr, errorptr, bracount, options, false);
04495       if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
04496       if (c >= 0)
04497         {
04498         ptr = save_ptr;
04499         c = '\\';
04500         goto NORMAL_CHAR;
04501         }
04502       }
04503 
04504     /* If \Q, enter "literal" mode */
04505 
04506     if (-c == ESC_Q)
04507       {
04508       inescq = true;
04509       continue;
04510       }
04511 
04512     /* Other escapes need one byte, and are of length one for repeats */
04513 
04514     length++;
04515 
04516     /* A back reference needs an additional 2 bytes, plus either one or 5
04517     bytes for a repeat. We also need to keep the value of the highest
04518     back reference. */
04519 
04520     if (c <= -ESC_REF)
04521       {
04522       int refnum = -c - ESC_REF;
04523       compile_block.backref_map |= (refnum < 32)? (1 << refnum) : 1;
04524       if (refnum > compile_block.top_backref)
04525         compile_block.top_backref = refnum;
04526       length += 2;   /* For single back reference */
04527       if (ptr[1] == '{' && is_counted_repeat(ptr+2))
04528         {
04529         ptr = read_repeat_counts(ptr+2, &min, &max, errorptr);
04530         if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
04531         if ((min == 0 && (max == 1 || max == -1)) ||
04532           (min == 1 && max == -1))
04533             length++;
04534         else length += 5;
04535         if (ptr[1] == '?') ptr++;
04536         }
04537       }
04538     continue;
04539 
04540     case '^':     /* Single-byte metacharacters */
04541     case '.':
04542     case '$':
04543     length++;
04544     continue;
04545 
04546     case '*':            /* These repeats won't be after brackets; */
04547     case '+':            /* those are handled separately */
04548     case '?':
04549     length++;
04550     goto POSESSIVE;      /* A few lines below */
04551 
04552     /* This covers the cases of braced repeats after a single char, metachar,
04553     class, or back reference. */
04554 
04555     case '{':
04556     if (!is_counted_repeat(ptr+1)) goto NORMAL_CHAR;
04557     ptr = read_repeat_counts(ptr+1, &min, &max, errorptr);
04558     if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
04559 
04560     /* These special cases just insert one extra opcode */
04561 
04562     if ((min == 0 && (max == 1 || max == -1)) ||
04563       (min == 1 && max == -1))
04564         length++;
04565 
04566     /* These cases might insert additional copies of a preceding character. */
04567 
04568     else
04569       {
04570 
04571       /* Not UTF-8 mode: all characters are one byte */
04572         {
04573         if (min != 1)
04574           {
04575           length--;   /* Uncount the original char or metachar */
04576           if (min > 0) length += 4;
04577           }
04578 
04579         length += (max > 0)? 4 : 2;
04580         }
04581       }
04582 
04583     if (ptr[1] == '?') ptr++;      /* Needs no extra length */
04584 
04585     POSESSIVE:                     /* Test for possessive quantifier */
04586     if (ptr[1] == '+')
04587       {
04588       ptr++;
04589       length += 2 + 2*LINK_SIZE;   /* Allow for atomic brackets */
04590       }
04591     continue;
04592 
04593     /* An alternation contains an offset to the next branch or ket. If any ims
04594     options changed in the previous branch(es), and/or if we are in a
04595     lookbehind assertion, extra space will be needed at the start of the
04596     branch. This is handled by branch_extra. */
04597 
04598     case '|':
04599     length += 1 + LINK_SIZE + branch_extra;
04600     continue;
04601 
04602     /* A character class uses 33 characters provided that all the character
04603     values are less than 256. Otherwise, it uses a bit map for low valued
04604     characters, and individual items for others. Don't worry about character
04605     types that aren't allowed in classes - they'll get picked up during the
04606     compile. A character class that contains only one single-byte character
04607     uses 2 or 3 bytes, depending on whether it is negated or not. Notice this
04608     where we can. (In UTF-8 mode we can do this only for chars < 128.) */
04609 
04610     case '[':
04611     class_optcount = 0;
04612 
04613     if (*(++ptr) == '^') ptr++;
04614 
04615     /* Written as a "do" so that an initial ']' is taken as data */
04616 
04617     if (*ptr != 0) do
04618       {
04619       /* Inside \Q...\E everything is literal except \E */
04620 
04621       if (inescq)
04622         {
04623         if (*ptr != '\\' || ptr[1] != 'E') goto NON_SPECIAL_CHARACTER;
04624         inescq = false;
04625         ptr += 1;
04626         continue;
04627         }
04628 
04629       /* Outside \Q...\E, check for escapes */
04630 
04631       if (*ptr == '\\')
04632         {
04633         int ch = check_escape(&ptr, errorptr, bracount, options, true);
04634         if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
04635 
04636         /* \b is backspace inside a class */
04637 
04638         if (-ch == ESC_b) ch = '\b';
04639 
04640         /* \Q enters quoting mode */
04641 
04642         if (-ch == ESC_Q)
04643           {
04644           inescq = true;
04645           continue;
04646           }
04647 
04648         /* Handle escapes that turn into characters */
04649 
04650         if (ch >= 0)
04651           {
04652           class_optcount++;            /* for possible optimization */
04653           }
04654         else class_optcount = 10;      /* \d, \s etc; make sure > 1 */
04655         }
04656 
04657       /* Check the syntax for POSIX stuff. The bits we actually handle are
04658       checked during the real compile phase. */
04659 
04660       else if (*ptr == '[' && check_posix_syntax(ptr, &ptr, &compile_block))
04661         {
04662         ptr++;
04663         class_optcount = 10;    /* Make sure > 1 */
04664         }
04665 
04666       /* Anything else just increments the possible optimization count. If
04667       there are wide characters, we are going to have to use an XCLASS. */
04668 
04669       else
04670         {
04671         NON_SPECIAL_CHARACTER:
04672         class_optcount++;
04673 
04674         }
04675       }
04676     while (*(++ptr) != 0 && (inescq || *ptr != ']')); /* Concludes "do" above */
04677 
04678     if (*ptr == 0)                          /* Missing terminating ']' */
04679       {
04680       *errorptr = ERR6;
04681       goto PCRE_ERROR_RETURN;
04682       }
04683 
04684     /* We can optimize when there was only one optimizable character. Repeats
04685     for positive and negated single one-byte chars are handled by the general
04686     code. Here, we handle repeats for the class opcodes. */
04687 
04688     if (class_optcount == 1) length += 3; else
04689       {
04690       length += 33;
04691 
04692       /* A repeat needs either 1 or 5 bytes. If it is a possessive quantifier,
04693       we also need extra for wrapping the whole thing in a sub-pattern. */
04694 
04695       if (*ptr != 0 && ptr[1] == '{' && is_counted_repeat(ptr+2))
04696         {
04697         ptr = read_repeat_counts(ptr+2, &min, &max, errorptr);
04698         if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
04699         if ((min == 0 && (max == 1 || max == -1)) ||
04700           (min == 1 && max == -1))
04701             length++;
04702         else length += 5;
04703         if (ptr[1] == '+')
04704           {
04705           ptr++;
04706           length += 2 + 2*LINK_SIZE;
04707           }
04708         else if (ptr[1] == '?') ptr++;
04709         }
04710       }
04711     continue;
04712 
04713     /* Brackets may be genuine groups or special things */
04714 
04715     case '(':
04716     branch_newextra = 0;
04717     bracket_length = 1 + LINK_SIZE;
04718 
04719     /* Handle special forms of bracket, which all start (? */
04720 
04721     if (ptr[1] == '?')
04722       {
04723       int set, unset;
04724       int *optset;
04725 
04726       switch (c = ptr[2])
04727         {
04728         /* Skip over comments entirely */
04729         case '#':
04730         ptr += 3;
04731         while (*ptr != 0 && *ptr != ')') ptr++;
04732         if (*ptr == 0)
04733           {
04734           *errorptr = ERR18;
04735           goto PCRE_ERROR_RETURN;
04736           }
04737         continue;
04738 
04739         /* Non-referencing groups and lookaheads just move the pointer on, and
04740         then behave like a non-special bracket, except that they don't increment
04741         the count of extracting brackets. Ditto for the "once only" bracket,
04742         which is in Perl from version 5.005. */
04743 
04744         case ':':
04745         case '=':
04746         case '!':
04747         case '>':
04748         ptr += 2;
04749         break;
04750 
04751         /* (?R) specifies a recursive call to the regex, which is an extension
04752         to provide the facility which can be obtained by (?p{perl-code}) in
04753         Perl 5.6. In Perl 5.8 this has become (??{perl-code}).
04754 
04755         From PCRE 4.00, items such as (?3) specify subroutine-like "calls" to
04756         the appropriate numbered brackets. This includes both recursive and
04757         non-recursive calls. (?R) is now synonymous with (?0). */
04758 
04759         case 'R':
04760         ptr++;
04761 
04762         case '0': case '1': case '2': case '3': case '4':
04763         case '5': case '6': case '7': case '8': case '9':
04764         ptr += 2;
04765         if (c != 'R')
04766           while ((digitab[*(++ptr)] & ctype_digit) != 0);
04767         if (*ptr != ')')
04768           {
04769           *errorptr = ERR29;
04770           goto PCRE_ERROR_RETURN;
04771           }
04772         length += 1 + LINK_SIZE;
04773 
04774         /* If this item is quantified, it will get wrapped inside brackets so
04775         as to use the code for quantified brackets. We jump down and use the
04776         code that handles this for real brackets. */
04777 
04778         if (ptr[1] == '+' || ptr[1] == '*' || ptr[1] == '?' || ptr[1] == '{')
04779           {
04780           length += 2 + 2 * LINK_SIZE;       /* to make bracketed */
04781           duplength = 5 + 3 * LINK_SIZE;
04782           goto HANDLE_QUANTIFIED_BRACKETS;
04783           }
04784         continue;
04785 
04786         /* (?C) is an extension which provides "callout" - to provide a bit of
04787         the functionality of the Perl (?{...}) feature. An optional number may
04788         follow (default is zero). */
04789 
04790         case 'C':
04791         ptr += 2;
04792         while ((digitab[*(++ptr)] & ctype_digit) != 0);
04793         if (*ptr != ')')
04794           {
04795           *errorptr = ERR39;
04796           goto PCRE_ERROR_RETURN;
04797           }
04798         length += 2;
04799         continue;
04800 
04801         /* Named subpatterns are an extension copied from Python */
04802 
04803         case 'P':
04804         ptr += 3;
04805         if (*ptr == '<')
04806           {
04807           const uschar *p;    /* Don't amalgamate; some compilers */
04808           p = ++ptr;          /* grumble at autoincrement in declaration */
04809           while ((compile_block.ctypes[*ptr] & ctype_word) != 0) ptr++;
04810           if (*ptr != '>')
04811             {
04812             *errorptr = ERR42;
04813             goto PCRE_ERROR_RETURN;
04814             }
04815           name_count++;
04816           if (ptr - p > max_name_size) max_name_size = (ptr - p);
04817           break;
04818           }
04819 
04820         if (*ptr == '=' || *ptr == '>')
04821           {
04822           while ((compile_block.ctypes[*(++ptr)] & ctype_word) != 0);
04823           if (*ptr != ')')
04824             {
04825             *errorptr = ERR42;
04826             goto PCRE_ERROR_RETURN;
04827             }
04828           break;
04829           }
04830 
04831         /* Unknown character after (?P */
04832 
04833         *errorptr = ERR41;
04834         goto PCRE_ERROR_RETURN;
04835 
04836         /* Lookbehinds are in Perl from version 5.005 */
04837 
04838         case '<':
04839         ptr += 3;
04840         if (*ptr == '=' || *ptr == '!')
04841           {
04842           branch_newextra = 1 + LINK_SIZE;
04843           length += 1 + LINK_SIZE;         /* For the first branch */
04844           break;
04845           }
04846         *errorptr = ERR24;
04847         goto PCRE_ERROR_RETURN;
04848 
04849         /* Conditionals are in Perl from version 5.005. The bracket must either
04850         be followed by a number (for bracket reference) or by an assertion
04851         group, or (a PCRE extension) by 'R' for a recursion test. */
04852 
04853         case '(':
04854         if (ptr[3] == 'R' && ptr[4] == ')')
04855           {
04856           ptr += 4;
04857           length += 3;
04858           }
04859         else if ((digitab[ptr[3]] & ctype_digit) != 0)
04860           {
04861           ptr += 4;
04862           length += 3;
04863           while ((digitab[*ptr] & ctype_digit) != 0) ptr++;
04864           if (*ptr != ')')
04865             {
04866             *errorptr = ERR26;
04867             goto PCRE_ERROR_RETURN;
04868             }
04869           }
04870         else   /* An assertion must follow */
04871           {
04872           ptr++;   /* Can treat like ':' as far as spacing is concerned */
04873           if (ptr[2] != '?' ||
04874              (ptr[3] != '=' && ptr[3] != '!' && ptr[3] != '<') )
04875             {
04876             ptr += 2;    /* To get right offset in message */
04877             *errorptr = ERR28;
04878             goto PCRE_ERROR_RETURN;
04879             }
04880           }
04881         break;
04882 
04883         /* Else loop checking valid options until ) is met. Anything else is an
04884         error. If we are without any brackets, i.e. at top level, the settings
04885         act as if specified in the options, so massage the options immediately.
04886         This is for backward compatibility with Perl 5.004. */
04887 
04888         default:
04889         set = unset = 0;
04890         optset = &set;
04891         ptr += 2;
04892 
04893         for (;; ptr++)
04894           {
04895           c = *ptr;
04896           switch (c)
04897             {
04898             case 'i':
04899             *optset |= PCRE_CASELESS;
04900             continue;
04901 
04902             case 'm':
04903             *optset |= PCRE_MULTILINE;
04904             continue;
04905 
04906             case 's':
04907             *optset |= PCRE_DOTALL;
04908             continue;
04909 
04910             case 'x':
04911             *optset |= PCRE_EXTENDED;
04912             continue;
04913 
04914             case 'X':
04915             *optset |= PCRE_EXTRA;
04916             continue;
04917 
04918             case 'U':
04919             *optset |= PCRE_UNGREEDY;
04920             continue;
04921 
04922             case '-':
04923             optset = &unset;
04924             continue;
04925 
04926             /* A termination by ')' indicates an options-setting-only item; if
04927             this is at the very start of the pattern (indicated by item_count
04928             being zero), we use it to set the global options. This is helpful
04929             when analyzing the pattern for first characters, etc. Otherwise
04930             nothing is done here and it is handled during the compiling
04931             process.
04932 
04933             [Historical note: Up to Perl 5.8, options settings at top level
04934             were always global settings, wherever they appeared in the pattern.
04935             That is, they were equivalent to an external setting. From 5.8
04936             onwards, they apply only to what follows (which is what you might
04937             expect).] */
04938 
04939             case ')':
04940             if (item_count == 0)
04941               {
04942               options = (options | set) & (~unset);
04943               set = unset = 0;     /* To save length */
04944               item_count--;        /* To allow for several */
04945               }
04946 
04947             /* Fall through */
04948 
04949             /* A termination by ':' indicates the start of a nested group with
04950             the given options set. This is again handled at compile time, but
04951             we must allow for compiled space if any of the ims options are
04952             set. We also have to allow for resetting space at the end of
04953             the group, which is why 4 is added to the length and not just 2.
04954             If there are several changes of options within the same group, this
04955             will lead to an over-estimate on the length, but this shouldn't
04956             matter very much. We also have to allow for resetting options at
04957             the start of any alternations, which we do by setting
04958             branch_newextra to 2. Finally, we record whether the case-dependent
04959             flag ever changes within the regex. This is used by the "required
04960             character" code. */
04961 
04962             case ':':
04963             if (((set|unset) & PCRE_IMS) != 0)
04964               {
04965               length += 4;
04966               branch_newextra = 2;
04967               if (((set|unset) & PCRE_CASELESS) != 0) options |= PCRE_ICHANGED;
04968               }
04969             goto END_OPTIONS;
04970 
04971             /* Unrecognized option character */
04972 
04973             default:
04974             *errorptr = ERR12;
04975             goto PCRE_ERROR_RETURN;
04976             }
04977           }
04978 
04979         /* If we hit a closing bracket, that's it - this is a freestanding
04980         option-setting. We need to ensure that branch_extra is updated if
04981         necessary. The only values branch_newextra can have here are 0 or 2.
04982         If the value is 2, then branch_extra must either be 2 or 5, depending
04983         on whether this is a lookbehind group or not. */
04984 
04985         END_OPTIONS:
04986         if (c == ')')
04987           {
04988           if (branch_newextra == 2 &&
04989               (branch_extra == 0 || branch_extra == 1+LINK_SIZE))
04990             branch_extra += branch_newextra;
04991           continue;
04992           }
04993 
04994         /* If options were terminated by ':' control comes here. Fall through
04995         to handle the group below. */
04996         }
04997       }
04998 
04999     /* Extracting brackets must be counted so we can process escapes in a
05000     Perlish way. If the number exceeds EXTRACT_BASIC_MAX we are going to
05001     need an additional 3 bytes of store per extracting bracket. However, if
05002     PCRE_NO_AUTO)CAPTURE is set, unadorned brackets become non-capturing, so we
05003     must leave the count alone (it will aways be zero). */
05004 
05005     else if ((options & PCRE_NO_AUTO_CAPTURE) == 0)
05006       {
05007       bracount++;
05008       if (bracount > EXTRACT_BASIC_MAX) bracket_length += 3;
05009       }
05010 
05011     /* Save length for computing whole length at end if there's a repeat that
05012     requires duplication of the group. Also save the current value of
05013     branch_extra, and start the new group with the new value. If non-zero, this
05014     will either be 2 for a (?imsx: group, or 3 for a lookbehind assertion. */
05015 
05016     if (brastackptr >= sizeof(brastack)/sizeof(int))
05017       {
05018       *errorptr = ERR19;
05019       goto PCRE_ERROR_RETURN;
05020       }
05021 
05022     bralenstack[brastackptr] = branch_extra;
05023     branch_extra = branch_newextra;
05024 
05025     brastack[brastackptr++] = length;
05026     length += bracket_length;
05027     continue;
05028 
05029     /* Handle ket. Look for subsequent max/min; for certain sets of values we
05030     have to replicate this bracket up to that many times. If brastackptr is
05031     0 this is an unmatched bracket which will generate an error, but take care
05032     not to try to access brastack[-1] when computing the length and restoring
05033     the branch_extra value. */
05034 
05035     case ')':
05036     length += 1 + LINK_SIZE;
05037     if (brastackptr > 0)
05038       {
05039       duplength = length - brastack[--brastackptr];
05040       branch_extra = bralenstack[brastackptr];
05041       }
05042     else duplength = 0;
05043 
05044     /* The following code is also used when a recursion such as (?3) is
05045     followed by a quantifier, because in that case, it has to be wrapped inside
05046     brackets so that the quantifier works. The value of duplength must be
05047     set before arrival. */
05048 
05049     HANDLE_QUANTIFIED_BRACKETS:
05050 
05051     /* Leave ptr at the final char; for read_repeat_counts this happens
05052     automatically; for the others we need an increment. */
05053 
05054     if ((c = ptr[1]) == '{' && is_counted_repeat(ptr+2))
05055       {
05056       ptr = read_repeat_counts(ptr+2, &min, &max, errorptr);
05057       if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
05058       }
05059     else if (c == '*') { min = 0; max = -1; ptr++; }
05060     else if (c == '+') { min = 1; max = -1; ptr++; }
05061     else if (c == '?') { min = 0; max = 1;  ptr++; }
05062     else { min = 1; max = 1; }
05063 
05064     /* If the minimum is zero, we have to allow for an OP_BRAZERO before the
05065     group, and if the maximum is greater than zero, we have to replicate
05066     maxval-1 times; each replication acquires an OP_BRAZERO plus a nesting
05067     bracket set. */
05068 
05069     if (min == 0)
05070       {
05071       length++;
05072       if (max > 0) length += (max - 1) * (duplength + 3 + 2*LINK_SIZE);
05073       }
05074 
05075     /* When the minimum is greater than zero, we have to replicate up to
05076     minval-1 times, with no additions required in the copies. Then, if there
05077     is a limited maximum we have to replicate up to maxval-1 times allowing
05078     for a BRAZERO item before each optional copy and nesting brackets for all
05079     but one of the optional copies. */
05080 
05081     else
05082       {
05083       length += (min - 1) * duplength;
05084       if (max > min)   /* Need this test as max=-1 means no limit */
05085         length += (max - min) * (duplength + 3 + 2*LINK_SIZE)
05086           - (2 + 2*LINK_SIZE);
05087       }
05088 
05089     /* Allow space for once brackets for "possessive quantifier" */
05090 
05091     if (ptr[1] == '+')
05092       {
05093       ptr++;
05094       length += 2 + 2*LINK_SIZE;
05095       }
05096     continue;
05097 
05098     /* Non-special character. For a run of such characters the length required
05099     is the number of characters + 2, except that the maximum run length is
05100     MAXLIT. We won't get a skipped space or a non-data escape or the start of a
05101     # comment as the first character, so the length can't be zero. */
05102 
05103     NORMAL_CHAR:
05104     default:
05105     length += 2;
05106     runlength = 0;
05107     do
05108       {
05109 
05110       /* If in a \Q...\E sequence, check for end; otherwise it's a literal */
05111       if (inescq)
05112         {
05113         if (c == '\\' && ptr[1] == 'E')
05114           {
05115           inescq = false;
05116           ptr++;
05117           }
05118         else runlength++;
05119         continue;
05120         }
05121 
05122       /* Skip whitespace and comments for /x */
05123 
05124       if ((options & PCRE_EXTENDED) != 0)
05125         {
05126         if ((compile_block.ctypes[c] & ctype_space) != 0) continue;
05127         if (c == '#')
05128           {
05129           /* The space before the ; is to avoid a warning on a silly compiler
05130           on the Macintosh. */
05131           while ((c = *(++ptr)) != 0 && c != NEWLINE) ;
05132           continue;
05133           }
05134         }
05135 
05136       /* Backslash may introduce a data char or a metacharacter; stop the
05137       string before the latter. */
05138 
05139       if (c == '\\')
05140         {
05141         const uschar *saveptr = ptr;
05142         c = check_escape(&ptr, errorptr, bracount, options, false);
05143         if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
05144         if (c < 0) { ptr = saveptr; break; }
05145 
05146         /* In UTF-8 mode, add on the number of additional bytes needed to
05147         encode this character, and save the total length in case this is a
05148         final char that is repeated. */
05149 
05150         }
05151 
05152       /* Ordinary character or single-char escape */
05153 
05154       runlength++;
05155       }
05156 
05157     /* This "while" is the end of the "do" above. */
05158 
05159     while (runlength < MAXLIT &&
05160       (compile_block.ctypes[c = *(++ptr)] & ctype_meta) == 0);
05161 
05162     /* If we hit a meta-character, back off to point to it */
05163 
05164     if (runlength < MAXLIT) ptr--;
05165 
05166     /* If the last char in the string is a UTF-8 multibyte character, we must
05167     set lastcharlength correctly. If it was specified as an escape, this will
05168     already have been done above. However, we also have to support in-line
05169     UTF-8 characters, so check backwards from where we are. */
05170 
05171 
05172     length += runlength;
05173     continue;
05174     }
05175   }
05176 
05177 length += 2 + LINK_SIZE;    /* For final KET and END */
05178 
05179 if (length > MAX_PATTERN_SIZE)
05180   {
05181   *errorptr = ERR20;
05182   return NULL;
05183   }
05184 
05185 /* Compute the size of data block needed and get it, either from malloc or
05186 externally provided function. */
05187 
05188 size = length + sizeof(real_pcre) + name_count * (max_name_size + 3);
05189 re = static_cast<real_pcre *>(malloc(size));
05190 
05191 if (re == NULL)
05192   {
05193   *errorptr = ERR21;
05194   return NULL;
05195   }
05196 
05197 /* Put in the magic number, and save the size, options, and table pointer */
05198 
05199 re->magic_number = MAGIC_NUMBER;
05200 re->size = size;
05201 re->options = options;
05202 re->tables = tables;
05203 re->name_entry_size = max_name_size + 3;
05204 re->name_count = name_count;
05205 
05206 /* The starting points of the name/number translation table and of the code are
05207 passed around in the compile data block. */
05208 
05209 compile_block.names_found = 0;
05210 compile_block.name_entry_size = max_name_size + 3;
05211 compile_block.name_table = (uschar *)re + sizeof(real_pcre);
05212 codestart = compile_block.name_table + re->name_entry_size * re->name_count;
05213 compile_block.start_code = codestart;
05214 compile_block.req_varyopt = 0;
05215 
05216 /* Set up a starting, non-extracting bracket, then compile the expression. On
05217 error, *errorptr will be set non-NULL, so we don't need to look at the result
05218 of the function here. */
05219 
05220 ptr = (const uschar *)pattern;
05221 code = (uschar *)codestart;
05222 *code = OP_BRA;
05223 bracount = 0;
05224 (void)compile_regex(options, options & PCRE_IMS, &bracount, &code, &ptr,
05225   errorptr, false, 0, &firstbyte, &reqbyte, NULL, &compile_block);
05226 re->top_bracket = bracount;
05227 re->top_backref = compile_block.top_backref;
05228 
05229 /* If not reached end of pattern on success, there's an excess bracket. */
05230 
05231 if (*errorptr == NULL && *ptr != 0) *errorptr = ERR22;
05232 
05233 /* Fill in the terminating state and check for disastrous overflow, but
05234 if debugging, leave the test till after things are printed out. */
05235 
05236 *code++ = OP_END;
05237 
05238 if (code - codestart > length) *errorptr = ERR23;
05239 
05240 /* Give an error if there's back reference to a non-existent capturing
05241 subpattern. */
05242 
05243 if (re->top_backref > re->top_bracket) *errorptr = ERR15;
05244 
05245 /* Failed to compile, or error while post-processing */
05246 
05247 if (*errorptr != NULL)
05248   {
05249   free(re);
05250   PCRE_ERROR_RETURN:
05251   *erroroffset = ptr - (const uschar *)pattern;
05252   return NULL;
05253   }
05254 
05255 /* If the anchored option was not passed, set the flag if we can determine that
05256 the pattern is anchored by virtue of ^ characters or \A or anything else (such
05257 as starting with .* when DOTALL is set).
05258 
05259 Otherwise, if we know what the first character has to be, save it, because that
05260 speeds up unanchored matches no end. If not, see if we can set the
05261 PCRE_STARTLINE flag. This is helpful for multiline matches when all branches
05262 start with ^. and also when all branches start with .* for non-DOTALL matches.
05263 */
05264 
05265 if ((options & PCRE_ANCHORED) == 0)
05266   {
05267   int temp_options = options;
05268   if (is_anchored(codestart, &temp_options, 0, compile_block.backref_map))
05269     re->options |= PCRE_ANCHORED;
05270   else
05271     {
05272     if (firstbyte < 0)
05273       firstbyte = find_firstassertedchar(codestart, &temp_options, false);
05274     if (firstbyte >= 0)   /* Remove caseless flag for non-caseable chars */
05275       {
05276       int ch = firstbyte & 255;
05277       re->first_byte = ((firstbyte & REQ_CASELESS) != 0 &&
05278          compile_block.fcc[ch] == ch)? ch : firstbyte;
05279       re->options |= PCRE_FIRSTSET;
05280       }
05281     else if (is_startline(codestart, 0, compile_block.backref_map))
05282       re->options |= PCRE_STARTLINE;
05283     }
05284   }
05285 
05286 /* For an anchored pattern, we use the "required byte" only if it follows a
05287 variable length item in the regex. Remove the caseless flag for non-caseable
05288 chars. */
05289 
05290 if (reqbyte >= 0 &&
05291      ((re->options & PCRE_ANCHORED) == 0 || (reqbyte & REQ_VARY) != 0))
05292   {
05293   int ch = reqbyte & 255;
05294   re->req_byte = ((reqbyte & REQ_CASELESS) != 0 &&
05295     compile_block.fcc[ch] == ch)? (reqbyte & ~REQ_CASELESS) : reqbyte;
05296   re->options |= PCRE_REQCHSET;
05297   }
05298 
05299 return (pcre *)re;
05300 }
05301

int pcre_copy_substring	(	const char *	,
		int *	,
		int	,
		int	,
		char *	,
		int
	)

Definition at line 811 of file pcre.cpp.

References PCRE_ERROR_NOMEMORY, and PCRE_ERROR_NOSUBSTRING.

Referenced by real_regmatch(), and regexp_match().

00812 {
00813 int yield;
00814 if (stringnumber < 0 || stringnumber >= stringcount)
00815   return PCRE_ERROR_NOSUBSTRING;
00816 stringnumber *= 2;
00817 yield = ovector[stringnumber+1] - ovector[stringnumber];
00818 if (size < yield + 1) return PCRE_ERROR_NOMEMORY;
00819 memcpy(buffer, subject + ovector[stringnumber], yield);
00820 buffer[yield] = 0;
00821 return yield;
00822 }
00823

int pcre_exec	(	const pcre *	,
		const pcre_extra *	,
		const char *	,
		int	,
		int	,
		int	,
		int *	,
		int
	)

Definition at line 7090 of file pcre.cpp.

References pcre_extra::callout_data, match_data::callout_data, match_data::capture_last, match_data::ctypes, ctypes_offset, DPRINTF, match_data::end_match_ptr, match_data::end_offset_top, match_data::end_subject, match_data::endonly, fcc_offset, real_pcre::first_byte, pcre_extra::flags, match_data::lcc, lcc_offset, MAGIC_NUMBER, real_pcre::magic_number, match(), match_data::match_call_count, match_isgroup, pcre_extra::match_limit, MATCH_LIMIT, match_data::match_limit, MATCH_MATCH, MATCH_NOMATCH, real_pcre::name_count, real_pcre::name_entry_size, NEWLINE, match_data::notbol, match_data::notempty, match_data::noteol, match_data::offset_end, match_data::offset_max, match_data::offset_overflow, match_data::offset_vector, pcre_study_data::options, real_pcre::options, PCRE_ANCHORED, PCRE_CASELESS, PCRE_DOLLAR_ENDONLY, PCRE_DOTALL, PCRE_ERROR_BADMAGIC, PCRE_ERROR_BADOPTION, PCRE_ERROR_NOMATCH, PCRE_ERROR_NOMEMORY, PCRE_ERROR_NULL, PCRE_EXTRA_CALLOUT_DATA, PCRE_EXTRA_MATCH_LIMIT, PCRE_EXTRA_STUDY_DATA, PCRE_FIRSTSET, PCRE_MULTILINE, PCRE_NOTBOL, PCRE_NOTEMPTY, PCRE_NOTEOL, PCRE_REQCHSET, PCRE_STARTLINE, PCRE_STUDY_MAPPED, PCRE_UTF8, PUBLIC_EXEC_OPTIONS, match_data::recursive, real_pcre::req_byte, REQ_BYTE_MAX, REQ_CASELESS, pcre_study_data::start_bits, match_data::start_code, match_data::start_match, match_data::start_offset, match_data::start_subject, pcre_extra::study_data, real_pcre::tables, real_pcre::top_backref, real_pcre::top_bracket, and match_data::utf8.

Referenced by check_filter(), FUNCTION(), real_regmatch(), real_regrab(), and regexp_match().

07092 {
07093 int rc, resetcount, ocount;
07094 int first_byte = -1;
07095 int req_byte = -1;
07096 int req_byte2 = -1;
07097 unsigned long int ims = 0;
07098 bool using_temporary_offsets = false;
07099 bool anchored;
07100 bool startline;
07101 bool first_byte_caseless = false;
07102 bool req_byte_caseless = false;
07103 match_data match_block;
07104 const uschar *start_bits = NULL;
07105 const uschar *start_match = (const uschar *)subject + start_offset;
07106 const uschar *end_subject;
07107 const uschar *req_byte_ptr = start_match - 1;
07108 const pcre_study_data *study;
07109 const real_pcre *re = (const real_pcre *)external_re;
07110 
07111 /* Plausibility checks */
07112 
07113 if ((options & ~PUBLIC_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
07114 if (re == NULL || subject == NULL ||
07115    (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;
07116 
07117 /* Fish out the optional data from the extra_data structure, first setting
07118 the default values. */
07119 
07120 study = NULL;
07121 match_block.match_limit = MATCH_LIMIT;
07122 match_block.callout_data = NULL;
07123 
07124 if (extra_data != NULL)
07125   {
07126   register unsigned int flags = extra_data->flags;
07127   if ((flags & PCRE_EXTRA_STUDY_DATA) != 0)
07128     study = (const pcre_study_data *)extra_data->study_data;
07129   if ((flags & PCRE_EXTRA_MATCH_LIMIT) != 0)
07130     match_block.match_limit = extra_data->match_limit;
07131   if ((flags & PCRE_EXTRA_CALLOUT_DATA) != 0)
07132     match_block.callout_data = extra_data->callout_data;
07133   }
07134 
07135 /* Now we have re supposedly pointing to the regex */
07136 
07137 if (re->magic_number != MAGIC_NUMBER) return PCRE_ERROR_BADMAGIC;
07138 
07139 anchored = ((re->options | options) & PCRE_ANCHORED) != 0;
07140 startline = (re->options & PCRE_STARTLINE) != 0;
07141 
07142 match_block.start_code =
07143   (const uschar *)re + sizeof(real_pcre) + re->name_count * re->name_entry_size;
07144 match_block.start_subject = (const uschar *)subject;
07145 match_block.start_offset = start_offset;
07146 match_block.end_subject = match_block.start_subject + length;
07147 end_subject = match_block.end_subject;
07148 
07149 match_block.endonly = (re->options & PCRE_DOLLAR_ENDONLY) != 0;
07150 match_block.utf8 = (re->options & PCRE_UTF8) != 0;
07151 
07152 match_block.notbol = (options & PCRE_NOTBOL) != 0;
07153 match_block.noteol = (options & PCRE_NOTEOL) != 0;
07154 match_block.notempty = (options & PCRE_NOTEMPTY) != 0;
07155 
07156 match_block.recursive = NULL;                   /* No recursion at top level */
07157 
07158 match_block.lcc = re->tables + lcc_offset;
07159 match_block.ctypes = re->tables + ctypes_offset;
07160 
07161 /* Check a UTF-8 string if required. Unfortunately there's no way of passing
07162 back the character offset. */
07163 
07164 /* The ims options can vary during the matching as a result of the presence
07165 of (?ims) items in the pattern. They are kept in a local variable so that
07166 restoring at the exit of a group is easy. */
07167 
07168 ims = re->options & (PCRE_CASELESS|PCRE_MULTILINE|PCRE_DOTALL);
07169 
07170 /* If the expression has got more back references than the offsets supplied can
07171 hold, we get a temporary bit of working store to use during the matching.
07172 Otherwise, we can use the vector supplied, rounding down its size to a multiple
07173 of 3. */
07174 
07175 ocount = offsetcount - (offsetcount % 3);
07176 
07177 if (re->top_backref > 0 && re->top_backref >= ocount/3)
07178   {
07179   ocount = re->top_backref * 3 + 3;
07180   match_block.offset_vector = static_cast<int *>(malloc(ocount * sizeof(int)));
07181   if (match_block.offset_vector == NULL) return PCRE_ERROR_NOMEMORY;
07182   using_temporary_offsets = true;
07183   DPRINTF(("Got memory to hold back references\n"));
07184   }
07185 else match_block.offset_vector = offsets;
07186 
07187 match_block.offset_end = ocount;
07188 match_block.offset_max = (2*ocount)/3;
07189 match_block.offset_overflow = false;
07190 match_block.capture_last = -1;
07191 
07192 /* Compute the minimum number of offsets that we need to reset each time. Doing
07193 this makes a huge difference to execution time when there aren't many brackets
07194 in the pattern. */
07195 
07196 resetcount = 2 + re->top_bracket * 2;
07197 if (resetcount > offsetcount) resetcount = ocount;
07198 
07199 /* Reset the working variable associated with each extraction. These should
07200 never be used unless previously set, but they get saved and restored, and so we
07201 initialize them to avoid reading uninitialized locations. */
07202 
07203 if (match_block.offset_vector != NULL)
07204   {
07205   register int *iptr = match_block.offset_vector + ocount;
07206   register int *iend = iptr - resetcount/2 + 1;
07207   while (--iptr >= iend) *iptr = -1;
07208   }
07209 
07210 /* Set up the first character to match, if available. The first_byte value is
07211 never set for an anchored regular expression, but the anchoring may be forced
07212 at run time, so we have to test for anchoring. The first char may be unset for
07213 an unanchored pattern, of course. If there's no first char and the pattern was
07214 studied, there may be a bitmap of possible first characters. */
07215 
07216 if (!anchored)
07217   {
07218   if ((re->options & PCRE_FIRSTSET) != 0)
07219     {
07220     first_byte = re->first_byte & 255;
07221     if ((first_byte_caseless = ((re->first_byte & REQ_CASELESS) != 0)) == true)
07222       first_byte = match_block.lcc[first_byte];
07223     }
07224   else
07225     if (!startline && study != NULL &&
07226       (study->options & PCRE_STUDY_MAPPED) != 0)
07227         start_bits = study->start_bits;
07228   }
07229 
07230 /* For anchored or unanchored matches, there may be a "last known required
07231 character" set. */
07232 
07233 if ((re->options & PCRE_REQCHSET) != 0)
07234   {
07235   req_byte = re->req_byte & 255;
07236   req_byte_caseless = (re->req_byte & REQ_CASELESS) != 0;
07237   req_byte2 = (re->tables + fcc_offset)[req_byte];  /* case flipped */
07238   }
07239 
07240 /* Loop for handling unanchored repeated matching attempts; for anchored regexs
07241 the loop runs just once. */
07242 
07243 do
07244   {
07245   register int *iptr = match_block.offset_vector;
07246   register int *iend = iptr + resetcount;
07247 
07248   /* Reset the maximum number of extractions we might see. */
07249 
07250   while (iptr < iend) *iptr++ = -1;
07251 
07252   /* Advance to a unique first char if possible */
07253 
07254   if (first_byte >= 0)
07255     {
07256     if (first_byte_caseless)
07257       while (start_match < end_subject &&
07258              match_block.lcc[*start_match] != first_byte)
07259         start_match++;
07260     else
07261       while (start_match < end_subject && *start_match != first_byte)
07262         start_match++;
07263     }
07264 
07265   /* Or to just after \n for a multiline match if possible */
07266 
07267   else if (startline)
07268     {
07269     if (start_match > match_block.start_subject + start_offset)
07270       {
07271       while (start_match < end_subject && start_match[-1] != NEWLINE)
07272         start_match++;
07273       }
07274     }
07275 
07276   /* Or to a non-unique first char after study */
07277 
07278   else if (start_bits != NULL)
07279     {
07280     while (start_match < end_subject)
07281       {
07282       register int c = *start_match;
07283       if ((start_bits[c/8] & (1 << (c&7))) == 0) start_match++; else break;
07284       }
07285     }
07286 
07287   /* If req_byte is set, we know that that character must appear in the subject
07288   for the match to succeed. If the first character is set, req_byte must be
07289   later in the subject; otherwise the test starts at the match point. This
07290   optimization can save a huge amount of backtracking in patterns with nested
07291   unlimited repeats that aren't going to match. Writing separate code for
07292   cased/caseless versions makes it go faster, as does using an autoincrement
07293   and backing off on a match.
07294 
07295   HOWEVER: when the subject string is very, very long, searching to its end can
07296   take a long time, and give bad performance on quite ordinary patterns. This
07297   showed up when somebody was matching /^C/ on a 32-megabyte string... so we
07298   don't do this when the string is sufficiently long. */
07299 
07300   if (req_byte >= 0 && end_subject - start_match < REQ_BYTE_MAX)
07301     {
07302     register const uschar *p = start_match + ((first_byte >= 0)? 1 : 0);
07303 
07304     /* We don't need to repeat the search if we haven't yet reached the
07305     place we found it at last time. */
07306 
07307     if (p > req_byte_ptr)
07308       {
07309       if (req_byte_caseless)
07310         {
07311         while (p < end_subject)
07312           {
07313           register int pp = *p++;
07314           if (pp == req_byte || pp == req_byte2) { p--; break; }
07315           }
07316         }
07317       else
07318         {
07319         while (p < end_subject)
07320           {
07321           if (*p++ == req_byte) { p--; break; }
07322           }
07323         }
07324 
07325       /* If we can't find the required character, break the matching loop */
07326 
07327       if (p >= end_subject) break;
07328 
07329       /* If we have found the required character, save the point where we
07330       found it, so that we don't search again next time round the loop if
07331       the start hasn't passed this character yet. */
07332 
07333       req_byte_ptr = p;
07334       }
07335     }
07336 
07337   /* When a match occurs, substrings will be set for all internal extractions;
07338   we just need to set up the whole thing as substring 0 before returning. If
07339   there were too many extractions, set the return code to zero. In the case
07340   where we had to get some local store to hold offsets for backreferences, copy
07341   those back references that we can. In this case there need not be overflow
07342   if certain parts of the pattern were not used. */
07343 
07344   match_block.start_match = start_match;
07345   match_block.match_call_count = 0;
07346 
07347   rc = match(start_match, match_block.start_code, 2, &match_block, ims, NULL,
07348     match_isgroup);
07349 
07350   if (rc == MATCH_NOMATCH)
07351     {
07352     start_match++;
07353     continue;
07354     }
07355 
07356   if (rc != MATCH_MATCH)
07357     {
07358     DPRINTF((">>>> error: returning %d\n", rc));
07359     return rc;
07360     }
07361 
07362   /* We have a match! Copy the offset information from temporary store if
07363   necessary */
07364 
07365   if (using_temporary_offsets)
07366     {
07367     if (offsetcount >= 4)
07368       {
07369       memcpy(offsets + 2, match_block.offset_vector + 2,
07370         (offsetcount - 2) * sizeof(int));
07371       DPRINTF(("Copied offsets from temporary memory\n"));
07372       }
07373     if (match_block.end_offset_top > offsetcount)
07374       match_block.offset_overflow = true;
07375 
07376     DPRINTF(("Freeing temporary memory\n"));
07377     free(match_block.offset_vector);
07378     }
07379 
07380   rc = match_block.offset_overflow? 0 : match_block.end_offset_top/2;
07381 
07382   if (offsetcount < 2) rc = 0; else
07383     {
07384     offsets[0] = start_match - match_block.start_subject;
07385     offsets[1] = match_block.end_match_ptr - match_block.start_subject;
07386     }
07387 
07388   DPRINTF((">>>> returning %d\n", rc));
07389   return rc;
07390   }
07391 
07392 /* This "while" is the end of the "do" above */
07393 
07394 while (!anchored && start_match <= end_subject);
07395 
07396 if (using_temporary_offsets)
07397   {
07398   DPRINTF(("Freeing temporary memory\n"));
07399   free(match_block.offset_vector);
07400   }
07401 
07402 DPRINTF((">>>> returning PCRE_ERROR_NOMATCH\n"));
07403 
07404 return PCRE_ERROR_NOMATCH;
07405 }
07406

const unsigned char* pcre_maketables ( void )

Definition at line 842 of file pcre.cpp.

References cbit_cntrl, cbit_digit, cbit_graph, cbit_length, cbit_lower, cbit_print, cbit_punct, cbit_space, cbit_upper, cbit_word, cbit_xdigit, ctype_digit, ctype_letter, ctype_meta, ctype_space, ctype_word, ctype_xdigit, and tables_length.

00842 {
00843 unsigned char *yield, *p;
00844 int i;
00845 
00846 yield = static_cast<unsigned char*>(malloc(tables_length));
00847 
00848 if (yield == NULL) return NULL;
00849 p = yield;
00850 
00851 /* First comes the lower casing table */
00852 
00853 for (i = 0; i < 256; i++) *p++ = tolower(i);
00854 
00855 /* Next the case-flipping table */
00856 
00857 for (i = 0; i < 256; i++) *p++ = islower(i)? toupper(i) : tolower(i);
00858 
00859 /* Then the character class tables. Don't try to be clever and save effort
00860 on exclusive ones - in some locales things may be different. Note that the
00861 table for "space" includes everything "isspace" gives, including VT in the
00862 default locale. This makes it work for the POSIX class [:space:]. */
00863 
00864 memset(p, 0, cbit_length);
00865 for (i = 0; i < 256; i++)
00866   {
00867   if (isdigit(i))
00868     {
00869     p[cbit_digit  + i/8] |= 1 << (i&7);
00870     p[cbit_word   + i/8] |= 1 << (i&7);
00871     }
00872   if (isupper(i))
00873     {
00874     p[cbit_upper  + i/8] |= 1 << (i&7);
00875     p[cbit_word   + i/8] |= 1 << (i&7);
00876     }
00877   if (islower(i))
00878     {
00879     p[cbit_lower  + i/8] |= 1 << (i&7);
00880     p[cbit_word   + i/8] |= 1 << (i&7);
00881     }
00882   if (i == '_')   p[cbit_word   + i/8] |= 1 << (i&7);
00883   if (isspace(i)) p[cbit_space  + i/8] |= 1 << (i&7);
00884   if (isxdigit(i))p[cbit_xdigit + i/8] |= 1 << (i&7);
00885   if (isgraph(i)) p[cbit_graph  + i/8] |= 1 << (i&7);
00886   if (isprint(i)) p[cbit_print  + i/8] |= 1 << (i&7);
00887   if (ispunct(i)) p[cbit_punct  + i/8] |= 1 << (i&7);
00888   if (iscntrl(i)) p[cbit_cntrl  + i/8] |= 1 << (i&7);
00889   }
00890 p += cbit_length;
00891 
00892 /* Finally, the character type table. In this, we exclude VT from the white
00893 space chars, because Perl doesn't recognize it as such for \s and for comments
00894 within regexes. */
00895 
00896 for (i = 0; i < 256; i++)
00897   {
00898   int x = 0;
00899   if (i != 0x0b && isspace(i)) x += ctype_space;
00900   if (isalpha(i)) x += ctype_letter;
00901   if (isdigit(i)) x += ctype_digit;
00902   if (isxdigit(i)) x += ctype_xdigit;
00903   if (isalnum(i) || i == '_') x += ctype_word;
00904   if (strchr("*+?{^.$|()[", i) != 0) x += ctype_meta;
00905   *p++ = x;
00906   }
00907 
00908 return yield;
00909 }
00910

pcre_extra* pcre_study	(	const pcre *	,
		int	,
		const char **
	)

Definition at line 1233 of file pcre.cpp.

References compile_data::cbits, cbits_offset, compile_data::ctypes, ctypes_offset, compile_data::fcc, fcc_offset, pcre_extra::flags, compile_data::lcc, lcc_offset, MAGIC_NUMBER, real_pcre::magic_number, real_pcre::name_count, real_pcre::name_entry_size, pcre_study_data::options, real_pcre::options, PCRE_ANCHORED, PCRE_CASELESS, PCRE_EXTRA_STUDY_DATA, PCRE_FIRSTSET, PCRE_STARTLINE, PCRE_STUDY_MAPPED, PCRE_UTF8, PUBLIC_STUDY_OPTIONS, set_start_bits(), pcre_study_data::size, pcre_study_data::start_bits, pcre_extra::study_data, and real_pcre::tables.

Referenced by CF_HAND(), and real_regrab().

01233 {
01234 uschar start_bits[32];
01235 pcre_extra *extra;
01236 pcre_study_data *study;
01237 const real_pcre *re = (const real_pcre *)external_re;
01238 uschar *code = (uschar *)re + sizeof(real_pcre) +
01239   (re->name_count * re->name_entry_size);
01240 compile_data compile_block;
01241 
01242 *errorptr = NULL;
01243 
01244 if (re == NULL || re->magic_number != MAGIC_NUMBER)
01245   {
01246   *errorptr = "argument is not a compiled regular expression";
01247   return NULL;
01248   }
01249 
01250 if ((options & ~PUBLIC_STUDY_OPTIONS) != 0)
01251   {
01252   *errorptr = "unknown or incorrect option bit(s) set";
01253   return NULL;
01254   }
01255 
01256 /* For an anchored pattern, or an unanchored pattern that has a first char, or
01257 a multiline pattern that matches only at "line starts", no further processing
01258 at present. */
01259 
01260 if ((re->options & (PCRE_ANCHORED|PCRE_FIRSTSET|PCRE_STARTLINE)) != 0)
01261   return NULL;
01262 
01263 /* Set the character tables in the block which is passed around */
01264 
01265 compile_block.lcc = re->tables + lcc_offset;
01266 compile_block.fcc = re->tables + fcc_offset;
01267 compile_block.cbits = re->tables + cbits_offset;
01268 compile_block.ctypes = re->tables + ctypes_offset;
01269 
01270 /* See if we can find a fixed set of initial characters for the pattern. */
01271 
01272 memset(start_bits, 0, 32 * sizeof(uschar));
01273 if (!set_start_bits(code, start_bits, (re->options & PCRE_CASELESS) != 0,
01274   (re->options & PCRE_UTF8) != 0, &compile_block)) return NULL;
01275 
01276 /* Get a pcre_extra block and a pcre_study_data block. The study data is put in
01277 the latter, which is pointed to by the former, which may also get additional
01278 data set later by the calling program. At the moment, the size of
01279 pcre_study_data is fixed. We nevertheless save it in a field for returning via
01280 the pcre_fullinfo() function so that if it becomes variable in the future, we
01281 don't have to change that code. */
01282 
01283 extra = static_cast<pcre_extra *>(malloc(sizeof(pcre_extra) + sizeof(pcre_study_data)));
01284 
01285 if (extra == NULL)
01286   {
01287   *errorptr = "failed to get memory";
01288   return NULL;
01289   }
01290 
01291 // Hmm.
01292 study = reinterpret_cast<pcre_study_data *>(reinterpret_cast<char*>(extra) + sizeof(pcre_extra));
01293 extra->flags = PCRE_EXTRA_STUDY_DATA;
01294 extra->study_data = study;
01295 
01296 study->size = sizeof(pcre_study_data);
01297 study->options = PCRE_STUDY_MAPPED;
01298 memcpy(study->start_bits, start_bits, sizeof(start_bits));
01299 
01300 return extra;
01301 }
01302

mux/src/pcre.h File Reference

Data Structures

Defines

Typedefs

Functions

Define Documentation

Typedef Documentation

Function Documentation


Data Structures
struct	pcre_extra
struct	pcre_callout_block
Defines
#define	PCRE_MAJOR 4
#define	PCRE_MINOR 5
#define	PCRE_DATE 01-December-2003
#define	PCRE_DATA_SCOPE extern
#define	PCRE_CASELESS 0x0001
#define	PCRE_MULTILINE 0x0002
#define	PCRE_DOTALL 0x0004
#define	PCRE_EXTENDED 0x0008
#define	PCRE_ANCHORED 0x0010
#define	PCRE_DOLLAR_ENDONLY 0x0020
#define	PCRE_EXTRA 0x0040
#define	PCRE_NOTBOL 0x0080
#define	PCRE_NOTEOL 0x0100
#define	PCRE_UNGREEDY 0x0200
#define	PCRE_NOTEMPTY 0x0400
#define	PCRE_UTF8 0x0800
#define	PCRE_NO_AUTO_CAPTURE 0x1000
#define	PCRE_NO_UTF8_CHECK 0x2000
#define	PCRE_ERROR_NOMATCH (-1)
#define	PCRE_ERROR_NULL (-2)
#define	PCRE_ERROR_BADOPTION (-3)
#define	PCRE_ERROR_BADMAGIC (-4)
#define	PCRE_ERROR_UNKNOWN_NODE (-5)
#define	PCRE_ERROR_NOMEMORY (-6)
#define	PCRE_ERROR_NOSUBSTRING (-7)
#define	PCRE_ERROR_MATCHLIMIT (-8)
#define	PCRE_ERROR_CALLOUT (-9)
#define	PCRE_ERROR_BADUTF8 (-10)
#define	PCRE_ERROR_BADUTF8_OFFSET (-11)
#define	PCRE_INFO_OPTIONS 0
#define	PCRE_INFO_SIZE 1
#define	PCRE_INFO_CAPTURECOUNT 2
#define	PCRE_INFO_BACKREFMAX 3
#define	PCRE_INFO_FIRSTBYTE 4
#define	PCRE_INFO_FIRSTCHAR 4
#define	PCRE_INFO_FIRSTTABLE 5
#define	PCRE_INFO_LASTLITERAL 6
#define	PCRE_INFO_NAMEENTRYSIZE 7
#define	PCRE_INFO_NAMECOUNT 8
#define	PCRE_INFO_NAMETABLE 9
#define	PCRE_INFO_STUDYSIZE 10
#define	PCRE_CONFIG_UTF8 0
#define	PCRE_CONFIG_NEWLINE 1
#define	PCRE_CONFIG_LINK_SIZE 2
#define	PCRE_CONFIG_POSIX_MALLOC_THRESHOLD 3
#define	PCRE_CONFIG_MATCH_LIMIT 4
#define	PCRE_CONFIG_STACKRECURSE 5
#define	PCRE_EXTRA_STUDY_DATA 0x0001
#define	PCRE_EXTRA_MATCH_LIMIT 0x0002
#define	PCRE_EXTRA_CALLOUT_DATA 0x0004
Typedefs
typedef real_pcre	pcre
Functions
pcre *	pcre_compile (const char , int, const char , int , const unsigned char *)
int	pcre_copy_substring (const char , int , int, int, char *, int)
int	pcre_exec (const pcre , const pcre_extra , const char , int, int, int, int , int)
const unsigned char *	pcre_maketables (void)
pcre_extra *	pcre_study (const pcre , int, const char *)