#include <stdlib.h>
Include dependency graph for pcre.h:
This graph shows which files directly or indirectly include this file:
Go to the source code of this file.
#define PCRE_ANCHORED 0x0010 |
#define PCRE_CASELESS 0x0001 |
Definition at line 28 of file pcre.h.
Referenced by atr_match1(), check_filter(), compile_branch(), find_firstassertedchar(), match(), match_ref(), pcre_compile(), pcre_exec(), pcre_study(), process_cmdent(), real_regmatch(), real_regrab(), and set_start_bits().
#define PCRE_DOLLAR_ENDONLY 0x0020 |
#define PCRE_DOTALL 0x0004 |
Definition at line 30 of file pcre.h.
Referenced by compile_branch(), is_anchored(), match(), pcre_compile(), and pcre_exec().
#define PCRE_ERROR_BADMAGIC (-4) |
#define PCRE_ERROR_BADOPTION (-3) |
#define PCRE_ERROR_NOMATCH (-1) |
#define PCRE_ERROR_NOMEMORY (-6) |
Definition at line 50 of file pcre.h.
Referenced by match(), pcre_copy_substring(), and pcre_exec().
#define PCRE_ERROR_NOSUBSTRING (-7) |
#define PCRE_ERROR_NULL (-2) |
#define PCRE_EXTENDED 0x0008 |
#define PCRE_EXTRA 0x0040 |
Definition at line 34 of file pcre.h.
Referenced by check_escape(), compile_branch(), and pcre_compile().
#define PCRE_EXTRA_CALLOUT_DATA 0x0004 |
#define PCRE_EXTRA_MATCH_LIMIT 0x0002 |
#define PCRE_EXTRA_STUDY_DATA 0x0001 |
#define PCRE_MULTILINE 0x0002 |
Definition at line 29 of file pcre.h.
Referenced by compile_branch(), is_anchored(), match(), pcre_compile(), and pcre_exec().
#define PCRE_NO_AUTO_CAPTURE 0x1000 |
#define PCRE_NOTBOL 0x0080 |
#define PCRE_NOTEMPTY 0x0400 |
#define PCRE_NOTEOL 0x0100 |
#define PCRE_UNGREEDY 0x0200 |
#define PCRE_UTF8 0x0800 |
pcre* pcre_compile | ( | const char * | , | |
int | , | |||
const char ** | , | |||
int * | , | |||
const unsigned char * | ||||
) |
Definition at line 4365 of file pcre.cpp.
References compile_data::backref_map, BRASTACK_SIZE, compile_data::cbits, cbits_offset, check_escape(), check_posix_syntax(), compile_regex(), ctype_digit, ctype_meta, ctype_space, ctype_word, compile_data::ctypes, ctypes_offset, digitab, DPRINTF, ERR12, ERR15, ERR16, ERR17, ERR18, ERR19, ERR20, ERR21, ERR22, ERR23, ERR24, ERR26, ERR28, ERR29, ERR32, ERR39, ERR41, ERR42, ERR6, ESC_b, ESC_Q, ESC_REF, EXTRACT_BASIC_MAX, compile_data::fcc, fcc_offset, find_firstassertedchar(), real_pcre::first_byte, is_anchored(), is_counted_repeat(), is_startline(), compile_data::lcc, lcc_offset, LINK_SIZE, MAGIC_NUMBER, real_pcre::magic_number, MAX_PATTERN_SIZE, MAXLIT, real_pcre::name_count, compile_data::name_entry_size, real_pcre::name_entry_size, compile_data::name_table, compile_data::names_found, NEWLINE, OP_BRA, OP_END, real_pcre::options, PCRE_ANCHORED, PCRE_CASELESS, pcre_default_tables, PCRE_DOTALL, PCRE_EXTENDED, PCRE_EXTRA, PCRE_FIRSTSET, PCRE_ICHANGED, PCRE_IMS, PCRE_MULTILINE, PCRE_NO_AUTO_CAPTURE, PCRE_REQCHSET, PCRE_STARTLINE, PCRE_UNGREEDY, PCRE_UTF8, PUBLIC_OPTIONS, read_repeat_counts(), real_pcre::req_byte, REQ_CASELESS, REQ_VARY, compile_data::req_varyopt, real_pcre::size, compile_data::start_code, real_pcre::tables, real_pcre::top_backref, compile_data::top_backref, and real_pcre::top_bracket.
Referenced by CF_HAND(), check_filter(), real_regmatch(), real_regrab(), and regexp_match().
04366 { 04367 real_pcre *re; 04368 int length = 1 + LINK_SIZE; /* For initial BRA plus length */ 04369 int runlength; 04370 int c, firstbyte, reqbyte; 04371 int bracount = 0; 04372 int branch_extra = 0; 04373 int branch_newextra; 04374 int item_count = -1; 04375 int name_count = 0; 04376 int max_name_size = 0; 04377 bool inescq = false; 04378 unsigned int brastackptr = 0; 04379 size_t size; 04380 uschar *code; 04381 const uschar *codestart; 04382 const uschar *ptr; 04383 compile_data compile_block; 04384 int brastack[BRASTACK_SIZE]; 04385 uschar bralenstack[BRASTACK_SIZE]; 04386 04387 /* We can't pass back an error message if errorptr is NULL; I guess the best we 04388 can do is just return NULL. */ 04389 04390 if (errorptr == NULL) return NULL; 04391 *errorptr = NULL; 04392 04393 /* However, we can give a message for this error */ 04394 04395 if (erroroffset == NULL) 04396 { 04397 *errorptr = ERR16; 04398 return NULL; 04399 } 04400 *erroroffset = 0; 04401 04402 /* Can't support UTF8 unless PCRE has been compiled to include the code. */ 04403 04404 if ((options & PCRE_UTF8) != 0) 04405 { 04406 *errorptr = ERR32; 04407 return NULL; 04408 } 04409 04410 if ((options & ~PUBLIC_OPTIONS) != 0) 04411 { 04412 *errorptr = ERR17; 04413 return NULL; 04414 } 04415 04416 /* Set up pointers to the individual character tables */ 04417 04418 if (tables == NULL) tables = pcre_default_tables; 04419 compile_block.lcc = tables + lcc_offset; 04420 compile_block.fcc = tables + fcc_offset; 04421 compile_block.cbits = tables + cbits_offset; 04422 compile_block.ctypes = tables + ctypes_offset; 04423 04424 /* Maximum back reference and backref bitmap. This is updated for numeric 04425 references during the first pass, but for named references during the actual 04426 compile pass. The bitmap records up to 31 back references to help in deciding 04427 whether (.*) can be treated as anchored or not. */ 04428 04429 compile_block.top_backref = 0; 04430 compile_block.backref_map = 0; 04431 04432 /* Reflect pattern for debugging output */ 04433 04434 DPRINTF(("------------------------------------------------------------------\n")); 04435 DPRINTF(("%s\n", pattern)); 04436 04437 /* The first thing to do is to make a pass over the pattern to compute the 04438 amount of store required to hold the compiled code. This does not have to be 04439 perfect as long as errors are overestimates. At the same time we can detect any 04440 flag settings right at the start, and extract them. Make an attempt to correct 04441 for any counted white space if an "extended" flag setting appears late in the 04442 pattern. We can't be so clever for #-comments. */ 04443 04444 ptr = (const uschar *)(pattern - 1); 04445 while ((c = *(++ptr)) != 0) 04446 { 04447 int min, max; 04448 #if defined(WIN32) && (_MSC_VER == 1200) && defined(_M_IX86) && !defined(__INTEL_COMPILER) 04449 // The addition of 'volatile' works around a bug in Version 12.0 of 04450 // Microsoft's Visual C/C++ compiler (part of Visual Studio 6.0). Without 04451 // volatile, class_optcount is calculated properly, but the compiler 04452 // clobbers the EAX register before tests it as class_optcount. 04453 // 04454 // This is not a problem with the Intel Compiler. 04455 // 04456 volatile int class_optcount; 04457 #else 04458 int class_optcount; 04459 #endif 04460 int bracket_length; 04461 int duplength; 04462 04463 /* If we are inside a \Q...\E sequence, all chars are literal */ 04464 04465 if (inescq) goto NORMAL_CHAR; 04466 04467 /* Otherwise, first check for ignored whitespace and comments */ 04468 04469 if ((options & PCRE_EXTENDED) != 0) 04470 { 04471 if ((compile_block.ctypes[c] & ctype_space) != 0) continue; 04472 if (c == '#') 04473 { 04474 /* The space before the ; is to avoid a warning on a silly compiler 04475 on the Macintosh. */ 04476 while ((c = *(++ptr)) != 0 && c != NEWLINE) ; 04477 if (c == 0) break; 04478 continue; 04479 } 04480 } 04481 04482 item_count++; /* Is zero for the first non-comment item */ 04483 04484 switch(c) 04485 { 04486 /* A backslashed item may be an escaped "normal" character or a 04487 character type. For a "normal" character, put the pointers and 04488 character back so that tests for whitespace etc. in the input 04489 are done correctly. */ 04490 04491 case '\\': 04492 { 04493 const uschar *save_ptr = ptr; 04494 c = check_escape(&ptr, errorptr, bracount, options, false); 04495 if (*errorptr != NULL) goto PCRE_ERROR_RETURN; 04496 if (c >= 0) 04497 { 04498 ptr = save_ptr; 04499 c = '\\'; 04500 goto NORMAL_CHAR; 04501 } 04502 } 04503 04504 /* If \Q, enter "literal" mode */ 04505 04506 if (-c == ESC_Q) 04507 { 04508 inescq = true; 04509 continue; 04510 } 04511 04512 /* Other escapes need one byte, and are of length one for repeats */ 04513 04514 length++; 04515 04516 /* A back reference needs an additional 2 bytes, plus either one or 5 04517 bytes for a repeat. We also need to keep the value of the highest 04518 back reference. */ 04519 04520 if (c <= -ESC_REF) 04521 { 04522 int refnum = -c - ESC_REF; 04523 compile_block.backref_map |= (refnum < 32)? (1 << refnum) : 1; 04524 if (refnum > compile_block.top_backref) 04525 compile_block.top_backref = refnum; 04526 length += 2; /* For single back reference */ 04527 if (ptr[1] == '{' && is_counted_repeat(ptr+2)) 04528 { 04529 ptr = read_repeat_counts(ptr+2, &min, &max, errorptr); 04530 if (*errorptr != NULL) goto PCRE_ERROR_RETURN; 04531 if ((min == 0 && (max == 1 || max == -1)) || 04532 (min == 1 && max == -1)) 04533 length++; 04534 else length += 5; 04535 if (ptr[1] == '?') ptr++; 04536 } 04537 } 04538 continue; 04539 04540 case '^': /* Single-byte metacharacters */ 04541 case '.': 04542 case '$': 04543 length++; 04544 continue; 04545 04546 case '*': /* These repeats won't be after brackets; */ 04547 case '+': /* those are handled separately */ 04548 case '?': 04549 length++; 04550 goto POSESSIVE; /* A few lines below */ 04551 04552 /* This covers the cases of braced repeats after a single char, metachar, 04553 class, or back reference. */ 04554 04555 case '{': 04556 if (!is_counted_repeat(ptr+1)) goto NORMAL_CHAR; 04557 ptr = read_repeat_counts(ptr+1, &min, &max, errorptr); 04558 if (*errorptr != NULL) goto PCRE_ERROR_RETURN; 04559 04560 /* These special cases just insert one extra opcode */ 04561 04562 if ((min == 0 && (max == 1 || max == -1)) || 04563 (min == 1 && max == -1)) 04564 length++; 04565 04566 /* These cases might insert additional copies of a preceding character. */ 04567 04568 else 04569 { 04570 04571 /* Not UTF-8 mode: all characters are one byte */ 04572 { 04573 if (min != 1) 04574 { 04575 length--; /* Uncount the original char or metachar */ 04576 if (min > 0) length += 4; 04577 } 04578 04579 length += (max > 0)? 4 : 2; 04580 } 04581 } 04582 04583 if (ptr[1] == '?') ptr++; /* Needs no extra length */ 04584 04585 POSESSIVE: /* Test for possessive quantifier */ 04586 if (ptr[1] == '+') 04587 { 04588 ptr++; 04589 length += 2 + 2*LINK_SIZE; /* Allow for atomic brackets */ 04590 } 04591 continue; 04592 04593 /* An alternation contains an offset to the next branch or ket. If any ims 04594 options changed in the previous branch(es), and/or if we are in a 04595 lookbehind assertion, extra space will be needed at the start of the 04596 branch. This is handled by branch_extra. */ 04597 04598 case '|': 04599 length += 1 + LINK_SIZE + branch_extra; 04600 continue; 04601 04602 /* A character class uses 33 characters provided that all the character 04603 values are less than 256. Otherwise, it uses a bit map for low valued 04604 characters, and individual items for others. Don't worry about character 04605 types that aren't allowed in classes - they'll get picked up during the 04606 compile. A character class that contains only one single-byte character 04607 uses 2 or 3 bytes, depending on whether it is negated or not. Notice this 04608 where we can. (In UTF-8 mode we can do this only for chars < 128.) */ 04609 04610 case '[': 04611 class_optcount = 0; 04612 04613 if (*(++ptr) == '^') ptr++; 04614 04615 /* Written as a "do" so that an initial ']' is taken as data */ 04616 04617 if (*ptr != 0) do 04618 { 04619 /* Inside \Q...\E everything is literal except \E */ 04620 04621 if (inescq) 04622 { 04623 if (*ptr != '\\' || ptr[1] != 'E') goto NON_SPECIAL_CHARACTER; 04624 inescq = false; 04625 ptr += 1; 04626 continue; 04627 } 04628 04629 /* Outside \Q...\E, check for escapes */ 04630 04631 if (*ptr == '\\') 04632 { 04633 int ch = check_escape(&ptr, errorptr, bracount, options, true); 04634 if (*errorptr != NULL) goto PCRE_ERROR_RETURN; 04635 04636 /* \b is backspace inside a class */ 04637 04638 if (-ch == ESC_b) ch = '\b'; 04639 04640 /* \Q enters quoting mode */ 04641 04642 if (-ch == ESC_Q) 04643 { 04644 inescq = true; 04645 continue; 04646 } 04647 04648 /* Handle escapes that turn into characters */ 04649 04650 if (ch >= 0) 04651 { 04652 class_optcount++; /* for possible optimization */ 04653 } 04654 else class_optcount = 10; /* \d, \s etc; make sure > 1 */ 04655 } 04656 04657 /* Check the syntax for POSIX stuff. The bits we actually handle are 04658 checked during the real compile phase. */ 04659 04660 else if (*ptr == '[' && check_posix_syntax(ptr, &ptr, &compile_block)) 04661 { 04662 ptr++; 04663 class_optcount = 10; /* Make sure > 1 */ 04664 } 04665 04666 /* Anything else just increments the possible optimization count. If 04667 there are wide characters, we are going to have to use an XCLASS. */ 04668 04669 else 04670 { 04671 NON_SPECIAL_CHARACTER: 04672 class_optcount++; 04673 04674 } 04675 } 04676 while (*(++ptr) != 0 && (inescq || *ptr != ']')); /* Concludes "do" above */ 04677 04678 if (*ptr == 0) /* Missing terminating ']' */ 04679 { 04680 *errorptr = ERR6; 04681 goto PCRE_ERROR_RETURN; 04682 } 04683 04684 /* We can optimize when there was only one optimizable character. Repeats 04685 for positive and negated single one-byte chars are handled by the general 04686 code. Here, we handle repeats for the class opcodes. */ 04687 04688 if (class_optcount == 1) length += 3; else 04689 { 04690 length += 33; 04691 04692 /* A repeat needs either 1 or 5 bytes. If it is a possessive quantifier, 04693 we also need extra for wrapping the whole thing in a sub-pattern. */ 04694 04695 if (*ptr != 0 && ptr[1] == '{' && is_counted_repeat(ptr+2)) 04696 { 04697 ptr = read_repeat_counts(ptr+2, &min, &max, errorptr); 04698 if (*errorptr != NULL) goto PCRE_ERROR_RETURN; 04699 if ((min == 0 && (max == 1 || max == -1)) || 04700 (min == 1 && max == -1)) 04701 length++; 04702 else length += 5; 04703 if (ptr[1] == '+') 04704 { 04705 ptr++; 04706 length += 2 + 2*LINK_SIZE; 04707 } 04708 else if (ptr[1] == '?') ptr++; 04709 } 04710 } 04711 continue; 04712 04713 /* Brackets may be genuine groups or special things */ 04714 04715 case '(': 04716 branch_newextra = 0; 04717 bracket_length = 1 + LINK_SIZE; 04718 04719 /* Handle special forms of bracket, which all start (? */ 04720 04721 if (ptr[1] == '?') 04722 { 04723 int set, unset; 04724 int *optset; 04725 04726 switch (c = ptr[2]) 04727 { 04728 /* Skip over comments entirely */ 04729 case '#': 04730 ptr += 3; 04731 while (*ptr != 0 && *ptr != ')') ptr++; 04732 if (*ptr == 0) 04733 { 04734 *errorptr = ERR18; 04735 goto PCRE_ERROR_RETURN; 04736 } 04737 continue; 04738 04739 /* Non-referencing groups and lookaheads just move the pointer on, and 04740 then behave like a non-special bracket, except that they don't increment 04741 the count of extracting brackets. Ditto for the "once only" bracket, 04742 which is in Perl from version 5.005. */ 04743 04744 case ':': 04745 case '=': 04746 case '!': 04747 case '>': 04748 ptr += 2; 04749 break; 04750 04751 /* (?R) specifies a recursive call to the regex, which is an extension 04752 to provide the facility which can be obtained by (?p{perl-code}) in 04753 Perl 5.6. In Perl 5.8 this has become (??{perl-code}). 04754 04755 From PCRE 4.00, items such as (?3) specify subroutine-like "calls" to 04756 the appropriate numbered brackets. This includes both recursive and 04757 non-recursive calls. (?R) is now synonymous with (?0). */ 04758 04759 case 'R': 04760 ptr++; 04761 04762 case '0': case '1': case '2': case '3': case '4': 04763 case '5': case '6': case '7': case '8': case '9': 04764 ptr += 2; 04765 if (c != 'R') 04766 while ((digitab[*(++ptr)] & ctype_digit) != 0); 04767 if (*ptr != ')') 04768 { 04769 *errorptr = ERR29; 04770 goto PCRE_ERROR_RETURN; 04771 } 04772 length += 1 + LINK_SIZE; 04773 04774 /* If this item is quantified, it will get wrapped inside brackets so 04775 as to use the code for quantified brackets. We jump down and use the 04776 code that handles this for real brackets. */ 04777 04778 if (ptr[1] == '+' || ptr[1] == '*' || ptr[1] == '?' || ptr[1] == '{') 04779 { 04780 length += 2 + 2 * LINK_SIZE; /* to make bracketed */ 04781 duplength = 5 + 3 * LINK_SIZE; 04782 goto HANDLE_QUANTIFIED_BRACKETS; 04783 } 04784 continue; 04785 04786 /* (?C) is an extension which provides "callout" - to provide a bit of 04787 the functionality of the Perl (?{...}) feature. An optional number may 04788 follow (default is zero). */ 04789 04790 case 'C': 04791 ptr += 2; 04792 while ((digitab[*(++ptr)] & ctype_digit) != 0); 04793 if (*ptr != ')') 04794 { 04795 *errorptr = ERR39; 04796 goto PCRE_ERROR_RETURN; 04797 } 04798 length += 2; 04799 continue; 04800 04801 /* Named subpatterns are an extension copied from Python */ 04802 04803 case 'P': 04804 ptr += 3; 04805 if (*ptr == '<') 04806 { 04807 const uschar *p; /* Don't amalgamate; some compilers */ 04808 p = ++ptr; /* grumble at autoincrement in declaration */ 04809 while ((compile_block.ctypes[*ptr] & ctype_word) != 0) ptr++; 04810 if (*ptr != '>') 04811 { 04812 *errorptr = ERR42; 04813 goto PCRE_ERROR_RETURN; 04814 } 04815 name_count++; 04816 if (ptr - p > max_name_size) max_name_size = (ptr - p); 04817 break; 04818 } 04819 04820 if (*ptr == '=' || *ptr == '>') 04821 { 04822 while ((compile_block.ctypes[*(++ptr)] & ctype_word) != 0); 04823 if (*ptr != ')') 04824 { 04825 *errorptr = ERR42; 04826 goto PCRE_ERROR_RETURN; 04827 } 04828 break; 04829 } 04830 04831 /* Unknown character after (?P */ 04832 04833 *errorptr = ERR41; 04834 goto PCRE_ERROR_RETURN; 04835 04836 /* Lookbehinds are in Perl from version 5.005 */ 04837 04838 case '<': 04839 ptr += 3; 04840 if (*ptr == '=' || *ptr == '!') 04841 { 04842 branch_newextra = 1 + LINK_SIZE; 04843 length += 1 + LINK_SIZE; /* For the first branch */ 04844 break; 04845 } 04846 *errorptr = ERR24; 04847 goto PCRE_ERROR_RETURN; 04848 04849 /* Conditionals are in Perl from version 5.005. The bracket must either 04850 be followed by a number (for bracket reference) or by an assertion 04851 group, or (a PCRE extension) by 'R' for a recursion test. */ 04852 04853 case '(': 04854 if (ptr[3] == 'R' && ptr[4] == ')') 04855 { 04856 ptr += 4; 04857 length += 3; 04858 } 04859 else if ((digitab[ptr[3]] & ctype_digit) != 0) 04860 { 04861 ptr += 4; 04862 length += 3; 04863 while ((digitab[*ptr] & ctype_digit) != 0) ptr++; 04864 if (*ptr != ')') 04865 { 04866 *errorptr = ERR26; 04867 goto PCRE_ERROR_RETURN; 04868 } 04869 } 04870 else /* An assertion must follow */ 04871 { 04872 ptr++; /* Can treat like ':' as far as spacing is concerned */ 04873 if (ptr[2] != '?' || 04874 (ptr[3] != '=' && ptr[3] != '!' && ptr[3] != '<') ) 04875 { 04876 ptr += 2; /* To get right offset in message */ 04877 *errorptr = ERR28; 04878 goto PCRE_ERROR_RETURN; 04879 } 04880 } 04881 break; 04882 04883 /* Else loop checking valid options until ) is met. Anything else is an 04884 error. If we are without any brackets, i.e. at top level, the settings 04885 act as if specified in the options, so massage the options immediately. 04886 This is for backward compatibility with Perl 5.004. */ 04887 04888 default: 04889 set = unset = 0; 04890 optset = &set; 04891 ptr += 2; 04892 04893 for (;; ptr++) 04894 { 04895 c = *ptr; 04896 switch (c) 04897 { 04898 case 'i': 04899 *optset |= PCRE_CASELESS; 04900 continue; 04901 04902 case 'm': 04903 *optset |= PCRE_MULTILINE; 04904 continue; 04905 04906 case 's': 04907 *optset |= PCRE_DOTALL; 04908 continue; 04909 04910 case 'x': 04911 *optset |= PCRE_EXTENDED; 04912 continue; 04913 04914 case 'X': 04915 *optset |= PCRE_EXTRA; 04916 continue; 04917 04918 case 'U': 04919 *optset |= PCRE_UNGREEDY; 04920 continue; 04921 04922 case '-': 04923 optset = &unset; 04924 continue; 04925 04926 /* A termination by ')' indicates an options-setting-only item; if 04927 this is at the very start of the pattern (indicated by item_count 04928 being zero), we use it to set the global options. This is helpful 04929 when analyzing the pattern for first characters, etc. Otherwise 04930 nothing is done here and it is handled during the compiling 04931 process. 04932 04933 [Historical note: Up to Perl 5.8, options settings at top level 04934 were always global settings, wherever they appeared in the pattern. 04935 That is, they were equivalent to an external setting. From 5.8 04936 onwards, they apply only to what follows (which is what you might 04937 expect).] */ 04938 04939 case ')': 04940 if (item_count == 0) 04941 { 04942 options = (options | set) & (~unset); 04943 set = unset = 0; /* To save length */ 04944 item_count--; /* To allow for several */ 04945 } 04946 04947 /* Fall through */ 04948 04949 /* A termination by ':' indicates the start of a nested group with 04950 the given options set. This is again handled at compile time, but 04951 we must allow for compiled space if any of the ims options are 04952 set. We also have to allow for resetting space at the end of 04953 the group, which is why 4 is added to the length and not just 2. 04954 If there are several changes of options within the same group, this 04955 will lead to an over-estimate on the length, but this shouldn't 04956 matter very much. We also have to allow for resetting options at 04957 the start of any alternations, which we do by setting 04958 branch_newextra to 2. Finally, we record whether the case-dependent 04959 flag ever changes within the regex. This is used by the "required 04960 character" code. */ 04961 04962 case ':': 04963 if (((set|unset) & PCRE_IMS) != 0) 04964 { 04965 length += 4; 04966 branch_newextra = 2; 04967 if (((set|unset) & PCRE_CASELESS) != 0) options |= PCRE_ICHANGED; 04968 } 04969 goto END_OPTIONS; 04970 04971 /* Unrecognized option character */ 04972 04973 default: 04974 *errorptr = ERR12; 04975 goto PCRE_ERROR_RETURN; 04976 } 04977 } 04978 04979 /* If we hit a closing bracket, that's it - this is a freestanding 04980 option-setting. We need to ensure that branch_extra is updated if 04981 necessary. The only values branch_newextra can have here are 0 or 2. 04982 If the value is 2, then branch_extra must either be 2 or 5, depending 04983 on whether this is a lookbehind group or not. */ 04984 04985 END_OPTIONS: 04986 if (c == ')') 04987 { 04988 if (branch_newextra == 2 && 04989 (branch_extra == 0 || branch_extra == 1+LINK_SIZE)) 04990 branch_extra += branch_newextra; 04991 continue; 04992 } 04993 04994 /* If options were terminated by ':' control comes here. Fall through 04995 to handle the group below. */ 04996 } 04997 } 04998 04999 /* Extracting brackets must be counted so we can process escapes in a 05000 Perlish way. If the number exceeds EXTRACT_BASIC_MAX we are going to 05001 need an additional 3 bytes of store per extracting bracket. However, if 05002 PCRE_NO_AUTO)CAPTURE is set, unadorned brackets become non-capturing, so we 05003 must leave the count alone (it will aways be zero). */ 05004 05005 else if ((options & PCRE_NO_AUTO_CAPTURE) == 0) 05006 { 05007 bracount++; 05008 if (bracount > EXTRACT_BASIC_MAX) bracket_length += 3; 05009 } 05010 05011 /* Save length for computing whole length at end if there's a repeat that 05012 requires duplication of the group. Also save the current value of 05013 branch_extra, and start the new group with the new value. If non-zero, this 05014 will either be 2 for a (?imsx: group, or 3 for a lookbehind assertion. */ 05015 05016 if (brastackptr >= sizeof(brastack)/sizeof(int)) 05017 { 05018 *errorptr = ERR19; 05019 goto PCRE_ERROR_RETURN; 05020 } 05021 05022 bralenstack[brastackptr] = branch_extra; 05023 branch_extra = branch_newextra; 05024 05025 brastack[brastackptr++] = length; 05026 length += bracket_length; 05027 continue; 05028 05029 /* Handle ket. Look for subsequent max/min; for certain sets of values we 05030 have to replicate this bracket up to that many times. If brastackptr is 05031 0 this is an unmatched bracket which will generate an error, but take care 05032 not to try to access brastack[-1] when computing the length and restoring 05033 the branch_extra value. */ 05034 05035 case ')': 05036 length += 1 + LINK_SIZE; 05037 if (brastackptr > 0) 05038 { 05039 duplength = length - brastack[--brastackptr]; 05040 branch_extra = bralenstack[brastackptr]; 05041 } 05042 else duplength = 0; 05043 05044 /* The following code is also used when a recursion such as (?3) is 05045 followed by a quantifier, because in that case, it has to be wrapped inside 05046 brackets so that the quantifier works. The value of duplength must be 05047 set before arrival. */ 05048 05049 HANDLE_QUANTIFIED_BRACKETS: 05050 05051 /* Leave ptr at the final char; for read_repeat_counts this happens 05052 automatically; for the others we need an increment. */ 05053 05054 if ((c = ptr[1]) == '{' && is_counted_repeat(ptr+2)) 05055 { 05056 ptr = read_repeat_counts(ptr+2, &min, &max, errorptr); 05057 if (*errorptr != NULL) goto PCRE_ERROR_RETURN; 05058 } 05059 else if (c == '*') { min = 0; max = -1; ptr++; } 05060 else if (c == '+') { min = 1; max = -1; ptr++; } 05061 else if (c == '?') { min = 0; max = 1; ptr++; } 05062 else { min = 1; max = 1; } 05063 05064 /* If the minimum is zero, we have to allow for an OP_BRAZERO before the 05065 group, and if the maximum is greater than zero, we have to replicate 05066 maxval-1 times; each replication acquires an OP_BRAZERO plus a nesting 05067 bracket set. */ 05068 05069 if (min == 0) 05070 { 05071 length++; 05072 if (max > 0) length += (max - 1) * (duplength + 3 + 2*LINK_SIZE); 05073 } 05074 05075 /* When the minimum is greater than zero, we have to replicate up to 05076 minval-1 times, with no additions required in the copies. Then, if there 05077 is a limited maximum we have to replicate up to maxval-1 times allowing 05078 for a BRAZERO item before each optional copy and nesting brackets for all 05079 but one of the optional copies. */ 05080 05081 else 05082 { 05083 length += (min - 1) * duplength; 05084 if (max > min) /* Need this test as max=-1 means no limit */ 05085 length += (max - min) * (duplength + 3 + 2*LINK_SIZE) 05086 - (2 + 2*LINK_SIZE); 05087 } 05088 05089 /* Allow space for once brackets for "possessive quantifier" */ 05090 05091 if (ptr[1] == '+') 05092 { 05093 ptr++; 05094 length += 2 + 2*LINK_SIZE; 05095 } 05096 continue; 05097 05098 /* Non-special character. For a run of such characters the length required 05099 is the number of characters + 2, except that the maximum run length is 05100 MAXLIT. We won't get a skipped space or a non-data escape or the start of a 05101 # comment as the first character, so the length can't be zero. */ 05102 05103 NORMAL_CHAR: 05104 default: 05105 length += 2; 05106 runlength = 0; 05107 do 05108 { 05109 05110 /* If in a \Q...\E sequence, check for end; otherwise it's a literal */ 05111 if (inescq) 05112 { 05113 if (c == '\\' && ptr[1] == 'E') 05114 { 05115 inescq = false; 05116 ptr++; 05117 } 05118 else runlength++; 05119 continue; 05120 } 05121 05122 /* Skip whitespace and comments for /x */ 05123 05124 if ((options & PCRE_EXTENDED) != 0) 05125 { 05126 if ((compile_block.ctypes[c] & ctype_space) != 0) continue; 05127 if (c == '#') 05128 { 05129 /* The space before the ; is to avoid a warning on a silly compiler 05130 on the Macintosh. */ 05131 while ((c = *(++ptr)) != 0 && c != NEWLINE) ; 05132 continue; 05133 } 05134 } 05135 05136 /* Backslash may introduce a data char or a metacharacter; stop the 05137 string before the latter. */ 05138 05139 if (c == '\\') 05140 { 05141 const uschar *saveptr = ptr; 05142 c = check_escape(&ptr, errorptr, bracount, options, false); 05143 if (*errorptr != NULL) goto PCRE_ERROR_RETURN; 05144 if (c < 0) { ptr = saveptr; break; } 05145 05146 /* In UTF-8 mode, add on the number of additional bytes needed to 05147 encode this character, and save the total length in case this is a 05148 final char that is repeated. */ 05149 05150 } 05151 05152 /* Ordinary character or single-char escape */ 05153 05154 runlength++; 05155 } 05156 05157 /* This "while" is the end of the "do" above. */ 05158 05159 while (runlength < MAXLIT && 05160 (compile_block.ctypes[c = *(++ptr)] & ctype_meta) == 0); 05161 05162 /* If we hit a meta-character, back off to point to it */ 05163 05164 if (runlength < MAXLIT) ptr--; 05165 05166 /* If the last char in the string is a UTF-8 multibyte character, we must 05167 set lastcharlength correctly. If it was specified as an escape, this will 05168 already have been done above. However, we also have to support in-line 05169 UTF-8 characters, so check backwards from where we are. */ 05170 05171 05172 length += runlength; 05173 continue; 05174 } 05175 } 05176 05177 length += 2 + LINK_SIZE; /* For final KET and END */ 05178 05179 if (length > MAX_PATTERN_SIZE) 05180 { 05181 *errorptr = ERR20; 05182 return NULL; 05183 } 05184 05185 /* Compute the size of data block needed and get it, either from malloc or 05186 externally provided function. */ 05187 05188 size = length + sizeof(real_pcre) + name_count * (max_name_size + 3); 05189 re = static_cast<real_pcre *>(malloc(size)); 05190 05191 if (re == NULL) 05192 { 05193 *errorptr = ERR21; 05194 return NULL; 05195 } 05196 05197 /* Put in the magic number, and save the size, options, and table pointer */ 05198 05199 re->magic_number = MAGIC_NUMBER; 05200 re->size = size; 05201 re->options = options; 05202 re->tables = tables; 05203 re->name_entry_size = max_name_size + 3; 05204 re->name_count = name_count; 05205 05206 /* The starting points of the name/number translation table and of the code are 05207 passed around in the compile data block. */ 05208 05209 compile_block.names_found = 0; 05210 compile_block.name_entry_size = max_name_size + 3; 05211 compile_block.name_table = (uschar *)re + sizeof(real_pcre); 05212 codestart = compile_block.name_table + re->name_entry_size * re->name_count; 05213 compile_block.start_code = codestart; 05214 compile_block.req_varyopt = 0; 05215 05216 /* Set up a starting, non-extracting bracket, then compile the expression. On 05217 error, *errorptr will be set non-NULL, so we don't need to look at the result 05218 of the function here. */ 05219 05220 ptr = (const uschar *)pattern; 05221 code = (uschar *)codestart; 05222 *code = OP_BRA; 05223 bracount = 0; 05224 (void)compile_regex(options, options & PCRE_IMS, &bracount, &code, &ptr, 05225 errorptr, false, 0, &firstbyte, &reqbyte, NULL, &compile_block); 05226 re->top_bracket = bracount; 05227 re->top_backref = compile_block.top_backref; 05228 05229 /* If not reached end of pattern on success, there's an excess bracket. */ 05230 05231 if (*errorptr == NULL && *ptr != 0) *errorptr = ERR22; 05232 05233 /* Fill in the terminating state and check for disastrous overflow, but 05234 if debugging, leave the test till after things are printed out. */ 05235 05236 *code++ = OP_END; 05237 05238 if (code - codestart > length) *errorptr = ERR23; 05239 05240 /* Give an error if there's back reference to a non-existent capturing 05241 subpattern. */ 05242 05243 if (re->top_backref > re->top_bracket) *errorptr = ERR15; 05244 05245 /* Failed to compile, or error while post-processing */ 05246 05247 if (*errorptr != NULL) 05248 { 05249 free(re); 05250 PCRE_ERROR_RETURN: 05251 *erroroffset = ptr - (const uschar *)pattern; 05252 return NULL; 05253 } 05254 05255 /* If the anchored option was not passed, set the flag if we can determine that 05256 the pattern is anchored by virtue of ^ characters or \A or anything else (such 05257 as starting with .* when DOTALL is set). 05258 05259 Otherwise, if we know what the first character has to be, save it, because that 05260 speeds up unanchored matches no end. If not, see if we can set the 05261 PCRE_STARTLINE flag. This is helpful for multiline matches when all branches 05262 start with ^. and also when all branches start with .* for non-DOTALL matches. 05263 */ 05264 05265 if ((options & PCRE_ANCHORED) == 0) 05266 { 05267 int temp_options = options; 05268 if (is_anchored(codestart, &temp_options, 0, compile_block.backref_map)) 05269 re->options |= PCRE_ANCHORED; 05270 else 05271 { 05272 if (firstbyte < 0) 05273 firstbyte = find_firstassertedchar(codestart, &temp_options, false); 05274 if (firstbyte >= 0) /* Remove caseless flag for non-caseable chars */ 05275 { 05276 int ch = firstbyte & 255; 05277 re->first_byte = ((firstbyte & REQ_CASELESS) != 0 && 05278 compile_block.fcc[ch] == ch)? ch : firstbyte; 05279 re->options |= PCRE_FIRSTSET; 05280 } 05281 else if (is_startline(codestart, 0, compile_block.backref_map)) 05282 re->options |= PCRE_STARTLINE; 05283 } 05284 } 05285 05286 /* For an anchored pattern, we use the "required byte" only if it follows a 05287 variable length item in the regex. Remove the caseless flag for non-caseable 05288 chars. */ 05289 05290 if (reqbyte >= 0 && 05291 ((re->options & PCRE_ANCHORED) == 0 || (reqbyte & REQ_VARY) != 0)) 05292 { 05293 int ch = reqbyte & 255; 05294 re->req_byte = ((reqbyte & REQ_CASELESS) != 0 && 05295 compile_block.fcc[ch] == ch)? (reqbyte & ~REQ_CASELESS) : reqbyte; 05296 re->options |= PCRE_REQCHSET; 05297 } 05298 05299 return (pcre *)re; 05300 } 05301
int pcre_copy_substring | ( | const char * | , | |
int * | , | |||
int | , | |||
int | , | |||
char * | , | |||
int | ||||
) |
Definition at line 811 of file pcre.cpp.
References PCRE_ERROR_NOMEMORY, and PCRE_ERROR_NOSUBSTRING.
Referenced by real_regmatch(), and regexp_match().
00812 { 00813 int yield; 00814 if (stringnumber < 0 || stringnumber >= stringcount) 00815 return PCRE_ERROR_NOSUBSTRING; 00816 stringnumber *= 2; 00817 yield = ovector[stringnumber+1] - ovector[stringnumber]; 00818 if (size < yield + 1) return PCRE_ERROR_NOMEMORY; 00819 memcpy(buffer, subject + ovector[stringnumber], yield); 00820 buffer[yield] = 0; 00821 return yield; 00822 } 00823
int pcre_exec | ( | const pcre * | , | |
const pcre_extra * | , | |||
const char * | , | |||
int | , | |||
int | , | |||
int | , | |||
int * | , | |||
int | ||||
) |
Definition at line 7090 of file pcre.cpp.
References pcre_extra::callout_data, match_data::callout_data, match_data::capture_last, match_data::ctypes, ctypes_offset, DPRINTF, match_data::end_match_ptr, match_data::end_offset_top, match_data::end_subject, match_data::endonly, fcc_offset, real_pcre::first_byte, pcre_extra::flags, match_data::lcc, lcc_offset, MAGIC_NUMBER, real_pcre::magic_number, match(), match_data::match_call_count, match_isgroup, pcre_extra::match_limit, MATCH_LIMIT, match_data::match_limit, MATCH_MATCH, MATCH_NOMATCH, real_pcre::name_count, real_pcre::name_entry_size, NEWLINE, match_data::notbol, match_data::notempty, match_data::noteol, match_data::offset_end, match_data::offset_max, match_data::offset_overflow, match_data::offset_vector, pcre_study_data::options, real_pcre::options, PCRE_ANCHORED, PCRE_CASELESS, PCRE_DOLLAR_ENDONLY, PCRE_DOTALL, PCRE_ERROR_BADMAGIC, PCRE_ERROR_BADOPTION, PCRE_ERROR_NOMATCH, PCRE_ERROR_NOMEMORY, PCRE_ERROR_NULL, PCRE_EXTRA_CALLOUT_DATA, PCRE_EXTRA_MATCH_LIMIT, PCRE_EXTRA_STUDY_DATA, PCRE_FIRSTSET, PCRE_MULTILINE, PCRE_NOTBOL, PCRE_NOTEMPTY, PCRE_NOTEOL, PCRE_REQCHSET, PCRE_STARTLINE, PCRE_STUDY_MAPPED, PCRE_UTF8, PUBLIC_EXEC_OPTIONS, match_data::recursive, real_pcre::req_byte, REQ_BYTE_MAX, REQ_CASELESS, pcre_study_data::start_bits, match_data::start_code, match_data::start_match, match_data::start_offset, match_data::start_subject, pcre_extra::study_data, real_pcre::tables, real_pcre::top_backref, real_pcre::top_bracket, and match_data::utf8.
Referenced by check_filter(), FUNCTION(), real_regmatch(), real_regrab(), and regexp_match().
07092 { 07093 int rc, resetcount, ocount; 07094 int first_byte = -1; 07095 int req_byte = -1; 07096 int req_byte2 = -1; 07097 unsigned long int ims = 0; 07098 bool using_temporary_offsets = false; 07099 bool anchored; 07100 bool startline; 07101 bool first_byte_caseless = false; 07102 bool req_byte_caseless = false; 07103 match_data match_block; 07104 const uschar *start_bits = NULL; 07105 const uschar *start_match = (const uschar *)subject + start_offset; 07106 const uschar *end_subject; 07107 const uschar *req_byte_ptr = start_match - 1; 07108 const pcre_study_data *study; 07109 const real_pcre *re = (const real_pcre *)external_re; 07110 07111 /* Plausibility checks */ 07112 07113 if ((options & ~PUBLIC_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION; 07114 if (re == NULL || subject == NULL || 07115 (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL; 07116 07117 /* Fish out the optional data from the extra_data structure, first setting 07118 the default values. */ 07119 07120 study = NULL; 07121 match_block.match_limit = MATCH_LIMIT; 07122 match_block.callout_data = NULL; 07123 07124 if (extra_data != NULL) 07125 { 07126 register unsigned int flags = extra_data->flags; 07127 if ((flags & PCRE_EXTRA_STUDY_DATA) != 0) 07128 study = (const pcre_study_data *)extra_data->study_data; 07129 if ((flags & PCRE_EXTRA_MATCH_LIMIT) != 0) 07130 match_block.match_limit = extra_data->match_limit; 07131 if ((flags & PCRE_EXTRA_CALLOUT_DATA) != 0) 07132 match_block.callout_data = extra_data->callout_data; 07133 } 07134 07135 /* Now we have re supposedly pointing to the regex */ 07136 07137 if (re->magic_number != MAGIC_NUMBER) return PCRE_ERROR_BADMAGIC; 07138 07139 anchored = ((re->options | options) & PCRE_ANCHORED) != 0; 07140 startline = (re->options & PCRE_STARTLINE) != 0; 07141 07142 match_block.start_code = 07143 (const uschar *)re + sizeof(real_pcre) + re->name_count * re->name_entry_size; 07144 match_block.start_subject = (const uschar *)subject; 07145 match_block.start_offset = start_offset; 07146 match_block.end_subject = match_block.start_subject + length; 07147 end_subject = match_block.end_subject; 07148 07149 match_block.endonly = (re->options & PCRE_DOLLAR_ENDONLY) != 0; 07150 match_block.utf8 = (re->options & PCRE_UTF8) != 0; 07151 07152 match_block.notbol = (options & PCRE_NOTBOL) != 0; 07153 match_block.noteol = (options & PCRE_NOTEOL) != 0; 07154 match_block.notempty = (options & PCRE_NOTEMPTY) != 0; 07155 07156 match_block.recursive = NULL; /* No recursion at top level */ 07157 07158 match_block.lcc = re->tables + lcc_offset; 07159 match_block.ctypes = re->tables + ctypes_offset; 07160 07161 /* Check a UTF-8 string if required. Unfortunately there's no way of passing 07162 back the character offset. */ 07163 07164 /* The ims options can vary during the matching as a result of the presence 07165 of (?ims) items in the pattern. They are kept in a local variable so that 07166 restoring at the exit of a group is easy. */ 07167 07168 ims = re->options & (PCRE_CASELESS|PCRE_MULTILINE|PCRE_DOTALL); 07169 07170 /* If the expression has got more back references than the offsets supplied can 07171 hold, we get a temporary bit of working store to use during the matching. 07172 Otherwise, we can use the vector supplied, rounding down its size to a multiple 07173 of 3. */ 07174 07175 ocount = offsetcount - (offsetcount % 3); 07176 07177 if (re->top_backref > 0 && re->top_backref >= ocount/3) 07178 { 07179 ocount = re->top_backref * 3 + 3; 07180 match_block.offset_vector = static_cast<int *>(malloc(ocount * sizeof(int))); 07181 if (match_block.offset_vector == NULL) return PCRE_ERROR_NOMEMORY; 07182 using_temporary_offsets = true; 07183 DPRINTF(("Got memory to hold back references\n")); 07184 } 07185 else match_block.offset_vector = offsets; 07186 07187 match_block.offset_end = ocount; 07188 match_block.offset_max = (2*ocount)/3; 07189 match_block.offset_overflow = false; 07190 match_block.capture_last = -1; 07191 07192 /* Compute the minimum number of offsets that we need to reset each time. Doing 07193 this makes a huge difference to execution time when there aren't many brackets 07194 in the pattern. */ 07195 07196 resetcount = 2 + re->top_bracket * 2; 07197 if (resetcount > offsetcount) resetcount = ocount; 07198 07199 /* Reset the working variable associated with each extraction. These should 07200 never be used unless previously set, but they get saved and restored, and so we 07201 initialize them to avoid reading uninitialized locations. */ 07202 07203 if (match_block.offset_vector != NULL) 07204 { 07205 register int *iptr = match_block.offset_vector + ocount; 07206 register int *iend = iptr - resetcount/2 + 1; 07207 while (--iptr >= iend) *iptr = -1; 07208 } 07209 07210 /* Set up the first character to match, if available. The first_byte value is 07211 never set for an anchored regular expression, but the anchoring may be forced 07212 at run time, so we have to test for anchoring. The first char may be unset for 07213 an unanchored pattern, of course. If there's no first char and the pattern was 07214 studied, there may be a bitmap of possible first characters. */ 07215 07216 if (!anchored) 07217 { 07218 if ((re->options & PCRE_FIRSTSET) != 0) 07219 { 07220 first_byte = re->first_byte & 255; 07221 if ((first_byte_caseless = ((re->first_byte & REQ_CASELESS) != 0)) == true) 07222 first_byte = match_block.lcc[first_byte]; 07223 } 07224 else 07225 if (!startline && study != NULL && 07226 (study->options & PCRE_STUDY_MAPPED) != 0) 07227 start_bits = study->start_bits; 07228 } 07229 07230 /* For anchored or unanchored matches, there may be a "last known required 07231 character" set. */ 07232 07233 if ((re->options & PCRE_REQCHSET) != 0) 07234 { 07235 req_byte = re->req_byte & 255; 07236 req_byte_caseless = (re->req_byte & REQ_CASELESS) != 0; 07237 req_byte2 = (re->tables + fcc_offset)[req_byte]; /* case flipped */ 07238 } 07239 07240 /* Loop for handling unanchored repeated matching attempts; for anchored regexs 07241 the loop runs just once. */ 07242 07243 do 07244 { 07245 register int *iptr = match_block.offset_vector; 07246 register int *iend = iptr + resetcount; 07247 07248 /* Reset the maximum number of extractions we might see. */ 07249 07250 while (iptr < iend) *iptr++ = -1; 07251 07252 /* Advance to a unique first char if possible */ 07253 07254 if (first_byte >= 0) 07255 { 07256 if (first_byte_caseless) 07257 while (start_match < end_subject && 07258 match_block.lcc[*start_match] != first_byte) 07259 start_match++; 07260 else 07261 while (start_match < end_subject && *start_match != first_byte) 07262 start_match++; 07263 } 07264 07265 /* Or to just after \n for a multiline match if possible */ 07266 07267 else if (startline) 07268 { 07269 if (start_match > match_block.start_subject + start_offset) 07270 { 07271 while (start_match < end_subject && start_match[-1] != NEWLINE) 07272 start_match++; 07273 } 07274 } 07275 07276 /* Or to a non-unique first char after study */ 07277 07278 else if (start_bits != NULL) 07279 { 07280 while (start_match < end_subject) 07281 { 07282 register int c = *start_match; 07283 if ((start_bits[c/8] & (1 << (c&7))) == 0) start_match++; else break; 07284 } 07285 } 07286 07287 /* If req_byte is set, we know that that character must appear in the subject 07288 for the match to succeed. If the first character is set, req_byte must be 07289 later in the subject; otherwise the test starts at the match point. This 07290 optimization can save a huge amount of backtracking in patterns with nested 07291 unlimited repeats that aren't going to match. Writing separate code for 07292 cased/caseless versions makes it go faster, as does using an autoincrement 07293 and backing off on a match. 07294 07295 HOWEVER: when the subject string is very, very long, searching to its end can 07296 take a long time, and give bad performance on quite ordinary patterns. This 07297 showed up when somebody was matching /^C/ on a 32-megabyte string... so we 07298 don't do this when the string is sufficiently long. */ 07299 07300 if (req_byte >= 0 && end_subject - start_match < REQ_BYTE_MAX) 07301 { 07302 register const uschar *p = start_match + ((first_byte >= 0)? 1 : 0); 07303 07304 /* We don't need to repeat the search if we haven't yet reached the 07305 place we found it at last time. */ 07306 07307 if (p > req_byte_ptr) 07308 { 07309 if (req_byte_caseless) 07310 { 07311 while (p < end_subject) 07312 { 07313 register int pp = *p++; 07314 if (pp == req_byte || pp == req_byte2) { p--; break; } 07315 } 07316 } 07317 else 07318 { 07319 while (p < end_subject) 07320 { 07321 if (*p++ == req_byte) { p--; break; } 07322 } 07323 } 07324 07325 /* If we can't find the required character, break the matching loop */ 07326 07327 if (p >= end_subject) break; 07328 07329 /* If we have found the required character, save the point where we 07330 found it, so that we don't search again next time round the loop if 07331 the start hasn't passed this character yet. */ 07332 07333 req_byte_ptr = p; 07334 } 07335 } 07336 07337 /* When a match occurs, substrings will be set for all internal extractions; 07338 we just need to set up the whole thing as substring 0 before returning. If 07339 there were too many extractions, set the return code to zero. In the case 07340 where we had to get some local store to hold offsets for backreferences, copy 07341 those back references that we can. In this case there need not be overflow 07342 if certain parts of the pattern were not used. */ 07343 07344 match_block.start_match = start_match; 07345 match_block.match_call_count = 0; 07346 07347 rc = match(start_match, match_block.start_code, 2, &match_block, ims, NULL, 07348 match_isgroup); 07349 07350 if (rc == MATCH_NOMATCH) 07351 { 07352 start_match++; 07353 continue; 07354 } 07355 07356 if (rc != MATCH_MATCH) 07357 { 07358 DPRINTF((">>>> error: returning %d\n", rc)); 07359 return rc; 07360 } 07361 07362 /* We have a match! Copy the offset information from temporary store if 07363 necessary */ 07364 07365 if (using_temporary_offsets) 07366 { 07367 if (offsetcount >= 4) 07368 { 07369 memcpy(offsets + 2, match_block.offset_vector + 2, 07370 (offsetcount - 2) * sizeof(int)); 07371 DPRINTF(("Copied offsets from temporary memory\n")); 07372 } 07373 if (match_block.end_offset_top > offsetcount) 07374 match_block.offset_overflow = true; 07375 07376 DPRINTF(("Freeing temporary memory\n")); 07377 free(match_block.offset_vector); 07378 } 07379 07380 rc = match_block.offset_overflow? 0 : match_block.end_offset_top/2; 07381 07382 if (offsetcount < 2) rc = 0; else 07383 { 07384 offsets[0] = start_match - match_block.start_subject; 07385 offsets[1] = match_block.end_match_ptr - match_block.start_subject; 07386 } 07387 07388 DPRINTF((">>>> returning %d\n", rc)); 07389 return rc; 07390 } 07391 07392 /* This "while" is the end of the "do" above */ 07393 07394 while (!anchored && start_match <= end_subject); 07395 07396 if (using_temporary_offsets) 07397 { 07398 DPRINTF(("Freeing temporary memory\n")); 07399 free(match_block.offset_vector); 07400 } 07401 07402 DPRINTF((">>>> returning PCRE_ERROR_NOMATCH\n")); 07403 07404 return PCRE_ERROR_NOMATCH; 07405 } 07406
const unsigned char* pcre_maketables | ( | void | ) |
Definition at line 842 of file pcre.cpp.
References cbit_cntrl, cbit_digit, cbit_graph, cbit_length, cbit_lower, cbit_print, cbit_punct, cbit_space, cbit_upper, cbit_word, cbit_xdigit, ctype_digit, ctype_letter, ctype_meta, ctype_space, ctype_word, ctype_xdigit, and tables_length.
00842 { 00843 unsigned char *yield, *p; 00844 int i; 00845 00846 yield = static_cast<unsigned char*>(malloc(tables_length)); 00847 00848 if (yield == NULL) return NULL; 00849 p = yield; 00850 00851 /* First comes the lower casing table */ 00852 00853 for (i = 0; i < 256; i++) *p++ = tolower(i); 00854 00855 /* Next the case-flipping table */ 00856 00857 for (i = 0; i < 256; i++) *p++ = islower(i)? toupper(i) : tolower(i); 00858 00859 /* Then the character class tables. Don't try to be clever and save effort 00860 on exclusive ones - in some locales things may be different. Note that the 00861 table for "space" includes everything "isspace" gives, including VT in the 00862 default locale. This makes it work for the POSIX class [:space:]. */ 00863 00864 memset(p, 0, cbit_length); 00865 for (i = 0; i < 256; i++) 00866 { 00867 if (isdigit(i)) 00868 { 00869 p[cbit_digit + i/8] |= 1 << (i&7); 00870 p[cbit_word + i/8] |= 1 << (i&7); 00871 } 00872 if (isupper(i)) 00873 { 00874 p[cbit_upper + i/8] |= 1 << (i&7); 00875 p[cbit_word + i/8] |= 1 << (i&7); 00876 } 00877 if (islower(i)) 00878 { 00879 p[cbit_lower + i/8] |= 1 << (i&7); 00880 p[cbit_word + i/8] |= 1 << (i&7); 00881 } 00882 if (i == '_') p[cbit_word + i/8] |= 1 << (i&7); 00883 if (isspace(i)) p[cbit_space + i/8] |= 1 << (i&7); 00884 if (isxdigit(i))p[cbit_xdigit + i/8] |= 1 << (i&7); 00885 if (isgraph(i)) p[cbit_graph + i/8] |= 1 << (i&7); 00886 if (isprint(i)) p[cbit_print + i/8] |= 1 << (i&7); 00887 if (ispunct(i)) p[cbit_punct + i/8] |= 1 << (i&7); 00888 if (iscntrl(i)) p[cbit_cntrl + i/8] |= 1 << (i&7); 00889 } 00890 p += cbit_length; 00891 00892 /* Finally, the character type table. In this, we exclude VT from the white 00893 space chars, because Perl doesn't recognize it as such for \s and for comments 00894 within regexes. */ 00895 00896 for (i = 0; i < 256; i++) 00897 { 00898 int x = 0; 00899 if (i != 0x0b && isspace(i)) x += ctype_space; 00900 if (isalpha(i)) x += ctype_letter; 00901 if (isdigit(i)) x += ctype_digit; 00902 if (isxdigit(i)) x += ctype_xdigit; 00903 if (isalnum(i) || i == '_') x += ctype_word; 00904 if (strchr("*+?{^.$|()[", i) != 0) x += ctype_meta; 00905 *p++ = x; 00906 } 00907 00908 return yield; 00909 } 00910
pcre_extra* pcre_study | ( | const pcre * | , | |
int | , | |||
const char ** | ||||
) |
Definition at line 1233 of file pcre.cpp.
References compile_data::cbits, cbits_offset, compile_data::ctypes, ctypes_offset, compile_data::fcc, fcc_offset, pcre_extra::flags, compile_data::lcc, lcc_offset, MAGIC_NUMBER, real_pcre::magic_number, real_pcre::name_count, real_pcre::name_entry_size, pcre_study_data::options, real_pcre::options, PCRE_ANCHORED, PCRE_CASELESS, PCRE_EXTRA_STUDY_DATA, PCRE_FIRSTSET, PCRE_STARTLINE, PCRE_STUDY_MAPPED, PCRE_UTF8, PUBLIC_STUDY_OPTIONS, set_start_bits(), pcre_study_data::size, pcre_study_data::start_bits, pcre_extra::study_data, and real_pcre::tables.
Referenced by CF_HAND(), and real_regrab().
01233 { 01234 uschar start_bits[32]; 01235 pcre_extra *extra; 01236 pcre_study_data *study; 01237 const real_pcre *re = (const real_pcre *)external_re; 01238 uschar *code = (uschar *)re + sizeof(real_pcre) + 01239 (re->name_count * re->name_entry_size); 01240 compile_data compile_block; 01241 01242 *errorptr = NULL; 01243 01244 if (re == NULL || re->magic_number != MAGIC_NUMBER) 01245 { 01246 *errorptr = "argument is not a compiled regular expression"; 01247 return NULL; 01248 } 01249 01250 if ((options & ~PUBLIC_STUDY_OPTIONS) != 0) 01251 { 01252 *errorptr = "unknown or incorrect option bit(s) set"; 01253 return NULL; 01254 } 01255 01256 /* For an anchored pattern, or an unanchored pattern that has a first char, or 01257 a multiline pattern that matches only at "line starts", no further processing 01258 at present. */ 01259 01260 if ((re->options & (PCRE_ANCHORED|PCRE_FIRSTSET|PCRE_STARTLINE)) != 0) 01261 return NULL; 01262 01263 /* Set the character tables in the block which is passed around */ 01264 01265 compile_block.lcc = re->tables + lcc_offset; 01266 compile_block.fcc = re->tables + fcc_offset; 01267 compile_block.cbits = re->tables + cbits_offset; 01268 compile_block.ctypes = re->tables + ctypes_offset; 01269 01270 /* See if we can find a fixed set of initial characters for the pattern. */ 01271 01272 memset(start_bits, 0, 32 * sizeof(uschar)); 01273 if (!set_start_bits(code, start_bits, (re->options & PCRE_CASELESS) != 0, 01274 (re->options & PCRE_UTF8) != 0, &compile_block)) return NULL; 01275 01276 /* Get a pcre_extra block and a pcre_study_data block. The study data is put in 01277 the latter, which is pointed to by the former, which may also get additional 01278 data set later by the calling program. At the moment, the size of 01279 pcre_study_data is fixed. We nevertheless save it in a field for returning via 01280 the pcre_fullinfo() function so that if it becomes variable in the future, we 01281 don't have to change that code. */ 01282 01283 extra = static_cast<pcre_extra *>(malloc(sizeof(pcre_extra) + sizeof(pcre_study_data))); 01284 01285 if (extra == NULL) 01286 { 01287 *errorptr = "failed to get memory"; 01288 return NULL; 01289 } 01290 01291 // Hmm. 01292 study = reinterpret_cast<pcre_study_data *>(reinterpret_cast<char*>(extra) + sizeof(pcre_extra)); 01293 extra->flags = PCRE_EXTRA_STUDY_DATA; 01294 extra->study_data = study; 01295 01296 study->size = sizeof(pcre_study_data); 01297 study->options = PCRE_STUDY_MAPPED; 01298 memcpy(study->start_bits, start_bits, sizeof(start_bits)); 01299 01300 return extra; 01301 } 01302