#include "autoconf.h"
#include "config.h"
#include <limits.h>
#include <string.h>
#include <ctype.h>
#include <stdlib.h>
#include <stddef.h>
#include "pcre.h"
#include "externs.h"
#include "timeutil.h"
Include dependency graph for pcre.cpp:
Go to the source code of this file.
Data Structures | |
struct | real_pcre |
struct | pcre_study_data |
struct | compile_data |
struct | branch_chain |
struct | recursion_info |
struct | match_data |
struct | eptrblock |
Defines | |
#define | LINK_SIZE 2 |
#define | MATCH_LIMIT 100000 |
#define | NEWLINE '\n' |
#define | PUT(a, n, d) |
#define | GET(a, n) (((a)[n] << 8) | (a)[(n)+1]) |
#define | MAX_PATTERN_SIZE (1 << 16) |
#define | PUTINC(a, n, d) PUT(a,n,d), a += LINK_SIZE |
#define | PUT2(a, n, d) |
#define | GET2(a, n) (((a)[n] << 8) | (a)[(n)+1]) |
#define | PUT2INC(a, n, d) PUT2(a,n,d), a += 2 |
#define | PCRE_IMS (PCRE_CASELESS|PCRE_MULTILINE|PCRE_DOTALL) |
#define | PCRE_FIRSTSET 0x40000000 |
#define | PCRE_REQCHSET 0x20000000 |
#define | PCRE_STARTLINE 0x10000000 |
#define | PCRE_ICHANGED 0x08000000 |
#define | PCRE_STUDY_MAPPED 0x01 |
#define | PUBLIC_OPTIONS |
#define | PUBLIC_EXEC_OPTIONS (PCRE_ANCHORED|PCRE_NOTBOL|PCRE_NOTEOL|PCRE_NOTEMPTY|PCRE_NO_UTF8_CHECK) |
#define | PUBLIC_STUDY_OPTIONS 0 |
#define | MAGIC_NUMBER 0x50435245UL |
#define | REQ_UNSET (-2) |
#define | REQ_NONE (-1) |
#define | REQ_CASELESS 0x0100 |
#define | REQ_VARY 0x0200 |
#define | ESC_e 27 |
#define | ESC_f '\f' |
#define | ESC_n NEWLINE |
#define | ESC_r '\r' |
#define | ESC_tee '\t' |
#define | XCL_NOT 0x01 |
#define | XCL_MAP 0x02 |
#define | XCL_END 0 |
#define | XCL_SINGLE 1 |
#define | XCL_RANGE 2 |
#define | OP_NAME_LIST |
#define | OP_LENGTHS |
#define | EXTRACT_BASIC_MAX 150 |
#define | CREF_RECURSE 0xffff |
#define | ERR1 "\\ at end of pattern" |
#define | ERR2 "\\c at end of pattern" |
#define | ERR3 "unrecognized character follows \\" |
#define | ERR4 "numbers out of order in {} quantifier" |
#define | ERR5 "number too big in {} quantifier" |
#define | ERR6 "missing terminating ] for character class" |
#define | ERR7 "invalid escape sequence in character class" |
#define | ERR8 "range out of order in character class" |
#define | ERR9 "nothing to repeat" |
#define | ERR10 "operand of unlimited repeat could match the empty string" |
#define | ERR11 "internal error: unexpected repeat" |
#define | ERR12 "unrecognized character after (?" |
#define | ERR13 "POSIX named classes are supported only within a class" |
#define | ERR14 "missing )" |
#define | ERR15 "reference to non-existent subpattern" |
#define | ERR16 "erroffset passed as NULL" |
#define | ERR17 "unknown option bit(s) set" |
#define | ERR18 "missing ) after comment" |
#define | ERR19 "parentheses nested too deeply" |
#define | ERR20 "regular expression too large" |
#define | ERR21 "failed to get memory" |
#define | ERR22 "unmatched parentheses" |
#define | ERR23 "internal error: code overflow" |
#define | ERR24 "unrecognized character after (?<" |
#define | ERR25 "lookbehind assertion is not fixed length" |
#define | ERR26 "malformed number after (?(" |
#define | ERR27 "conditional group contains more than two branches" |
#define | ERR28 "assertion expected after (?(" |
#define | ERR29 "(?R or (?digits must be followed by )" |
#define | ERR30 "unknown POSIX class name" |
#define | ERR31 "POSIX collating elements are not supported" |
#define | ERR32 "this version of PCRE is not compiled with PCRE_UTF8 support" |
#define | ERR33 "spare error" |
#define | ERR34 "character value in \\x{...} sequence is too large" |
#define | ERR35 "invalid condition (?(0)" |
#define | ERR36 "\\C not allowed in lookbehind assertion" |
#define | ERR37 "PCRE does not support \\L, \\l, \\N, \\P, \\p, \\U, \\u, or \\X" |
#define | ERR38 "number after (?C is > 255" |
#define | ERR39 "closing ) for (?C expected" |
#define | ERR40 "recursive call could loop indefinitely" |
#define | ERR41 "unrecognized character after (?P" |
#define | ERR42 "syntax error after (?P" |
#define | ERR43 "two named groups have the same name" |
#define | ERR44 "invalid UTF-8 string" |
#define | ctype_space 0x01 |
#define | ctype_letter 0x02 |
#define | ctype_digit 0x04 |
#define | ctype_xdigit 0x08 |
#define | ctype_word 0x10 |
#define | ctype_meta 0x80 |
#define | cbit_space 0 |
#define | cbit_xdigit 32 |
#define | cbit_digit 64 |
#define | cbit_upper 96 |
#define | cbit_lower 128 |
#define | cbit_word 160 |
#define | cbit_graph 192 |
#define | cbit_print 224 |
#define | cbit_punct 256 |
#define | cbit_cntrl 288 |
#define | cbit_length 320 |
#define | lcc_offset 0 |
#define | fcc_offset 256 |
#define | cbits_offset 512 |
#define | ctypes_offset (cbits_offset + cbit_length) |
#define | tables_length (ctypes_offset + 256) |
#define | DPRINTF(p) |
#define | BRASTACK_SIZE 200 |
#define | REC_STACK_SAVE_MAX 30 |
#define | MAXLIT 250 |
#define | REQ_BYTE_MAX 1000 |
#define | match_condassert 0x01 |
#define | match_isgroup 0x02 |
#define | MATCH_MATCH 1 |
#define | MATCH_NOMATCH 0 |
#define | GETCHAR(c, eptr) c = *eptr; |
#define | GETCHARINC(c, eptr) c = *eptr++; |
#define | GETCHARINCTEST(c, eptr) c = *eptr++; |
#define | GETCHARLEN(c, eptr, len) c = *eptr; |
#define | BACKCHAR(eptr) |
#define | REGISTER register |
#define | RMATCH(rx, ra, rb, rc, rd, re, rf, rg) rx = match(ra,rb,rc,rd,re,rf,rg) |
#define | RRETURN(ra) return ra |
#define | fi i |
#define | fc c |
Typedefs | |
typedef unsigned char | uschar |
Enumerations | |
enum | { ESC_A = 1, ESC_G, ESC_B, ESC_b, ESC_D, ESC_d, ESC_S, ESC_s, ESC_W, ESC_w, ESC_dum1, ESC_C, ESC_Z, ESC_z, ESC_E, ESC_Q, ESC_REF } |
enum | { OP_END, OP_SOD, OP_SOM, OP_NOT_WORD_BOUNDARY, OP_WORD_BOUNDARY, OP_NOT_DIGIT, OP_DIGIT, OP_NOT_WHITESPACE, OP_WHITESPACE, OP_NOT_WORDCHAR, OP_WORDCHAR, OP_ANY, OP_ANYBYTE, OP_EODN, OP_EOD, OP_OPT, OP_CIRC, OP_DOLL, OP_CHARS, OP_NOT, OP_STAR, OP_MINSTAR, OP_PLUS, OP_MINPLUS, OP_QUERY, OP_MINQUERY, OP_UPTO, OP_MINUPTO, OP_EXACT, OP_NOTSTAR, OP_NOTMINSTAR, OP_NOTPLUS, OP_NOTMINPLUS, OP_NOTQUERY, OP_NOTMINQUERY, OP_NOTUPTO, OP_NOTMINUPTO, OP_NOTEXACT, OP_TYPESTAR, OP_TYPEMINSTAR, OP_TYPEPLUS, OP_TYPEMINPLUS, OP_TYPEQUERY, OP_TYPEMINQUERY, OP_TYPEUPTO, OP_TYPEMINUPTO, OP_TYPEEXACT, OP_CRSTAR, OP_CRMINSTAR, OP_CRPLUS, OP_CRMINPLUS, OP_CRQUERY, OP_CRMINQUERY, OP_CRRANGE, OP_CRMINRANGE, OP_CLASS, OP_NCLASS, OP_XCLASS, OP_REF, OP_RECURSE, OP_CALLOUT, OP_ALT, OP_KET, OP_KETRMAX, OP_KETRMIN, OP_ASSERT, OP_ASSERT_NOT, OP_ASSERTBACK, OP_ASSERTBACK_NOT, OP_REVERSE, OP_ONCE, OP_COND, OP_CREF, OP_BRAZERO, OP_BRAMINZERO, OP_BRANUMBER, OP_BRA } |
Functions | |
int | pcre_copy_substring (const char *subject, int *ovector, int stringcount, int stringnumber, char *buffer, int size) |
const unsigned char * | pcre_maketables (void) |
static void | set_bit (uschar *start_bits, int c, bool caseless, compile_data *cd) |
static bool | set_start_bits (const uschar *code, uschar *start_bits, bool caseless, bool utf8, compile_data *cd) |
pcre_extra * | pcre_study (const pcre *external_re, int options, const char **errorptr) |
static bool | compile_regex (int, int, int *, uschar **, const uschar **, const char **, bool, int, int *, int *, branch_chain *, compile_data *) |
static int | check_escape (const uschar **ptrptr, const char **errorptr, int bracount, int options, bool isclass) |
static bool | is_counted_repeat (const uschar *p) |
static const uschar * | read_repeat_counts (const uschar *p, int *minp, int *maxp, const char **errorptr) |
static const uschar * | first_significant_code (const uschar *code, int *options, int optbit) |
static int | find_fixedlength (uschar *code, int options) |
static const uschar * | find_bracket (const uschar *code, int number) |
static const uschar * | find_recurse (const uschar *code, bool utf8) |
static bool | could_be_empty_branch (const uschar *code, const uschar *endcode, bool utf8) |
static bool | could_be_empty (const uschar *code, const uschar *endcode, branch_chain *bcptr, bool utf8) |
static bool | check_posix_syntax (const uschar *ptr, const uschar **endptr, compile_data *cd) |
static int | check_posix_name (const uschar *ptr, int len) |
static void | adjust_recurse (uschar *group, int adjust, bool utf8, compile_data *cd) |
static bool | compile_branch (int *optionsptr, int *brackets, uschar **codeptr, const uschar **ptrptr, const char **errorptr, int *firstbyteptr, int *reqbyteptr, branch_chain *bcptr, compile_data *cd) |
static bool | is_anchored (register const uschar *code, int *options, unsigned int bracket_map, unsigned int backref_map) |
static bool | is_startline (const uschar *code, unsigned int bracket_map, unsigned int backref_map) |
static int | find_firstassertedchar (const uschar *code, int *options, bool inassert) |
pcre * | pcre_compile (const char *pattern, int options, const char **errorptr, int *erroroffset, const unsigned char *tables) |
static bool | match_ref (int offset, register const uschar *eptr, int length, match_data *md, unsigned long int ims) |
static int | match (REGISTER const uschar *eptr, REGISTER const uschar *ecode, int offset_top, match_data *md, unsigned long int ims, eptrblock *eptrb, int flags) |
int | pcre_exec (const pcre *external_re, const pcre_extra *extra_data, const char *subject, int length, int start_offset, int options, int *offsets, int offsetcount) |
Variables | |
static unsigned char | pcre_default_tables [] |
static const uschar | OP_lengths [] = { OP_LENGTHS } |
static const char | rep_min [] = { 0, 0, 1, 1, 0, 0 } |
static const char | rep_max [] = { 0, 0, 0, 0, 1, 1 } |
static const short int | escapes [] |
static const char *const | posix_names [] |
static const uschar | posix_name_lengths [] |
static const int | posix_class_maps [] |
static const unsigned char | digitab [] |
int(*) | pcre_callout (pcre_callout_block *) = NULL |
#define BRASTACK_SIZE 200 |
#define cbit_cntrl 288 |
#define cbit_digit 64 |
Definition at line 578 of file pcre.cpp.
Referenced by compile_branch(), pcre_maketables(), and set_start_bits().
#define cbit_graph 192 |
#define cbit_length 320 |
#define cbit_lower 128 |
#define cbit_print 224 |
#define cbit_punct 256 |
#define cbit_space 0 |
Definition at line 576 of file pcre.cpp.
Referenced by compile_branch(), pcre_maketables(), and set_start_bits().
#define cbit_upper 96 |
#define cbit_word 160 |
Definition at line 581 of file pcre.cpp.
Referenced by compile_branch(), pcre_maketables(), and set_start_bits().
#define cbit_xdigit 32 |
#define cbits_offset 512 |
#define CREF_RECURSE 0xffff |
#define ctype_digit 0x04 |
Definition at line 568 of file pcre.cpp.
Referenced by check_escape(), compile_branch(), is_counted_repeat(), match(), pcre_compile(), pcre_maketables(), and read_repeat_counts().
#define ctype_letter 0x02 |
Definition at line 567 of file pcre.cpp.
Referenced by check_posix_syntax(), pcre_maketables(), and set_bit().
#define ctype_meta 0x80 |
Definition at line 571 of file pcre.cpp.
Referenced by compile_branch(), pcre_compile(), and pcre_maketables().
#define ctype_space 0x01 |
Definition at line 566 of file pcre.cpp.
Referenced by compile_branch(), match(), pcre_compile(), and pcre_maketables().
#define ctype_word 0x10 |
Definition at line 570 of file pcre.cpp.
Referenced by match(), pcre_compile(), and pcre_maketables().
#define ctype_xdigit 0x08 |
#define ctypes_offset (cbits_offset + cbit_length) |
Definition at line 594 of file pcre.cpp.
Referenced by pcre_compile(), pcre_exec(), and pcre_study().
#define DPRINTF | ( | p | ) |
Definition at line 1306 of file pcre.cpp.
Referenced by compile_regex(), match(), pcre_compile(), and pcre_exec().
#define ERR1 "\\ at end of pattern" |
#define ERR10 "operand of unlimited repeat could match the empty string" |
#define ERR11 "internal error: unexpected repeat" |
#define ERR12 "unrecognized character after (?" |
#define ERR13 "POSIX named classes are supported only within a class" |
#define ERR14 "missing )" |
#define ERR15 "reference to non-existent subpattern" |
#define ERR16 "erroffset passed as NULL" |
#define ERR17 "unknown option bit(s) set" |
#define ERR18 "missing ) after comment" |
#define ERR19 "parentheses nested too deeply" |
#define ERR2 "\\c at end of pattern" |
#define ERR20 "regular expression too large" |
#define ERR21 "failed to get memory" |
#define ERR22 "unmatched parentheses" |
#define ERR23 "internal error: code overflow" |
#define ERR24 "unrecognized character after (?<" |
#define ERR25 "lookbehind assertion is not fixed length" |
#define ERR26 "malformed number after (?(" |
#define ERR27 "conditional group contains more than two branches" |
#define ERR28 "assertion expected after (?(" |
#define ERR29 "(?R or (?digits must be followed by )" |
#define ERR3 "unrecognized character follows \\" |
#define ERR30 "unknown POSIX class name" |
#define ERR31 "POSIX collating elements are not supported" |
#define ERR32 "this version of PCRE is not compiled with PCRE_UTF8 support" |
#define ERR34 "character value in \\x{...} sequence is too large" |
#define ERR35 "invalid condition (?(0)" |
#define ERR36 "\\C not allowed in lookbehind assertion" |
#define ERR37 "PCRE does not support \\L, \\l, \\N, \\P, \\p, \\U, \\u, or \\X" |
#define ERR38 "number after (?C is > 255" |
#define ERR39 "closing ) for (?C expected" |
#define ERR4 "numbers out of order in {} quantifier" |
#define ERR40 "recursive call could loop indefinitely" |
#define ERR41 "unrecognized character after (?P" |
#define ERR42 "syntax error after (?P" |
#define ERR43 "two named groups have the same name" |
#define ERR5 "number too big in {} quantifier" |
#define ERR6 "missing terminating ] for character class" |
#define ERR7 "invalid escape sequence in character class" |
#define ERR8 "range out of order in character class" |
#define ERR9 "nothing to repeat" |
#define EXTRACT_BASIC_MAX 150 |
Definition at line 401 of file pcre.cpp.
Referenced by compile_branch(), find_bracket(), is_anchored(), is_startline(), match(), and pcre_compile().
#define fc c |
Referenced by do_mail_stats(), and match().
#define fcc_offset 256 |
Definition at line 592 of file pcre.cpp.
Referenced by pcre_compile(), pcre_exec(), and pcre_study().
#define fi i |
Referenced by match().
#define GET | ( | a, | |||
n | ) | (((a)[n] << 8) | (a)[(n)+1]) |
Definition at line 82 of file pcre.cpp.
Referenced by adjust_recurse(), compile_branch(), compile_regex(), could_be_empty_branch(), find_firstassertedchar(), find_fixedlength(), first_significant_code(), is_anchored(), is_startline(), match(), and set_start_bits().
#define GET2 | ( | a, | |||
n | ) | (((a)[n] << 8) | (a)[(n)+1]) |
Definition at line 101 of file pcre.cpp.
Referenced by compile_branch(), could_be_empty_branch(), find_bracket(), find_fixedlength(), is_anchored(), is_startline(), and match().
#define GETCHARINCTEST | ( | c, | |||
eptr | ) | c = *eptr++; |
#define lcc_offset 0 |
Definition at line 591 of file pcre.cpp.
Referenced by pcre_compile(), pcre_exec(), and pcre_study().
#define LINK_SIZE 2 |
Definition at line 57 of file pcre.cpp.
Referenced by adjust_recurse(), compile_branch(), compile_regex(), could_be_empty_branch(), find_bracket(), find_firstassertedchar(), find_fixedlength(), is_anchored(), is_startline(), match(), pcre_compile(), and set_start_bits().
#define MAGIC_NUMBER 0x50435245UL |
Definition at line 140 of file pcre.cpp.
Referenced by pcre_compile(), pcre_exec(), and pcre_study().
#define match_isgroup 0x02 |
#define MATCH_LIMIT 100000 |
#define MATCH_MATCH 1 |
#define MATCH_NOMATCH 0 |
#define MAX_PATTERN_SIZE (1 << 16) |
#define MAXLIT 250 |
#define NEWLINE '\n' |
Definition at line 59 of file pcre.cpp.
Referenced by compile_branch(), match(), pcre_compile(), and pcre_exec().
#define OP_NAME_LIST |
Value:
"End", "\\A", "\\G", "\\B", "\\b", "\\D", "\\d", \ "\\S", "\\s", "\\W", "\\w", "Any", "Anybyte", "\\Z", "\\z", \ "Opt", "^", "$", "chars", "not", \ "*", "*?", "+", "+?", "?", "??", "{", "{", "{", \ "*", "*?", "+", "+?", "?", "??", "{", "{", "{", \ "*", "*?", "+", "+?", "?", "??", "{", "{", "{", \ "*", "*?", "+", "+?", "?", "??", "{", "{", \ "class", "nclass", "xclass", "Ref", "Recurse", "Callout", \ "Alt", "Ket", "KetRmax", "KetRmin", "Assert", "Assert not", \ "AssertB", "AssertB not", "Reverse", "Once", "Cond", "Cond ref",\ "Brazero", "Braminzero", "Branumber", "Bra"
#define PCRE_FIRSTSET 0x40000000 |
Definition at line 116 of file pcre.cpp.
Referenced by pcre_compile(), pcre_exec(), and pcre_study().
#define PCRE_ICHANGED 0x08000000 |
#define PCRE_IMS (PCRE_CASELESS|PCRE_MULTILINE|PCRE_DOTALL) |
Definition at line 108 of file pcre.cpp.
Referenced by compile_branch(), compile_regex(), match(), and pcre_compile().
#define PCRE_REQCHSET 0x20000000 |
#define PCRE_STARTLINE 0x10000000 |
Definition at line 118 of file pcre.cpp.
Referenced by pcre_compile(), pcre_exec(), and pcre_study().
#define PCRE_STUDY_MAPPED 0x01 |
#define PUBLIC_EXEC_OPTIONS (PCRE_ANCHORED|PCRE_NOTBOL|PCRE_NOTEOL|PCRE_NOTEMPTY|PCRE_NO_UTF8_CHECK) |
#define PUBLIC_OPTIONS |
Value:
(PCRE_CASELESS|PCRE_EXTENDED|PCRE_ANCHORED|PCRE_MULTILINE| \ PCRE_DOTALL|PCRE_DOLLAR_ENDONLY|PCRE_EXTRA|PCRE_UNGREEDY|PCRE_UTF8| \ PCRE_NO_AUTO_CAPTURE|PCRE_NO_UTF8_CHECK)
Definition at line 128 of file pcre.cpp.
Referenced by pcre_compile().
#define PUBLIC_STUDY_OPTIONS 0 |
#define PUT | ( | a, | |||
n, | |||||
d | ) |
Value:
(a[n] = (d) >> 8), \ (a[(n)+1] = (d) & 255)
Definition at line 78 of file pcre.cpp.
Referenced by adjust_recurse(), compile_branch(), and compile_regex().
#define PUT2 | ( | a, | |||
n, | |||||
d | ) |
Value:
a[n] = (d) >> 8; \ a[(n)+1] = (d) & 255
Definition at line 97 of file pcre.cpp.
Referenced by compile_branch().
#define PUT2INC | ( | a, | |||
n, | |||||
d | ) | PUT2(a,n,d), a += 2 |
#define PUTINC | ( | a, | |||
n, | |||||
d | ) | PUT(a,n,d), a += LINK_SIZE |
#define REQ_BYTE_MAX 1000 |
#define REQ_CASELESS 0x0100 |
Definition at line 150 of file pcre.cpp.
Referenced by compile_branch(), find_firstassertedchar(), pcre_compile(), and pcre_exec().
#define REQ_NONE (-1) |
#define REQ_UNSET (-2) |
#define REQ_VARY 0x0200 |
Definition at line 151 of file pcre.cpp.
Referenced by compile_branch(), compile_regex(), and pcre_compile().
#define RMATCH | ( | rx, | |||
ra, | |||||
rb, | |||||
rc, | |||||
rd, | |||||
re, | |||||
rf, | |||||
rg | ) | rx = match(ra,rb,rc,rd,re,rf,rg) |
#define tables_length (ctypes_offset + 256) |
anonymous enum |
ESC_A | |
ESC_G | |
ESC_B | |
ESC_b | |
ESC_D | |
ESC_d | |
ESC_S | |
ESC_s | |
ESC_W | |
ESC_w | |
ESC_dum1 | |
ESC_C | |
ESC_Z | |
ESC_z | |
ESC_E | |
ESC_Q | |
ESC_REF |
Definition at line 193 of file pcre.cpp.
00193 { ESC_A = 1, ESC_G, ESC_B, ESC_b, ESC_D, ESC_d, ESC_S, ESC_s, ESC_W, 00194 ESC_w, ESC_dum1, ESC_C, ESC_Z, ESC_z, ESC_E, ESC_Q, ESC_REF };
anonymous enum |
Definition at line 213 of file pcre.cpp.
00213 { 00214 OP_END, /* 0 End of pattern */ 00215 00216 /* Values corresponding to backslashed metacharacters */ 00217 00218 OP_SOD, /* 1 Start of data: \A */ 00219 OP_SOM, /* 2 Start of match (subject + offset): \G */ 00220 OP_NOT_WORD_BOUNDARY, /* 3 \B */ 00221 OP_WORD_BOUNDARY, /* 4 \b */ 00222 OP_NOT_DIGIT, /* 5 \D */ 00223 OP_DIGIT, /* 6 \d */ 00224 OP_NOT_WHITESPACE, /* 7 \S */ 00225 OP_WHITESPACE, /* 8 \s */ 00226 OP_NOT_WORDCHAR, /* 9 \W */ 00227 OP_WORDCHAR, /* 10 \w */ 00228 OP_ANY, /* 11 Match any character */ 00229 OP_ANYBYTE, /* 12 Match any byte (\C); different to OP_ANY for UTF-8 */ 00230 OP_EODN, /* 13 End of data or \n at end of data: \Z. */ 00231 OP_EOD, /* 14 End of data: \z */ 00232 00233 OP_OPT, /* 15 Set runtime options */ 00234 OP_CIRC, /* 16 Start of line - varies with multiline switch */ 00235 OP_DOLL, /* 17 End of line - varies with multiline switch */ 00236 OP_CHARS, /* 18 Match string of characters */ 00237 OP_NOT, /* 19 Match anything but the following char */ 00238 00239 OP_STAR, /* 20 The maximizing and minimizing versions of */ 00240 OP_MINSTAR, /* 21 all these opcodes must come in pairs, with */ 00241 OP_PLUS, /* 22 the minimizing one second. */ 00242 OP_MINPLUS, /* 23 This first set applies to single characters */ 00243 OP_QUERY, /* 24 */ 00244 OP_MINQUERY, /* 25 */ 00245 OP_UPTO, /* 26 From 0 to n matches */ 00246 OP_MINUPTO, /* 27 */ 00247 OP_EXACT, /* 28 Exactly n matches */ 00248 00249 OP_NOTSTAR, /* 29 The maximizing and minimizing versions of */ 00250 OP_NOTMINSTAR, /* 30 all these opcodes must come in pairs, with */ 00251 OP_NOTPLUS, /* 31 the minimizing one second. */ 00252 OP_NOTMINPLUS, /* 32 This set applies to "not" single characters */ 00253 OP_NOTQUERY, /* 33 */ 00254 OP_NOTMINQUERY, /* 34 */ 00255 OP_NOTUPTO, /* 35 From 0 to n matches */ 00256 OP_NOTMINUPTO, /* 36 */ 00257 OP_NOTEXACT, /* 37 Exactly n matches */ 00258 00259 OP_TYPESTAR, /* 38 The maximizing and minimizing versions of */ 00260 OP_TYPEMINSTAR, /* 39 all these opcodes must come in pairs, with */ 00261 OP_TYPEPLUS, /* 40 the minimizing one second. These codes must */ 00262 OP_TYPEMINPLUS, /* 41 be in exactly the same order as those above. */ 00263 OP_TYPEQUERY, /* 42 This set applies to character types such as \d */ 00264 OP_TYPEMINQUERY, /* 43 */ 00265 OP_TYPEUPTO, /* 44 From 0 to n matches */ 00266 OP_TYPEMINUPTO, /* 45 */ 00267 OP_TYPEEXACT, /* 46 Exactly n matches */ 00268 00269 OP_CRSTAR, /* 47 The maximizing and minimizing versions of */ 00270 OP_CRMINSTAR, /* 48 all these opcodes must come in pairs, with */ 00271 OP_CRPLUS, /* 49 the minimizing one second. These codes must */ 00272 OP_CRMINPLUS, /* 50 be in exactly the same order as those above. */ 00273 OP_CRQUERY, /* 51 These are for character classes and back refs */ 00274 OP_CRMINQUERY, /* 52 */ 00275 OP_CRRANGE, /* 53 These are different to the three seta above. */ 00276 OP_CRMINRANGE, /* 54 */ 00277 00278 OP_CLASS, /* 55 Match a character class, chars < 256 only */ 00279 OP_NCLASS, /* 56 Same, but the bitmap was created from a negative 00280 class - the difference is relevant only when a UTF-8 00281 character > 255 is encountered. */ 00282 00283 OP_XCLASS, /* 57 Extended class for handling UTF-8 chars within the 00284 class. This does both positive and negative. */ 00285 00286 OP_REF, /* 58 Match a back reference */ 00287 OP_RECURSE, /* 59 Match a numbered subpattern (possibly recursive) */ 00288 OP_CALLOUT, /* 60 Call out to external function if provided */ 00289 00290 OP_ALT, /* 61 Start of alternation */ 00291 OP_KET, /* 62 End of group that doesn't have an unbounded repeat */ 00292 OP_KETRMAX, /* 63 These two must remain together and in this */ 00293 OP_KETRMIN, /* 64 order. They are for groups the repeat for ever. */ 00294 00295 /* The assertions must come before ONCE and COND */ 00296 00297 OP_ASSERT, /* 65 Positive lookahead */ 00298 OP_ASSERT_NOT, /* 66 Negative lookahead */ 00299 OP_ASSERTBACK, /* 67 Positive lookbehind */ 00300 OP_ASSERTBACK_NOT, /* 68 Negative lookbehind */ 00301 OP_REVERSE, /* 69 Move pointer back - used in lookbehind assertions */ 00302 00303 /* ONCE and COND must come after the assertions, with ONCE first, as there's 00304 a test for >= ONCE for a subpattern that isn't an assertion. */ 00305 00306 OP_ONCE, /* 70 Once matched, don't back up into the subpattern */ 00307 OP_COND, /* 71 Conditional group */ 00308 OP_CREF, /* 72 Used to hold an extraction string number (cond ref) */ 00309 00310 OP_BRAZERO, /* 73 These two must remain together and in this */ 00311 OP_BRAMINZERO, /* 74 order. */ 00312 00313 OP_BRANUMBER, /* 75 Used for extracting brackets whose number is greater 00314 than can fit into an opcode. */ 00315 00316 OP_BRA /* 76 This and greater values are used for brackets that 00317 extract substrings up to a basic limit. After that, 00318 use is made of OP_BRANUMBER. */ 00319 };
static void adjust_recurse | ( | uschar * | group, | |
int | adjust, | |||
bool | utf8, | |||
compile_data * | cd | |||
) | [static] |
Definition at line 2299 of file pcre.cpp.
References find_recurse(), GET, LINK_SIZE, PUT, and compile_data::start_code.
Referenced by compile_branch().
02299 { 02300 uschar *ptr = group; 02301 while ((ptr = (uschar *)find_recurse(ptr, utf8)) != NULL) 02302 { 02303 int offset = GET(ptr, 1); 02304 if (cd->start_code + offset >= group) PUT(ptr, 1, offset + adjust); 02305 ptr += 1 + LINK_SIZE; 02306 } 02307 } 02308
static int check_escape | ( | const uschar ** | ptrptr, | |
const char ** | errorptr, | |||
int | bracount, | |||
int | options, | |||
bool | isclass | |||
) | [static] |
Definition at line 1525 of file pcre.cpp.
References ctype_digit, ctype_xdigit, digitab, ERR1, ERR2, ERR3, ERR37, ESC_REF, escapes, and PCRE_EXTRA.
Referenced by compile_branch(), and pcre_compile().
01526 { 01527 const uschar *ptr = *ptrptr; 01528 int c, i; 01529 01530 /* If backslash is at the end of the pattern, it's an error. */ 01531 01532 c = *(++ptr); 01533 if (c == 0) *errorptr = ERR1; 01534 01535 /* Non-alphamerics are literals. For digits or letters, do an initial lookup in 01536 a table. A non-zero result is something that can be returned immediately. 01537 Otherwise further processing may be required. */ 01538 01539 else if (c < '0' || c > 'z') {} /* Not alphameric */ 01540 else if ((i = escapes[c - '0']) != 0) c = i; 01541 01542 /* Escapes that need further processing, or are illegal. */ 01543 01544 else 01545 { 01546 const uschar *oldptr; 01547 switch (c) 01548 { 01549 /* A number of Perl escapes are not handled by PCRE. We give an explicit 01550 error. */ 01551 01552 case 'l': 01553 case 'L': 01554 case 'N': 01555 case 'p': 01556 case 'P': 01557 case 'u': 01558 case 'U': 01559 case 'X': 01560 *errorptr = ERR37; 01561 break; 01562 01563 /* The handling of escape sequences consisting of a string of digits 01564 starting with one that is not zero is not straightforward. By experiment, 01565 the way Perl works seems to be as follows: 01566 01567 Outside a character class, the digits are read as a decimal number. If the 01568 number is less than 10, or if there are that many previous extracting 01569 left brackets, then it is a back reference. Otherwise, up to three octal 01570 digits are read to form an escaped byte. Thus \123 is likely to be octal 01571 123 (cf \0123, which is octal 012 followed by the literal 3). If the octal 01572 value is greater than 377, the least significant 8 bits are taken. Inside a 01573 character class, \ followed by a digit is always an octal number. */ 01574 01575 case '1': case '2': case '3': case '4': case '5': 01576 case '6': case '7': case '8': case '9': 01577 01578 if (!isclass) 01579 { 01580 oldptr = ptr; 01581 c -= '0'; 01582 while ((digitab[ptr[1]] & ctype_digit) != 0) 01583 c = c * 10 + *(++ptr) - '0'; 01584 if (c < 10 || c <= bracount) 01585 { 01586 c = -(ESC_REF + c); 01587 break; 01588 } 01589 ptr = oldptr; /* Put the pointer back and fall through */ 01590 } 01591 01592 /* Handle an octal number following \. If the first digit is 8 or 9, Perl 01593 generates a binary zero byte and treats the digit as a following literal. 01594 Thus we have to pull back the pointer by one. */ 01595 01596 if ((c = *ptr) >= '8') 01597 { 01598 ptr--; 01599 c = 0; 01600 break; 01601 } 01602 01603 /* \0 always starts an octal number, but we may drop through to here with a 01604 larger first octal digit. */ 01605 01606 case '0': 01607 c -= '0'; 01608 while(i++ < 2 && ptr[1] >= '0' && ptr[1] <= '7') 01609 c = c * 8 + *(++ptr) - '0'; 01610 c &= 255; /* Take least significant 8 bits */ 01611 break; 01612 01613 /* \x is complicated when UTF-8 is enabled. \x{ddd} is a character number 01614 which can be greater than 0xff, but only if the ddd are hex digits. */ 01615 01616 case 'x': 01617 01618 /* Read just a single hex char */ 01619 01620 c = 0; 01621 while (i++ < 2 && (digitab[ptr[1]] & ctype_xdigit) != 0) 01622 { 01623 int cc; /* Some compilers don't like ++ */ 01624 cc = *(++ptr); /* in initializers */ 01625 if (cc >= 'a') cc -= 32; /* Convert to upper case */ 01626 c = c * 16 + cc - ((cc < 'A')? '0' : ('A' - 10)); 01627 } 01628 break; 01629 01630 /* Other special escapes not starting with a digit are straightforward */ 01631 01632 case 'c': 01633 c = *(++ptr); 01634 if (c == 0) 01635 { 01636 *errorptr = ERR2; 01637 return 0; 01638 } 01639 01640 /* A letter is upper-cased; then the 0x40 bit is flipped. This coding 01641 is ASCII-specific, but then the whole concept of \cx is ASCII-specific. */ 01642 01643 if (c >= 'a' && c <= 'z') c -= 32; 01644 c ^= 0x40; 01645 break; 01646 01647 /* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any 01648 other alphameric following \ is an error if PCRE_EXTRA was set; otherwise, 01649 for Perl compatibility, it is a literal. This code looks a bit odd, but 01650 there used to be some cases other than the default, and there may be again 01651 in future, so I haven't "optimized" it. */ 01652 01653 default: 01654 if ((options & PCRE_EXTRA) != 0) 01655 { 01656 *errorptr = ERR3; 01657 } 01658 break; 01659 } 01660 } 01661 01662 *ptrptr = ptr; 01663 return c; 01664 } 01665
static int check_posix_name | ( | const uschar * | ptr, | |
int | len | |||
) | [static] |
Definition at line 2262 of file pcre.cpp.
References posix_name_lengths, and posix_names.
Referenced by compile_branch().
02262 { 02263 register int yield = 0; 02264 while (posix_name_lengths[yield] != 0) 02265 { 02266 if (len == posix_name_lengths[yield] && 02267 strncmp((const char *)ptr, posix_names[yield], len) == 0) return yield; 02268 yield++; 02269 } 02270 return -1; 02271 } 02272
static bool check_posix_syntax | ( | const uschar * | ptr, | |
const uschar ** | endptr, | |||
compile_data * | cd | |||
) | [static] |
Definition at line 2230 of file pcre.cpp.
References ctype_letter, and compile_data::ctypes.
Referenced by compile_branch(), and pcre_compile().
02230 { 02231 int terminator; /* Don't combine these lines; the Solaris cc */ 02232 terminator = *(++ptr); /* compiler warns about "non-constant" initializer. */ 02233 if (*(++ptr) == '^') ptr++; 02234 while ((cd->ctypes[*ptr] & ctype_letter) != 0) ptr++; 02235 if (*ptr == terminator && ptr[1] == ']') 02236 { 02237 *endptr = ptr; 02238 return true; 02239 } 02240 return false; 02241 } 02242
static bool compile_branch | ( | int * | optionsptr, | |
int * | brackets, | |||
uschar ** | codeptr, | |||
const uschar ** | ptrptr, | |||
const char ** | errorptr, | |||
int * | firstbyteptr, | |||
int * | reqbyteptr, | |||
branch_chain * | bcptr, | |||
compile_data * | cd | |||
) | [static] |
Definition at line 2336 of file pcre.cpp.
References adjust_recurse(), compile_data::backref_map, cbit_digit, cbit_space, cbit_word, compile_data::cbits, check_escape(), check_posix_name(), check_posix_syntax(), compile_regex(), could_be_empty(), CREF_RECURSE, ctype_digit, ctype_meta, ctype_space, compile_data::ctypes, digitab, ERR11, ERR13, ERR14, ERR15, ERR27, ERR30, ERR31, ERR35, ERR38, ERR40, ERR43, ERR7, ERR8, ERR9, ESC_b, ESC_D, ESC_d, ESC_Q, ESC_REF, ESC_S, ESC_s, ESC_W, ESC_w, ESC_Z, EXTRACT_BASIC_MAX, compile_data::fcc, find_bracket(), GET, GET2, is_counted_repeat(), LINK_SIZE, MAXLIT, compile_data::name_entry_size, compile_data::name_table, compile_data::names_found, NEWLINE, OP_ANY, OP_ASSERT, OP_ASSERT_NOT, OP_ASSERTBACK, OP_ASSERTBACK_NOT, OP_BRA, OP_BRANUMBER, OP_BRAZERO, OP_CALLOUT, OP_CHARS, OP_CIRC, OP_CLASS, OP_COND, OP_CREF, OP_CRPLUS, OP_CRQUERY, OP_CRRANGE, OP_CRSTAR, OP_DOLL, OP_END, OP_EODN, OP_EXACT, OP_KET, OP_KETRMAX, OP_NCLASS, OP_NOT, OP_NOTSTAR, OP_ONCE, OP_OPT, OP_PLUS, OP_QUERY, OP_RECURSE, OP_REF, OP_STAR, OP_TYPESTAR, OP_UPTO, PCRE_CASELESS, PCRE_DOTALL, PCRE_EXTENDED, PCRE_EXTRA, PCRE_IMS, PCRE_MULTILINE, PCRE_NO_AUTO_CAPTURE, PCRE_UNGREEDY, posix_class_maps, PUT, PUT2, PUT2INC, PUTINC, read_repeat_counts(), REQ_CASELESS, REQ_NONE, REQ_UNSET, REQ_VARY, compile_data::req_varyopt, compile_data::start_code, and compile_data::top_backref.
Referenced by compile_regex().
02338 { 02339 int repeat_type, op_type; 02340 int repeat_min = 0, repeat_max = 0; /* To please picky compilers */ 02341 int bravalue = 0; 02342 int length; 02343 int greedy_default, greedy_non_default; 02344 int firstbyte, reqbyte; 02345 int zeroreqbyte, zerofirstbyte; 02346 int req_caseopt, reqvary, tempreqvary; 02347 int condcount = 0; 02348 int options = *optionsptr; 02349 register int c; 02350 register uschar *code = *codeptr; 02351 uschar *tempcode; 02352 bool inescq = false; 02353 bool groupsetfirstbyte = false; 02354 const uschar *ptr = *ptrptr; 02355 const uschar *tempptr; 02356 uschar *previous = NULL; 02357 uschar classa[32]; 02358 02359 bool utf8 = false; 02360 02361 /* Set up the default and non-default settings for greediness */ 02362 02363 greedy_default = ((options & PCRE_UNGREEDY) != 0); 02364 greedy_non_default = greedy_default ^ 1; 02365 02366 /* Initialize no first char, no required char. REQ_UNSET means "no char 02367 matching encountered yet". It gets changed to REQ_NONE if we hit something that 02368 matches a non-fixed char first char; reqbyte just remains unset if we never 02369 find one. 02370 02371 When we hit a repeat whose minimum is zero, we may have to adjust these values 02372 to take the zero repeat into account. This is implemented by setting them to 02373 zerofirstbyte and zeroreqbyte when such a repeat is encountered. The individual 02374 item types that can be repeated set these backoff variables appropriately. */ 02375 02376 firstbyte = reqbyte = zerofirstbyte = zeroreqbyte = REQ_UNSET; 02377 02378 /* The variable req_caseopt contains either the REQ_CASELESS value or zero, 02379 according to the current setting of the caseless flag. REQ_CASELESS is a bit 02380 value > 255. It is added into the firstbyte or reqbyte variables to record the 02381 case status of the value. */ 02382 02383 req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0; 02384 02385 /* Switch on next character until the end of the branch */ 02386 02387 for (;; ptr++) 02388 { 02389 bool negate_class; 02390 bool possessive_quantifier; 02391 int class_charcount; 02392 int class_lastchar; 02393 int newoptions; 02394 int recno; 02395 int skipbytes; 02396 int subreqbyte; 02397 int subfirstbyte; 02398 02399 c = *ptr; 02400 if (inescq && c != 0) goto NORMAL_CHAR; 02401 02402 if ((options & PCRE_EXTENDED) != 0) 02403 { 02404 if ((cd->ctypes[c] & ctype_space) != 0) continue; 02405 if (c == '#') 02406 { 02407 /* The space before the ; is to avoid a warning on a silly compiler 02408 on the Macintosh. */ 02409 while ((c = *(++ptr)) != 0 && c != NEWLINE) ; 02410 if (c != 0) continue; /* Else fall through to handle end of string */ 02411 } 02412 } 02413 02414 switch(c) 02415 { 02416 /* The branch terminates at end of string, |, or ). */ 02417 02418 case 0: 02419 case '|': 02420 case ')': 02421 *firstbyteptr = firstbyte; 02422 *reqbyteptr = reqbyte; 02423 *codeptr = code; 02424 *ptrptr = ptr; 02425 return true; 02426 02427 /* Handle single-character metacharacters. In multiline mode, ^ disables 02428 the setting of any following char as a first character. */ 02429 02430 case '^': 02431 if ((options & PCRE_MULTILINE) != 0) 02432 { 02433 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE; 02434 } 02435 previous = NULL; 02436 *code++ = OP_CIRC; 02437 break; 02438 02439 case '$': 02440 previous = NULL; 02441 *code++ = OP_DOLL; 02442 break; 02443 02444 /* There can never be a first char if '.' is first, whatever happens about 02445 repeats. The value of reqbyte doesn't change either. */ 02446 02447 case '.': 02448 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE; 02449 zerofirstbyte = firstbyte; 02450 zeroreqbyte = reqbyte; 02451 previous = code; 02452 *code++ = OP_ANY; 02453 break; 02454 02455 /* Character classes. If the included characters are all < 255 in value, we 02456 build a 32-byte bitmap of the permitted characters, except in the special 02457 case where there is only one such character. For negated classes, we build 02458 the map as usual, then invert it at the end. However, we use a different 02459 opcode so that data characters > 255 can be handled correctly. 02460 02461 If the class contains characters outside the 0-255 range, a different 02462 opcode is compiled. It may optionally have a bit map for characters < 256, 02463 but those above are are explicitly listed afterwards. A flag byte tells 02464 whether the bitmap is present, and whether this is a negated class or not. 02465 */ 02466 02467 case '[': 02468 previous = code; 02469 02470 /* PCRE supports POSIX class stuff inside a class. Perl gives an error if 02471 they are encountered at the top level, so we'll do that too. */ 02472 02473 if ((ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') && 02474 check_posix_syntax(ptr, &tempptr, cd)) 02475 { 02476 *errorptr = (ptr[1] == ':')? ERR13 : ERR31; 02477 goto FAILED; 02478 } 02479 02480 /* If the first character is '^', set the negation flag and skip it. */ 02481 02482 if ((c = *(++ptr)) == '^') 02483 { 02484 negate_class = true; 02485 c = *(++ptr); 02486 } 02487 else 02488 { 02489 negate_class = false; 02490 } 02491 02492 /* Keep a count of chars with values < 256 so that we can optimize the case 02493 of just a single character (as long as it's < 256). For higher valued UTF-8 02494 characters, we don't yet do any optimization. */ 02495 02496 class_charcount = 0; 02497 class_lastchar = -1; 02498 02499 02500 /* Initialize the 32-char bit map to all zeros. We have to build the 02501 map in a temporary bit of store, in case the class contains only 1 02502 character (< 256), because in that case the compiled code doesn't use the 02503 bit map. */ 02504 02505 memset(classa, 0, 32 * sizeof(uschar)); 02506 02507 /* Process characters until ] is reached. By writing this as a "do" it 02508 means that an initial ] is taken as a data character. The first pass 02509 through the regex checked the overall syntax, so we don't need to be very 02510 strict here. At the start of the loop, c contains the first byte of the 02511 character. */ 02512 02513 do 02514 { 02515 02516 /* Inside \Q...\E everything is literal except \E */ 02517 02518 if (inescq) 02519 { 02520 if (c == '\\' && ptr[1] == 'E') 02521 { 02522 inescq = false; 02523 ptr++; 02524 continue; 02525 } 02526 else goto LONE_SINGLE_CHARACTER; 02527 } 02528 02529 /* Handle POSIX class names. Perl allows a negation extension of the 02530 form [:^name:]. A square bracket that doesn't match the syntax is 02531 treated as a literal. We also recognize the POSIX constructions 02532 [.ch.] and [=ch=] ("collating elements") and fault them, as Perl 02533 5.6 and 5.8 do. */ 02534 02535 if (c == '[' && 02536 (ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') && 02537 check_posix_syntax(ptr, &tempptr, cd)) 02538 { 02539 bool local_negate = false; 02540 int posix_class, i; 02541 register const uschar *cbits = cd->cbits; 02542 02543 if (ptr[1] != ':') 02544 { 02545 *errorptr = ERR31; 02546 goto FAILED; 02547 } 02548 02549 ptr += 2; 02550 if (*ptr == '^') 02551 { 02552 local_negate = true; 02553 ptr++; 02554 } 02555 02556 posix_class = check_posix_name(ptr, tempptr - ptr); 02557 if (posix_class < 0) 02558 { 02559 *errorptr = ERR30; 02560 goto FAILED; 02561 } 02562 02563 /* If matching is caseless, upper and lower are converted to 02564 alpha. This relies on the fact that the class table starts with 02565 alpha, lower, upper as the first 3 entries. */ 02566 02567 if ((options & PCRE_CASELESS) != 0 && posix_class <= 2) 02568 posix_class = 0; 02569 02570 /* Or into the map we are building up to 3 of the static class 02571 tables, or their negations. The [:blank:] class sets up the same 02572 chars as the [:space:] class (all white space). We remove the vertical 02573 white space chars afterwards. */ 02574 02575 posix_class *= 3; 02576 for (i = 0; i < 3; i++) 02577 { 02578 bool blankclass = strncmp((char *)ptr, "blank", 5) == 0; 02579 int taboffset = posix_class_maps[posix_class + i]; 02580 if (taboffset < 0) break; 02581 if (local_negate) 02582 { 02583 for (c = 0; c < 32; c++) classa[c] |= ~cbits[c+taboffset]; 02584 if (blankclass) classa[1] |= 0x3c; 02585 } 02586 else 02587 { 02588 for (c = 0; c < 32; c++) classa[c] |= cbits[c+taboffset]; 02589 if (blankclass) classa[1] &= ~0x3c; 02590 } 02591 } 02592 02593 ptr = tempptr + 1; 02594 class_charcount = 10; /* Set > 1; assumes more than 1 per class */ 02595 continue; /* End of POSIX syntax handling */ 02596 } 02597 02598 /* Backslash may introduce a single character, or it may introduce one 02599 of the specials, which just set a flag. Escaped items are checked for 02600 validity in the pre-compiling pass. The sequence \b is a special case. 02601 Inside a class (and only there) it is treated as backspace. Elsewhere 02602 it marks a word boundary. Other escapes have preset maps ready to 02603 or into the one we are building. We assume they have more than one 02604 character in them, so set class_charcount bigger than one. */ 02605 02606 if (c == '\\') 02607 { 02608 c = check_escape(&ptr, errorptr, *brackets, options, true); 02609 if (-c == ESC_b) c = '\b'; /* \b is backslash in a class */ 02610 02611 if (-c == ESC_Q) /* Handle start of quoted string */ 02612 { 02613 if (ptr[1] == '\\' && ptr[2] == 'E') 02614 { 02615 ptr += 2; /* avoid empty string */ 02616 } 02617 else inescq = true; 02618 continue; 02619 } 02620 02621 else if (c < 0) 02622 { 02623 register const uschar *cbits = cd->cbits; 02624 class_charcount = 10; /* Greater than 1 is what matters */ 02625 switch (-c) 02626 { 02627 case ESC_d: 02628 for (c = 0; c < 32; c++) classa[c] |= cbits[c+cbit_digit]; 02629 continue; 02630 02631 case ESC_D: 02632 for (c = 0; c < 32; c++) classa[c] |= ~cbits[c+cbit_digit]; 02633 continue; 02634 02635 case ESC_w: 02636 for (c = 0; c < 32; c++) classa[c] |= cbits[c+cbit_word]; 02637 continue; 02638 02639 case ESC_W: 02640 for (c = 0; c < 32; c++) classa[c] |= ~cbits[c+cbit_word]; 02641 continue; 02642 02643 case ESC_s: 02644 for (c = 0; c < 32; c++) classa[c] |= cbits[c+cbit_space]; 02645 classa[1] &= ~0x08; /* Perl 5.004 onwards omits VT from \s */ 02646 continue; 02647 02648 case ESC_S: 02649 for (c = 0; c < 32; c++) classa[c] |= ~cbits[c+cbit_space]; 02650 classa[1] |= 0x08; /* Perl 5.004 onwards omits VT from \s */ 02651 continue; 02652 02653 /* Unrecognized escapes are faulted if PCRE is running in its 02654 strict mode. By default, for compatibility with Perl, they are 02655 treated as literals. */ 02656 02657 default: 02658 if ((options & PCRE_EXTRA) != 0) 02659 { 02660 *errorptr = ERR7; 02661 goto FAILED; 02662 } 02663 c = *ptr; /* The final character */ 02664 } 02665 } 02666 02667 /* Fall through if we have a single character (c >= 0). This may be 02668 > 256 in UTF-8 mode. */ 02669 02670 } /* End of backslash handling */ 02671 02672 /* A single character may be followed by '-' to form a range. However, 02673 Perl does not permit ']' to be the end of the range. A '-' character 02674 here is treated as a literal. */ 02675 02676 if (ptr[1] == '-' && ptr[2] != ']') 02677 { 02678 int d; 02679 ptr += 2; 02680 02681 d = *ptr; 02682 02683 /* The second part of a range can be a single-character escape, but 02684 not any of the other escapes. Perl 5.6 treats a hyphen as a literal 02685 in such circumstances. */ 02686 02687 if (d == '\\') 02688 { 02689 const uschar *oldptr = ptr; 02690 d = check_escape(&ptr, errorptr, *brackets, options, true); 02691 02692 /* \b is backslash; any other special means the '-' was literal */ 02693 02694 if (d < 0) 02695 { 02696 if (d == -ESC_b) d = '\b'; else 02697 { 02698 ptr = oldptr - 2; 02699 goto LONE_SINGLE_CHARACTER; /* A few lines below */ 02700 } 02701 } 02702 } 02703 02704 /* Check that the two values are in the correct order */ 02705 02706 if (d < c) 02707 { 02708 *errorptr = ERR8; 02709 goto FAILED; 02710 } 02711 02712 /* If d is greater than 255, we can't just use the bit map, so set up 02713 for the UTF-8 supporting class type. If we are not caseless, we can 02714 just set up a single range. If we are caseless, the characters < 256 02715 are handled with a bitmap, in order to get the case-insensitive 02716 handling. */ 02717 02718 /* We use the bit map if the range is entirely < 255, or if part of it 02719 is < 255 and matching is caseless. */ 02720 02721 for (; c <= d; c++) 02722 { 02723 classa[c/8] |= (1 << (c&7)); 02724 if ((options & PCRE_CASELESS) != 0) 02725 { 02726 int uc = cd->fcc[c]; /* flip case */ 02727 classa[uc/8] |= (1 << (uc&7)); 02728 } 02729 class_charcount++; /* in case a one-char range */ 02730 class_lastchar = c; 02731 } 02732 02733 continue; /* Go get the next char in the class */ 02734 } 02735 02736 /* Handle a lone single character - we can get here for a normal 02737 non-escape char, or after \ that introduces a single character. */ 02738 02739 LONE_SINGLE_CHARACTER: 02740 02741 /* Handle a single-byte character */ 02742 { 02743 classa[c/8] |= (1 << (c&7)); 02744 if ((options & PCRE_CASELESS) != 0) 02745 { 02746 c = cd->fcc[c]; /* flip case */ 02747 classa[c/8] |= (1 << (c&7)); 02748 } 02749 class_charcount++; 02750 class_lastchar = c; 02751 } 02752 } 02753 02754 /* Loop until ']' reached; the check for end of string happens inside the 02755 loop. This "while" is the end of the "do" above. */ 02756 02757 while ((c = *(++ptr)) != ']' || inescq); 02758 02759 /* If class_charcount is 1, we saw precisely one character with a value < 02760 256. In UTF-8 mode, we can optimize if there were no characters >= 256 and 02761 the one character is < 128. In non-UTF-8 mode we can always optimize. 02762 02763 The optimization throws away the bit map. We turn the item into a 02764 1-character OP_CHARS if it's positive, or OP_NOT if it's negative. Note 02765 that OP_NOT does not support multibyte characters. In the positive case, it 02766 can cause firstbyte to be set. Otherwise, there can be no first char if 02767 this item is first, whatever repeat count may follow. In the case of 02768 reqbyte, save the previous value for reinstating. */ 02769 02770 if (class_charcount == 1) 02771 { 02772 zeroreqbyte = reqbyte; 02773 if (negate_class) 02774 { 02775 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE; 02776 zerofirstbyte = firstbyte; 02777 *code++ = OP_NOT; 02778 } 02779 else 02780 { 02781 if (firstbyte == REQ_UNSET) 02782 { 02783 zerofirstbyte = REQ_NONE; 02784 firstbyte = class_lastchar | req_caseopt; 02785 } 02786 else 02787 { 02788 zerofirstbyte = firstbyte; 02789 reqbyte = class_lastchar | req_caseopt | cd->req_varyopt; 02790 } 02791 *code++ = OP_CHARS; 02792 *code++ = 1; 02793 } 02794 *code++ = class_lastchar; 02795 break; /* End of class handling */ 02796 } /* End of 1-byte optimization */ 02797 02798 /* Otherwise, if this is the first thing in the branch, there can be no 02799 first char setting, whatever the repeat count. Any reqbyte setting must 02800 remain unchanged after any kind of repeat. */ 02801 02802 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE; 02803 zerofirstbyte = firstbyte; 02804 zeroreqbyte = reqbyte; 02805 02806 /* If there are characters with values > 255, we have to compile an 02807 extended class, with its own opcode. If there are no characters < 256, 02808 we can omit the bitmap. */ 02809 02810 02811 /* If there are no characters > 255, negate the 32-byte map if necessary, 02812 and copy it into the code vector. If this is the first thing in the branch, 02813 there can be no first char setting, whatever the repeat count. Any reqbyte 02814 setting must remain unchanged after any kind of repeat. */ 02815 02816 if (negate_class) 02817 { 02818 *code++ = OP_NCLASS; 02819 for (c = 0; c < 32; c++) code[c] = ~classa[c]; 02820 } 02821 else 02822 { 02823 *code++ = OP_CLASS; 02824 memcpy(code, classa, 32); 02825 } 02826 code += 32; 02827 break; 02828 02829 /* Various kinds of repeat */ 02830 02831 case '{': 02832 if (!is_counted_repeat(ptr+1)) goto NORMAL_CHAR; 02833 ptr = read_repeat_counts(ptr+1, &repeat_min, &repeat_max, errorptr); 02834 if (*errorptr != NULL) goto FAILED; 02835 goto REPEAT; 02836 02837 case '*': 02838 repeat_min = 0; 02839 repeat_max = -1; 02840 goto REPEAT; 02841 02842 case '+': 02843 repeat_min = 1; 02844 repeat_max = -1; 02845 goto REPEAT; 02846 02847 case '?': 02848 repeat_min = 0; 02849 repeat_max = 1; 02850 02851 REPEAT: 02852 if (previous == NULL) 02853 { 02854 *errorptr = ERR9; 02855 goto FAILED; 02856 } 02857 02858 if (repeat_min == 0) 02859 { 02860 firstbyte = zerofirstbyte; /* Adjust for zero repeat */ 02861 reqbyte = zeroreqbyte; /* Ditto */ 02862 } 02863 02864 /* Remember whether this is a variable length repeat */ 02865 02866 reqvary = (repeat_min == repeat_max)? 0 : REQ_VARY; 02867 02868 op_type = 0; /* Default single-char op codes */ 02869 possessive_quantifier = false; /* Default not possessive quantifier */ 02870 02871 /* Save start of previous item, in case we have to move it up to make space 02872 for an inserted OP_ONCE for the additional '+' extension. */ 02873 02874 tempcode = previous; 02875 02876 /* If the next character is '+', we have a possessive quantifier. This 02877 implies greediness, whatever the setting of the PCRE_UNGREEDY option. 02878 If the next character is '?' this is a minimizing repeat, by default, 02879 but if PCRE_UNGREEDY is set, it works the other way round. We change the 02880 repeat type to the non-default. */ 02881 02882 if (ptr[1] == '+') 02883 { 02884 repeat_type = 0; /* Force greedy */ 02885 possessive_quantifier = true; 02886 ptr++; 02887 } 02888 else if (ptr[1] == '?') 02889 { 02890 repeat_type = greedy_non_default; 02891 ptr++; 02892 } 02893 else repeat_type = greedy_default; 02894 02895 /* If previous was a recursion, we need to wrap it inside brackets so that 02896 it can be replicated if necessary. */ 02897 02898 if (*previous == OP_RECURSE) 02899 { 02900 memmove(previous + 1 + LINK_SIZE, previous, 1 + LINK_SIZE); 02901 code += 1 + LINK_SIZE; 02902 *previous = OP_BRA; 02903 PUT(previous, 1, code - previous); 02904 *code = OP_KET; 02905 PUT(code, 1, code - previous); 02906 code += 1 + LINK_SIZE; 02907 } 02908 02909 /* If previous was a string of characters, chop off the last one and use it 02910 as the subject of the repeat. If there was only one character, we can 02911 abolish the previous item altogether. If a one-char item has a minumum of 02912 more than one, ensure that it is set in reqbyte - it might not be if a 02913 sequence such as x{3} is the first thing in a branch because the x will 02914 have gone into firstbyte instead. */ 02915 02916 if (*previous == OP_CHARS) 02917 { 02918 /* Deal with UTF-8 characters that take up more than one byte. It's 02919 easier to write this out separately than try to macrify it. Use c to 02920 hold the length of the character in bytes, plus 0x80 to flag that it's a 02921 length rather than a small character. */ 02922 02923 02924 /* Handle the case of a single byte - either with no UTF8 support, or 02925 with UTF-8 disabled, or for a UTF-8 character < 128. */ 02926 02927 { 02928 c = *(--code); 02929 if (code == previous + 2) /* There was only one character */ 02930 { 02931 code = previous; /* Abolish the previous item */ 02932 if (repeat_min > 1) reqbyte = c | req_caseopt | cd->req_varyopt; 02933 } 02934 else 02935 { 02936 previous[1]--; /* adjust length */ 02937 tempcode = code; /* Adjust position to be moved for '+' */ 02938 } 02939 } 02940 02941 goto OUTPUT_SINGLE_REPEAT; /* Code shared with single character types */ 02942 } 02943 02944 /* If previous was a single negated character ([^a] or similar), we use 02945 one of the special opcodes, replacing it. The code is shared with single- 02946 character repeats by setting opt_type to add a suitable offset into 02947 repeat_type. OP_NOT is currently used only for single-byte chars. */ 02948 02949 else if (*previous == OP_NOT) 02950 { 02951 op_type = OP_NOTSTAR - OP_STAR; /* Use "not" opcodes */ 02952 c = previous[1]; 02953 code = previous; 02954 goto OUTPUT_SINGLE_REPEAT; 02955 } 02956 02957 /* If previous was a character type match (\d or similar), abolish it and 02958 create a suitable repeat item. The code is shared with single-character 02959 repeats by setting op_type to add a suitable offset into repeat_type. */ 02960 02961 else if (*previous < OP_EODN) 02962 { 02963 op_type = OP_TYPESTAR - OP_STAR; /* Use type opcodes */ 02964 c = *previous; 02965 code = previous; 02966 02967 OUTPUT_SINGLE_REPEAT: 02968 02969 /* If the maximum is zero then the minimum must also be zero; Perl allows 02970 this case, so we do too - by simply omitting the item altogether. */ 02971 02972 if (repeat_max == 0) goto END_REPEAT; 02973 02974 /* Combine the op_type with the repeat_type */ 02975 02976 repeat_type += op_type; 02977 02978 /* A minimum of zero is handled either as the special case * or ?, or as 02979 an UPTO, with the maximum given. */ 02980 02981 if (repeat_min == 0) 02982 { 02983 if (repeat_max == -1) *code++ = OP_STAR + repeat_type; 02984 else if (repeat_max == 1) *code++ = OP_QUERY + repeat_type; 02985 else 02986 { 02987 *code++ = OP_UPTO + repeat_type; 02988 PUT2INC(code, 0, repeat_max); 02989 } 02990 } 02991 02992 /* The case {1,} is handled as the special case + */ 02993 02994 else if (repeat_min == 1 && repeat_max == -1) 02995 *code++ = OP_PLUS + repeat_type; 02996 02997 /* The case {n,n} is just an EXACT, while the general case {n,m} is 02998 handled as an EXACT followed by an UPTO. An EXACT of 1 is optimized. */ 02999 03000 else 03001 { 03002 if (repeat_min != 1) 03003 { 03004 *code++ = OP_EXACT + op_type; /* NB EXACT doesn't have repeat_type */ 03005 PUT2INC(code, 0, repeat_min); 03006 } 03007 03008 /* If the mininum is 1 and the previous item was a character string, 03009 we either have to put back the item that got cancelled if the string 03010 length was 1, or add the character back onto the end of a longer 03011 string. For a character type nothing need be done; it will just get 03012 put back naturally. Note that the final character is always going to 03013 get added below, so we leave code ready for its insertion. */ 03014 03015 else if (*previous == OP_CHARS) 03016 { 03017 if (code == previous) code += 2; else 03018 03019 /* In UTF-8 mode, a multibyte char has its length in c, with the 0x80 03020 bit set as a flag. The length will always be between 2 and 6. */ 03021 03022 previous[1]++; 03023 } 03024 03025 /* For a single negated character we also have to put back the 03026 item that got cancelled. At present this applies only to single byte 03027 characters in any mode. */ 03028 03029 else if (*previous == OP_NOT) code++; 03030 03031 /* If the maximum is unlimited, insert an OP_STAR. Before doing so, 03032 we have to insert the character for the previous code. In UTF-8 mode, 03033 long characters have their length in c, with the 0x80 bit as a flag. */ 03034 03035 if (repeat_max < 0) 03036 { 03037 *code++ = c; 03038 *code++ = OP_STAR + repeat_type; 03039 } 03040 03041 /* Else insert an UPTO if the max is greater than the min, again 03042 preceded by the character, for the previously inserted code. */ 03043 03044 else if (repeat_max != repeat_min) 03045 { 03046 *code++ = c; 03047 repeat_max -= repeat_min; 03048 *code++ = OP_UPTO + repeat_type; 03049 PUT2INC(code, 0, repeat_max); 03050 } 03051 } 03052 03053 /* The character or character type itself comes last in all cases. */ 03054 03055 03056 *code++ = c; 03057 } 03058 03059 /* If previous was a character class or a back reference, we put the repeat 03060 stuff after it, but just skip the item if the repeat was {0,0}. */ 03061 03062 else if (*previous == OP_CLASS || 03063 *previous == OP_NCLASS || 03064 *previous == OP_REF) 03065 { 03066 if (repeat_max == 0) 03067 { 03068 code = previous; 03069 goto END_REPEAT; 03070 } 03071 if (repeat_min == 0 && repeat_max == -1) 03072 *code++ = OP_CRSTAR + repeat_type; 03073 else if (repeat_min == 1 && repeat_max == -1) 03074 *code++ = OP_CRPLUS + repeat_type; 03075 else if (repeat_min == 0 && repeat_max == 1) 03076 *code++ = OP_CRQUERY + repeat_type; 03077 else 03078 { 03079 *code++ = OP_CRRANGE + repeat_type; 03080 PUT2INC(code, 0, repeat_min); 03081 if (repeat_max == -1) repeat_max = 0; /* 2-byte encoding for max */ 03082 PUT2INC(code, 0, repeat_max); 03083 } 03084 } 03085 03086 /* If previous was a bracket group, we may have to replicate it in certain 03087 cases. */ 03088 03089 else if (*previous >= OP_BRA || *previous == OP_ONCE || 03090 *previous == OP_COND) 03091 { 03092 register int i; 03093 int ketoffset = 0; 03094 int len = code - previous; 03095 uschar *bralink = NULL; 03096 03097 /* If the maximum repeat count is unlimited, find the end of the bracket 03098 by scanning through from the start, and compute the offset back to it 03099 from the current code pointer. There may be an OP_OPT setting following 03100 the final KET, so we can't find the end just by going back from the code 03101 pointer. */ 03102 03103 if (repeat_max == -1) 03104 { 03105 register uschar *ket = previous; 03106 do ket += GET(ket, 1); while (*ket != OP_KET); 03107 ketoffset = code - ket; 03108 } 03109 03110 /* The case of a zero minimum is special because of the need to stick 03111 OP_BRAZERO in front of it, and because the group appears once in the 03112 data, whereas in other cases it appears the minimum number of times. For 03113 this reason, it is simplest to treat this case separately, as otherwise 03114 the code gets far too messy. There are several special subcases when the 03115 minimum is zero. */ 03116 03117 if (repeat_min == 0) 03118 { 03119 /* If the maximum is also zero, we just omit the group from the output 03120 altogether. */ 03121 03122 if (repeat_max == 0) 03123 { 03124 code = previous; 03125 goto END_REPEAT; 03126 } 03127 03128 /* If the maximum is 1 or unlimited, we just have to stick in the 03129 BRAZERO and do no more at this point. However, we do need to adjust 03130 any OP_RECURSE calls inside the group that refer to the group itself or 03131 any internal group, because the offset is from the start of the whole 03132 regex. Temporarily terminate the pattern while doing this. */ 03133 03134 if (repeat_max <= 1) 03135 { 03136 *code = OP_END; 03137 adjust_recurse(previous, 1, utf8, cd); 03138 memmove(previous+1, previous, len); 03139 code++; 03140 *previous++ = OP_BRAZERO + repeat_type; 03141 } 03142 03143 /* If the maximum is greater than 1 and limited, we have to replicate 03144 in a nested fashion, sticking OP_BRAZERO before each set of brackets. 03145 The first one has to be handled carefully because it's the original 03146 copy, which has to be moved up. The remainder can be handled by code 03147 that is common with the non-zero minimum case below. We have to 03148 adjust the value or repeat_max, since one less copy is required. Once 03149 again, we may have to adjust any OP_RECURSE calls inside the group. */ 03150 03151 else 03152 { 03153 int offset; 03154 *code = OP_END; 03155 adjust_recurse(previous, 2 + LINK_SIZE, utf8, cd); 03156 memmove(previous + 2 + LINK_SIZE, previous, len); 03157 code += 2 + LINK_SIZE; 03158 *previous++ = OP_BRAZERO + repeat_type; 03159 *previous++ = OP_BRA; 03160 03161 /* We chain together the bracket offset fields that have to be 03162 filled in later when the ends of the brackets are reached. */ 03163 03164 offset = (bralink == NULL)? 0 : previous - bralink; 03165 bralink = previous; 03166 PUTINC(previous, 0, offset); 03167 } 03168 03169 repeat_max--; 03170 } 03171 03172 /* If the minimum is greater than zero, replicate the group as many 03173 times as necessary, and adjust the maximum to the number of subsequent 03174 copies that we need. If we set a first char from the group, and didn't 03175 set a required char, copy the latter from the former. */ 03176 03177 else 03178 { 03179 if (repeat_min > 1) 03180 { 03181 if (groupsetfirstbyte && reqbyte < 0) reqbyte = firstbyte; 03182 for (i = 1; i < repeat_min; i++) 03183 { 03184 memcpy(code, previous, len); 03185 code += len; 03186 } 03187 } 03188 if (repeat_max > 0) repeat_max -= repeat_min; 03189 } 03190 03191 /* This code is common to both the zero and non-zero minimum cases. If 03192 the maximum is limited, it replicates the group in a nested fashion, 03193 remembering the bracket starts on a stack. In the case of a zero minimum, 03194 the first one was set up above. In all cases the repeat_max now specifies 03195 the number of additional copies needed. */ 03196 03197 if (repeat_max >= 0) 03198 { 03199 for (i = repeat_max - 1; i >= 0; i--) 03200 { 03201 *code++ = OP_BRAZERO + repeat_type; 03202 03203 /* All but the final copy start a new nesting, maintaining the 03204 chain of brackets outstanding. */ 03205 03206 if (i != 0) 03207 { 03208 int offset; 03209 *code++ = OP_BRA; 03210 offset = (bralink == NULL)? 0 : code - bralink; 03211 bralink = code; 03212 PUTINC(code, 0, offset); 03213 } 03214 03215 memcpy(code, previous, len); 03216 code += len; 03217 } 03218 03219 /* Now chain through the pending brackets, and fill in their length 03220 fields (which are holding the chain links pro tem). */ 03221 03222 while (bralink != NULL) 03223 { 03224 int oldlinkoffset; 03225 int offset = code - bralink + 1; 03226 uschar *bra = code - offset; 03227 oldlinkoffset = GET(bra, 1); 03228 bralink = (oldlinkoffset == 0)? NULL : bralink - oldlinkoffset; 03229 *code++ = OP_KET; 03230 PUTINC(code, 0, offset); 03231 PUT(bra, 1, offset); 03232 } 03233 } 03234 03235 /* If the maximum is unlimited, set a repeater in the final copy. We 03236 can't just offset backwards from the current code point, because we 03237 don't know if there's been an options resetting after the ket. The 03238 correct offset was computed above. */ 03239 03240 else code[-ketoffset] = OP_KETRMAX + repeat_type; 03241 } 03242 03243 /* Else there's some kind of shambles */ 03244 03245 else 03246 { 03247 *errorptr = ERR11; 03248 goto FAILED; 03249 } 03250 03251 /* If the character following a repeat is '+', we wrap the entire repeated 03252 item inside OP_ONCE brackets. This is just syntactic sugar, taken from 03253 Sun's Java package. The repeated item starts at tempcode, not at previous, 03254 which might be the first part of a string whose (former) last char we 03255 repeated. However, we don't support '+' after a greediness '?'. */ 03256 03257 if (possessive_quantifier) 03258 { 03259 int len = code - tempcode; 03260 memmove(tempcode + 1+LINK_SIZE, tempcode, len); 03261 code += 1 + LINK_SIZE; 03262 len += 1 + LINK_SIZE; 03263 tempcode[0] = OP_ONCE; 03264 *code++ = OP_KET; 03265 PUTINC(code, 0, len); 03266 PUT(tempcode, 1, len); 03267 } 03268 03269 /* In all case we no longer have a previous item. We also set the 03270 "follows varying string" flag for subsequently encountered reqbytes if 03271 it isn't already set and we have just passed a varying length item. */ 03272 03273 END_REPEAT: 03274 previous = NULL; 03275 cd->req_varyopt |= reqvary; 03276 break; 03277 03278 03279 /* Start of nested bracket sub-expression, or comment or lookahead or 03280 lookbehind or option setting or condition. First deal with special things 03281 that can come after a bracket; all are introduced by ?, and the appearance 03282 of any of them means that this is not a referencing group. They were 03283 checked for validity in the first pass over the string, so we don't have to 03284 check for syntax errors here. */ 03285 03286 case '(': 03287 newoptions = options; 03288 skipbytes = 0; 03289 03290 if (*(++ptr) == '?') 03291 { 03292 int set, unset; 03293 int *optset; 03294 03295 switch (*(++ptr)) 03296 { 03297 case '#': /* Comment; skip to ket */ 03298 ptr++; 03299 while (*ptr != ')') ptr++; 03300 continue; 03301 03302 case ':': /* Non-extracting bracket */ 03303 bravalue = OP_BRA; 03304 ptr++; 03305 break; 03306 03307 case '(': 03308 bravalue = OP_COND; /* Conditional group */ 03309 03310 /* Condition to test for recursion */ 03311 03312 if (ptr[1] == 'R') 03313 { 03314 code[1+LINK_SIZE] = OP_CREF; 03315 PUT2(code, 2+LINK_SIZE, CREF_RECURSE); 03316 skipbytes = 3; 03317 ptr += 3; 03318 } 03319 03320 /* Condition to test for a numbered subpattern match. We know that 03321 if a digit follows ( then there will just be digits until ) because 03322 the syntax was checked in the first pass. */ 03323 03324 else if ((digitab[ptr[1]] && ctype_digit) != 0) 03325 { 03326 int condref; /* Don't amalgamate; some compilers */ 03327 condref = *(++ptr) - '0'; /* grumble at autoincrement in declaration */ 03328 while (*(++ptr) != ')') condref = condref*10 + *ptr - '0'; 03329 if (condref == 0) 03330 { 03331 *errorptr = ERR35; 03332 goto FAILED; 03333 } 03334 ptr++; 03335 code[1+LINK_SIZE] = OP_CREF; 03336 PUT2(code, 2+LINK_SIZE, condref); 03337 skipbytes = 3; 03338 } 03339 /* For conditions that are assertions, we just fall through, having 03340 set bravalue above. */ 03341 break; 03342 03343 case '=': /* Positive lookahead */ 03344 bravalue = OP_ASSERT; 03345 ptr++; 03346 break; 03347 03348 case '!': /* Negative lookahead */ 03349 bravalue = OP_ASSERT_NOT; 03350 ptr++; 03351 break; 03352 03353 case '<': /* Lookbehinds */ 03354 switch (*(++ptr)) 03355 { 03356 case '=': /* Positive lookbehind */ 03357 bravalue = OP_ASSERTBACK; 03358 ptr++; 03359 break; 03360 03361 case '!': /* Negative lookbehind */ 03362 bravalue = OP_ASSERTBACK_NOT; 03363 ptr++; 03364 break; 03365 } 03366 break; 03367 03368 case '>': /* One-time brackets */ 03369 bravalue = OP_ONCE; 03370 ptr++; 03371 break; 03372 03373 case 'C': /* Callout - may be followed by digits */ 03374 *code++ = OP_CALLOUT; 03375 { 03376 int n = 0; 03377 while ((digitab[*(++ptr)] & ctype_digit) != 0) 03378 n = n * 10 + *ptr - '0'; 03379 if (n > 255) 03380 { 03381 *errorptr = ERR38; 03382 goto FAILED; 03383 } 03384 *code++ = n; 03385 } 03386 previous = NULL; 03387 continue; 03388 03389 case 'P': /* Named subpattern handling */ 03390 if (*(++ptr) == '<') /* Definition */ 03391 { 03392 int i, namelen; 03393 uschar *slot = cd->name_table; 03394 const uschar *name; /* Don't amalgamate; some compilers */ 03395 name = ++ptr; /* grumble at autoincrement in declaration */ 03396 03397 while (*ptr++ != '>'); 03398 namelen = ptr - name - 1; 03399 03400 for (i = 0; i < cd->names_found; i++) 03401 { 03402 int crc = memcmp(name, slot+2, namelen); 03403 if (crc == 0) 03404 { 03405 if (slot[2+namelen] == 0) 03406 { 03407 *errorptr = ERR43; 03408 goto FAILED; 03409 } 03410 crc = -1; /* Current name is substring */ 03411 } 03412 if (crc < 0) 03413 { 03414 memmove(slot + cd->name_entry_size, slot, 03415 (cd->names_found - i) * cd->name_entry_size); 03416 break; 03417 } 03418 slot += cd->name_entry_size; 03419 } 03420 03421 PUT2(slot, 0, *brackets + 1); 03422 memcpy(slot + 2, name, namelen); 03423 slot[2+namelen] = 0; 03424 cd->names_found++; 03425 goto NUMBERED_GROUP; 03426 } 03427 03428 if (*ptr == '=' || *ptr == '>') /* Reference or recursion */ 03429 { 03430 int i, namelen; 03431 int type = *ptr++; 03432 const uschar *name = ptr; 03433 uschar *slot = cd->name_table; 03434 03435 while (*ptr != ')') ptr++; 03436 namelen = ptr - name; 03437 03438 for (i = 0; i < cd->names_found; i++) 03439 { 03440 if (strncmp((char *)name, (char *)slot+2, namelen) == 0) break; 03441 slot += cd->name_entry_size; 03442 } 03443 if (i >= cd->names_found) 03444 { 03445 *errorptr = ERR15; 03446 goto FAILED; 03447 } 03448 03449 recno = GET2(slot, 0); 03450 03451 if (type == '>') goto HANDLE_RECURSION; /* A few lines below */ 03452 03453 /* Back reference */ 03454 03455 previous = code; 03456 *code++ = OP_REF; 03457 PUT2INC(code, 0, recno); 03458 cd->backref_map |= (recno < 32)? (1 << recno) : 1; 03459 if (recno > cd->top_backref) cd->top_backref = recno; 03460 continue; 03461 } 03462 03463 /* Should never happen */ 03464 break; 03465 03466 case 'R': /* Pattern recursion */ 03467 ptr++; /* Same as (?0) */ 03468 /* Fall through */ 03469 03470 /* Recursion or "subroutine" call */ 03471 03472 case '0': case '1': case '2': case '3': case '4': 03473 case '5': case '6': case '7': case '8': case '9': 03474 { 03475 const uschar *called; 03476 recno = 0; 03477 while((digitab[*ptr] & ctype_digit) != 0) 03478 recno = recno * 10 + *ptr++ - '0'; 03479 03480 /* Come here from code above that handles a named recursion */ 03481 03482 HANDLE_RECURSION: 03483 03484 previous = code; 03485 03486 /* Find the bracket that is being referenced. Temporarily end the 03487 regex in case it doesn't exist. */ 03488 03489 *code = OP_END; 03490 called = (recno == 0)? 03491 cd->start_code : find_bracket(cd->start_code, recno); 03492 03493 if (called == NULL) 03494 { 03495 *errorptr = ERR15; 03496 goto FAILED; 03497 } 03498 03499 /* If the subpattern is still open, this is a recursive call. We 03500 check to see if this is a left recursion that could loop for ever, 03501 and diagnose that case. */ 03502 03503 if (GET(called, 1) == 0 && could_be_empty(called, code, bcptr, utf8)) 03504 { 03505 *errorptr = ERR40; 03506 goto FAILED; 03507 } 03508 03509 /* Insert the recursion/subroutine item */ 03510 03511 *code = OP_RECURSE; 03512 PUT(code, 1, called - cd->start_code); 03513 code += 1 + LINK_SIZE; 03514 } 03515 continue; 03516 03517 /* Character after (? not specially recognized */ 03518 03519 default: /* Option setting */ 03520 set = unset = 0; 03521 optset = &set; 03522 03523 while (*ptr != ')' && *ptr != ':') 03524 { 03525 switch (*ptr++) 03526 { 03527 case '-': optset = &unset; break; 03528 03529 case 'i': *optset |= PCRE_CASELESS; break; 03530 case 'm': *optset |= PCRE_MULTILINE; break; 03531 case 's': *optset |= PCRE_DOTALL; break; 03532 case 'x': *optset |= PCRE_EXTENDED; break; 03533 case 'U': *optset |= PCRE_UNGREEDY; break; 03534 case 'X': *optset |= PCRE_EXTRA; break; 03535 } 03536 } 03537 03538 /* Set up the changed option bits, but don't change anything yet. */ 03539 03540 newoptions = (options | set) & (~unset); 03541 03542 /* If the options ended with ')' this is not the start of a nested 03543 group with option changes, so the options change at this level. Compile 03544 code to change the ims options if this setting actually changes any of 03545 them. We also pass the new setting back so that it can be put at the 03546 start of any following branches, and when this group ends (if we are in 03547 a group), a resetting item can be compiled. 03548 03549 Note that if this item is right at the start of the pattern, the 03550 options will have been abstracted and made global, so there will be no 03551 change to compile. */ 03552 03553 if (*ptr == ')') 03554 { 03555 if ((options & PCRE_IMS) != (newoptions & PCRE_IMS)) 03556 { 03557 *code++ = OP_OPT; 03558 *code++ = newoptions & PCRE_IMS; 03559 } 03560 03561 /* Change options at this level, and pass them back for use 03562 in subsequent branches. Reset the greedy defaults and the case 03563 value for firstbyte and reqbyte. */ 03564 03565 *optionsptr = options = newoptions; 03566 greedy_default = ((newoptions & PCRE_UNGREEDY) != 0); 03567 greedy_non_default = greedy_default ^ 1; 03568 req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0; 03569 03570 previous = NULL; /* This item can't be repeated */ 03571 continue; /* It is complete */ 03572 } 03573 03574 /* If the options ended with ':' we are heading into a nested group 03575 with possible change of options. Such groups are non-capturing and are 03576 not assertions of any kind. All we need to do is skip over the ':'; 03577 the newoptions value is handled below. */ 03578 03579 bravalue = OP_BRA; 03580 ptr++; 03581 } 03582 } 03583 03584 /* If PCRE_NO_AUTO_CAPTURE is set, all unadorned brackets become 03585 non-capturing and behave like (?:...) brackets */ 03586 03587 else if ((options & PCRE_NO_AUTO_CAPTURE) != 0) 03588 { 03589 bravalue = OP_BRA; 03590 } 03591 03592 /* Else we have a referencing group; adjust the opcode. If the bracket 03593 number is greater than EXTRACT_BASIC_MAX, we set the opcode one higher, and 03594 arrange for the true number to follow later, in an OP_BRANUMBER item. */ 03595 03596 else 03597 { 03598 NUMBERED_GROUP: 03599 if (++(*brackets) > EXTRACT_BASIC_MAX) 03600 { 03601 bravalue = OP_BRA + EXTRACT_BASIC_MAX + 1; 03602 code[1+LINK_SIZE] = OP_BRANUMBER; 03603 PUT2(code, 2+LINK_SIZE, *brackets); 03604 skipbytes = 3; 03605 } 03606 else bravalue = OP_BRA + *brackets; 03607 } 03608 03609 /* Process nested bracketed re. Assertions may not be repeated, but other 03610 kinds can be. We copy code into a non-register variable in order to be able 03611 to pass its address because some compilers complain otherwise. Pass in a 03612 new setting for the ims options if they have changed. */ 03613 03614 previous = (bravalue >= OP_ONCE)? code : NULL; 03615 *code = bravalue; 03616 tempcode = code; 03617 tempreqvary = cd->req_varyopt; /* Save value before bracket */ 03618 03619 if (!compile_regex( 03620 newoptions, /* The complete new option state */ 03621 options & PCRE_IMS, /* The previous ims option state */ 03622 brackets, /* Extracting bracket count */ 03623 &tempcode, /* Where to put code (updated) */ 03624 &ptr, /* Input pointer (updated) */ 03625 errorptr, /* Where to put an error message */ 03626 (bravalue == OP_ASSERTBACK || 03627 bravalue == OP_ASSERTBACK_NOT), /* true if back assert */ 03628 skipbytes, /* Skip over OP_COND/OP_BRANUMBER */ 03629 &subfirstbyte, /* For possible first char */ 03630 &subreqbyte, /* For possible last char */ 03631 bcptr, /* Current branch chain */ 03632 cd)) /* Tables block */ 03633 goto FAILED; 03634 03635 /* At the end of compiling, code is still pointing to the start of the 03636 group, while tempcode has been updated to point past the end of the group 03637 and any option resetting that may follow it. The pattern pointer (ptr) 03638 is on the bracket. */ 03639 03640 /* If this is a conditional bracket, check that there are no more than 03641 two branches in the group. */ 03642 03643 else if (bravalue == OP_COND) 03644 { 03645 uschar *tc = code; 03646 condcount = 0; 03647 03648 do { 03649 condcount++; 03650 tc += GET(tc,1); 03651 } 03652 while (*tc != OP_KET); 03653 03654 if (condcount > 2) 03655 { 03656 *errorptr = ERR27; 03657 goto FAILED; 03658 } 03659 03660 /* If there is just one branch, we must not make use of its firstbyte or 03661 reqbyte, because this is equivalent to an empty second branch. */ 03662 03663 if (condcount == 1) subfirstbyte = subreqbyte = REQ_NONE; 03664 } 03665 03666 /* Handle updating of the required and first characters. Update for normal 03667 brackets of all kinds, and conditions with two branches (see code above). 03668 If the bracket is followed by a quantifier with zero repeat, we have to 03669 back off. Hence the definition of zeroreqbyte and zerofirstbyte outside the 03670 main loop so that they can be accessed for the back off. */ 03671 03672 zeroreqbyte = reqbyte; 03673 zerofirstbyte = firstbyte; 03674 groupsetfirstbyte = false; 03675 03676 if (bravalue >= OP_BRA || bravalue == OP_ONCE || bravalue == OP_COND) 03677 { 03678 /* If we have not yet set a firstbyte in this branch, take it from the 03679 subpattern, remembering that it was set here so that a repeat of more 03680 than one can replicate it as reqbyte if necessary. If the subpattern has 03681 no firstbyte, set "none" for the whole branch. In both cases, a zero 03682 repeat forces firstbyte to "none". */ 03683 03684 if (firstbyte == REQ_UNSET) 03685 { 03686 if (subfirstbyte >= 0) 03687 { 03688 firstbyte = subfirstbyte; 03689 groupsetfirstbyte = true; 03690 } 03691 else firstbyte = REQ_NONE; 03692 zerofirstbyte = REQ_NONE; 03693 } 03694 03695 /* If firstbyte was previously set, convert the subpattern's firstbyte 03696 into reqbyte if there wasn't one, using the vary flag that was in 03697 existence beforehand. */ 03698 03699 else if (subfirstbyte >= 0 && subreqbyte < 0) 03700 subreqbyte = subfirstbyte | tempreqvary; 03701 03702 /* If the subpattern set a required byte (or set a first byte that isn't 03703 really the first byte - see above), set it. */ 03704 03705 if (subreqbyte >= 0) reqbyte = subreqbyte; 03706 } 03707 03708 /* For a forward assertion, we take the reqbyte, if set. This can be 03709 helpful if the pattern that follows the assertion doesn't set a different 03710 char. For example, it's useful for /(?=abcde).+/. We can't set firstbyte 03711 for an assertion, however because it leads to incorrect effect for patterns 03712 such as /(?=a)a.+/ when the "real" "a" would then become a reqbyte instead 03713 of a firstbyte. This is overcome by a scan at the end if there's no 03714 firstbyte, looking for an asserted first char. */ 03715 03716 else if (bravalue == OP_ASSERT && subreqbyte >= 0) reqbyte = subreqbyte; 03717 03718 /* Now update the main code pointer to the end of the group. */ 03719 03720 code = tempcode; 03721 03722 /* Error if hit end of pattern */ 03723 03724 if (*ptr != ')') 03725 { 03726 *errorptr = ERR14; 03727 goto FAILED; 03728 } 03729 break; 03730 03731 /* Check \ for being a real metacharacter; if not, fall through and handle 03732 it as a data character at the start of a string. Escape items are checked 03733 for validity in the pre-compiling pass. */ 03734 03735 case '\\': 03736 tempptr = ptr; 03737 c = check_escape(&ptr, errorptr, *brackets, options, false); 03738 03739 /* Handle metacharacters introduced by \. For ones like \d, the ESC_ values 03740 are arranged to be the negation of the corresponding OP_values. For the 03741 back references, the values are ESC_REF plus the reference number. Only 03742 back references and those types that consume a character may be repeated. 03743 We can test for values between ESC_b and ESC_Z for the latter; this may 03744 have to change if any new ones are ever created. */ 03745 03746 if (c < 0) 03747 { 03748 if (-c == ESC_Q) /* Handle start of quoted string */ 03749 { 03750 if (ptr[1] == '\\' && ptr[2] == 'E') ptr += 2; /* avoid empty string */ 03751 else inescq = true; 03752 continue; 03753 } 03754 03755 /* For metasequences that actually match a character, we disable the 03756 setting of a first character if it hasn't already been set. */ 03757 03758 if (firstbyte == REQ_UNSET && -c > ESC_b && -c < ESC_Z) 03759 firstbyte = REQ_NONE; 03760 03761 /* Set values to reset to if this is followed by a zero repeat. */ 03762 03763 zerofirstbyte = firstbyte; 03764 zeroreqbyte = reqbyte; 03765 03766 /* Back references are handled specially */ 03767 03768 if (-c >= ESC_REF) 03769 { 03770 int number = -c - ESC_REF; 03771 previous = code; 03772 *code++ = OP_REF; 03773 PUT2INC(code, 0, number); 03774 } 03775 else 03776 { 03777 previous = (-c > ESC_b && -c < ESC_Z)? code : NULL; 03778 *code++ = -c; 03779 } 03780 continue; 03781 } 03782 03783 /* Data character: reset and fall through */ 03784 03785 ptr = tempptr; 03786 c = '\\'; 03787 03788 /* Handle a run of data characters until a metacharacter is encountered. 03789 The first character is guaranteed not to be whitespace or # when the 03790 extended flag is set. */ 03791 03792 NORMAL_CHAR: 03793 default: 03794 previous = code; 03795 *code = OP_CHARS; 03796 code += 2; 03797 length = 0; 03798 03799 do 03800 { 03801 /* If in \Q...\E, check for the end; if not, we always have a literal */ 03802 03803 if (inescq) 03804 { 03805 if (c == '\\' && ptr[1] == 'E') 03806 { 03807 inescq = false; 03808 ptr++; 03809 } 03810 else 03811 { 03812 *code++ = c; 03813 length++; 03814 } 03815 continue; 03816 } 03817 03818 /* Skip white space and comments for /x patterns */ 03819 03820 if ((options & PCRE_EXTENDED) != 0) 03821 { 03822 if ((cd->ctypes[c] & ctype_space) != 0) continue; 03823 if (c == '#') 03824 { 03825 /* The space before the ; is to avoid a warning on a silly compiler 03826 on the Macintosh. */ 03827 while ((c = *(++ptr)) != 0 && c != NEWLINE) ; 03828 if (c == 0) break; 03829 continue; 03830 } 03831 } 03832 03833 /* Backslash may introduce a data char or a metacharacter. Escaped items 03834 are checked for validity in the pre-compiling pass. Stop the string 03835 before a metaitem. */ 03836 03837 if (c == '\\') 03838 { 03839 tempptr = ptr; 03840 c = check_escape(&ptr, errorptr, *brackets, options, false); 03841 if (c < 0) { ptr = tempptr; break; } 03842 03843 /* If a character is > 127 in UTF-8 mode, we have to turn it into 03844 two or more bytes in the UTF-8 encoding. */ 03845 03846 } 03847 03848 /* Ordinary character or single-char escape */ 03849 03850 *code++ = c; 03851 length++; 03852 } 03853 03854 /* This "while" is the end of the "do" above. */ 03855 03856 while (length < MAXLIT && (cd->ctypes[c = *(++ptr)] & ctype_meta) == 0); 03857 03858 /* Update the first and last requirements. These are always bytes, even in 03859 UTF-8 mode. However, there is a special case to be considered when there 03860 are only one or two characters. Because this gets messy in UTF-8 mode, the 03861 code is kept separate. When we get here "length" contains the number of 03862 bytes. */ 03863 03864 03865 /* This is the code for non-UTF-8 operation, either without UTF-8 support, 03866 or when UTF-8 is not enabled. */ 03867 03868 { 03869 /* firstbyte was not previously set; take it from this string */ 03870 03871 if (firstbyte == REQ_UNSET) 03872 { 03873 if (length == 1) 03874 { 03875 zerofirstbyte = REQ_NONE; 03876 firstbyte = previous[2] | req_caseopt; 03877 zeroreqbyte = reqbyte; 03878 } 03879 else 03880 { 03881 zerofirstbyte = firstbyte = previous[2] | req_caseopt; 03882 zeroreqbyte = (length > 2)? 03883 (code[-2] | req_caseopt | cd->req_varyopt) : reqbyte; 03884 reqbyte = code[-1] | req_caseopt | cd->req_varyopt; 03885 } 03886 } 03887 03888 /* firstbyte was previously set */ 03889 03890 else 03891 { 03892 zerofirstbyte = firstbyte; 03893 zeroreqbyte = (length == 1)? reqbyte : 03894 code[-2] | req_caseopt | cd->req_varyopt; 03895 reqbyte = code[-1] | req_caseopt | cd->req_varyopt; 03896 } 03897 } 03898 03899 /* Set the length in the data vector, and advance to the next state. */ 03900 03901 previous[1] = length; 03902 if (length < MAXLIT) ptr--; 03903 break; 03904 } 03905 } /* end of big loop */ 03906 03907 /* Control never reaches here by falling through, only by a goto for all the 03908 error states. Pass back the position in the pattern so that it can be displayed 03909 to the user for diagnosing the error. */ 03910 03911 FAILED: 03912 *ptrptr = ptr; 03913 return false; 03914 } 03915
static bool compile_regex | ( | int | , | |
int | , | |||
int * | , | |||
uschar ** | , | |||
const uschar ** | , | |||
const char ** | , | |||
bool | , | |||
int | , | |||
int * | , | |||
int * | , | |||
branch_chain * | , | |||
compile_data * | ||||
) | [static] |
Definition at line 3950 of file pcre.cpp.
References CMuxAlarm::bAlarmed, compile_branch(), branch_chain::current, DPRINTF, ERR25, ERR36, find_fixedlength(), GET, LINK_SIZE, MuxAlarm, OP_ALT, OP_END, OP_KET, OP_OPT, OP_REVERSE, branch_chain::outer, PCRE_IMS, PUT, PUTINC, REQ_NONE, REQ_UNSET, and REQ_VARY.
Referenced by compile_branch(), and pcre_compile().
03952 { 03953 const uschar *ptr = *ptrptr; 03954 uschar *code = *codeptr; 03955 uschar *last_branch = code; 03956 uschar *start_bracket = code; 03957 uschar *reverse_count = NULL; 03958 int firstbyte, reqbyte; 03959 int branchfirstbyte, branchreqbyte; 03960 branch_chain bc; 03961 03962 bc.outer = bcptr; 03963 bc.current = code; 03964 03965 firstbyte = reqbyte = REQ_UNSET; 03966 03967 /* Offset is set zero to mark that this bracket is still open */ 03968 03969 PUT(code, 1, 0); 03970 code += 1 + LINK_SIZE + skipbytes; 03971 03972 /* Loop for each alternative branch */ 03973 03974 for (;!MuxAlarm.bAlarmed;) 03975 { 03976 /* Handle a change of ims options at the start of the branch */ 03977 03978 if ((options & PCRE_IMS) != oldims) 03979 { 03980 *code++ = OP_OPT; 03981 *code++ = options & PCRE_IMS; 03982 } 03983 03984 /* Set up dummy OP_REVERSE if lookbehind assertion */ 03985 03986 if (lookbehind) 03987 { 03988 *code++ = OP_REVERSE; 03989 reverse_count = code; 03990 PUTINC(code, 0, 0); 03991 } 03992 03993 /* Now compile the branch */ 03994 03995 if (!compile_branch(&options, brackets, &code, &ptr, errorptr, 03996 &branchfirstbyte, &branchreqbyte, &bc, cd)) 03997 { 03998 *ptrptr = ptr; 03999 return false; 04000 } 04001 04002 /* If this is the first branch, the firstbyte and reqbyte values for the 04003 branch become the values for the regex. */ 04004 04005 if (*last_branch != OP_ALT) 04006 { 04007 firstbyte = branchfirstbyte; 04008 reqbyte = branchreqbyte; 04009 } 04010 04011 /* If this is not the first branch, the first char and reqbyte have to 04012 match the values from all the previous branches, except that if the previous 04013 value for reqbyte didn't have REQ_VARY set, it can still match, and we set 04014 REQ_VARY for the regex. */ 04015 04016 else 04017 { 04018 /* If we previously had a firstbyte, but it doesn't match the new branch, 04019 we have to abandon the firstbyte for the regex, but if there was previously 04020 no reqbyte, it takes on the value of the old firstbyte. */ 04021 04022 if (firstbyte >= 0 && firstbyte != branchfirstbyte) 04023 { 04024 if (reqbyte < 0) reqbyte = firstbyte; 04025 firstbyte = REQ_NONE; 04026 } 04027 04028 /* If we (now or from before) have no firstbyte, a firstbyte from the 04029 branch becomes a reqbyte if there isn't a branch reqbyte. */ 04030 04031 if (firstbyte < 0 && branchfirstbyte >= 0 && branchreqbyte < 0) 04032 branchreqbyte = branchfirstbyte; 04033 04034 /* Now ensure that the reqbytes match */ 04035 04036 if ((reqbyte & ~REQ_VARY) != (branchreqbyte & ~REQ_VARY)) 04037 reqbyte = REQ_NONE; 04038 else reqbyte |= branchreqbyte; /* To "or" REQ_VARY */ 04039 } 04040 04041 /* If lookbehind, check that this branch matches a fixed-length string, 04042 and put the length into the OP_REVERSE item. Temporarily mark the end of 04043 the branch with OP_END. */ 04044 04045 if (lookbehind) 04046 { 04047 int length; 04048 *code = OP_END; 04049 length = find_fixedlength(last_branch, options); 04050 DPRINTF(("fixed length = %d\n", length)); 04051 if (length < 0) 04052 { 04053 *errorptr = (length == -2)? ERR36 : ERR25; 04054 *ptrptr = ptr; 04055 return false; 04056 } 04057 PUT(reverse_count, 0, length); 04058 } 04059 04060 /* Reached end of expression, either ')' or end of pattern. Go back through 04061 the alternative branches and reverse the chain of offsets, with the field in 04062 the BRA item now becoming an offset to the first alternative. If there are 04063 no alternatives, it points to the end of the group. The length in the 04064 terminating ket is always the length of the whole bracketed item. If any of 04065 the ims options were changed inside the group, compile a resetting op-code 04066 following, except at the very end of the pattern. Return leaving the pointer 04067 at the terminating char. */ 04068 04069 if (*ptr != '|') 04070 { 04071 int length = code - last_branch; 04072 do 04073 { 04074 int prev_length = GET(last_branch, 1); 04075 PUT(last_branch, 1, length); 04076 length = prev_length; 04077 last_branch -= length; 04078 } 04079 while (length > 0); 04080 04081 /* Fill in the ket */ 04082 04083 *code = OP_KET; 04084 PUT(code, 1, code - start_bracket); 04085 code += 1 + LINK_SIZE; 04086 04087 /* Resetting option if needed */ 04088 04089 if ((options & PCRE_IMS) != oldims && *ptr == ')') 04090 { 04091 *code++ = OP_OPT; 04092 *code++ = oldims; 04093 } 04094 04095 /* Set values to pass back */ 04096 04097 *codeptr = code; 04098 *ptrptr = ptr; 04099 *firstbyteptr = firstbyte; 04100 *reqbyteptr = reqbyte; 04101 return true; 04102 } 04103 04104 /* Another branch follows; insert an "or" node. Its length field points back 04105 to the previous branch while the bracket remains open. At the end the chain 04106 is reversed. It's done like this so that the start of the bracket has a 04107 zero offset until it is closed, making it possible to detect recursion. */ 04108 04109 *code = OP_ALT; 04110 PUT(code, 1, code - last_branch); 04111 bc.current = last_branch = code; 04112 code += 1 + LINK_SIZE; 04113 ptr++; 04114 } 04115 return false; 04116 } 04117
static bool could_be_empty | ( | const uschar * | code, | |
const uschar * | endcode, | |||
branch_chain * | bcptr, | |||
bool | utf8 | |||
) | [static] |
Definition at line 2199 of file pcre.cpp.
References could_be_empty_branch(), branch_chain::current, and branch_chain::outer.
Referenced by compile_branch().
02200 { 02201 while (bcptr != NULL && bcptr->current >= code) 02202 { 02203 if (!could_be_empty_branch(bcptr->current, endcode, utf8)) return false; 02204 bcptr = bcptr->outer; 02205 } 02206 return true; 02207 } 02208
static bool could_be_empty_branch | ( | const uschar * | code, | |
const uschar * | endcode, | |||
bool | utf8 | |||
) | [static] |
Definition at line 2080 of file pcre.cpp.
References first_significant_code(), GET, GET2, LINK_SIZE, OP_ALT, OP_ANY, OP_ANYBYTE, OP_BRA, OP_CHARS, OP_CLASS, OP_CRMINPLUS, OP_CRMINQUERY, OP_CRMINRANGE, OP_CRMINSTAR, OP_CRPLUS, OP_CRQUERY, OP_CRRANGE, OP_CRSTAR, OP_DIGIT, OP_EXACT, OP_KET, OP_KETRMAX, OP_KETRMIN, OP_lengths, OP_MINPLUS, OP_NCLASS, OP_NOT, OP_NOT_DIGIT, OP_NOT_WHITESPACE, OP_NOT_WORDCHAR, OP_NOTEXACT, OP_NOTMINPLUS, OP_NOTPLUS, OP_PLUS, OP_TYPEEXACT, OP_TYPEMINPLUS, OP_TYPEPLUS, OP_WHITESPACE, and OP_WORDCHAR.
Referenced by could_be_empty().
02080 { 02081 register int c; 02082 for (code = first_significant_code(code + 1 + LINK_SIZE, NULL, 0); 02083 code < endcode; 02084 code = first_significant_code(code + OP_lengths[c], NULL, 0)) 02085 { 02086 const uschar *ccode; 02087 02088 c = *code; 02089 02090 if (c >= OP_BRA) 02091 { 02092 bool empty_branch; 02093 if (GET(code, 1) == 0) return true; /* Hit unclosed bracket */ 02094 02095 /* Scan a closed bracket */ 02096 02097 empty_branch = false; 02098 do 02099 { 02100 if (!empty_branch && could_be_empty_branch(code, endcode, utf8)) 02101 empty_branch = true; 02102 code += GET(code, 1); 02103 } 02104 while (*code == OP_ALT); 02105 if (!empty_branch) return false; /* All branches are non-empty */ 02106 code += 1 + LINK_SIZE; 02107 c = *code; 02108 } 02109 02110 else switch (c) 02111 { 02112 /* Check for quantifiers after a class */ 02113 02114 02115 case OP_CLASS: 02116 case OP_NCLASS: 02117 ccode = code + 33; 02118 02119 02120 switch (*ccode) 02121 { 02122 case OP_CRSTAR: /* These could be empty; continue */ 02123 case OP_CRMINSTAR: 02124 case OP_CRQUERY: 02125 case OP_CRMINQUERY: 02126 break; 02127 02128 default: /* Non-repeat => class must match */ 02129 case OP_CRPLUS: /* These repeats aren't empty */ 02130 case OP_CRMINPLUS: 02131 return false; 02132 02133 case OP_CRRANGE: 02134 case OP_CRMINRANGE: 02135 if (GET2(ccode, 1) > 0) return false; /* Minimum > 0 */ 02136 break; 02137 } 02138 break; 02139 02140 /* Opcodes that must match a character */ 02141 02142 case OP_NOT_DIGIT: 02143 case OP_DIGIT: 02144 case OP_NOT_WHITESPACE: 02145 case OP_WHITESPACE: 02146 case OP_NOT_WORDCHAR: 02147 case OP_WORDCHAR: 02148 case OP_ANY: 02149 case OP_ANYBYTE: 02150 case OP_CHARS: 02151 case OP_NOT: 02152 case OP_PLUS: 02153 case OP_MINPLUS: 02154 case OP_EXACT: 02155 case OP_NOTPLUS: 02156 case OP_NOTMINPLUS: 02157 case OP_NOTEXACT: 02158 case OP_TYPEPLUS: 02159 case OP_TYPEMINPLUS: 02160 case OP_TYPEEXACT: 02161 return false; 02162 02163 /* End of branch */ 02164 02165 case OP_KET: 02166 case OP_KETRMAX: 02167 case OP_KETRMIN: 02168 case OP_ALT: 02169 return true; 02170 02171 } 02172 } 02173 02174 return true; 02175 } 02176
Definition at line 1997 of file pcre.cpp.
References EXTRACT_BASIC_MAX, GET2, LINK_SIZE, OP_BRA, OP_CHARS, OP_END, and OP_lengths.
Referenced by compile_branch().
01997 { 01998 01999 for (;;) 02000 { 02001 register int c = *code; 02002 if (c == OP_END) return NULL; 02003 else if (c == OP_CHARS) code += code[1] + OP_lengths[c]; 02004 else if (c > OP_BRA) 02005 { 02006 int n = c - OP_BRA; 02007 if (n > EXTRACT_BASIC_MAX) n = GET2(code, 2+LINK_SIZE); 02008 if (n == number) return (uschar *)code; 02009 code += OP_lengths[OP_BRA]; 02010 } 02011 else 02012 { 02013 code += OP_lengths[c]; 02014 02015 } 02016 } 02017 } 02018
static int find_firstassertedchar | ( | const uschar * | code, | |
int * | options, | |||
bool | inassert | |||
) | [static] |
Definition at line 4294 of file pcre.cpp.
References first_significant_code(), GET, LINK_SIZE, OP_ALT, OP_ASSERT, OP_BRA, OP_CHARS, OP_COND, OP_EXACT, OP_MINPLUS, OP_ONCE, OP_PLUS, PCRE_CASELESS, and REQ_CASELESS.
Referenced by pcre_compile().
04294 { 04295 register int c = -1; 04296 do { 04297 int d; 04298 const uschar *scode = 04299 first_significant_code(code + 1+LINK_SIZE, options, PCRE_CASELESS); 04300 register int op = *scode; 04301 04302 if (op >= OP_BRA) op = OP_BRA; 04303 04304 switch(op) 04305 { 04306 default: 04307 return -1; 04308 04309 case OP_BRA: 04310 case OP_ASSERT: 04311 case OP_ONCE: 04312 case OP_COND: 04313 if ((d = find_firstassertedchar(scode, options, op == OP_ASSERT)) < 0) 04314 return -1; 04315 if (c < 0) c = d; else if (c != d) return -1; 04316 break; 04317 04318 case OP_EXACT: /* Fall through */ 04319 scode++; 04320 04321 case OP_CHARS: /* Fall through */ 04322 scode++; 04323 04324 case OP_PLUS: 04325 case OP_MINPLUS: 04326 if (!inassert) return -1; 04327 if (c < 0) 04328 { 04329 c = scode[1]; 04330 if ((*options & PCRE_CASELESS) != 0) c |= REQ_CASELESS; 04331 } 04332 else if (c != scode[1]) return -1; 04333 break; 04334 } 04335 04336 code += GET(code, 1); 04337 } 04338 while (*code == OP_ALT); 04339 return c; 04340 } 04341
static int find_fixedlength | ( | uschar * | code, | |
int | options | |||
) | [static] |
Definition at line 1830 of file pcre.cpp.
References GET, GET2, LINK_SIZE, OP_ALT, OP_ANY, OP_ANYBYTE, OP_ASSERT, OP_ASSERT_NOT, OP_ASSERTBACK, OP_ASSERTBACK_NOT, OP_BRA, OP_BRANUMBER, OP_CALLOUT, OP_CHARS, OP_CIRC, OP_CLASS, OP_COND, OP_CREF, OP_CRMINQUERY, OP_CRMINRANGE, OP_CRMINSTAR, OP_CRQUERY, OP_CRRANGE, OP_CRSTAR, OP_DIGIT, OP_DOLL, OP_END, OP_EOD, OP_EODN, OP_EXACT, OP_KET, OP_KETRMAX, OP_KETRMIN, OP_lengths, OP_NCLASS, OP_NOT_DIGIT, OP_NOT_WHITESPACE, OP_NOT_WORD_BOUNDARY, OP_NOT_WORDCHAR, OP_ONCE, OP_OPT, OP_REVERSE, OP_SOD, OP_SOM, OP_TYPEEXACT, OP_WHITESPACE, OP_WORD_BOUNDARY, and OP_WORDCHAR.
Referenced by compile_regex().
01830 { 01831 int length = -1; 01832 01833 register int branchlength = 0; 01834 register uschar *cc = code + 1 + LINK_SIZE; 01835 01836 /* Scan along the opcodes for this branch. If we get to the end of the 01837 branch, check the length against that of the other branches. */ 01838 01839 for (;;) 01840 { 01841 int d; 01842 register int op = *cc; 01843 if (op >= OP_BRA) op = OP_BRA; 01844 01845 switch (op) 01846 { 01847 case OP_BRA: 01848 case OP_ONCE: 01849 case OP_COND: 01850 d = find_fixedlength(cc, options); 01851 if (d < 0) return d; 01852 branchlength += d; 01853 do cc += GET(cc, 1); while (*cc == OP_ALT); 01854 cc += 1 + LINK_SIZE; 01855 break; 01856 01857 /* Reached end of a branch; if it's a ket it is the end of a nested 01858 call. If it's ALT it is an alternation in a nested call. If it is 01859 END it's the end of the outer call. All can be handled by the same code. */ 01860 01861 case OP_ALT: 01862 case OP_KET: 01863 case OP_KETRMAX: 01864 case OP_KETRMIN: 01865 case OP_END: 01866 if (length < 0) length = branchlength; 01867 else if (length != branchlength) return -1; 01868 if (*cc != OP_ALT) return length; 01869 cc += 1 + LINK_SIZE; 01870 branchlength = 0; 01871 break; 01872 01873 /* Skip over assertive subpatterns */ 01874 01875 case OP_ASSERT: 01876 case OP_ASSERT_NOT: 01877 case OP_ASSERTBACK: 01878 case OP_ASSERTBACK_NOT: 01879 do cc += GET(cc, 1); while (*cc == OP_ALT); 01880 /* Fall through */ 01881 01882 /* Skip over things that don't match chars */ 01883 01884 case OP_REVERSE: 01885 case OP_BRANUMBER: 01886 case OP_CREF: 01887 case OP_OPT: 01888 case OP_CALLOUT: 01889 case OP_SOD: 01890 case OP_SOM: 01891 case OP_EOD: 01892 case OP_EODN: 01893 case OP_CIRC: 01894 case OP_DOLL: 01895 case OP_NOT_WORD_BOUNDARY: 01896 case OP_WORD_BOUNDARY: 01897 cc += OP_lengths[*cc]; 01898 break; 01899 01900 /* Handle char strings. In UTF-8 mode we must count characters, not bytes. 01901 This requires a scan of the string, unfortunately. We assume valid UTF-8 01902 strings, so all we do is reduce the length by one for every byte whose bits 01903 are 10xxxxxx. */ 01904 01905 case OP_CHARS: 01906 branchlength += *(++cc); 01907 cc += *cc + 1; 01908 break; 01909 01910 /* Handle exact repetitions. The count is already in characters, but we 01911 need to skip over a multibyte character in UTF8 mode. */ 01912 01913 case OP_EXACT: 01914 branchlength += GET2(cc,1); 01915 cc += 4; 01916 break; 01917 01918 case OP_TYPEEXACT: 01919 branchlength += GET2(cc,1); 01920 cc += 4; 01921 break; 01922 01923 /* Handle single-char matchers */ 01924 01925 case OP_NOT_DIGIT: 01926 case OP_DIGIT: 01927 case OP_NOT_WHITESPACE: 01928 case OP_WHITESPACE: 01929 case OP_NOT_WORDCHAR: 01930 case OP_WORDCHAR: 01931 case OP_ANY: 01932 branchlength++; 01933 cc++; 01934 break; 01935 01936 /* The single-byte matcher isn't allowed */ 01937 01938 case OP_ANYBYTE: 01939 return -2; 01940 01941 /* Check a class for variable quantification */ 01942 01943 01944 case OP_CLASS: 01945 case OP_NCLASS: 01946 cc += 33; 01947 01948 switch (*cc) 01949 { 01950 case OP_CRSTAR: 01951 case OP_CRMINSTAR: 01952 case OP_CRQUERY: 01953 case OP_CRMINQUERY: 01954 return -1; 01955 01956 case OP_CRRANGE: 01957 case OP_CRMINRANGE: 01958 if (GET2(cc,1) != GET2(cc,3)) return -1; 01959 branchlength += GET2(cc,1); 01960 cc += 5; 01961 break; 01962 01963 default: 01964 branchlength++; 01965 } 01966 break; 01967 01968 /* Anything else is variable length */ 01969 01970 default: 01971 return -1; 01972 } 01973 } 01974 /* Control never gets here */ 01975 } 01976
Definition at line 2037 of file pcre.cpp.
References OP_BRA, OP_CHARS, OP_END, OP_lengths, and OP_RECURSE.
Referenced by adjust_recurse().
02037 { 02038 utf8 = utf8; /* Stop pedantic compilers complaining */ 02039 02040 for (;;) 02041 { 02042 register int c = *code; 02043 if (c == OP_END) return NULL; 02044 else if (c == OP_RECURSE) return code; 02045 else if (c == OP_CHARS) code += code[1] + OP_lengths[c]; 02046 else if (c > OP_BRA) 02047 { 02048 code += OP_lengths[OP_BRA]; 02049 } 02050 else 02051 { 02052 code += OP_lengths[c]; 02053 02054 } 02055 } 02056 } 02057
static const uschar* first_significant_code | ( | const uschar * | code, | |
int * | options, | |||
int | optbit | |||
) | [static] |
Definition at line 1777 of file pcre.cpp.
References GET, OP_ALT, OP_ASSERT_NOT, OP_ASSERTBACK, OP_ASSERTBACK_NOT, OP_BRANUMBER, OP_CALLOUT, OP_CREF, OP_lengths, OP_NOT_WORD_BOUNDARY, OP_OPT, and OP_WORD_BOUNDARY.
Referenced by could_be_empty_branch(), find_firstassertedchar(), is_anchored(), and is_startline().
01777 { 01778 for (;;) 01779 { 01780 switch ((int)*code) 01781 { 01782 case OP_OPT: 01783 if (optbit > 0 && ((int)code[1] & optbit) != (*options & optbit)) 01784 *options = (int)code[1]; 01785 code += 2; 01786 break; 01787 01788 case OP_ASSERT_NOT: 01789 case OP_ASSERTBACK: 01790 case OP_ASSERTBACK_NOT: 01791 do code += GET(code, 1); while (*code == OP_ALT); 01792 /* Fall through */ 01793 01794 case OP_CALLOUT: 01795 case OP_CREF: 01796 case OP_BRANUMBER: 01797 case OP_WORD_BOUNDARY: 01798 case OP_NOT_WORD_BOUNDARY: 01799 code += OP_lengths[*code]; 01800 break; 01801 01802 default: 01803 return code; 01804 } 01805 } 01806 /* Control never reaches here */ 01807 } 01808
static bool is_anchored | ( | register const uschar * | code, | |
int * | options, | |||
unsigned int | bracket_map, | |||
unsigned int | backref_map | |||
) | [static] |
Definition at line 4161 of file pcre.cpp.
References EXTRACT_BASIC_MAX, first_significant_code(), GET, GET2, LINK_SIZE, OP_ALT, OP_ANY, OP_ASSERT, OP_BRA, OP_CIRC, OP_COND, OP_ONCE, OP_SOD, OP_SOM, OP_TYPEMINSTAR, OP_TYPESTAR, PCRE_DOTALL, and PCRE_MULTILINE.
Referenced by pcre_compile().
04162 { 04163 do { 04164 const uschar *scode = 04165 first_significant_code(code + 1+LINK_SIZE, options, PCRE_MULTILINE); 04166 register int op = *scode; 04167 04168 /* Capturing brackets */ 04169 04170 if (op > OP_BRA) 04171 { 04172 int new_map; 04173 op -= OP_BRA; 04174 if (op > EXTRACT_BASIC_MAX) op = GET2(scode, 2+LINK_SIZE); 04175 new_map = bracket_map | ((op < 32)? (1 << op) : 1); 04176 if (!is_anchored(scode, options, new_map, backref_map)) return false; 04177 } 04178 04179 /* Other brackets */ 04180 04181 else if (op == OP_BRA || op == OP_ASSERT || op == OP_ONCE || op == OP_COND) 04182 { 04183 if (!is_anchored(scode, options, bracket_map, backref_map)) return false; 04184 } 04185 04186 /* .* is not anchored unless DOTALL is set and it isn't in brackets that 04187 are or may be referenced. */ 04188 04189 else if ((op == OP_TYPESTAR || op == OP_TYPEMINSTAR) && 04190 (*options & PCRE_DOTALL) != 0) 04191 { 04192 if (scode[1] != OP_ANY || (bracket_map & backref_map) != 0) return false; 04193 } 04194 04195 /* Check for explicit anchoring */ 04196 04197 else if (op != OP_SOD && op != OP_SOM && 04198 ((*options & PCRE_MULTILINE) != 0 || op != OP_CIRC)) 04199 return false; 04200 code += GET(code, 1); 04201 } 04202 while (*code == OP_ALT); /* Loop for each alternative */ 04203 return true; 04204 } 04205
static bool is_counted_repeat | ( | const uschar * | p | ) | [static] |
Definition at line 1685 of file pcre.cpp.
References ctype_digit, and digitab.
Referenced by compile_branch(), and pcre_compile().
01685 { 01686 if ((digitab[*p++] & ctype_digit) == 0) return false; 01687 while ((digitab[*p] & ctype_digit) != 0) p++; 01688 if (*p == '}') return true; 01689 01690 if (*p++ != ',') return false; 01691 if (*p == '}') return true; 01692 01693 if ((digitab[*p++] & ctype_digit) == 0) return false; 01694 while ((digitab[*p] & ctype_digit) != 0) p++; 01695 01696 return (*p == '}'); 01697 } 01698
static bool is_startline | ( | const uschar * | code, | |
unsigned int | bracket_map, | |||
unsigned int | backref_map | |||
) | [static] |
Definition at line 4231 of file pcre.cpp.
References EXTRACT_BASIC_MAX, first_significant_code(), GET, GET2, LINK_SIZE, OP_ALT, OP_ANY, OP_ASSERT, OP_BRA, OP_CIRC, OP_COND, OP_ONCE, OP_TYPEMINSTAR, and OP_TYPESTAR.
Referenced by pcre_compile().
04232 { 04233 do { 04234 const uschar *scode = first_significant_code(code + 1+LINK_SIZE, NULL, 0); 04235 register int op = *scode; 04236 04237 /* Capturing brackets */ 04238 04239 if (op > OP_BRA) 04240 { 04241 int new_map; 04242 op -= OP_BRA; 04243 if (op > EXTRACT_BASIC_MAX) op = GET2(scode, 2+LINK_SIZE); 04244 new_map = bracket_map | ((op < 32)? (1 << op) : 1); 04245 if (!is_startline(scode, new_map, backref_map)) return false; 04246 } 04247 04248 /* Other brackets */ 04249 04250 else if (op == OP_BRA || op == OP_ASSERT || op == OP_ONCE || op == OP_COND) 04251 { if (!is_startline(scode, bracket_map, backref_map)) return false; } 04252 04253 /* .* is not anchored unless DOTALL is set and it isn't in brackets that 04254 may be referenced. */ 04255 04256 else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR) 04257 { 04258 if (scode[1] != OP_ANY || (bracket_map & backref_map) != 0) return false; 04259 } 04260 04261 /* Check for explicit circumflex */ 04262 04263 else if (op != OP_CIRC) return false; 04264 code += GET(code, 1); 04265 } 04266 while (*code == OP_ALT); /* Loop for each alternative */ 04267 return true; 04268 } 04269
static int match | ( | REGISTER const uschar * | eptr, | |
REGISTER const uschar * | ecode, | |||
int | offset_top, | |||
match_data * | md, | |||
unsigned long int | ims, | |||
eptrblock * | eptrb, | |||
int | flags | |||
) | [static] |
Definition at line 5414 of file pcre.cpp.
References recursion_info::after_call, CMuxAlarm::bAlarmed, pcre_callout_block::callout_data, pcre_callout_block::callout_number, pcre_callout_block::capture_last, pcre_callout_block::capture_top, CREF_RECURSE, ctype_digit, ctype_space, ctype_word, pcre_callout_block::current_position, DPRINTF, eptrblock::epb_prev, eptrblock::epb_saved_eptr, EXTRACT_BASIC_MAX, fc, fi, GET, GET2, GETCHARINCTEST, recursion_info::group_num, LINK_SIZE, match_condassert, match_isgroup, MATCH_MATCH, MATCH_NOMATCH, match_ref(), md, MuxAlarm, NEWLINE, next, recursion_info::offset_save, pcre_callout_block::offset_vector, OP_ALT, OP_ANY, OP_ANYBYTE, OP_ASSERT, OP_ASSERT_NOT, OP_ASSERTBACK, OP_ASSERTBACK_NOT, OP_BRA, OP_BRAMINZERO, OP_BRANUMBER, OP_BRAZERO, OP_CALLOUT, OP_CHARS, OP_CIRC, OP_CLASS, OP_COND, OP_CREF, OP_CRMINPLUS, OP_CRMINQUERY, OP_CRMINRANGE, OP_CRMINSTAR, OP_CRPLUS, OP_CRQUERY, OP_CRRANGE, OP_CRSTAR, OP_DIGIT, OP_DOLL, OP_END, OP_EOD, OP_EODN, OP_EXACT, OP_KET, OP_KETRMAX, OP_KETRMIN, OP_MINPLUS, OP_MINQUERY, OP_MINSTAR, OP_MINUPTO, OP_NCLASS, OP_NOT, OP_NOT_DIGIT, OP_NOT_WHITESPACE, OP_NOT_WORD_BOUNDARY, OP_NOT_WORDCHAR, OP_NOTEXACT, OP_NOTMINPLUS, OP_NOTMINQUERY, OP_NOTMINSTAR, OP_NOTMINUPTO, OP_NOTPLUS, OP_NOTQUERY, OP_NOTSTAR, OP_NOTUPTO, OP_ONCE, OP_OPT, OP_PLUS, OP_QUERY, OP_RECURSE, OP_REF, OP_REVERSE, OP_SOD, OP_SOM, OP_STAR, OP_TYPEEXACT, OP_TYPEMINPLUS, OP_TYPEMINQUERY, OP_TYPEMINSTAR, OP_TYPEMINUPTO, OP_TYPEPLUS, OP_TYPEQUERY, OP_TYPESTAR, OP_TYPEUPTO, OP_UPTO, OP_WHITESPACE, OP_WORD_BOUNDARY, OP_WORDCHAR, pcre_callout, PCRE_CASELESS, PCRE_DOTALL, PCRE_ERROR_MATCHLIMIT, PCRE_ERROR_NOMEMORY, PCRE_ERROR_UNKNOWN_NODE, PCRE_IMS, PCRE_MULTILINE, recursion_info::prevrec, REC_STACK_SAVE_MAX, rep_max, rep_min, RMATCH, RRETURN, recursion_info::save_start, recursion_info::saved_max, pcre_callout_block::start_match, pcre_callout_block::subject, pcre_callout_block::subject_length, and pcre_callout_block::version.
Referenced by absolute_name(), atr_match(), atr_match1(), match_numeric(), match_player(), and pcre_exec().
05416 { 05417 /* These variables do not need to be preserved over recursion in this function, 05418 so they can be ordinary variables in all cases. Mark them with "register" 05419 because they are used a lot in loops. */ 05420 05421 register int rrc; /* Returns from recursive calls */ 05422 register int i; /* Used for loops not involving calls to RMATCH() */ 05423 register int c; /* Character values not kept over RMATCH() calls */ 05424 05425 /* When recursion is not being used, all "local" variables that have to be 05426 preserved over calls to RMATCH() are part of a "frame" which is obtained from 05427 heap storage. Set up the top-level frame here; others are obtained from the 05428 heap whenever RMATCH() does a "recursion". See the macro definitions above. */ 05429 05430 #define fi i 05431 #define fc c 05432 05433 const uschar *callpat; /* Many of these variables are used ony */ 05434 /* small blocks of the code. My normal */ 05435 const uschar *data; /* style of coding would have declared */ 05436 /* them within each of those blocks. */ 05437 const uschar *next; /* However, in order to accommodate the */ 05438 const uschar *pp; /* version of this code that uses an */ 05439 const uschar *prev; /* external "stack" implemented on the */ 05440 const uschar *saved_eptr; /* heap, it is easier to declare them */ 05441 /* all here, so the declarations can */ 05442 recursion_info new_recursive; /* be cut out in a block. The only */ 05443 /* declarations within blocks below are */ 05444 bool cur_is_word; /* for variables that do not have to */ 05445 bool condition; /* be preserved over a recursive call */ 05446 bool minimize; /* to RMATCH(). */ 05447 bool prev_is_word; 05448 05449 unsigned long int original_ims; 05450 05451 int ctype; 05452 int length; 05453 int max; 05454 int min; 05455 int number; 05456 int offset; 05457 int op; 05458 int save_capture_last; 05459 int save_offset1, save_offset2, save_offset3; 05460 int stacksave[REC_STACK_SAVE_MAX]; 05461 05462 eptrblock newptrb; 05463 05464 /* OK, now we can get on with the real code of the function. Recursion is 05465 specified by the macros RMATCH and RRETURN. When NO_RECURSE is *not* defined, 05466 these just turn into a recursive call to match() and a "return", respectively. 05467 However, RMATCH isn't like a function call because it's quite a complicated 05468 macro. It has to be used in one particular way. This shouldn't, however, impact 05469 performance when true recursion is being used. */ 05470 05471 if (md->match_call_count++ >= md->match_limit) RRETURN(PCRE_ERROR_MATCHLIMIT); 05472 05473 original_ims = ims; /* Save for resetting on ')' */ 05474 05475 /* At the start of a bracketed group, add the current subject pointer to the 05476 stack of such pointers, to be re-instated at the end of the group when we hit 05477 the closing ket. When match() is called in other circumstances, we don't add to 05478 this stack. */ 05479 05480 if ((flags & match_isgroup) != 0) 05481 { 05482 newptrb.epb_prev = eptrb; 05483 newptrb.epb_saved_eptr = eptr; 05484 eptrb = &newptrb; 05485 } 05486 05487 /* Now start processing the operations. */ 05488 05489 for (;!MuxAlarm.bAlarmed;) 05490 { 05491 op = *ecode; 05492 minimize = false; 05493 05494 /* Opening capturing bracket. If there is space in the offset vector, save 05495 the current subject position in the working slot at the top of the vector. We 05496 mustn't change the current values of the data slot, because they may be set 05497 from a previous iteration of this group, and be referred to by a reference 05498 inside the group. 05499 05500 If the bracket fails to match, we need to restore this value and also the 05501 values of the final offsets, in case they were set by a previous iteration of 05502 the same bracket. 05503 05504 If there isn't enough space in the offset vector, treat this as if it were a 05505 non-capturing bracket. Don't worry about setting the flag for the error case 05506 here; that is handled in the code for KET. */ 05507 05508 if (op > OP_BRA) 05509 { 05510 number = op - OP_BRA; 05511 05512 /* For extended extraction brackets (large number), we have to fish out the 05513 number from a dummy opcode at the start. */ 05514 05515 if (number > EXTRACT_BASIC_MAX) 05516 number = GET2(ecode, 2+LINK_SIZE); 05517 offset = number << 1; 05518 05519 if (offset < md->offset_max) 05520 { 05521 save_offset1 = md->offset_vector[offset]; 05522 save_offset2 = md->offset_vector[offset+1]; 05523 save_offset3 = md->offset_vector[md->offset_end - number]; 05524 save_capture_last = md->capture_last; 05525 05526 DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3)); 05527 md->offset_vector[md->offset_end - number] = eptr - md->start_subject; 05528 05529 do 05530 { 05531 RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb, 05532 match_isgroup); 05533 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 05534 md->capture_last = save_capture_last; 05535 ecode += GET(ecode, 1); 05536 } 05537 while (*ecode == OP_ALT); 05538 05539 DPRINTF(("bracket %d failed\n", number)); 05540 05541 md->offset_vector[offset] = save_offset1; 05542 md->offset_vector[offset+1] = save_offset2; 05543 md->offset_vector[md->offset_end - number] = save_offset3; 05544 05545 RRETURN(MATCH_NOMATCH); 05546 } 05547 05548 /* Insufficient room for saving captured contents */ 05549 05550 else op = OP_BRA; 05551 } 05552 05553 /* Other types of node can be handled by a switch */ 05554 05555 switch(op) 05556 { 05557 case OP_BRA: /* Non-capturing bracket: optimized */ 05558 DPRINTF(("start bracket 0\n")); 05559 do 05560 { 05561 RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb, 05562 match_isgroup); 05563 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 05564 ecode += GET(ecode, 1); 05565 } 05566 while (*ecode == OP_ALT); 05567 DPRINTF(("bracket 0 failed\n")); 05568 RRETURN(MATCH_NOMATCH); 05569 05570 /* Conditional group: compilation checked that there are no more than 05571 two branches. If the condition is false, skipping the first branch takes us 05572 past the end if there is only one branch, but that's OK because that is 05573 exactly what going to the ket would do. */ 05574 05575 case OP_COND: 05576 if (ecode[LINK_SIZE+1] == OP_CREF) /* Condition extract or recurse test */ 05577 { 05578 offset = GET2(ecode, LINK_SIZE+2) << 1; /* Doubled ref number */ 05579 condition = (offset == CREF_RECURSE * 2)? 05580 (md->recursive != NULL) : 05581 (offset < offset_top && md->offset_vector[offset] >= 0); 05582 RMATCH(rrc, eptr, ecode + (condition? 05583 (LINK_SIZE + 4) : (LINK_SIZE + 1 + GET(ecode, 1))), 05584 offset_top, md, ims, eptrb, match_isgroup); 05585 RRETURN(rrc); 05586 } 05587 05588 /* The condition is an assertion. Call match() to evaluate it - setting 05589 the final argument true causes it to stop at the end of an assertion. */ 05590 05591 else 05592 { 05593 RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL, 05594 match_condassert | match_isgroup); 05595 if (rrc == MATCH_MATCH) 05596 { 05597 ecode += 1 + LINK_SIZE + GET(ecode, LINK_SIZE+2); 05598 while (*ecode == OP_ALT) ecode += GET(ecode, 1); 05599 } 05600 else if (rrc != MATCH_NOMATCH) 05601 { 05602 RRETURN(rrc); /* Need braces because of following else */ 05603 } 05604 else ecode += GET(ecode, 1); 05605 RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb, 05606 match_isgroup); 05607 RRETURN(rrc); 05608 } 05609 /* Control never reaches here */ 05610 05611 /* Skip over conditional reference or large extraction number data if 05612 encountered. */ 05613 05614 case OP_CREF: 05615 case OP_BRANUMBER: 05616 ecode += 3; 05617 break; 05618 05619 /* End of the pattern. If we are in a recursion, we should restore the 05620 offsets appropriately and continue from after the call. */ 05621 05622 case OP_END: 05623 if (md->recursive != NULL && md->recursive->group_num == 0) 05624 { 05625 recursion_info *rec = md->recursive; 05626 DPRINTF(("Hit the end in a (?0) recursion\n")); 05627 md->recursive = rec->prevrec; 05628 memmove(md->offset_vector, rec->offset_save, 05629 rec->saved_max * sizeof(int)); 05630 md->start_match = rec->save_start; 05631 ims = original_ims; 05632 ecode = rec->after_call; 05633 break; 05634 } 05635 05636 /* Otherwise, if PCRE_NOTEMPTY is set, fail if we have matched an empty 05637 string - backtracking will then try other alternatives, if any. */ 05638 05639 if (md->notempty && eptr == md->start_match) RRETURN(MATCH_NOMATCH); 05640 md->end_match_ptr = eptr; /* Record where we ended */ 05641 md->end_offset_top = offset_top; /* and how many extracts were taken */ 05642 RRETURN(MATCH_MATCH); 05643 05644 /* Change option settings */ 05645 05646 case OP_OPT: 05647 ims = ecode[1]; 05648 ecode += 2; 05649 DPRINTF(("ims set to %02lx\n", ims)); 05650 break; 05651 05652 /* Assertion brackets. Check the alternative branches in turn - the 05653 matching won't pass the KET for an assertion. If any one branch matches, 05654 the assertion is true. Lookbehind assertions have an OP_REVERSE item at the 05655 start of each branch to move the current point backwards, so the code at 05656 this level is identical to the lookahead case. */ 05657 05658 case OP_ASSERT: 05659 case OP_ASSERTBACK: 05660 do 05661 { 05662 RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL, 05663 match_isgroup); 05664 if (rrc == MATCH_MATCH) break; 05665 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 05666 ecode += GET(ecode, 1); 05667 } 05668 while (*ecode == OP_ALT); 05669 if (*ecode == OP_KET) RRETURN(MATCH_NOMATCH); 05670 05671 /* If checking an assertion for a condition, return MATCH_MATCH. */ 05672 05673 if ((flags & match_condassert) != 0) RRETURN(MATCH_MATCH); 05674 05675 /* Continue from after the assertion, updating the offsets high water 05676 mark, since extracts may have been taken during the assertion. */ 05677 05678 do ecode += GET(ecode,1); while (*ecode == OP_ALT); 05679 ecode += 1 + LINK_SIZE; 05680 offset_top = md->end_offset_top; 05681 continue; 05682 05683 /* Negative assertion: all branches must fail to match */ 05684 05685 case OP_ASSERT_NOT: 05686 case OP_ASSERTBACK_NOT: 05687 do 05688 { 05689 RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL, 05690 match_isgroup); 05691 if (rrc == MATCH_MATCH) RRETURN(MATCH_NOMATCH); 05692 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 05693 ecode += GET(ecode,1); 05694 } 05695 while (*ecode == OP_ALT); 05696 05697 if ((flags & match_condassert) != 0) RRETURN(MATCH_MATCH); 05698 05699 ecode += 1 + LINK_SIZE; 05700 continue; 05701 05702 /* Move the subject pointer back. This occurs only at the start of 05703 each branch of a lookbehind assertion. If we are too close to the start to 05704 move back, this match function fails. When working with UTF-8 we move 05705 back a number of characters, not bytes. */ 05706 05707 case OP_REVERSE: 05708 05709 /* No UTF-8 support, or not in UTF-8 mode: count is byte count */ 05710 05711 { 05712 eptr -= GET(ecode,1); 05713 if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH); 05714 } 05715 05716 /* Skip to next op code */ 05717 05718 ecode += 1 + LINK_SIZE; 05719 break; 05720 05721 /* The callout item calls an external function, if one is provided, passing 05722 details of the match so far. This is mainly for debugging, though the 05723 function is able to force a failure. */ 05724 05725 case OP_CALLOUT: 05726 if (pcre_callout != NULL) 05727 { 05728 pcre_callout_block cb; 05729 cb.version = 0; /* Version 0 of the callout block */ 05730 cb.callout_number = ecode[1]; 05731 cb.offset_vector = md->offset_vector; 05732 cb.subject = (const char *)md->start_subject; 05733 cb.subject_length = md->end_subject - md->start_subject; 05734 cb.start_match = md->start_match - md->start_subject; 05735 cb.current_position = eptr - md->start_subject; 05736 cb.capture_top = offset_top/2; 05737 cb.capture_last = md->capture_last; 05738 cb.callout_data = md->callout_data; 05739 if ((rrc = (*pcre_callout)(&cb)) > 0) RRETURN(MATCH_NOMATCH); 05740 if (rrc < 0) RRETURN(rrc); 05741 } 05742 ecode += 2; 05743 break; 05744 05745 /* Recursion either matches the current regex, or some subexpression. The 05746 offset data is the offset to the starting bracket from the start of the 05747 whole pattern. (This is so that it works from duplicated subpatterns.) 05748 05749 If there are any capturing brackets started but not finished, we have to 05750 save their starting points and reinstate them after the recursion. However, 05751 we don't know how many such there are (offset_top records the completed 05752 total) so we just have to save all the potential data. There may be up to 05753 65535 such values, which is too large to put on the stack, but using malloc 05754 for small numbers seems expensive. As a compromise, the stack is used when 05755 there are no more than REC_STACK_SAVE_MAX values to store; otherwise malloc 05756 is used. A problem is what to do if the malloc fails ... there is no way of 05757 returning to the top level with an error. Save the top REC_STACK_SAVE_MAX 05758 values on the stack, and accept that the rest may be wrong. 05759 05760 There are also other values that have to be saved. We use a chained 05761 sequence of blocks that actually live on the stack. Thanks to Robin Houston 05762 for the original version of this logic. */ 05763 05764 case OP_RECURSE: 05765 { 05766 callpat = md->start_code + GET(ecode, 1); 05767 new_recursive.group_num = *callpat - OP_BRA; 05768 05769 /* For extended extraction brackets (large number), we have to fish out 05770 the number from a dummy opcode at the start. */ 05771 05772 if (new_recursive.group_num > EXTRACT_BASIC_MAX) 05773 new_recursive.group_num = GET2(callpat, 2+LINK_SIZE); 05774 05775 /* Add to "recursing stack" */ 05776 05777 new_recursive.prevrec = md->recursive; 05778 md->recursive = &new_recursive; 05779 05780 /* Find where to continue from afterwards */ 05781 05782 ecode += 1 + LINK_SIZE; 05783 new_recursive.after_call = ecode; 05784 05785 /* Now save the offset data. */ 05786 05787 new_recursive.saved_max = md->offset_end; 05788 if (new_recursive.saved_max <= REC_STACK_SAVE_MAX) 05789 new_recursive.offset_save = stacksave; 05790 else 05791 { 05792 new_recursive.offset_save = 05793 static_cast<int *>(malloc(new_recursive.saved_max * sizeof(int))); 05794 if (new_recursive.offset_save == NULL) RRETURN(PCRE_ERROR_NOMEMORY); 05795 } 05796 05797 memcpy(new_recursive.offset_save, md->offset_vector, 05798 new_recursive.saved_max * sizeof(int)); 05799 new_recursive.save_start = md->start_match; 05800 md->start_match = eptr; 05801 05802 /* OK, now we can do the recursion. For each top-level alternative we 05803 restore the offset and recursion data. */ 05804 05805 DPRINTF(("Recursing into group %d\n", new_recursive.group_num)); 05806 do 05807 { 05808 RMATCH(rrc, eptr, callpat + 1 + LINK_SIZE, offset_top, md, ims, 05809 eptrb, match_isgroup); 05810 if (rrc == MATCH_MATCH) 05811 { 05812 md->recursive = new_recursive.prevrec; 05813 if (new_recursive.offset_save != stacksave) 05814 free(new_recursive.offset_save); 05815 RRETURN(MATCH_MATCH); 05816 } 05817 else if (rrc != MATCH_NOMATCH) RRETURN(rrc); 05818 05819 md->recursive = &new_recursive; 05820 memcpy(md->offset_vector, new_recursive.offset_save, 05821 new_recursive.saved_max * sizeof(int)); 05822 callpat += GET(callpat, 1); 05823 } 05824 while (*callpat == OP_ALT); 05825 05826 DPRINTF(("Recursion didn't match\n")); 05827 md->recursive = new_recursive.prevrec; 05828 if (new_recursive.offset_save != stacksave) 05829 free(new_recursive.offset_save); 05830 RRETURN(MATCH_NOMATCH); 05831 } 05832 /* Control never reaches here */ 05833 05834 /* "Once" brackets are like assertion brackets except that after a match, 05835 the point in the subject string is not moved back. Thus there can never be 05836 a move back into the brackets. Friedl calls these "atomic" subpatterns. 05837 Check the alternative branches in turn - the matching won't pass the KET 05838 for this kind of subpattern. If any one branch matches, we carry on as at 05839 the end of a normal bracket, leaving the subject pointer. */ 05840 05841 case OP_ONCE: 05842 { 05843 prev = ecode; 05844 saved_eptr = eptr; 05845 05846 do 05847 { 05848 RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, 05849 eptrb, match_isgroup); 05850 if (rrc == MATCH_MATCH) break; 05851 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 05852 ecode += GET(ecode,1); 05853 } 05854 while (*ecode == OP_ALT); 05855 05856 /* If hit the end of the group (which could be repeated), fail */ 05857 05858 if (*ecode != OP_ONCE && *ecode != OP_ALT) RRETURN(MATCH_NOMATCH); 05859 05860 /* Continue as from after the assertion, updating the offsets high water 05861 mark, since extracts may have been taken. */ 05862 05863 do ecode += GET(ecode,1); while (*ecode == OP_ALT); 05864 05865 offset_top = md->end_offset_top; 05866 eptr = md->end_match_ptr; 05867 05868 /* For a non-repeating ket, just continue at this level. This also 05869 happens for a repeating ket if no characters were matched in the group. 05870 This is the forcible breaking of infinite loops as implemented in Perl 05871 5.005. If there is an options reset, it will get obeyed in the normal 05872 course of events. */ 05873 05874 if (*ecode == OP_KET || eptr == saved_eptr) 05875 { 05876 ecode += 1+LINK_SIZE; 05877 break; 05878 } 05879 05880 /* The repeating kets try the rest of the pattern or restart from the 05881 preceding bracket, in the appropriate order. We need to reset any options 05882 that changed within the bracket before re-running it, so check the next 05883 opcode. */ 05884 05885 if (ecode[1+LINK_SIZE] == OP_OPT) 05886 { 05887 ims = (ims & ~PCRE_IMS) | ecode[4]; 05888 DPRINTF(("ims set to %02lx at group repeat\n", ims)); 05889 } 05890 05891 if (*ecode == OP_KETRMIN) 05892 { 05893 RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb, 0); 05894 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 05895 RMATCH(rrc, eptr, prev, offset_top, md, ims, eptrb, match_isgroup); 05896 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 05897 } 05898 else /* OP_KETRMAX */ 05899 { 05900 RMATCH(rrc, eptr, prev, offset_top, md, ims, eptrb, match_isgroup); 05901 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 05902 RMATCH(rrc, eptr, ecode + 1+LINK_SIZE, offset_top, md, ims, eptrb, 0); 05903 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 05904 } 05905 } 05906 RRETURN(MATCH_NOMATCH); 05907 05908 /* An alternation is the end of a branch; scan along to find the end of the 05909 bracketed group and go to there. */ 05910 05911 case OP_ALT: 05912 do ecode += GET(ecode,1); while (*ecode == OP_ALT); 05913 break; 05914 05915 /* BRAZERO and BRAMINZERO occur just before a bracket group, indicating 05916 that it may occur zero times. It may repeat infinitely, or not at all - 05917 i.e. it could be ()* or ()? in the pattern. Brackets with fixed upper 05918 repeat limits are compiled as a number of copies, with the optional ones 05919 preceded by BRAZERO or BRAMINZERO. */ 05920 05921 case OP_BRAZERO: 05922 { 05923 next = ecode+1; 05924 RMATCH(rrc, eptr, next, offset_top, md, ims, eptrb, match_isgroup); 05925 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 05926 do next += GET(next,1); while (*next == OP_ALT); 05927 ecode = next + 1+LINK_SIZE; 05928 } 05929 break; 05930 05931 case OP_BRAMINZERO: 05932 { 05933 next = ecode+1; 05934 do next += GET(next,1); while (*next == OP_ALT); 05935 RMATCH(rrc, eptr, next + 1+LINK_SIZE, offset_top, md, ims, eptrb, 05936 match_isgroup); 05937 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 05938 ecode++; 05939 } 05940 break; 05941 05942 /* End of a group, repeated or non-repeating. If we are at the end of 05943 an assertion "group", stop matching and return MATCH_MATCH, but record the 05944 current high water mark for use by positive assertions. Do this also 05945 for the "once" (not-backup up) groups. */ 05946 05947 case OP_KET: 05948 case OP_KETRMIN: 05949 case OP_KETRMAX: 05950 { 05951 prev = ecode - GET(ecode, 1); 05952 saved_eptr = eptrb->epb_saved_eptr; 05953 05954 /* Back up the stack of bracket start pointers. */ 05955 05956 eptrb = eptrb->epb_prev; 05957 05958 if (*prev == OP_ASSERT || *prev == OP_ASSERT_NOT || 05959 *prev == OP_ASSERTBACK || *prev == OP_ASSERTBACK_NOT || 05960 *prev == OP_ONCE) 05961 { 05962 md->end_match_ptr = eptr; /* For ONCE */ 05963 md->end_offset_top = offset_top; 05964 RRETURN(MATCH_MATCH); 05965 } 05966 05967 /* In all other cases except a conditional group we have to check the 05968 group number back at the start and if necessary complete handling an 05969 extraction by setting the offsets and bumping the high water mark. */ 05970 05971 if (*prev != OP_COND) 05972 { 05973 number = *prev - OP_BRA; 05974 05975 /* For extended extraction brackets (large number), we have to fish out 05976 the number from a dummy opcode at the start. */ 05977 05978 if (number > EXTRACT_BASIC_MAX) number = GET2(prev, 2+LINK_SIZE); 05979 offset = number << 1; 05980 05981 /* Test for a numbered group. This includes groups called as a result 05982 of recursion. Note that whole-pattern recursion is coded as a recurse 05983 into group 0, so it won't be picked up here. Instead, we catch it when 05984 the OP_END is reached. */ 05985 05986 if (number > 0) 05987 { 05988 md->capture_last = number; 05989 if (offset >= md->offset_max) md->offset_overflow = true; else 05990 { 05991 md->offset_vector[offset] = 05992 md->offset_vector[md->offset_end - number]; 05993 md->offset_vector[offset+1] = eptr - md->start_subject; 05994 if (offset_top <= offset) offset_top = offset + 2; 05995 } 05996 05997 /* Handle a recursively called group. Restore the offsets 05998 appropriately and continue from after the call. */ 05999 06000 if (md->recursive != NULL && md->recursive->group_num == number) 06001 { 06002 recursion_info *rec = md->recursive; 06003 DPRINTF(("Recursion (%d) succeeded - continuing\n", number)); 06004 md->recursive = rec->prevrec; 06005 md->start_match = rec->save_start; 06006 memcpy(md->offset_vector, rec->offset_save, 06007 rec->saved_max * sizeof(int)); 06008 ecode = rec->after_call; 06009 ims = original_ims; 06010 break; 06011 } 06012 } 06013 } 06014 06015 /* Reset the value of the ims flags, in case they got changed during 06016 the group. */ 06017 06018 ims = original_ims; 06019 DPRINTF(("ims reset to %02lx\n", ims)); 06020 06021 /* For a non-repeating ket, just continue at this level. This also 06022 happens for a repeating ket if no characters were matched in the group. 06023 This is the forcible breaking of infinite loops as implemented in Perl 06024 5.005. If there is an options reset, it will get obeyed in the normal 06025 course of events. */ 06026 06027 if (*ecode == OP_KET || eptr == saved_eptr) 06028 { 06029 ecode += 1 + LINK_SIZE; 06030 break; 06031 } 06032 06033 /* The repeating kets try the rest of the pattern or restart from the 06034 preceding bracket, in the appropriate order. */ 06035 06036 if (*ecode == OP_KETRMIN) 06037 { 06038 RMATCH(rrc, eptr, ecode + 1+LINK_SIZE, offset_top, md, ims, eptrb, 0); 06039 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 06040 RMATCH(rrc, eptr, prev, offset_top, md, ims, eptrb, match_isgroup); 06041 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 06042 } 06043 else /* OP_KETRMAX */ 06044 { 06045 RMATCH(rrc, eptr, prev, offset_top, md, ims, eptrb, match_isgroup); 06046 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 06047 RMATCH(rrc, eptr, ecode + 1+LINK_SIZE, offset_top, md, ims, eptrb, 0); 06048 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 06049 } 06050 } 06051 06052 RRETURN(MATCH_NOMATCH); 06053 06054 /* Start of subject unless notbol, or after internal newline if multiline */ 06055 06056 case OP_CIRC: 06057 if (md->notbol && eptr == md->start_subject) RRETURN(MATCH_NOMATCH); 06058 if ((ims & PCRE_MULTILINE) != 0) 06059 { 06060 if (eptr != md->start_subject && eptr[-1] != NEWLINE) 06061 RRETURN(MATCH_NOMATCH); 06062 ecode++; 06063 break; 06064 } 06065 /* ... else fall through */ 06066 06067 /* Start of subject assertion */ 06068 06069 case OP_SOD: 06070 if (eptr != md->start_subject) RRETURN(MATCH_NOMATCH); 06071 ecode++; 06072 break; 06073 06074 /* Start of match assertion */ 06075 06076 case OP_SOM: 06077 if (eptr != md->start_subject + md->start_offset) RRETURN(MATCH_NOMATCH); 06078 ecode++; 06079 break; 06080 06081 /* Assert before internal newline if multiline, or before a terminating 06082 newline unless endonly is set, else end of subject unless noteol is set. */ 06083 06084 case OP_DOLL: 06085 if ((ims & PCRE_MULTILINE) != 0) 06086 { 06087 if (eptr < md->end_subject) 06088 { if (*eptr != NEWLINE) RRETURN(MATCH_NOMATCH); } 06089 else 06090 { if (md->noteol) RRETURN(MATCH_NOMATCH); } 06091 ecode++; 06092 break; 06093 } 06094 else 06095 { 06096 if (md->noteol) RRETURN(MATCH_NOMATCH); 06097 if (!md->endonly) 06098 { 06099 if (eptr < md->end_subject - 1 || 06100 (eptr == md->end_subject - 1 && *eptr != NEWLINE)) 06101 RRETURN(MATCH_NOMATCH); 06102 ecode++; 06103 break; 06104 } 06105 } 06106 /* ... else fall through */ 06107 06108 /* End of subject assertion (\z) */ 06109 06110 case OP_EOD: 06111 if (eptr < md->end_subject) RRETURN(MATCH_NOMATCH); 06112 ecode++; 06113 break; 06114 06115 /* End of subject or ending \n assertion (\Z) */ 06116 06117 case OP_EODN: 06118 if (eptr < md->end_subject - 1 || 06119 (eptr == md->end_subject - 1 && *eptr != NEWLINE)) RRETURN(MATCH_NOMATCH); 06120 ecode++; 06121 break; 06122 06123 /* Word boundary assertions */ 06124 06125 case OP_NOT_WORD_BOUNDARY: 06126 case OP_WORD_BOUNDARY: 06127 { 06128 06129 /* Find out if the previous and current characters are "word" characters. 06130 It takes a bit more work in UTF-8 mode. Characters > 255 are assumed to 06131 be "non-word" characters. */ 06132 06133 06134 /* More streamlined when not in UTF-8 mode */ 06135 06136 { 06137 prev_is_word = (eptr != md->start_subject) && 06138 ((md->ctypes[eptr[-1]] & ctype_word) != 0); 06139 cur_is_word = (eptr < md->end_subject) && 06140 ((md->ctypes[*eptr] & ctype_word) != 0); 06141 } 06142 06143 /* Now see if the situation is what we want */ 06144 06145 if ((*ecode++ == OP_WORD_BOUNDARY)? 06146 cur_is_word == prev_is_word : cur_is_word != prev_is_word) 06147 RRETURN(MATCH_NOMATCH); 06148 } 06149 break; 06150 06151 /* Match a single character type; inline for speed */ 06152 06153 case OP_ANY: 06154 if ((ims & PCRE_DOTALL) == 0 && eptr < md->end_subject && *eptr == NEWLINE) 06155 RRETURN(MATCH_NOMATCH); 06156 if (eptr++ >= md->end_subject) RRETURN(MATCH_NOMATCH); 06157 ecode++; 06158 break; 06159 06160 /* Match a single byte, even in UTF-8 mode. This opcode really does match 06161 any byte, even newline, independent of the setting of PCRE_DOTALL. */ 06162 06163 case OP_ANYBYTE: 06164 if (eptr++ >= md->end_subject) RRETURN(MATCH_NOMATCH); 06165 ecode++; 06166 break; 06167 06168 case OP_NOT_DIGIT: 06169 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); 06170 GETCHARINCTEST(c, eptr); 06171 if ( 06172 (md->ctypes[c] & ctype_digit) != 0 06173 ) 06174 RRETURN(MATCH_NOMATCH); 06175 ecode++; 06176 break; 06177 06178 case OP_DIGIT: 06179 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); 06180 GETCHARINCTEST(c, eptr); 06181 if ( 06182 (md->ctypes[c] & ctype_digit) == 0 06183 ) 06184 RRETURN(MATCH_NOMATCH); 06185 ecode++; 06186 break; 06187 06188 case OP_NOT_WHITESPACE: 06189 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); 06190 GETCHARINCTEST(c, eptr); 06191 if ( 06192 (md->ctypes[c] & ctype_space) != 0 06193 ) 06194 RRETURN(MATCH_NOMATCH); 06195 ecode++; 06196 break; 06197 06198 case OP_WHITESPACE: 06199 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); 06200 GETCHARINCTEST(c, eptr); 06201 if ( 06202 (md->ctypes[c] & ctype_space) == 0 06203 ) 06204 RRETURN(MATCH_NOMATCH); 06205 ecode++; 06206 break; 06207 06208 case OP_NOT_WORDCHAR: 06209 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); 06210 GETCHARINCTEST(c, eptr); 06211 if ( 06212 (md->ctypes[c] & ctype_word) != 0 06213 ) 06214 RRETURN(MATCH_NOMATCH); 06215 ecode++; 06216 break; 06217 06218 case OP_WORDCHAR: 06219 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); 06220 GETCHARINCTEST(c, eptr); 06221 if ( 06222 (md->ctypes[c] & ctype_word) == 0 06223 ) 06224 RRETURN(MATCH_NOMATCH); 06225 ecode++; 06226 break; 06227 06228 /* Match a back reference, possibly repeatedly. Look past the end of the 06229 item to see if there is repeat information following. The code is similar 06230 to that for character classes, but repeated for efficiency. Then obey 06231 similar code to character type repeats - written out again for speed. 06232 However, if the referenced string is the empty string, always treat 06233 it as matched, any number of times (otherwise there could be infinite 06234 loops). */ 06235 06236 case OP_REF: 06237 { 06238 offset = GET2(ecode, 1) << 1; /* Doubled ref number */ 06239 ecode += 3; /* Advance past item */ 06240 06241 /* If the reference is unset, set the length to be longer than the amount 06242 of subject left; this ensures that every attempt at a match fails. We 06243 can't just fail here, because of the possibility of quantifiers with zero 06244 minima. */ 06245 06246 length = (offset >= offset_top || md->offset_vector[offset] < 0)? 06247 md->end_subject - eptr + 1 : 06248 md->offset_vector[offset+1] - md->offset_vector[offset]; 06249 06250 /* Set up for repetition, or handle the non-repeated case */ 06251 06252 switch (*ecode) 06253 { 06254 case OP_CRSTAR: 06255 case OP_CRMINSTAR: 06256 case OP_CRPLUS: 06257 case OP_CRMINPLUS: 06258 case OP_CRQUERY: 06259 case OP_CRMINQUERY: 06260 c = *ecode++ - OP_CRSTAR; 06261 minimize = (c & 1) != 0; 06262 min = rep_min[c]; /* Pick up values from tables; */ 06263 max = rep_max[c]; /* zero for max => infinity */ 06264 if (max == 0) max = INT_MAX; 06265 break; 06266 06267 case OP_CRRANGE: 06268 case OP_CRMINRANGE: 06269 minimize = (*ecode == OP_CRMINRANGE); 06270 min = GET2(ecode, 1); 06271 max = GET2(ecode, 3); 06272 if (max == 0) max = INT_MAX; 06273 ecode += 5; 06274 break; 06275 06276 default: /* No repeat follows */ 06277 if (!match_ref(offset, eptr, length, md, ims)) RRETURN(MATCH_NOMATCH); 06278 eptr += length; 06279 continue; /* With the main loop */ 06280 } 06281 06282 /* If the length of the reference is zero, just continue with the 06283 main loop. */ 06284 06285 if (length == 0) continue; 06286 06287 /* First, ensure the minimum number of matches are present. We get back 06288 the length of the reference string explicitly rather than passing the 06289 address of eptr, so that eptr can be a register variable. */ 06290 06291 for (i = 1; i <= min; i++) 06292 { 06293 if (!match_ref(offset, eptr, length, md, ims)) RRETURN(MATCH_NOMATCH); 06294 eptr += length; 06295 } 06296 06297 /* If min = max, continue at the same level without recursion. 06298 They are not both allowed to be zero. */ 06299 06300 if (min == max) continue; 06301 06302 /* If minimizing, keep trying and advancing the pointer */ 06303 06304 if (minimize) 06305 { 06306 for (fi = min;; fi++) 06307 { 06308 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0); 06309 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 06310 if (fi >= max || !match_ref(offset, eptr, length, md, ims)) 06311 RRETURN(MATCH_NOMATCH); 06312 eptr += length; 06313 } 06314 /* Control never gets here */ 06315 } 06316 06317 /* If maximizing, find the longest string and work backwards */ 06318 06319 else 06320 { 06321 pp = eptr; 06322 for (i = min; i < max; i++) 06323 { 06324 if (!match_ref(offset, eptr, length, md, ims)) break; 06325 eptr += length; 06326 } 06327 while (eptr >= pp) 06328 { 06329 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0); 06330 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 06331 eptr -= length; 06332 } 06333 RRETURN(MATCH_NOMATCH); 06334 } 06335 } 06336 /* Control never gets here */ 06337 06338 06339 06340 /* Match a bit-mapped character class, possibly repeatedly. This op code is 06341 used when all the characters in the class have values in the range 0-255. 06342 The only difference between OP_CLASS and OP_NCLASS occurs when a data 06343 character outside the range is encountered. 06344 06345 First, look past the end of the item to see if there is repeat information 06346 following. Then obey similar code to character type repeats - written out 06347 again for speed. */ 06348 06349 case OP_NCLASS: 06350 case OP_CLASS: 06351 { 06352 data = ecode + 1; /* Save for matching */ 06353 ecode += 33; /* Advance past the item */ 06354 06355 switch (*ecode) 06356 { 06357 case OP_CRSTAR: 06358 case OP_CRMINSTAR: 06359 case OP_CRPLUS: 06360 case OP_CRMINPLUS: 06361 case OP_CRQUERY: 06362 case OP_CRMINQUERY: 06363 c = *ecode++ - OP_CRSTAR; 06364 minimize = (c & 1) != 0; 06365 min = rep_min[c]; /* Pick up values from tables; */ 06366 max = rep_max[c]; /* zero for max => infinity */ 06367 if (max == 0) max = INT_MAX; 06368 break; 06369 06370 case OP_CRRANGE: 06371 case OP_CRMINRANGE: 06372 minimize = (*ecode == OP_CRMINRANGE); 06373 min = GET2(ecode, 1); 06374 max = GET2(ecode, 3); 06375 if (max == 0) max = INT_MAX; 06376 ecode += 5; 06377 break; 06378 06379 default: /* No repeat follows */ 06380 min = max = 1; 06381 break; 06382 } 06383 06384 /* First, ensure the minimum number of matches are present. */ 06385 06386 /* Not UTF-8 mode */ 06387 { 06388 for (i = 1; i <= min; i++) 06389 { 06390 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); 06391 c = *eptr++; 06392 if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH); 06393 } 06394 } 06395 06396 /* If max == min we can continue with the main loop without the 06397 need to recurse. */ 06398 06399 if (min == max) continue; 06400 06401 /* If minimizing, keep testing the rest of the expression and advancing 06402 the pointer while it matches the class. */ 06403 06404 if (minimize) 06405 { 06406 /* Not UTF-8 mode */ 06407 { 06408 for (fi = min;; fi++) 06409 { 06410 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0); 06411 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 06412 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); 06413 c = *eptr++; 06414 if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH); 06415 } 06416 } 06417 /* Control never gets here */ 06418 } 06419 06420 /* If maximizing, find the longest possible run, then work backwards. */ 06421 06422 else 06423 { 06424 pp = eptr; 06425 06426 /* Not UTF-8 mode */ 06427 { 06428 for (i = min; i < max; i++) 06429 { 06430 if (eptr >= md->end_subject) break; 06431 c = *eptr; 06432 if ((data[c/8] & (1 << (c&7))) == 0) break; 06433 eptr++; 06434 } 06435 while (eptr >= pp) 06436 { 06437 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0); 06438 eptr--; 06439 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 06440 } 06441 } 06442 06443 RRETURN(MATCH_NOMATCH); 06444 } 06445 } 06446 /* Control never gets here */ 06447 06448 06449 /* Match an extended character class. This opcode is encountered only 06450 in UTF-8 mode, because that's the only time it is compiled. */ 06451 06452 06453 /* Match a run of characters */ 06454 06455 case OP_CHARS: 06456 { 06457 register int slen = ecode[1]; 06458 ecode += 2; 06459 06460 if (slen > md->end_subject - eptr) RRETURN(MATCH_NOMATCH); 06461 if ((ims & PCRE_CASELESS) != 0) 06462 { 06463 while (slen-- > 0) 06464 if (md->lcc[*ecode++] != md->lcc[*eptr++]) 06465 RRETURN(MATCH_NOMATCH); 06466 } 06467 else 06468 { 06469 while (slen-- > 0) if (*ecode++ != *eptr++) RRETURN(MATCH_NOMATCH); 06470 } 06471 } 06472 break; 06473 06474 /* Match a single character repeatedly; different opcodes share code. */ 06475 06476 case OP_EXACT: 06477 min = max = GET2(ecode, 1); 06478 ecode += 3; 06479 goto REPEATCHAR; 06480 06481 case OP_UPTO: 06482 case OP_MINUPTO: 06483 min = 0; 06484 max = GET2(ecode, 1); 06485 minimize = *ecode == OP_MINUPTO; 06486 ecode += 3; 06487 goto REPEATCHAR; 06488 06489 case OP_STAR: 06490 case OP_MINSTAR: 06491 case OP_PLUS: 06492 case OP_MINPLUS: 06493 case OP_QUERY: 06494 case OP_MINQUERY: 06495 c = *ecode++ - OP_STAR; 06496 minimize = (c & 1) != 0; 06497 min = rep_min[c]; /* Pick up values from tables; */ 06498 max = rep_max[c]; /* zero for max => infinity */ 06499 if (max == 0) max = INT_MAX; 06500 06501 /* Common code for all repeated single-character matches. We can give 06502 up quickly if there are fewer than the minimum number of characters left in 06503 the subject. */ 06504 06505 REPEATCHAR: 06506 06507 /* When not in UTF-8 mode, load a single-byte character. */ 06508 { 06509 if (min > md->end_subject - eptr) RRETURN(MATCH_NOMATCH); 06510 fc = *ecode++; 06511 } 06512 06513 /* The value of fc at this point is always less than 256, though we may or 06514 may not be in UTF-8 mode. The code is duplicated for the caseless and 06515 caseful cases, for speed, since matching characters is likely to be quite 06516 common. First, ensure the minimum number of matches are present. If min = 06517 max, continue at the same level without recursing. Otherwise, if 06518 minimizing, keep trying the rest of the expression and advancing one 06519 matching character if failing, up to the maximum. Alternatively, if 06520 maximizing, find the maximum number of characters and work backwards. */ 06521 06522 DPRINTF(("matching %c{%d,%d} against subject %.*s\n", fc, min, max, 06523 max, eptr)); 06524 06525 if ((ims & PCRE_CASELESS) != 0) 06526 { 06527 fc = md->lcc[fc]; 06528 for (i = 1; i <= min; i++) 06529 if (fc != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH); 06530 if (min == max) continue; 06531 if (minimize) 06532 { 06533 for (fi = min;; fi++) 06534 { 06535 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0); 06536 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 06537 if (fi >= max || eptr >= md->end_subject || 06538 fc != md->lcc[*eptr++]) 06539 RRETURN(MATCH_NOMATCH); 06540 } 06541 /* Control never gets here */ 06542 } 06543 else 06544 { 06545 pp = eptr; 06546 for (i = min; i < max; i++) 06547 { 06548 if (eptr >= md->end_subject || fc != md->lcc[*eptr]) break; 06549 eptr++; 06550 } 06551 while (eptr >= pp) 06552 { 06553 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0); 06554 eptr--; 06555 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 06556 } 06557 RRETURN(MATCH_NOMATCH); 06558 } 06559 /* Control never gets here */ 06560 } 06561 06562 /* Caseful comparisons (includes all multi-byte characters) */ 06563 06564 else 06565 { 06566 for (i = 1; i <= min; i++) if (fc != *eptr++) RRETURN(MATCH_NOMATCH); 06567 if (min == max) continue; 06568 if (minimize) 06569 { 06570 for (fi = min;; fi++) 06571 { 06572 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0); 06573 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 06574 if (fi >= max || eptr >= md->end_subject || fc != *eptr++) 06575 RRETURN(MATCH_NOMATCH); 06576 } 06577 /* Control never gets here */ 06578 } 06579 else 06580 { 06581 pp = eptr; 06582 for (i = min; i < max; i++) 06583 { 06584 if (eptr >= md->end_subject || fc != *eptr) break; 06585 eptr++; 06586 } 06587 while (eptr >= pp) 06588 { 06589 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0); 06590 eptr--; 06591 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 06592 } 06593 RRETURN(MATCH_NOMATCH); 06594 } 06595 } 06596 /* Control never gets here */ 06597 06598 /* Match a negated single one-byte character. The character we are 06599 checking can be multibyte. */ 06600 06601 case OP_NOT: 06602 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); 06603 ecode++; 06604 GETCHARINCTEST(c, eptr); 06605 if ((ims & PCRE_CASELESS) != 0) 06606 { 06607 c = md->lcc[c]; 06608 if (md->lcc[*ecode++] == c) RRETURN(MATCH_NOMATCH); 06609 } 06610 else 06611 { 06612 if (*ecode++ == c) RRETURN(MATCH_NOMATCH); 06613 } 06614 break; 06615 06616 /* Match a negated single one-byte character repeatedly. This is almost a 06617 repeat of the code for a repeated single character, but I haven't found a 06618 nice way of commoning these up that doesn't require a test of the 06619 positive/negative option for each character match. Maybe that wouldn't add 06620 very much to the time taken, but character matching *is* what this is all 06621 about... */ 06622 06623 case OP_NOTEXACT: 06624 min = max = GET2(ecode, 1); 06625 ecode += 3; 06626 goto REPEATNOTCHAR; 06627 06628 case OP_NOTUPTO: 06629 case OP_NOTMINUPTO: 06630 min = 0; 06631 max = GET2(ecode, 1); 06632 minimize = *ecode == OP_NOTMINUPTO; 06633 ecode += 3; 06634 goto REPEATNOTCHAR; 06635 06636 case OP_NOTSTAR: 06637 case OP_NOTMINSTAR: 06638 case OP_NOTPLUS: 06639 case OP_NOTMINPLUS: 06640 case OP_NOTQUERY: 06641 case OP_NOTMINQUERY: 06642 c = *ecode++ - OP_NOTSTAR; 06643 minimize = (c & 1) != 0; 06644 min = rep_min[c]; /* Pick up values from tables; */ 06645 max = rep_max[c]; /* zero for max => infinity */ 06646 if (max == 0) max = INT_MAX; 06647 06648 /* Common code for all repeated single-character (less than 255) matches. 06649 We can give up quickly if there are fewer than the minimum number of 06650 characters left in the subject. */ 06651 06652 REPEATNOTCHAR: 06653 if (min > md->end_subject - eptr) RRETURN(MATCH_NOMATCH); 06654 fc = *ecode++; 06655 06656 /* The code is duplicated for the caseless and caseful cases, for speed, 06657 since matching characters is likely to be quite common. First, ensure the 06658 minimum number of matches are present. If min = max, continue at the same 06659 level without recursing. Otherwise, if minimizing, keep trying the rest of 06660 the expression and advancing one matching character if failing, up to the 06661 maximum. Alternatively, if maximizing, find the maximum number of 06662 characters and work backwards. */ 06663 06664 DPRINTF(("negative matching %c{%d,%d} against subject %.*s\n", fc, min, max, 06665 max, eptr)); 06666 06667 if ((ims & PCRE_CASELESS) != 0) 06668 { 06669 fc = md->lcc[fc]; 06670 06671 06672 /* Not UTF-8 mode */ 06673 { 06674 for (i = 1; i <= min; i++) 06675 if (fc == md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH); 06676 } 06677 06678 if (min == max) continue; 06679 06680 if (minimize) 06681 { 06682 /* Not UTF-8 mode */ 06683 { 06684 for (fi = min;; fi++) 06685 { 06686 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0); 06687 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 06688 if (fi >= max || eptr >= md->end_subject || fc == md->lcc[*eptr++]) 06689 RRETURN(MATCH_NOMATCH); 06690 } 06691 } 06692 /* Control never gets here */ 06693 } 06694 06695 /* Maximize case */ 06696 06697 else 06698 { 06699 pp = eptr; 06700 06701 /* Not UTF-8 mode */ 06702 { 06703 for (i = min; i < max; i++) 06704 { 06705 if (eptr >= md->end_subject || fc == md->lcc[*eptr]) break; 06706 eptr++; 06707 } 06708 while (eptr >= pp) 06709 { 06710 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0); 06711 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 06712 eptr--; 06713 } 06714 } 06715 06716 RRETURN(MATCH_NOMATCH); 06717 } 06718 /* Control never gets here */ 06719 } 06720 06721 /* Caseful comparisons */ 06722 06723 else 06724 { 06725 /* Not UTF-8 mode */ 06726 { 06727 for (i = 1; i <= min; i++) 06728 if (fc == *eptr++) RRETURN(MATCH_NOMATCH); 06729 } 06730 06731 if (min == max) continue; 06732 06733 if (minimize) 06734 { 06735 /* Not UTF-8 mode */ 06736 { 06737 for (fi = min;; fi++) 06738 { 06739 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0); 06740 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 06741 if (fi >= max || eptr >= md->end_subject || fc == *eptr++) 06742 RRETURN(MATCH_NOMATCH); 06743 } 06744 } 06745 /* Control never gets here */ 06746 } 06747 06748 /* Maximize case */ 06749 06750 else 06751 { 06752 pp = eptr; 06753 06754 /* Not UTF-8 mode */ 06755 { 06756 for (i = min; i < max; i++) 06757 { 06758 if (eptr >= md->end_subject || fc == *eptr) break; 06759 eptr++; 06760 } 06761 while (eptr >= pp) 06762 { 06763 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0); 06764 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 06765 eptr--; 06766 } 06767 } 06768 06769 RRETURN(MATCH_NOMATCH); 06770 } 06771 } 06772 /* Control never gets here */ 06773 06774 /* Match a single character type repeatedly; several different opcodes 06775 share code. This is very similar to the code for single characters, but we 06776 repeat it in the interests of efficiency. */ 06777 06778 case OP_TYPEEXACT: 06779 min = max = GET2(ecode, 1); 06780 minimize = true; 06781 ecode += 3; 06782 goto REPEATTYPE; 06783 06784 case OP_TYPEUPTO: 06785 case OP_TYPEMINUPTO: 06786 min = 0; 06787 max = GET2(ecode, 1); 06788 minimize = *ecode == OP_TYPEMINUPTO; 06789 ecode += 3; 06790 goto REPEATTYPE; 06791 06792 case OP_TYPESTAR: 06793 case OP_TYPEMINSTAR: 06794 case OP_TYPEPLUS: 06795 case OP_TYPEMINPLUS: 06796 case OP_TYPEQUERY: 06797 case OP_TYPEMINQUERY: 06798 c = *ecode++ - OP_TYPESTAR; 06799 minimize = (c & 1) != 0; 06800 min = rep_min[c]; /* Pick up values from tables; */ 06801 max = rep_max[c]; /* zero for max => infinity */ 06802 if (max == 0) max = INT_MAX; 06803 06804 /* Common code for all repeated single character type matches. Note that 06805 in UTF-8 mode, '.' matches a character of any length, but for the other 06806 character types, the valid characters are all one-byte long. */ 06807 06808 REPEATTYPE: 06809 ctype = *ecode++; /* Code for the character type */ 06810 06811 /* First, ensure the minimum number of matches are present. Use inline 06812 code for maximizing the speed, and do the type test once at the start 06813 (i.e. keep it out of the loop). Also we can test that there are at least 06814 the minimum number of bytes before we start. This isn't as effective in 06815 UTF-8 mode, but it does no harm. Separate the UTF-8 code completely as that 06816 is tidier. */ 06817 06818 if (min > md->end_subject - eptr) RRETURN(MATCH_NOMATCH); 06819 if (min > 0) 06820 { 06821 06822 /* Code for the non-UTF-8 case for minimum matching */ 06823 06824 switch(ctype) 06825 { 06826 case OP_ANY: 06827 if ((ims & PCRE_DOTALL) == 0) 06828 { 06829 for (i = 1; i <= min; i++) 06830 if (*eptr++ == NEWLINE) RRETURN(MATCH_NOMATCH); 06831 } 06832 else eptr += min; 06833 break; 06834 06835 case OP_ANYBYTE: 06836 eptr += min; 06837 break; 06838 06839 case OP_NOT_DIGIT: 06840 for (i = 1; i <= min; i++) 06841 if ((md->ctypes[*eptr++] & ctype_digit) != 0) RRETURN(MATCH_NOMATCH); 06842 break; 06843 06844 case OP_DIGIT: 06845 for (i = 1; i <= min; i++) 06846 if ((md->ctypes[*eptr++] & ctype_digit) == 0) RRETURN(MATCH_NOMATCH); 06847 break; 06848 06849 case OP_NOT_WHITESPACE: 06850 for (i = 1; i <= min; i++) 06851 if ((md->ctypes[*eptr++] & ctype_space) != 0) RRETURN(MATCH_NOMATCH); 06852 break; 06853 06854 case OP_WHITESPACE: 06855 for (i = 1; i <= min; i++) 06856 if ((md->ctypes[*eptr++] & ctype_space) == 0) RRETURN(MATCH_NOMATCH); 06857 break; 06858 06859 case OP_NOT_WORDCHAR: 06860 for (i = 1; i <= min; i++) 06861 if ((md->ctypes[*eptr++] & ctype_word) != 0) 06862 RRETURN(MATCH_NOMATCH); 06863 break; 06864 06865 case OP_WORDCHAR: 06866 for (i = 1; i <= min; i++) 06867 if ((md->ctypes[*eptr++] & ctype_word) == 0) 06868 RRETURN(MATCH_NOMATCH); 06869 break; 06870 } 06871 } 06872 06873 /* If min = max, continue at the same level without recursing */ 06874 06875 if (min == max) continue; 06876 06877 /* If minimizing, we have to test the rest of the pattern before each 06878 subsequent match. Again, separate the UTF-8 case for speed. */ 06879 06880 if (minimize) 06881 { 06882 /* Not UTF-8 mode */ 06883 { 06884 for (fi = min;; fi++) 06885 { 06886 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0); 06887 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 06888 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); 06889 c = *eptr++; 06890 switch(ctype) 06891 { 06892 case OP_ANY: 06893 if ((ims & PCRE_DOTALL) == 0 && c == NEWLINE) RRETURN(MATCH_NOMATCH); 06894 break; 06895 06896 case OP_ANYBYTE: 06897 break; 06898 06899 case OP_NOT_DIGIT: 06900 if ((md->ctypes[c] & ctype_digit) != 0) RRETURN(MATCH_NOMATCH); 06901 break; 06902 06903 case OP_DIGIT: 06904 if ((md->ctypes[c] & ctype_digit) == 0) RRETURN(MATCH_NOMATCH); 06905 break; 06906 06907 case OP_NOT_WHITESPACE: 06908 if ((md->ctypes[c] & ctype_space) != 0) RRETURN(MATCH_NOMATCH); 06909 break; 06910 06911 case OP_WHITESPACE: 06912 if ((md->ctypes[c] & ctype_space) == 0) RRETURN(MATCH_NOMATCH); 06913 break; 06914 06915 case OP_NOT_WORDCHAR: 06916 if ((md->ctypes[c] & ctype_word) != 0) RRETURN(MATCH_NOMATCH); 06917 break; 06918 06919 case OP_WORDCHAR: 06920 if ((md->ctypes[c] & ctype_word) == 0) RRETURN(MATCH_NOMATCH); 06921 break; 06922 } 06923 } 06924 } 06925 /* Control never gets here */ 06926 } 06927 06928 /* If maximizing it is worth using inline code for speed, doing the type 06929 test once at the start (i.e. keep it out of the loop). Again, keep the 06930 UTF-8 stuff separate. */ 06931 06932 else 06933 { 06934 pp = eptr; 06935 06936 /* Not UTF-8 mode */ 06937 { 06938 switch(ctype) 06939 { 06940 case OP_ANY: 06941 if ((ims & PCRE_DOTALL) == 0) 06942 { 06943 for (i = min; i < max; i++) 06944 { 06945 if (eptr >= md->end_subject || *eptr == NEWLINE) break; 06946 eptr++; 06947 } 06948 break; 06949 } 06950 /* For DOTALL case, fall through and treat as \C */ 06951 06952 case OP_ANYBYTE: 06953 c = max - min; 06954 if (c > md->end_subject - eptr) c = md->end_subject - eptr; 06955 eptr += c; 06956 break; 06957 06958 case OP_NOT_DIGIT: 06959 for (i = min; i < max; i++) 06960 { 06961 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_digit) != 0) 06962 break; 06963 eptr++; 06964 } 06965 break; 06966 06967 case OP_DIGIT: 06968 for (i = min; i < max; i++) 06969 { 06970 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_digit) == 0) 06971 break; 06972 eptr++; 06973 } 06974 break; 06975 06976 case OP_NOT_WHITESPACE: 06977 for (i = min; i < max; i++) 06978 { 06979 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_space) != 0) 06980 break; 06981 eptr++; 06982 } 06983 break; 06984 06985 case OP_WHITESPACE: 06986 for (i = min; i < max; i++) 06987 { 06988 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_space) == 0) 06989 break; 06990 eptr++; 06991 } 06992 break; 06993 06994 case OP_NOT_WORDCHAR: 06995 for (i = min; i < max; i++) 06996 { 06997 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_word) != 0) 06998 break; 06999 eptr++; 07000 } 07001 break; 07002 07003 case OP_WORDCHAR: 07004 for (i = min; i < max; i++) 07005 { 07006 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_word) == 0) 07007 break; 07008 eptr++; 07009 } 07010 break; 07011 } 07012 07013 /* eptr is now past the end of the maximum run */ 07014 07015 while (eptr >= pp) 07016 { 07017 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0); 07018 eptr--; 07019 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 07020 } 07021 } 07022 07023 /* Get here if we can't make it match with any permitted repetitions */ 07024 07025 RRETURN(MATCH_NOMATCH); 07026 } 07027 /* Control never gets here */ 07028 07029 /* There's been some horrible disaster. Since all codes > OP_BRA are 07030 for capturing brackets, and there shouldn't be any gaps between 0 and 07031 OP_BRA, arrival here can only mean there is something seriously wrong 07032 in the code above or the OP_xxx definitions. */ 07033 07034 default: 07035 DPRINTF(("Unknown opcode %d\n", *ecode)); 07036 RRETURN(PCRE_ERROR_UNKNOWN_NODE); 07037 } 07038 07039 /* Do not stick any code in here without much thought; it is assumed 07040 that "continue" in the code above comes out to here to repeat the main 07041 loop. */ 07042 07043 } /* End of main loop */ 07044 RRETURN(MATCH_NOMATCH); 07045 } 07046
static bool match_ref | ( | int | offset, | |
register const uschar * | eptr, | |||
int | length, | |||
match_data * | md, | |||
unsigned long int | ims | |||
) | [static] |
Definition at line 5323 of file pcre.cpp.
References md, and PCRE_CASELESS.
Referenced by match().
05324 { 05325 const uschar *p = md->start_subject + md->offset_vector[offset]; 05326 05327 /* Always fail if not enough characters left */ 05328 05329 if (length > md->end_subject - eptr) return false; 05330 05331 /* Separate the caselesss case for speed */ 05332 05333 if ((ims & PCRE_CASELESS) != 0) 05334 { 05335 while (length-- > 0) 05336 if (md->lcc[*p++] != md->lcc[*eptr++]) return false; 05337 } 05338 else 05339 { while (length-- > 0) if (*p++ != *eptr++) return false; } 05340 05341 return true; 05342 } 05343
pcre* pcre_compile | ( | const char * | pattern, | |
int | options, | |||
const char ** | errorptr, | |||
int * | erroroffset, | |||
const unsigned char * | tables | |||
) |
Definition at line 4365 of file pcre.cpp.
References compile_data::backref_map, BRASTACK_SIZE, compile_data::cbits, cbits_offset, check_escape(), check_posix_syntax(), compile_regex(), ctype_digit, ctype_meta, ctype_space, ctype_word, compile_data::ctypes, ctypes_offset, digitab, DPRINTF, ERR12, ERR15, ERR16, ERR17, ERR18, ERR19, ERR20, ERR21, ERR22, ERR23, ERR24, ERR26, ERR28, ERR29, ERR32, ERR39, ERR41, ERR42, ERR6, ESC_b, ESC_Q, ESC_REF, EXTRACT_BASIC_MAX, compile_data::fcc, fcc_offset, find_firstassertedchar(), real_pcre::first_byte, is_anchored(), is_counted_repeat(), is_startline(), compile_data::lcc, lcc_offset, LINK_SIZE, real_pcre::magic_number, MAGIC_NUMBER, MAX_PATTERN_SIZE, MAXLIT, real_pcre::name_count, real_pcre::name_entry_size, compile_data::name_entry_size, compile_data::name_table, compile_data::names_found, NEWLINE, OP_BRA, OP_END, real_pcre::options, PCRE_ANCHORED, PCRE_CASELESS, pcre_default_tables, PCRE_DOTALL, PCRE_EXTENDED, PCRE_EXTRA, PCRE_FIRSTSET, PCRE_ICHANGED, PCRE_IMS, PCRE_MULTILINE, PCRE_NO_AUTO_CAPTURE, PCRE_REQCHSET, PCRE_STARTLINE, PCRE_UNGREEDY, PCRE_UTF8, PUBLIC_OPTIONS, read_repeat_counts(), real_pcre::req_byte, REQ_CASELESS, REQ_VARY, compile_data::req_varyopt, real_pcre::size, compile_data::start_code, real_pcre::tables, compile_data::top_backref, real_pcre::top_backref, and real_pcre::top_bracket.
Referenced by CF_HAND(), check_filter(), real_regmatch(), real_regrab(), and regexp_match().
04366 { 04367 real_pcre *re; 04368 int length = 1 + LINK_SIZE; /* For initial BRA plus length */ 04369 int runlength; 04370 int c, firstbyte, reqbyte; 04371 int bracount = 0; 04372 int branch_extra = 0; 04373 int branch_newextra; 04374 int item_count = -1; 04375 int name_count = 0; 04376 int max_name_size = 0; 04377 bool inescq = false; 04378 unsigned int brastackptr = 0; 04379 size_t size; 04380 uschar *code; 04381 const uschar *codestart; 04382 const uschar *ptr; 04383 compile_data compile_block; 04384 int brastack[BRASTACK_SIZE]; 04385 uschar bralenstack[BRASTACK_SIZE]; 04386 04387 /* We can't pass back an error message if errorptr is NULL; I guess the best we 04388 can do is just return NULL. */ 04389 04390 if (errorptr == NULL) return NULL; 04391 *errorptr = NULL; 04392 04393 /* However, we can give a message for this error */ 04394 04395 if (erroroffset == NULL) 04396 { 04397 *errorptr = ERR16; 04398 return NULL; 04399 } 04400 *erroroffset = 0; 04401 04402 /* Can't support UTF8 unless PCRE has been compiled to include the code. */ 04403 04404 if ((options & PCRE_UTF8) != 0) 04405 { 04406 *errorptr = ERR32; 04407 return NULL; 04408 } 04409 04410 if ((options & ~PUBLIC_OPTIONS) != 0) 04411 { 04412 *errorptr = ERR17; 04413 return NULL; 04414 } 04415 04416 /* Set up pointers to the individual character tables */ 04417 04418 if (tables == NULL) tables = pcre_default_tables; 04419 compile_block.lcc = tables + lcc_offset; 04420 compile_block.fcc = tables + fcc_offset; 04421 compile_block.cbits = tables + cbits_offset; 04422 compile_block.ctypes = tables + ctypes_offset; 04423 04424 /* Maximum back reference and backref bitmap. This is updated for numeric 04425 references during the first pass, but for named references during the actual 04426 compile pass. The bitmap records up to 31 back references to help in deciding 04427 whether (.*) can be treated as anchored or not. */ 04428 04429 compile_block.top_backref = 0; 04430 compile_block.backref_map = 0; 04431 04432 /* Reflect pattern for debugging output */ 04433 04434 DPRINTF(("------------------------------------------------------------------\n")); 04435 DPRINTF(("%s\n", pattern)); 04436 04437 /* The first thing to do is to make a pass over the pattern to compute the 04438 amount of store required to hold the compiled code. This does not have to be 04439 perfect as long as errors are overestimates. At the same time we can detect any 04440 flag settings right at the start, and extract them. Make an attempt to correct 04441 for any counted white space if an "extended" flag setting appears late in the 04442 pattern. We can't be so clever for #-comments. */ 04443 04444 ptr = (const uschar *)(pattern - 1); 04445 while ((c = *(++ptr)) != 0) 04446 { 04447 int min, max; 04448 #if defined(WIN32) && (_MSC_VER == 1200) && defined(_M_IX86) && !defined(__INTEL_COMPILER) 04449 // The addition of 'volatile' works around a bug in Version 12.0 of 04450 // Microsoft's Visual C/C++ compiler (part of Visual Studio 6.0). Without 04451 // volatile, class_optcount is calculated properly, but the compiler 04452 // clobbers the EAX register before tests it as class_optcount. 04453 // 04454 // This is not a problem with the Intel Compiler. 04455 // 04456 volatile int class_optcount; 04457 #else 04458 int class_optcount; 04459 #endif 04460 int bracket_length; 04461 int duplength; 04462 04463 /* If we are inside a \Q...\E sequence, all chars are literal */ 04464 04465 if (inescq) goto NORMAL_CHAR; 04466 04467 /* Otherwise, first check for ignored whitespace and comments */ 04468 04469 if ((options & PCRE_EXTENDED) != 0) 04470 { 04471 if ((compile_block.ctypes[c] & ctype_space) != 0) continue; 04472 if (c == '#') 04473 { 04474 /* The space before the ; is to avoid a warning on a silly compiler 04475 on the Macintosh. */ 04476 while ((c = *(++ptr)) != 0 && c != NEWLINE) ; 04477 if (c == 0) break; 04478 continue; 04479 } 04480 } 04481 04482 item_count++; /* Is zero for the first non-comment item */ 04483 04484 switch(c) 04485 { 04486 /* A backslashed item may be an escaped "normal" character or a 04487 character type. For a "normal" character, put the pointers and 04488 character back so that tests for whitespace etc. in the input 04489 are done correctly. */ 04490 04491 case '\\': 04492 { 04493 const uschar *save_ptr = ptr; 04494 c = check_escape(&ptr, errorptr, bracount, options, false); 04495 if (*errorptr != NULL) goto PCRE_ERROR_RETURN; 04496 if (c >= 0) 04497 { 04498 ptr = save_ptr; 04499 c = '\\'; 04500 goto NORMAL_CHAR; 04501 } 04502 } 04503 04504 /* If \Q, enter "literal" mode */ 04505 04506 if (-c == ESC_Q) 04507 { 04508 inescq = true; 04509 continue; 04510 } 04511 04512 /* Other escapes need one byte, and are of length one for repeats */ 04513 04514 length++; 04515 04516 /* A back reference needs an additional 2 bytes, plus either one or 5 04517 bytes for a repeat. We also need to keep the value of the highest 04518 back reference. */ 04519 04520 if (c <= -ESC_REF) 04521 { 04522 int refnum = -c - ESC_REF; 04523 compile_block.backref_map |= (refnum < 32)? (1 << refnum) : 1; 04524 if (refnum > compile_block.top_backref) 04525 compile_block.top_backref = refnum; 04526 length += 2; /* For single back reference */ 04527 if (ptr[1] == '{' && is_counted_repeat(ptr+2)) 04528 { 04529 ptr = read_repeat_counts(ptr+2, &min, &max, errorptr); 04530 if (*errorptr != NULL) goto PCRE_ERROR_RETURN; 04531 if ((min == 0 && (max == 1 || max == -1)) || 04532 (min == 1 && max == -1)) 04533 length++; 04534 else length += 5; 04535 if (ptr[1] == '?') ptr++; 04536 } 04537 } 04538 continue; 04539 04540 case '^': /* Single-byte metacharacters */ 04541 case '.': 04542 case '$': 04543 length++; 04544 continue; 04545 04546 case '*': /* These repeats won't be after brackets; */ 04547 case '+': /* those are handled separately */ 04548 case '?': 04549 length++; 04550 goto POSESSIVE; /* A few lines below */ 04551 04552 /* This covers the cases of braced repeats after a single char, metachar, 04553 class, or back reference. */ 04554 04555 case '{': 04556 if (!is_counted_repeat(ptr+1)) goto NORMAL_CHAR; 04557 ptr = read_repeat_counts(ptr+1, &min, &max, errorptr); 04558 if (*errorptr != NULL) goto PCRE_ERROR_RETURN; 04559 04560 /* These special cases just insert one extra opcode */ 04561 04562 if ((min == 0 && (max == 1 || max == -1)) || 04563 (min == 1 && max == -1)) 04564 length++; 04565 04566 /* These cases might insert additional copies of a preceding character. */ 04567 04568 else 04569 { 04570 04571 /* Not UTF-8 mode: all characters are one byte */ 04572 { 04573 if (min != 1) 04574 { 04575 length--; /* Uncount the original char or metachar */ 04576 if (min > 0) length += 4; 04577 } 04578 04579 length += (max > 0)? 4 : 2; 04580 } 04581 } 04582 04583 if (ptr[1] == '?') ptr++; /* Needs no extra length */ 04584 04585 POSESSIVE: /* Test for possessive quantifier */ 04586 if (ptr[1] == '+') 04587 { 04588 ptr++; 04589 length += 2 + 2*LINK_SIZE; /* Allow for atomic brackets */ 04590 } 04591 continue; 04592 04593 /* An alternation contains an offset to the next branch or ket. If any ims 04594 options changed in the previous branch(es), and/or if we are in a 04595 lookbehind assertion, extra space will be needed at the start of the 04596 branch. This is handled by branch_extra. */ 04597 04598 case '|': 04599 length += 1 + LINK_SIZE + branch_extra; 04600 continue; 04601 04602 /* A character class uses 33 characters provided that all the character 04603 values are less than 256. Otherwise, it uses a bit map for low valued 04604 characters, and individual items for others. Don't worry about character 04605 types that aren't allowed in classes - they'll get picked up during the 04606 compile. A character class that contains only one single-byte character 04607 uses 2 or 3 bytes, depending on whether it is negated or not. Notice this 04608 where we can. (In UTF-8 mode we can do this only for chars < 128.) */ 04609 04610 case '[': 04611 class_optcount = 0; 04612 04613 if (*(++ptr) == '^') ptr++; 04614 04615 /* Written as a "do" so that an initial ']' is taken as data */ 04616 04617 if (*ptr != 0) do 04618 { 04619 /* Inside \Q...\E everything is literal except \E */ 04620 04621 if (inescq) 04622 { 04623 if (*ptr != '\\' || ptr[1] != 'E') goto NON_SPECIAL_CHARACTER; 04624 inescq = false; 04625 ptr += 1; 04626 continue; 04627 } 04628 04629 /* Outside \Q...\E, check for escapes */ 04630 04631 if (*ptr == '\\') 04632 { 04633 int ch = check_escape(&ptr, errorptr, bracount, options, true); 04634 if (*errorptr != NULL) goto PCRE_ERROR_RETURN; 04635 04636 /* \b is backspace inside a class */ 04637 04638 if (-ch == ESC_b) ch = '\b'; 04639 04640 /* \Q enters quoting mode */ 04641 04642 if (-ch == ESC_Q) 04643 { 04644 inescq = true; 04645 continue; 04646 } 04647 04648 /* Handle escapes that turn into characters */ 04649 04650 if (ch >= 0) 04651 { 04652 class_optcount++; /* for possible optimization */ 04653 } 04654 else class_optcount = 10; /* \d, \s etc; make sure > 1 */ 04655 } 04656 04657 /* Check the syntax for POSIX stuff. The bits we actually handle are 04658 checked during the real compile phase. */ 04659 04660 else if (*ptr == '[' && check_posix_syntax(ptr, &ptr, &compile_block)) 04661 { 04662 ptr++; 04663 class_optcount = 10; /* Make sure > 1 */ 04664 } 04665 04666 /* Anything else just increments the possible optimization count. If 04667 there are wide characters, we are going to have to use an XCLASS. */ 04668 04669 else 04670 { 04671 NON_SPECIAL_CHARACTER: 04672 class_optcount++; 04673 04674 } 04675 } 04676 while (*(++ptr) != 0 && (inescq || *ptr != ']')); /* Concludes "do" above */ 04677 04678 if (*ptr == 0) /* Missing terminating ']' */ 04679 { 04680 *errorptr = ERR6; 04681 goto PCRE_ERROR_RETURN; 04682 } 04683 04684 /* We can optimize when there was only one optimizable character. Repeats 04685 for positive and negated single one-byte chars are handled by the general 04686 code. Here, we handle repeats for the class opcodes. */ 04687 04688 if (class_optcount == 1) length += 3; else 04689 { 04690 length += 33; 04691 04692 /* A repeat needs either 1 or 5 bytes. If it is a possessive quantifier, 04693 we also need extra for wrapping the whole thing in a sub-pattern. */ 04694 04695 if (*ptr != 0 && ptr[1] == '{' && is_counted_repeat(ptr+2)) 04696 { 04697 ptr = read_repeat_counts(ptr+2, &min, &max, errorptr); 04698 if (*errorptr != NULL) goto PCRE_ERROR_RETURN; 04699 if ((min == 0 && (max == 1 || max == -1)) || 04700 (min == 1 && max == -1)) 04701 length++; 04702 else length += 5; 04703 if (ptr[1] == '+') 04704 { 04705 ptr++; 04706 length += 2 + 2*LINK_SIZE; 04707 } 04708 else if (ptr[1] == '?') ptr++; 04709 } 04710 } 04711 continue; 04712 04713 /* Brackets may be genuine groups or special things */ 04714 04715 case '(': 04716 branch_newextra = 0; 04717 bracket_length = 1 + LINK_SIZE; 04718 04719 /* Handle special forms of bracket, which all start (? */ 04720 04721 if (ptr[1] == '?') 04722 { 04723 int set, unset; 04724 int *optset; 04725 04726 switch (c = ptr[2]) 04727 { 04728 /* Skip over comments entirely */ 04729 case '#': 04730 ptr += 3; 04731 while (*ptr != 0 && *ptr != ')') ptr++; 04732 if (*ptr == 0) 04733 { 04734 *errorptr = ERR18; 04735 goto PCRE_ERROR_RETURN; 04736 } 04737 continue; 04738 04739 /* Non-referencing groups and lookaheads just move the pointer on, and 04740 then behave like a non-special bracket, except that they don't increment 04741 the count of extracting brackets. Ditto for the "once only" bracket, 04742 which is in Perl from version 5.005. */ 04743 04744 case ':': 04745 case '=': 04746 case '!': 04747 case '>': 04748 ptr += 2; 04749 break; 04750 04751 /* (?R) specifies a recursive call to the regex, which is an extension 04752 to provide the facility which can be obtained by (?p{perl-code}) in 04753 Perl 5.6. In Perl 5.8 this has become (??{perl-code}). 04754 04755 From PCRE 4.00, items such as (?3) specify subroutine-like "calls" to 04756 the appropriate numbered brackets. This includes both recursive and 04757 non-recursive calls. (?R) is now synonymous with (?0). */ 04758 04759 case 'R': 04760 ptr++; 04761 04762 case '0': case '1': case '2': case '3': case '4': 04763 case '5': case '6': case '7': case '8': case '9': 04764 ptr += 2; 04765 if (c != 'R') 04766 while ((digitab[*(++ptr)] & ctype_digit) != 0); 04767 if (*ptr != ')') 04768 { 04769 *errorptr = ERR29; 04770 goto PCRE_ERROR_RETURN; 04771 } 04772 length += 1 + LINK_SIZE; 04773 04774 /* If this item is quantified, it will get wrapped inside brackets so 04775 as to use the code for quantified brackets. We jump down and use the 04776 code that handles this for real brackets. */ 04777 04778 if (ptr[1] == '+' || ptr[1] == '*' || ptr[1] == '?' || ptr[1] == '{') 04779 { 04780 length += 2 + 2 * LINK_SIZE; /* to make bracketed */ 04781 duplength = 5 + 3 * LINK_SIZE; 04782 goto HANDLE_QUANTIFIED_BRACKETS; 04783 } 04784 continue; 04785 04786 /* (?C) is an extension which provides "callout" - to provide a bit of 04787 the functionality of the Perl (?{...}) feature. An optional number may 04788 follow (default is zero). */ 04789 04790 case 'C': 04791 ptr += 2; 04792 while ((digitab[*(++ptr)] & ctype_digit) != 0); 04793 if (*ptr != ')') 04794 { 04795 *errorptr = ERR39; 04796 goto PCRE_ERROR_RETURN; 04797 } 04798 length += 2; 04799 continue; 04800 04801 /* Named subpatterns are an extension copied from Python */ 04802 04803 case 'P': 04804 ptr += 3; 04805 if (*ptr == '<') 04806 { 04807 const uschar *p; /* Don't amalgamate; some compilers */ 04808 p = ++ptr; /* grumble at autoincrement in declaration */ 04809 while ((compile_block.ctypes[*ptr] & ctype_word) != 0) ptr++; 04810 if (*ptr != '>') 04811 { 04812 *errorptr = ERR42; 04813 goto PCRE_ERROR_RETURN; 04814 } 04815 name_count++; 04816 if (ptr - p > max_name_size) max_name_size = (ptr - p); 04817 break; 04818 } 04819 04820 if (*ptr == '=' || *ptr == '>') 04821 { 04822 while ((compile_block.ctypes[*(++ptr)] & ctype_word) != 0); 04823 if (*ptr != ')') 04824 { 04825 *errorptr = ERR42; 04826 goto PCRE_ERROR_RETURN; 04827 } 04828 break; 04829 } 04830 04831 /* Unknown character after (?P */ 04832 04833 *errorptr = ERR41; 04834 goto PCRE_ERROR_RETURN; 04835 04836 /* Lookbehinds are in Perl from version 5.005 */ 04837 04838 case '<': 04839 ptr += 3; 04840 if (*ptr == '=' || *ptr == '!') 04841 { 04842 branch_newextra = 1 + LINK_SIZE; 04843 length += 1 + LINK_SIZE; /* For the first branch */ 04844 break; 04845 } 04846 *errorptr = ERR24; 04847 goto PCRE_ERROR_RETURN; 04848 04849 /* Conditionals are in Perl from version 5.005. The bracket must either 04850 be followed by a number (for bracket reference) or by an assertion 04851 group, or (a PCRE extension) by 'R' for a recursion test. */ 04852 04853 case '(': 04854 if (ptr[3] == 'R' && ptr[4] == ')') 04855 { 04856 ptr += 4; 04857 length += 3; 04858 } 04859 else if ((digitab[ptr[3]] & ctype_digit) != 0) 04860 { 04861 ptr += 4; 04862 length += 3; 04863 while ((digitab[*ptr] & ctype_digit) != 0) ptr++; 04864 if (*ptr != ')') 04865 { 04866 *errorptr = ERR26; 04867 goto PCRE_ERROR_RETURN; 04868 } 04869 } 04870 else /* An assertion must follow */ 04871 { 04872 ptr++; /* Can treat like ':' as far as spacing is concerned */ 04873 if (ptr[2] != '?' || 04874 (ptr[3] != '=' && ptr[3] != '!' && ptr[3] != '<') ) 04875 { 04876 ptr += 2; /* To get right offset in message */ 04877 *errorptr = ERR28; 04878 goto PCRE_ERROR_RETURN; 04879 } 04880 } 04881 break; 04882 04883 /* Else loop checking valid options until ) is met. Anything else is an 04884 error. If we are without any brackets, i.e. at top level, the settings 04885 act as if specified in the options, so massage the options immediately. 04886 This is for backward compatibility with Perl 5.004. */ 04887 04888 default: 04889 set = unset = 0; 04890 optset = &set; 04891 ptr += 2; 04892 04893 for (;; ptr++) 04894 { 04895 c = *ptr; 04896 switch (c) 04897 { 04898 case 'i': 04899 *optset |= PCRE_CASELESS; 04900 continue; 04901 04902 case 'm': 04903 *optset |= PCRE_MULTILINE; 04904 continue; 04905 04906 case 's': 04907 *optset |= PCRE_DOTALL; 04908 continue; 04909 04910 case 'x': 04911 *optset |= PCRE_EXTENDED; 04912 continue; 04913 04914 case 'X': 04915 *optset |= PCRE_EXTRA; 04916 continue; 04917 04918 case 'U': 04919 *optset |= PCRE_UNGREEDY; 04920 continue; 04921 04922 case '-': 04923 optset = &unset; 04924 continue; 04925 04926 /* A termination by ')' indicates an options-setting-only item; if 04927 this is at the very start of the pattern (indicated by item_count 04928 being zero), we use it to set the global options. This is helpful 04929 when analyzing the pattern for first characters, etc. Otherwise 04930 nothing is done here and it is handled during the compiling 04931 process. 04932 04933 [Historical note: Up to Perl 5.8, options settings at top level 04934 were always global settings, wherever they appeared in the pattern. 04935 That is, they were equivalent to an external setting. From 5.8 04936 onwards, they apply only to what follows (which is what you might 04937 expect).] */ 04938 04939 case ')': 04940 if (item_count == 0) 04941 { 04942 options = (options | set) & (~unset); 04943 set = unset = 0; /* To save length */ 04944 item_count--; /* To allow for several */ 04945 } 04946 04947 /* Fall through */ 04948 04949 /* A termination by ':' indicates the start of a nested group with 04950 the given options set. This is again handled at compile time, but 04951 we must allow for compiled space if any of the ims options are 04952 set. We also have to allow for resetting space at the end of 04953 the group, which is why 4 is added to the length and not just 2. 04954 If there are several changes of options within the same group, this 04955 will lead to an over-estimate on the length, but this shouldn't 04956 matter very much. We also have to allow for resetting options at 04957 the start of any alternations, which we do by setting 04958 branch_newextra to 2. Finally, we record whether the case-dependent 04959 flag ever changes within the regex. This is used by the "required 04960 character" code. */ 04961 04962 case ':': 04963 if (((set|unset) & PCRE_IMS) != 0) 04964 { 04965 length += 4; 04966 branch_newextra = 2; 04967 if (((set|unset) & PCRE_CASELESS) != 0) options |= PCRE_ICHANGED; 04968 } 04969 goto END_OPTIONS; 04970 04971 /* Unrecognized option character */ 04972 04973 default: 04974 *errorptr = ERR12; 04975 goto PCRE_ERROR_RETURN; 04976 } 04977 } 04978 04979 /* If we hit a closing bracket, that's it - this is a freestanding 04980 option-setting. We need to ensure that branch_extra is updated if 04981 necessary. The only values branch_newextra can have here are 0 or 2. 04982 If the value is 2, then branch_extra must either be 2 or 5, depending 04983 on whether this is a lookbehind group or not. */ 04984 04985 END_OPTIONS: 04986 if (c == ')') 04987 { 04988 if (branch_newextra == 2 && 04989 (branch_extra == 0 || branch_extra == 1+LINK_SIZE)) 04990 branch_extra += branch_newextra; 04991 continue; 04992 } 04993 04994 /* If options were terminated by ':' control comes here. Fall through 04995 to handle the group below. */ 04996 } 04997 } 04998 04999 /* Extracting brackets must be counted so we can process escapes in a 05000 Perlish way. If the number exceeds EXTRACT_BASIC_MAX we are going to 05001 need an additional 3 bytes of store per extracting bracket. However, if 05002 PCRE_NO_AUTO)CAPTURE is set, unadorned brackets become non-capturing, so we 05003 must leave the count alone (it will aways be zero). */ 05004 05005 else if ((options & PCRE_NO_AUTO_CAPTURE) == 0) 05006 { 05007 bracount++; 05008 if (bracount > EXTRACT_BASIC_MAX) bracket_length += 3; 05009 } 05010 05011 /* Save length for computing whole length at end if there's a repeat that 05012 requires duplication of the group. Also save the current value of 05013 branch_extra, and start the new group with the new value. If non-zero, this 05014 will either be 2 for a (?imsx: group, or 3 for a lookbehind assertion. */ 05015 05016 if (brastackptr >= sizeof(brastack)/sizeof(int)) 05017 { 05018 *errorptr = ERR19; 05019 goto PCRE_ERROR_RETURN; 05020 } 05021 05022 bralenstack[brastackptr] = branch_extra; 05023 branch_extra = branch_newextra; 05024 05025 brastack[brastackptr++] = length; 05026 length += bracket_length; 05027 continue; 05028 05029 /* Handle ket. Look for subsequent max/min; for certain sets of values we 05030 have to replicate this bracket up to that many times. If brastackptr is 05031 0 this is an unmatched bracket which will generate an error, but take care 05032 not to try to access brastack[-1] when computing the length and restoring 05033 the branch_extra value. */ 05034 05035 case ')': 05036 length += 1 + LINK_SIZE; 05037 if (brastackptr > 0) 05038 { 05039 duplength = length - brastack[--brastackptr]; 05040 branch_extra = bralenstack[brastackptr]; 05041 } 05042 else duplength = 0; 05043 05044 /* The following code is also used when a recursion such as (?3) is 05045 followed by a quantifier, because in that case, it has to be wrapped inside 05046 brackets so that the quantifier works. The value of duplength must be 05047 set before arrival. */ 05048 05049 HANDLE_QUANTIFIED_BRACKETS: 05050 05051 /* Leave ptr at the final char; for read_repeat_counts this happens 05052 automatically; for the others we need an increment. */ 05053 05054 if ((c = ptr[1]) == '{' && is_counted_repeat(ptr+2)) 05055 { 05056 ptr = read_repeat_counts(ptr+2, &min, &max, errorptr); 05057 if (*errorptr != NULL) goto PCRE_ERROR_RETURN; 05058 } 05059 else if (c == '*') { min = 0; max = -1; ptr++; } 05060 else if (c == '+') { min = 1; max = -1; ptr++; } 05061 else if (c == '?') { min = 0; max = 1; ptr++; } 05062 else { min = 1; max = 1; } 05063 05064 /* If the minimum is zero, we have to allow for an OP_BRAZERO before the 05065 group, and if the maximum is greater than zero, we have to replicate 05066 maxval-1 times; each replication acquires an OP_BRAZERO plus a nesting 05067 bracket set. */ 05068 05069 if (min == 0) 05070 { 05071 length++; 05072 if (max > 0) length += (max - 1) * (duplength + 3 + 2*LINK_SIZE); 05073 } 05074 05075 /* When the minimum is greater than zero, we have to replicate up to 05076 minval-1 times, with no additions required in the copies. Then, if there 05077 is a limited maximum we have to replicate up to maxval-1 times allowing 05078 for a BRAZERO item before each optional copy and nesting brackets for all 05079 but one of the optional copies. */ 05080 05081 else 05082 { 05083 length += (min - 1) * duplength; 05084 if (max > min) /* Need this test as max=-1 means no limit */ 05085 length += (max - min) * (duplength + 3 + 2*LINK_SIZE) 05086 - (2 + 2*LINK_SIZE); 05087 } 05088 05089 /* Allow space for once brackets for "possessive quantifier" */ 05090 05091 if (ptr[1] == '+') 05092 { 05093 ptr++; 05094 length += 2 + 2*LINK_SIZE; 05095 } 05096 continue; 05097 05098 /* Non-special character. For a run of such characters the length required 05099 is the number of characters + 2, except that the maximum run length is 05100 MAXLIT. We won't get a skipped space or a non-data escape or the start of a 05101 # comment as the first character, so the length can't be zero. */ 05102 05103 NORMAL_CHAR: 05104 default: 05105 length += 2; 05106 runlength = 0; 05107 do 05108 { 05109 05110 /* If in a \Q...\E sequence, check for end; otherwise it's a literal */ 05111 if (inescq) 05112 { 05113 if (c == '\\' && ptr[1] == 'E') 05114 { 05115 inescq = false; 05116 ptr++; 05117 } 05118 else runlength++; 05119 continue; 05120 } 05121 05122 /* Skip whitespace and comments for /x */ 05123 05124 if ((options & PCRE_EXTENDED) != 0) 05125 { 05126 if ((compile_block.ctypes[c] & ctype_space) != 0) continue; 05127 if (c == '#') 05128 { 05129 /* The space before the ; is to avoid a warning on a silly compiler 05130 on the Macintosh. */ 05131 while ((c = *(++ptr)) != 0 && c != NEWLINE) ; 05132 continue; 05133 } 05134 } 05135 05136 /* Backslash may introduce a data char or a metacharacter; stop the 05137 string before the latter. */ 05138 05139 if (c == '\\') 05140 { 05141 const uschar *saveptr = ptr; 05142 c = check_escape(&ptr, errorptr, bracount, options, false); 05143 if (*errorptr != NULL) goto PCRE_ERROR_RETURN; 05144 if (c < 0) { ptr = saveptr; break; } 05145 05146 /* In UTF-8 mode, add on the number of additional bytes needed to 05147 encode this character, and save the total length in case this is a 05148 final char that is repeated. */ 05149 05150 } 05151 05152 /* Ordinary character or single-char escape */ 05153 05154 runlength++; 05155 } 05156 05157 /* This "while" is the end of the "do" above. */ 05158 05159 while (runlength < MAXLIT && 05160 (compile_block.ctypes[c = *(++ptr)] & ctype_meta) == 0); 05161 05162 /* If we hit a meta-character, back off to point to it */ 05163 05164 if (runlength < MAXLIT) ptr--; 05165 05166 /* If the last char in the string is a UTF-8 multibyte character, we must 05167 set lastcharlength correctly. If it was specified as an escape, this will 05168 already have been done above. However, we also have to support in-line 05169 UTF-8 characters, so check backwards from where we are. */ 05170 05171 05172 length += runlength; 05173 continue; 05174 } 05175 } 05176 05177 length += 2 + LINK_SIZE; /* For final KET and END */ 05178 05179 if (length > MAX_PATTERN_SIZE) 05180 { 05181 *errorptr = ERR20; 05182 return NULL; 05183 } 05184 05185 /* Compute the size of data block needed and get it, either from malloc or 05186 externally provided function. */ 05187 05188 size = length + sizeof(real_pcre) + name_count * (max_name_size + 3); 05189 re = static_cast<real_pcre *>(malloc(size)); 05190 05191 if (re == NULL) 05192 { 05193 *errorptr = ERR21; 05194 return NULL; 05195 } 05196 05197 /* Put in the magic number, and save the size, options, and table pointer */ 05198 05199 re->magic_number = MAGIC_NUMBER; 05200 re->size = size; 05201 re->options = options; 05202 re->tables = tables; 05203 re->name_entry_size = max_name_size + 3; 05204 re->name_count = name_count; 05205 05206 /* The starting points of the name/number translation table and of the code are 05207 passed around in the compile data block. */ 05208 05209 compile_block.names_found = 0; 05210 compile_block.name_entry_size = max_name_size + 3; 05211 compile_block.name_table = (uschar *)re + sizeof(real_pcre); 05212 codestart = compile_block.name_table + re->name_entry_size * re->name_count; 05213 compile_block.start_code = codestart; 05214 compile_block.req_varyopt = 0; 05215 05216 /* Set up a starting, non-extracting bracket, then compile the expression. On 05217 error, *errorptr will be set non-NULL, so we don't need to look at the result 05218 of the function here. */ 05219 05220 ptr = (const uschar *)pattern; 05221 code = (uschar *)codestart; 05222 *code = OP_BRA; 05223 bracount = 0; 05224 (void)compile_regex(options, options & PCRE_IMS, &bracount, &code, &ptr, 05225 errorptr, false, 0, &firstbyte, &reqbyte, NULL, &compile_block); 05226 re->top_bracket = bracount; 05227 re->top_backref = compile_block.top_backref; 05228 05229 /* If not reached end of pattern on success, there's an excess bracket. */ 05230 05231 if (*errorptr == NULL && *ptr != 0) *errorptr = ERR22; 05232 05233 /* Fill in the terminating state and check for disastrous overflow, but 05234 if debugging, leave the test till after things are printed out. */ 05235 05236 *code++ = OP_END; 05237 05238 if (code - codestart > length) *errorptr = ERR23; 05239 05240 /* Give an error if there's back reference to a non-existent capturing 05241 subpattern. */ 05242 05243 if (re->top_backref > re->top_bracket) *errorptr = ERR15; 05244 05245 /* Failed to compile, or error while post-processing */ 05246 05247 if (*errorptr != NULL) 05248 { 05249 free(re); 05250 PCRE_ERROR_RETURN: 05251 *erroroffset = ptr - (const uschar *)pattern; 05252 return NULL; 05253 } 05254 05255 /* If the anchored option was not passed, set the flag if we can determine that 05256 the pattern is anchored by virtue of ^ characters or \A or anything else (such 05257 as starting with .* when DOTALL is set). 05258 05259 Otherwise, if we know what the first character has to be, save it, because that 05260 speeds up unanchored matches no end. If not, see if we can set the 05261 PCRE_STARTLINE flag. This is helpful for multiline matches when all branches 05262 start with ^. and also when all branches start with .* for non-DOTALL matches. 05263 */ 05264 05265 if ((options & PCRE_ANCHORED) == 0) 05266 { 05267 int temp_options = options; 05268 if (is_anchored(codestart, &temp_options, 0, compile_block.backref_map)) 05269 re->options |= PCRE_ANCHORED; 05270 else 05271 { 05272 if (firstbyte < 0) 05273 firstbyte = find_firstassertedchar(codestart, &temp_options, false); 05274 if (firstbyte >= 0) /* Remove caseless flag for non-caseable chars */ 05275 { 05276 int ch = firstbyte & 255; 05277 re->first_byte = ((firstbyte & REQ_CASELESS) != 0 && 05278 compile_block.fcc[ch] == ch)? ch : firstbyte; 05279 re->options |= PCRE_FIRSTSET; 05280 } 05281 else if (is_startline(codestart, 0, compile_block.backref_map)) 05282 re->options |= PCRE_STARTLINE; 05283 } 05284 } 05285 05286 /* For an anchored pattern, we use the "required byte" only if it follows a 05287 variable length item in the regex. Remove the caseless flag for non-caseable 05288 chars. */ 05289 05290 if (reqbyte >= 0 && 05291 ((re->options & PCRE_ANCHORED) == 0 || (reqbyte & REQ_VARY) != 0)) 05292 { 05293 int ch = reqbyte & 255; 05294 re->req_byte = ((reqbyte & REQ_CASELESS) != 0 && 05295 compile_block.fcc[ch] == ch)? (reqbyte & ~REQ_CASELESS) : reqbyte; 05296 re->options |= PCRE_REQCHSET; 05297 } 05298 05299 return (pcre *)re; 05300 } 05301
int pcre_copy_substring | ( | const char * | subject, | |
int * | ovector, | |||
int | stringcount, | |||
int | stringnumber, | |||
char * | buffer, | |||
int | size | |||
) |
Definition at line 811 of file pcre.cpp.
References PCRE_ERROR_NOMEMORY, and PCRE_ERROR_NOSUBSTRING.
Referenced by real_regmatch(), and regexp_match().
00812 { 00813 int yield; 00814 if (stringnumber < 0 || stringnumber >= stringcount) 00815 return PCRE_ERROR_NOSUBSTRING; 00816 stringnumber *= 2; 00817 yield = ovector[stringnumber+1] - ovector[stringnumber]; 00818 if (size < yield + 1) return PCRE_ERROR_NOMEMORY; 00819 memcpy(buffer, subject + ovector[stringnumber], yield); 00820 buffer[yield] = 0; 00821 return yield; 00822 } 00823
int pcre_exec | ( | const pcre * | external_re, | |
const pcre_extra * | extra_data, | |||
const char * | subject, | |||
int | length, | |||
int | start_offset, | |||
int | options, | |||
int * | offsets, | |||
int | offsetcount | |||
) |
Definition at line 7090 of file pcre.cpp.
References match_data::callout_data, pcre_extra::callout_data, match_data::capture_last, match_data::ctypes, ctypes_offset, DPRINTF, match_data::end_match_ptr, match_data::end_offset_top, match_data::end_subject, match_data::endonly, fcc_offset, real_pcre::first_byte, pcre_extra::flags, match_data::lcc, lcc_offset, real_pcre::magic_number, MAGIC_NUMBER, match(), match_data::match_call_count, match_isgroup, match_data::match_limit, MATCH_LIMIT, pcre_extra::match_limit, MATCH_MATCH, MATCH_NOMATCH, real_pcre::name_count, real_pcre::name_entry_size, NEWLINE, match_data::notbol, match_data::notempty, match_data::noteol, match_data::offset_end, match_data::offset_max, match_data::offset_overflow, match_data::offset_vector, real_pcre::options, pcre_study_data::options, PCRE_ANCHORED, PCRE_CASELESS, PCRE_DOLLAR_ENDONLY, PCRE_DOTALL, PCRE_ERROR_BADMAGIC, PCRE_ERROR_BADOPTION, PCRE_ERROR_NOMATCH, PCRE_ERROR_NOMEMORY, PCRE_ERROR_NULL, PCRE_EXTRA_CALLOUT_DATA, PCRE_EXTRA_MATCH_LIMIT, PCRE_EXTRA_STUDY_DATA, PCRE_FIRSTSET, PCRE_MULTILINE, PCRE_NOTBOL, PCRE_NOTEMPTY, PCRE_NOTEOL, PCRE_REQCHSET, PCRE_STARTLINE, PCRE_STUDY_MAPPED, PCRE_UTF8, PUBLIC_EXEC_OPTIONS, match_data::recursive, real_pcre::req_byte, REQ_BYTE_MAX, REQ_CASELESS, pcre_study_data::start_bits, match_data::start_code, match_data::start_match, match_data::start_offset, match_data::start_subject, pcre_extra::study_data, real_pcre::tables, real_pcre::top_backref, real_pcre::top_bracket, and match_data::utf8.
Referenced by check_filter(), FUNCTION(), real_regmatch(), real_regrab(), and regexp_match().
07092 { 07093 int rc, resetcount, ocount; 07094 int first_byte = -1; 07095 int req_byte = -1; 07096 int req_byte2 = -1; 07097 unsigned long int ims = 0; 07098 bool using_temporary_offsets = false; 07099 bool anchored; 07100 bool startline; 07101 bool first_byte_caseless = false; 07102 bool req_byte_caseless = false; 07103 match_data match_block; 07104 const uschar *start_bits = NULL; 07105 const uschar *start_match = (const uschar *)subject + start_offset; 07106 const uschar *end_subject; 07107 const uschar *req_byte_ptr = start_match - 1; 07108 const pcre_study_data *study; 07109 const real_pcre *re = (const real_pcre *)external_re; 07110 07111 /* Plausibility checks */ 07112 07113 if ((options & ~PUBLIC_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION; 07114 if (re == NULL || subject == NULL || 07115 (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL; 07116 07117 /* Fish out the optional data from the extra_data structure, first setting 07118 the default values. */ 07119 07120 study = NULL; 07121 match_block.match_limit = MATCH_LIMIT; 07122 match_block.callout_data = NULL; 07123 07124 if (extra_data != NULL) 07125 { 07126 register unsigned int flags = extra_data->flags; 07127 if ((flags & PCRE_EXTRA_STUDY_DATA) != 0) 07128 study = (const pcre_study_data *)extra_data->study_data; 07129 if ((flags & PCRE_EXTRA_MATCH_LIMIT) != 0) 07130 match_block.match_limit = extra_data->match_limit; 07131 if ((flags & PCRE_EXTRA_CALLOUT_DATA) != 0) 07132 match_block.callout_data = extra_data->callout_data; 07133 } 07134 07135 /* Now we have re supposedly pointing to the regex */ 07136 07137 if (re->magic_number != MAGIC_NUMBER) return PCRE_ERROR_BADMAGIC; 07138 07139 anchored = ((re->options | options) & PCRE_ANCHORED) != 0; 07140 startline = (re->options & PCRE_STARTLINE) != 0; 07141 07142 match_block.start_code = 07143 (const uschar *)re + sizeof(real_pcre) + re->name_count * re->name_entry_size; 07144 match_block.start_subject = (const uschar *)subject; 07145 match_block.start_offset = start_offset; 07146 match_block.end_subject = match_block.start_subject + length; 07147 end_subject = match_block.end_subject; 07148 07149 match_block.endonly = (re->options & PCRE_DOLLAR_ENDONLY) != 0; 07150 match_block.utf8 = (re->options & PCRE_UTF8) != 0; 07151 07152 match_block.notbol = (options & PCRE_NOTBOL) != 0; 07153 match_block.noteol = (options & PCRE_NOTEOL) != 0; 07154 match_block.notempty = (options & PCRE_NOTEMPTY) != 0; 07155 07156 match_block.recursive = NULL; /* No recursion at top level */ 07157 07158 match_block.lcc = re->tables + lcc_offset; 07159 match_block.ctypes = re->tables + ctypes_offset; 07160 07161 /* Check a UTF-8 string if required. Unfortunately there's no way of passing 07162 back the character offset. */ 07163 07164 /* The ims options can vary during the matching as a result of the presence 07165 of (?ims) items in the pattern. They are kept in a local variable so that 07166 restoring at the exit of a group is easy. */ 07167 07168 ims = re->options & (PCRE_CASELESS|PCRE_MULTILINE|PCRE_DOTALL); 07169 07170 /* If the expression has got more back references than the offsets supplied can 07171 hold, we get a temporary bit of working store to use during the matching. 07172 Otherwise, we can use the vector supplied, rounding down its size to a multiple 07173 of 3. */ 07174 07175 ocount = offsetcount - (offsetcount % 3); 07176 07177 if (re->top_backref > 0 && re->top_backref >= ocount/3) 07178 { 07179 ocount = re->top_backref * 3 + 3; 07180 match_block.offset_vector = static_cast<int *>(malloc(ocount * sizeof(int))); 07181 if (match_block.offset_vector == NULL) return PCRE_ERROR_NOMEMORY; 07182 using_temporary_offsets = true; 07183 DPRINTF(("Got memory to hold back references\n")); 07184 } 07185 else match_block.offset_vector = offsets; 07186 07187 match_block.offset_end = ocount; 07188 match_block.offset_max = (2*ocount)/3; 07189 match_block.offset_overflow = false; 07190 match_block.capture_last = -1; 07191 07192 /* Compute the minimum number of offsets that we need to reset each time. Doing 07193 this makes a huge difference to execution time when there aren't many brackets 07194 in the pattern. */ 07195 07196 resetcount = 2 + re->top_bracket * 2; 07197 if (resetcount > offsetcount) resetcount = ocount; 07198 07199 /* Reset the working variable associated with each extraction. These should 07200 never be used unless previously set, but they get saved and restored, and so we 07201 initialize them to avoid reading uninitialized locations. */ 07202 07203 if (match_block.offset_vector != NULL) 07204 { 07205 register int *iptr = match_block.offset_vector + ocount; 07206 register int *iend = iptr - resetcount/2 + 1; 07207 while (--iptr >= iend) *iptr = -1; 07208 } 07209 07210 /* Set up the first character to match, if available. The first_byte value is 07211 never set for an anchored regular expression, but the anchoring may be forced 07212 at run time, so we have to test for anchoring. The first char may be unset for 07213 an unanchored pattern, of course. If there's no first char and the pattern was 07214 studied, there may be a bitmap of possible first characters. */ 07215 07216 if (!anchored) 07217 { 07218 if ((re->options & PCRE_FIRSTSET) != 0) 07219 { 07220 first_byte = re->first_byte & 255; 07221 if ((first_byte_caseless = ((re->first_byte & REQ_CASELESS) != 0)) == true) 07222 first_byte = match_block.lcc[first_byte]; 07223 } 07224 else 07225 if (!startline && study != NULL && 07226 (study->options & PCRE_STUDY_MAPPED) != 0) 07227 start_bits = study->start_bits; 07228 } 07229 07230 /* For anchored or unanchored matches, there may be a "last known required 07231 character" set. */ 07232 07233 if ((re->options & PCRE_REQCHSET) != 0) 07234 { 07235 req_byte = re->req_byte & 255; 07236 req_byte_caseless = (re->req_byte & REQ_CASELESS) != 0; 07237 req_byte2 = (re->tables + fcc_offset)[req_byte]; /* case flipped */ 07238 } 07239 07240 /* Loop for handling unanchored repeated matching attempts; for anchored regexs 07241 the loop runs just once. */ 07242 07243 do 07244 { 07245 register int *iptr = match_block.offset_vector; 07246 register int *iend = iptr + resetcount; 07247 07248 /* Reset the maximum number of extractions we might see. */ 07249 07250 while (iptr < iend) *iptr++ = -1; 07251 07252 /* Advance to a unique first char if possible */ 07253 07254 if (first_byte >= 0) 07255 { 07256 if (first_byte_caseless) 07257 while (start_match < end_subject && 07258 match_block.lcc[*start_match] != first_byte) 07259 start_match++; 07260 else 07261 while (start_match < end_subject && *start_match != first_byte) 07262 start_match++; 07263 } 07264 07265 /* Or to just after \n for a multiline match if possible */ 07266 07267 else if (startline) 07268 { 07269 if (start_match > match_block.start_subject + start_offset) 07270 { 07271 while (start_match < end_subject && start_match[-1] != NEWLINE) 07272 start_match++; 07273 } 07274 } 07275 07276 /* Or to a non-unique first char after study */ 07277 07278 else if (start_bits != NULL) 07279 { 07280 while (start_match < end_subject) 07281 { 07282 register int c = *start_match; 07283 if ((start_bits[c/8] & (1 << (c&7))) == 0) start_match++; else break; 07284 } 07285 } 07286 07287 /* If req_byte is set, we know that that character must appear in the subject 07288 for the match to succeed. If the first character is set, req_byte must be 07289 later in the subject; otherwise the test starts at the match point. This 07290 optimization can save a huge amount of backtracking in patterns with nested 07291 unlimited repeats that aren't going to match. Writing separate code for 07292 cased/caseless versions makes it go faster, as does using an autoincrement 07293 and backing off on a match. 07294 07295 HOWEVER: when the subject string is very, very long, searching to its end can 07296 take a long time, and give bad performance on quite ordinary patterns. This 07297 showed up when somebody was matching /^C/ on a 32-megabyte string... so we 07298 don't do this when the string is sufficiently long. */ 07299 07300 if (req_byte >= 0 && end_subject - start_match < REQ_BYTE_MAX) 07301 { 07302 register const uschar *p = start_match + ((first_byte >= 0)? 1 : 0); 07303 07304 /* We don't need to repeat the search if we haven't yet reached the 07305 place we found it at last time. */ 07306 07307 if (p > req_byte_ptr) 07308 { 07309 if (req_byte_caseless) 07310 { 07311 while (p < end_subject) 07312 { 07313 register int pp = *p++; 07314 if (pp == req_byte || pp == req_byte2) { p--; break; } 07315 } 07316 } 07317 else 07318 { 07319 while (p < end_subject) 07320 { 07321 if (*p++ == req_byte) { p--; break; } 07322 } 07323 } 07324 07325 /* If we can't find the required character, break the matching loop */ 07326 07327 if (p >= end_subject) break; 07328 07329 /* If we have found the required character, save the point where we 07330 found it, so that we don't search again next time round the loop if 07331 the start hasn't passed this character yet. */ 07332 07333 req_byte_ptr = p; 07334 } 07335 } 07336 07337 /* When a match occurs, substrings will be set for all internal extractions; 07338 we just need to set up the whole thing as substring 0 before returning. If 07339 there were too many extractions, set the return code to zero. In the case 07340 where we had to get some local store to hold offsets for backreferences, copy 07341 those back references that we can. In this case there need not be overflow 07342 if certain parts of the pattern were not used. */ 07343 07344 match_block.start_match = start_match; 07345 match_block.match_call_count = 0; 07346 07347 rc = match(start_match, match_block.start_code, 2, &match_block, ims, NULL, 07348 match_isgroup); 07349 07350 if (rc == MATCH_NOMATCH) 07351 { 07352 start_match++; 07353 continue; 07354 } 07355 07356 if (rc != MATCH_MATCH) 07357 { 07358 DPRINTF((">>>> error: returning %d\n", rc)); 07359 return rc; 07360 } 07361 07362 /* We have a match! Copy the offset information from temporary store if 07363 necessary */ 07364 07365 if (using_temporary_offsets) 07366 { 07367 if (offsetcount >= 4) 07368 { 07369 memcpy(offsets + 2, match_block.offset_vector + 2, 07370 (offsetcount - 2) * sizeof(int)); 07371 DPRINTF(("Copied offsets from temporary memory\n")); 07372 } 07373 if (match_block.end_offset_top > offsetcount) 07374 match_block.offset_overflow = true; 07375 07376 DPRINTF(("Freeing temporary memory\n")); 07377 free(match_block.offset_vector); 07378 } 07379 07380 rc = match_block.offset_overflow? 0 : match_block.end_offset_top/2; 07381 07382 if (offsetcount < 2) rc = 0; else 07383 { 07384 offsets[0] = start_match - match_block.start_subject; 07385 offsets[1] = match_block.end_match_ptr - match_block.start_subject; 07386 } 07387 07388 DPRINTF((">>>> returning %d\n", rc)); 07389 return rc; 07390 } 07391 07392 /* This "while" is the end of the "do" above */ 07393 07394 while (!anchored && start_match <= end_subject); 07395 07396 if (using_temporary_offsets) 07397 { 07398 DPRINTF(("Freeing temporary memory\n")); 07399 free(match_block.offset_vector); 07400 } 07401 07402 DPRINTF((">>>> returning PCRE_ERROR_NOMATCH\n")); 07403 07404 return PCRE_ERROR_NOMATCH; 07405 } 07406
const unsigned char* pcre_maketables | ( | void | ) |
Definition at line 842 of file pcre.cpp.
References cbit_cntrl, cbit_digit, cbit_graph, cbit_length, cbit_lower, cbit_print, cbit_punct, cbit_space, cbit_upper, cbit_word, cbit_xdigit, ctype_digit, ctype_letter, ctype_meta, ctype_space, ctype_word, ctype_xdigit, and tables_length.
00842 { 00843 unsigned char *yield, *p; 00844 int i; 00845 00846 yield = static_cast<unsigned char*>(malloc(tables_length)); 00847 00848 if (yield == NULL) return NULL; 00849 p = yield; 00850 00851 /* First comes the lower casing table */ 00852 00853 for (i = 0; i < 256; i++) *p++ = tolower(i); 00854 00855 /* Next the case-flipping table */ 00856 00857 for (i = 0; i < 256; i++) *p++ = islower(i)? toupper(i) : tolower(i); 00858 00859 /* Then the character class tables. Don't try to be clever and save effort 00860 on exclusive ones - in some locales things may be different. Note that the 00861 table for "space" includes everything "isspace" gives, including VT in the 00862 default locale. This makes it work for the POSIX class [:space:]. */ 00863 00864 memset(p, 0, cbit_length); 00865 for (i = 0; i < 256; i++) 00866 { 00867 if (isdigit(i)) 00868 { 00869 p[cbit_digit + i/8] |= 1 << (i&7); 00870 p[cbit_word + i/8] |= 1 << (i&7); 00871 } 00872 if (isupper(i)) 00873 { 00874 p[cbit_upper + i/8] |= 1 << (i&7); 00875 p[cbit_word + i/8] |= 1 << (i&7); 00876 } 00877 if (islower(i)) 00878 { 00879 p[cbit_lower + i/8] |= 1 << (i&7); 00880 p[cbit_word + i/8] |= 1 << (i&7); 00881 } 00882 if (i == '_') p[cbit_word + i/8] |= 1 << (i&7); 00883 if (isspace(i)) p[cbit_space + i/8] |= 1 << (i&7); 00884 if (isxdigit(i))p[cbit_xdigit + i/8] |= 1 << (i&7); 00885 if (isgraph(i)) p[cbit_graph + i/8] |= 1 << (i&7); 00886 if (isprint(i)) p[cbit_print + i/8] |= 1 << (i&7); 00887 if (ispunct(i)) p[cbit_punct + i/8] |= 1 << (i&7); 00888 if (iscntrl(i)) p[cbit_cntrl + i/8] |= 1 << (i&7); 00889 } 00890 p += cbit_length; 00891 00892 /* Finally, the character type table. In this, we exclude VT from the white 00893 space chars, because Perl doesn't recognize it as such for \s and for comments 00894 within regexes. */ 00895 00896 for (i = 0; i < 256; i++) 00897 { 00898 int x = 0; 00899 if (i != 0x0b && isspace(i)) x += ctype_space; 00900 if (isalpha(i)) x += ctype_letter; 00901 if (isdigit(i)) x += ctype_digit; 00902 if (isxdigit(i)) x += ctype_xdigit; 00903 if (isalnum(i) || i == '_') x += ctype_word; 00904 if (strchr("*+?{^.$|()[", i) != 0) x += ctype_meta; 00905 *p++ = x; 00906 } 00907 00908 return yield; 00909 } 00910
pcre_extra* pcre_study | ( | const pcre * | external_re, | |
int | options, | |||
const char ** | errorptr | |||
) |
Definition at line 1233 of file pcre.cpp.
References compile_data::cbits, cbits_offset, compile_data::ctypes, ctypes_offset, compile_data::fcc, fcc_offset, pcre_extra::flags, compile_data::lcc, lcc_offset, real_pcre::magic_number, MAGIC_NUMBER, real_pcre::name_count, real_pcre::name_entry_size, real_pcre::options, pcre_study_data::options, PCRE_ANCHORED, PCRE_CASELESS, PCRE_EXTRA_STUDY_DATA, PCRE_FIRSTSET, PCRE_STARTLINE, PCRE_STUDY_MAPPED, PCRE_UTF8, PUBLIC_STUDY_OPTIONS, set_start_bits(), pcre_study_data::size, pcre_study_data::start_bits, pcre_extra::study_data, and real_pcre::tables.
Referenced by CF_HAND(), and real_regrab().
01233 { 01234 uschar start_bits[32]; 01235 pcre_extra *extra; 01236 pcre_study_data *study; 01237 const real_pcre *re = (const real_pcre *)external_re; 01238 uschar *code = (uschar *)re + sizeof(real_pcre) + 01239 (re->name_count * re->name_entry_size); 01240 compile_data compile_block; 01241 01242 *errorptr = NULL; 01243 01244 if (re == NULL || re->magic_number != MAGIC_NUMBER) 01245 { 01246 *errorptr = "argument is not a compiled regular expression"; 01247 return NULL; 01248 } 01249 01250 if ((options & ~PUBLIC_STUDY_OPTIONS) != 0) 01251 { 01252 *errorptr = "unknown or incorrect option bit(s) set"; 01253 return NULL; 01254 } 01255 01256 /* For an anchored pattern, or an unanchored pattern that has a first char, or 01257 a multiline pattern that matches only at "line starts", no further processing 01258 at present. */ 01259 01260 if ((re->options & (PCRE_ANCHORED|PCRE_FIRSTSET|PCRE_STARTLINE)) != 0) 01261 return NULL; 01262 01263 /* Set the character tables in the block which is passed around */ 01264 01265 compile_block.lcc = re->tables + lcc_offset; 01266 compile_block.fcc = re->tables + fcc_offset; 01267 compile_block.cbits = re->tables + cbits_offset; 01268 compile_block.ctypes = re->tables + ctypes_offset; 01269 01270 /* See if we can find a fixed set of initial characters for the pattern. */ 01271 01272 memset(start_bits, 0, 32 * sizeof(uschar)); 01273 if (!set_start_bits(code, start_bits, (re->options & PCRE_CASELESS) != 0, 01274 (re->options & PCRE_UTF8) != 0, &compile_block)) return NULL; 01275 01276 /* Get a pcre_extra block and a pcre_study_data block. The study data is put in 01277 the latter, which is pointed to by the former, which may also get additional 01278 data set later by the calling program. At the moment, the size of 01279 pcre_study_data is fixed. We nevertheless save it in a field for returning via 01280 the pcre_fullinfo() function so that if it becomes variable in the future, we 01281 don't have to change that code. */ 01282 01283 extra = static_cast<pcre_extra *>(malloc(sizeof(pcre_extra) + sizeof(pcre_study_data))); 01284 01285 if (extra == NULL) 01286 { 01287 *errorptr = "failed to get memory"; 01288 return NULL; 01289 } 01290 01291 // Hmm. 01292 study = reinterpret_cast<pcre_study_data *>(reinterpret_cast<char*>(extra) + sizeof(pcre_extra)); 01293 extra->flags = PCRE_EXTRA_STUDY_DATA; 01294 extra->study_data = study; 01295 01296 study->size = sizeof(pcre_study_data); 01297 study->options = PCRE_STUDY_MAPPED; 01298 memcpy(study->start_bits, start_bits, sizeof(start_bits)); 01299 01300 return extra; 01301 } 01302
static const uschar* read_repeat_counts | ( | const uschar * | p, | |
int * | minp, | |||
int * | maxp, | |||
const char ** | errorptr | |||
) | [static] |
Definition at line 1722 of file pcre.cpp.
References ctype_digit, digitab, ERR4, and ERR5.
Referenced by compile_branch(), and pcre_compile().
01722 { 01723 int min = 0; 01724 int max = -1; 01725 01726 while ((digitab[*p] & ctype_digit) != 0) min = min * 10 + *p++ - '0'; 01727 01728 if (*p == '}') max = min; else 01729 { 01730 if (*(++p) != '}') 01731 { 01732 max = 0; 01733 while((digitab[*p] & ctype_digit) != 0) max = max * 10 + *p++ - '0'; 01734 if (max < min) 01735 { 01736 *errorptr = ERR4; 01737 return p; 01738 } 01739 } 01740 } 01741 01742 /* Do paranoid checks, then fill in the required variables, and pass back the 01743 pointer to the terminating '}'. */ 01744 01745 if (min < 0 || 65535 < min || 01746 max < -1 || 65535 < max) 01747 *errorptr = ERR5; 01748 else 01749 { 01750 *minp = min; 01751 *maxp = max; 01752 } 01753 return p; 01754 } 01755
static void set_bit | ( | uschar * | start_bits, | |
int | c, | |||
bool | caseless, | |||
compile_data * | cd | |||
) | [static] |
Definition at line 932 of file pcre.cpp.
References ctype_letter, compile_data::ctypes, and compile_data::fcc.
Referenced by set_start_bits().
00932 { 00933 start_bits[c/8] |= (1 << (c&7)); 00934 if (caseless && (cd->ctypes[c] & ctype_letter) != 0) 00935 start_bits[cd->fcc[c]/8] |= (1 << (cd->fcc[c]&7)); 00936 } 00937
static bool set_start_bits | ( | const uschar * | code, | |
uschar * | start_bits, | |||
bool | caseless, | |||
bool | utf8, | |||
compile_data * | cd | |||
) | [static] |
Definition at line 960 of file pcre.cpp.
References cbit_digit, cbit_space, cbit_word, compile_data::cbits, GET, LINK_SIZE, OP_ALT, OP_ASSERT, OP_ASSERT_NOT, OP_ASSERTBACK, OP_ASSERTBACK_NOT, OP_BRA, OP_BRAMINZERO, OP_BRANUMBER, OP_BRAZERO, OP_CALLOUT, OP_CHARS, OP_CLASS, OP_CRMINQUERY, OP_CRMINRANGE, OP_CRMINSTAR, OP_CRQUERY, OP_CRRANGE, OP_CRSTAR, OP_DIGIT, OP_EXACT, OP_MINPLUS, OP_MINQUERY, OP_MINSTAR, OP_MINUPTO, OP_NCLASS, OP_NOT_DIGIT, OP_NOT_WHITESPACE, OP_NOT_WORDCHAR, OP_OPT, OP_PLUS, OP_QUERY, OP_STAR, OP_TYPEEXACT, OP_TYPEMINPLUS, OP_TYPEMINQUERY, OP_TYPEMINSTAR, OP_TYPEMINUPTO, OP_TYPEPLUS, OP_TYPEQUERY, OP_TYPESTAR, OP_TYPEUPTO, OP_UPTO, OP_WHITESPACE, OP_WORDCHAR, PCRE_CASELESS, and set_bit().
Referenced by pcre_study().
00961 { 00962 register int c; 00963 00964 /* This next statement and the later reference to dummy are here in order to 00965 trick the optimizer of the IBM C compiler for OS/2 into generating correct 00966 code. Apparently IBM isn't going to fix the problem, and we would rather not 00967 disable optimization (in this module it actually makes a big difference, and 00968 the pcre module can use all the optimization it can get). */ 00969 00970 volatile int dummy; 00971 00972 do 00973 { 00974 const uschar *tcode = code + 1 + LINK_SIZE; 00975 bool try_next = true; 00976 00977 while (try_next) 00978 { 00979 /* If a branch starts with a bracket or a positive lookahead assertion, 00980 recurse to set bits from within them. That's all for this branch. */ 00981 00982 if ((int)*tcode >= OP_BRA || *tcode == OP_ASSERT) 00983 { 00984 if (!set_start_bits(tcode, start_bits, caseless, utf8, cd)) 00985 return false; 00986 try_next = false; 00987 } 00988 00989 else switch(*tcode) 00990 { 00991 default: 00992 return false; 00993 00994 /* Skip over callout */ 00995 00996 case OP_CALLOUT: 00997 tcode += 2; 00998 break; 00999 01000 /* Skip over extended extraction bracket number */ 01001 01002 case OP_BRANUMBER: 01003 tcode += 3; 01004 break; 01005 01006 /* Skip over lookbehind and negative lookahead assertions */ 01007 01008 case OP_ASSERT_NOT: 01009 case OP_ASSERTBACK: 01010 case OP_ASSERTBACK_NOT: 01011 do tcode += GET(tcode, 1); while (*tcode == OP_ALT); 01012 tcode += 1+LINK_SIZE; 01013 break; 01014 01015 /* Skip over an option setting, changing the caseless flag */ 01016 01017 case OP_OPT: 01018 caseless = (tcode[1] & PCRE_CASELESS) != 0; 01019 tcode += 2; 01020 break; 01021 01022 /* BRAZERO does the bracket, but carries on. */ 01023 01024 case OP_BRAZERO: 01025 case OP_BRAMINZERO: 01026 if (!set_start_bits(++tcode, start_bits, caseless, utf8, cd)) 01027 return false; 01028 dummy = 1; 01029 do tcode += GET(tcode,1); while (*tcode == OP_ALT); 01030 tcode += 1+LINK_SIZE; 01031 break; 01032 01033 /* Single-char * or ? sets the bit and tries the next item */ 01034 01035 case OP_STAR: 01036 case OP_MINSTAR: 01037 case OP_QUERY: 01038 case OP_MINQUERY: 01039 set_bit(start_bits, tcode[1], caseless, cd); 01040 tcode += 2; 01041 break; 01042 01043 /* Single-char upto sets the bit and tries the next */ 01044 01045 case OP_UPTO: 01046 case OP_MINUPTO: 01047 set_bit(start_bits, tcode[3], caseless, cd); 01048 tcode += 4; 01049 break; 01050 01051 /* At least one single char sets the bit and stops */ 01052 01053 case OP_EXACT: /* Fall through */ 01054 tcode++; 01055 01056 case OP_CHARS: /* Fall through */ 01057 tcode++; 01058 01059 case OP_PLUS: 01060 case OP_MINPLUS: 01061 set_bit(start_bits, tcode[1], caseless, cd); 01062 try_next = false; 01063 break; 01064 01065 /* Single character type sets the bits and stops */ 01066 01067 case OP_NOT_DIGIT: 01068 for (c = 0; c < 32; c++) 01069 start_bits[c] |= ~cd->cbits[c+cbit_digit]; 01070 try_next = false; 01071 break; 01072 01073 case OP_DIGIT: 01074 for (c = 0; c < 32; c++) 01075 start_bits[c] |= cd->cbits[c+cbit_digit]; 01076 try_next = false; 01077 break; 01078 01079 case OP_NOT_WHITESPACE: 01080 for (c = 0; c < 32; c++) 01081 start_bits[c] |= ~cd->cbits[c+cbit_space]; 01082 try_next = false; 01083 break; 01084 01085 case OP_WHITESPACE: 01086 for (c = 0; c < 32; c++) 01087 start_bits[c] |= cd->cbits[c+cbit_space]; 01088 try_next = false; 01089 break; 01090 01091 case OP_NOT_WORDCHAR: 01092 for (c = 0; c < 32; c++) 01093 start_bits[c] |= ~cd->cbits[c+cbit_word]; 01094 try_next = false; 01095 break; 01096 01097 case OP_WORDCHAR: 01098 for (c = 0; c < 32; c++) 01099 start_bits[c] |= cd->cbits[c+cbit_word]; 01100 try_next = false; 01101 break; 01102 01103 /* One or more character type fudges the pointer and restarts, knowing 01104 it will hit a single character type and stop there. */ 01105 01106 case OP_TYPEPLUS: 01107 case OP_TYPEMINPLUS: 01108 tcode++; 01109 break; 01110 01111 case OP_TYPEEXACT: 01112 tcode += 3; 01113 break; 01114 01115 /* Zero or more repeats of character types set the bits and then 01116 try again. */ 01117 01118 case OP_TYPEUPTO: 01119 case OP_TYPEMINUPTO: 01120 tcode += 2; /* Fall through */ 01121 01122 case OP_TYPESTAR: 01123 case OP_TYPEMINSTAR: 01124 case OP_TYPEQUERY: 01125 case OP_TYPEMINQUERY: 01126 switch(tcode[1]) 01127 { 01128 case OP_NOT_DIGIT: 01129 for (c = 0; c < 32; c++) 01130 start_bits[c] |= ~cd->cbits[c+cbit_digit]; 01131 break; 01132 01133 case OP_DIGIT: 01134 for (c = 0; c < 32; c++) 01135 start_bits[c] |= cd->cbits[c+cbit_digit]; 01136 break; 01137 01138 case OP_NOT_WHITESPACE: 01139 for (c = 0; c < 32; c++) 01140 start_bits[c] |= ~cd->cbits[c+cbit_space]; 01141 break; 01142 01143 case OP_WHITESPACE: 01144 for (c = 0; c < 32; c++) 01145 start_bits[c] |= cd->cbits[c+cbit_space]; 01146 break; 01147 01148 case OP_NOT_WORDCHAR: 01149 for (c = 0; c < 32; c++) 01150 start_bits[c] |= ~cd->cbits[c+cbit_word]; 01151 break; 01152 01153 case OP_WORDCHAR: 01154 for (c = 0; c < 32; c++) 01155 start_bits[c] |= cd->cbits[c+cbit_word]; 01156 break; 01157 } 01158 01159 tcode += 2; 01160 break; 01161 01162 /* Character class where all the information is in a bit map: set the 01163 bits and either carry on or not, according to the repeat count. If it was 01164 a negative class, and we are operating with UTF-8 characters, any byte 01165 with the top-bit set is a potentially valid starter because it may start 01166 a character with a value > 255. (This is sub-optimal in that the 01167 character may be in the range 128-255, and those characters might be 01168 unwanted, but that's as far as we go for the moment.) */ 01169 01170 case OP_NCLASS: 01171 if (utf8) memset(start_bits+16, 0xff, 16); 01172 /* Fall through */ 01173 01174 case OP_CLASS: 01175 { 01176 tcode++; 01177 for (c = 0; c < 32; c++) start_bits[c] |= tcode[c]; 01178 tcode += 32; 01179 switch (*tcode) 01180 { 01181 case OP_CRSTAR: 01182 case OP_CRMINSTAR: 01183 case OP_CRQUERY: 01184 case OP_CRMINQUERY: 01185 tcode++; 01186 break; 01187 01188 case OP_CRRANGE: 01189 case OP_CRMINRANGE: 01190 if (((tcode[1] << 8) + tcode[2]) == 0) tcode += 5; 01191 else try_next = false; 01192 break; 01193 01194 default: 01195 try_next = false; 01196 break; 01197 } 01198 } 01199 break; /* End of bitmap class handling */ 01200 01201 } /* End of switch */ 01202 } /* End of try_next loop */ 01203 01204 code += GET(code, 1); /* Advance to next branch */ 01205 } 01206 while (*code == OP_ALT); 01207 return true; 01208 } 01209
const unsigned char digitab[] [static] |
Definition at line 1415 of file pcre.cpp.
Referenced by check_escape(), compile_branch(), is_counted_repeat(), pcre_compile(), and read_repeat_counts().
const short int escapes[] [static] |
Initial value:
{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ':', ';', '<', '=', '>', '?', '@', -ESC_A, -ESC_B, -ESC_C, -ESC_D, -ESC_E, 0, -ESC_G, 0, 0, 0, 0, 0, 0, 0, 0, 0, -ESC_Q, 0, -ESC_S, 0, 0, 0, -ESC_W, 0, 0, -ESC_Z, '[', '\\', ']', '^', '_', '`', 7, -ESC_b, 0, -ESC_d, ESC_e, ESC_f, 0, 0, 0, 0, 0, 0, 0, ESC_n, 0, 0, 0, ESC_r, -ESC_s, ESC_tee, 0, 0, -ESC_w, 0, 0, -ESC_z }
Definition at line 1353 of file pcre.cpp.
Referenced by check_escape().
const uschar OP_lengths[] = { OP_LENGTHS } [static] |
Definition at line 1341 of file pcre.cpp.
Referenced by could_be_empty_branch(), find_bracket(), find_fixedlength(), find_recurse(), and first_significant_code().
int(*) pcre_callout(pcre_callout_block *) = NULL |
unsigned char pcre_default_tables[] [static] |
const int posix_class_maps[] [static] |
Initial value:
{ cbit_lower, cbit_upper, -1, cbit_lower, -1, -1, cbit_upper, -1, -1, cbit_digit, cbit_lower, cbit_upper, cbit_print, cbit_cntrl, -1, cbit_space, -1, -1, cbit_cntrl, -1, -1, cbit_digit, -1, -1, cbit_graph, -1, -1, cbit_print, -1, -1, cbit_punct, -1, -1, cbit_space, -1, -1, cbit_word, -1, -1, cbit_xdigit,-1, -1 }
Definition at line 1382 of file pcre.cpp.
Referenced by compile_branch().
const uschar posix_name_lengths[] [static] |
Initial value:
{ 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 }
Definition at line 1375 of file pcre.cpp.
Referenced by check_posix_name().
const char* const posix_names[] [static] |
Initial value:
{ "alpha", "lower", "upper", "alnum", "ascii", "blank", "cntrl", "digit", "graph", "print", "punct", "space", "word", "xdigit" }
Definition at line 1370 of file pcre.cpp.
Referenced by check_posix_name().
const char rep_max[] = { 0, 0, 0, 0, 1, 1 } [static] |
const char rep_min[] = { 0, 0, 1, 1, 0, 0 } [static] |