mux/src/pcre.cpp File Reference

#include "autoconf.h"
#include "config.h"
#include <limits.h>
#include <string.h>
#include <ctype.h>
#include <stdlib.h>
#include <stddef.h>
#include "pcre.h"
#include "externs.h"
#include "timeutil.h"

Include dependency graph for pcre.cpp:

Go to the source code of this file.

Data Structures

struct  real_pcre
struct  pcre_study_data
struct  compile_data
struct  branch_chain
struct  recursion_info
struct  match_data
struct  eptrblock

Defines

#define LINK_SIZE   2
#define MATCH_LIMIT   100000
#define NEWLINE   '\n'
#define PUT(a, n, d)
#define GET(a, n)   (((a)[n] << 8) | (a)[(n)+1])
#define MAX_PATTERN_SIZE   (1 << 16)
#define PUTINC(a, n, d)   PUT(a,n,d), a += LINK_SIZE
#define PUT2(a, n, d)
#define GET2(a, n)   (((a)[n] << 8) | (a)[(n)+1])
#define PUT2INC(a, n, d)   PUT2(a,n,d), a += 2
#define PCRE_IMS   (PCRE_CASELESS|PCRE_MULTILINE|PCRE_DOTALL)
#define PCRE_FIRSTSET   0x40000000
#define PCRE_REQCHSET   0x20000000
#define PCRE_STARTLINE   0x10000000
#define PCRE_ICHANGED   0x08000000
#define PCRE_STUDY_MAPPED   0x01
#define PUBLIC_OPTIONS
#define PUBLIC_EXEC_OPTIONS   (PCRE_ANCHORED|PCRE_NOTBOL|PCRE_NOTEOL|PCRE_NOTEMPTY|PCRE_NO_UTF8_CHECK)
#define PUBLIC_STUDY_OPTIONS   0
#define MAGIC_NUMBER   0x50435245UL
#define REQ_UNSET   (-2)
#define REQ_NONE   (-1)
#define REQ_CASELESS   0x0100
#define REQ_VARY   0x0200
#define ESC_e   27
#define ESC_f   '\f'
#define ESC_n   NEWLINE
#define ESC_r   '\r'
#define ESC_tee   '\t'
#define XCL_NOT   0x01
#define XCL_MAP   0x02
#define XCL_END   0
#define XCL_SINGLE   1
#define XCL_RANGE   2
#define OP_NAME_LIST
#define OP_LENGTHS
#define EXTRACT_BASIC_MAX   150
#define CREF_RECURSE   0xffff
#define ERR1   "\\ at end of pattern"
#define ERR2   "\\c at end of pattern"
#define ERR3   "unrecognized character follows \\"
#define ERR4   "numbers out of order in {} quantifier"
#define ERR5   "number too big in {} quantifier"
#define ERR6   "missing terminating ] for character class"
#define ERR7   "invalid escape sequence in character class"
#define ERR8   "range out of order in character class"
#define ERR9   "nothing to repeat"
#define ERR10   "operand of unlimited repeat could match the empty string"
#define ERR11   "internal error: unexpected repeat"
#define ERR12   "unrecognized character after (?"
#define ERR13   "POSIX named classes are supported only within a class"
#define ERR14   "missing )"
#define ERR15   "reference to non-existent subpattern"
#define ERR16   "erroffset passed as NULL"
#define ERR17   "unknown option bit(s) set"
#define ERR18   "missing ) after comment"
#define ERR19   "parentheses nested too deeply"
#define ERR20   "regular expression too large"
#define ERR21   "failed to get memory"
#define ERR22   "unmatched parentheses"
#define ERR23   "internal error: code overflow"
#define ERR24   "unrecognized character after (?<"
#define ERR25   "lookbehind assertion is not fixed length"
#define ERR26   "malformed number after (?("
#define ERR27   "conditional group contains more than two branches"
#define ERR28   "assertion expected after (?("
#define ERR29   "(?R or (?digits must be followed by )"
#define ERR30   "unknown POSIX class name"
#define ERR31   "POSIX collating elements are not supported"
#define ERR32   "this version of PCRE is not compiled with PCRE_UTF8 support"
#define ERR33   "spare error"
#define ERR34   "character value in \\x{...} sequence is too large"
#define ERR35   "invalid condition (?(0)"
#define ERR36   "\\C not allowed in lookbehind assertion"
#define ERR37   "PCRE does not support \\L, \\l, \\N, \\P, \\p, \\U, \\u, or \\X"
#define ERR38   "number after (?C is > 255"
#define ERR39   "closing ) for (?C expected"
#define ERR40   "recursive call could loop indefinitely"
#define ERR41   "unrecognized character after (?P"
#define ERR42   "syntax error after (?P"
#define ERR43   "two named groups have the same name"
#define ERR44   "invalid UTF-8 string"
#define ctype_space   0x01
#define ctype_letter   0x02
#define ctype_digit   0x04
#define ctype_xdigit   0x08
#define ctype_word   0x10
#define ctype_meta   0x80
#define cbit_space   0
#define cbit_xdigit   32
#define cbit_digit   64
#define cbit_upper   96
#define cbit_lower   128
#define cbit_word   160
#define cbit_graph   192
#define cbit_print   224
#define cbit_punct   256
#define cbit_cntrl   288
#define cbit_length   320
#define lcc_offset   0
#define fcc_offset   256
#define cbits_offset   512
#define ctypes_offset   (cbits_offset + cbit_length)
#define tables_length   (ctypes_offset + 256)
#define DPRINTF(p)
#define BRASTACK_SIZE   200
#define REC_STACK_SAVE_MAX   30
#define MAXLIT   250
#define REQ_BYTE_MAX   1000
#define match_condassert   0x01
#define match_isgroup   0x02
#define MATCH_MATCH   1
#define MATCH_NOMATCH   0
#define GETCHAR(c, eptr)   c = *eptr;
#define GETCHARINC(c, eptr)   c = *eptr++;
#define GETCHARINCTEST(c, eptr)   c = *eptr++;
#define GETCHARLEN(c, eptr, len)   c = *eptr;
#define BACKCHAR(eptr)
#define REGISTER   register
#define RMATCH(rx, ra, rb, rc, rd, re, rf, rg)   rx = match(ra,rb,rc,rd,re,rf,rg)
#define RRETURN(ra)   return ra
#define fi   i
#define fc   c

Typedefs

typedef unsigned char uschar

Enumerations

enum  {
  ESC_A = 1, ESC_G, ESC_B, ESC_b,
  ESC_D, ESC_d, ESC_S, ESC_s,
  ESC_W, ESC_w, ESC_dum1, ESC_C,
  ESC_Z, ESC_z, ESC_E, ESC_Q,
  ESC_REF
}
enum  {
  OP_END, OP_SOD, OP_SOM, OP_NOT_WORD_BOUNDARY,
  OP_WORD_BOUNDARY, OP_NOT_DIGIT, OP_DIGIT, OP_NOT_WHITESPACE,
  OP_WHITESPACE, OP_NOT_WORDCHAR, OP_WORDCHAR, OP_ANY,
  OP_ANYBYTE, OP_EODN, OP_EOD, OP_OPT,
  OP_CIRC, OP_DOLL, OP_CHARS, OP_NOT,
  OP_STAR, OP_MINSTAR, OP_PLUS, OP_MINPLUS,
  OP_QUERY, OP_MINQUERY, OP_UPTO, OP_MINUPTO,
  OP_EXACT, OP_NOTSTAR, OP_NOTMINSTAR, OP_NOTPLUS,
  OP_NOTMINPLUS, OP_NOTQUERY, OP_NOTMINQUERY, OP_NOTUPTO,
  OP_NOTMINUPTO, OP_NOTEXACT, OP_TYPESTAR, OP_TYPEMINSTAR,
  OP_TYPEPLUS, OP_TYPEMINPLUS, OP_TYPEQUERY, OP_TYPEMINQUERY,
  OP_TYPEUPTO, OP_TYPEMINUPTO, OP_TYPEEXACT, OP_CRSTAR,
  OP_CRMINSTAR, OP_CRPLUS, OP_CRMINPLUS, OP_CRQUERY,
  OP_CRMINQUERY, OP_CRRANGE, OP_CRMINRANGE, OP_CLASS,
  OP_NCLASS, OP_XCLASS, OP_REF, OP_RECURSE,
  OP_CALLOUT, OP_ALT, OP_KET, OP_KETRMAX,
  OP_KETRMIN, OP_ASSERT, OP_ASSERT_NOT, OP_ASSERTBACK,
  OP_ASSERTBACK_NOT, OP_REVERSE, OP_ONCE, OP_COND,
  OP_CREF, OP_BRAZERO, OP_BRAMINZERO, OP_BRANUMBER,
  OP_BRA
}

Functions

int pcre_copy_substring (const char *subject, int *ovector, int stringcount, int stringnumber, char *buffer, int size)
const unsigned char * pcre_maketables (void)
static void set_bit (uschar *start_bits, int c, bool caseless, compile_data *cd)
static bool set_start_bits (const uschar *code, uschar *start_bits, bool caseless, bool utf8, compile_data *cd)
pcre_extrapcre_study (const pcre *external_re, int options, const char **errorptr)
static bool compile_regex (int, int, int *, uschar **, const uschar **, const char **, bool, int, int *, int *, branch_chain *, compile_data *)
static int check_escape (const uschar **ptrptr, const char **errorptr, int bracount, int options, bool isclass)
static bool is_counted_repeat (const uschar *p)
static const uscharread_repeat_counts (const uschar *p, int *minp, int *maxp, const char **errorptr)
static const uscharfirst_significant_code (const uschar *code, int *options, int optbit)
static int find_fixedlength (uschar *code, int options)
static const uscharfind_bracket (const uschar *code, int number)
static const uscharfind_recurse (const uschar *code, bool utf8)
static bool could_be_empty_branch (const uschar *code, const uschar *endcode, bool utf8)
static bool could_be_empty (const uschar *code, const uschar *endcode, branch_chain *bcptr, bool utf8)
static bool check_posix_syntax (const uschar *ptr, const uschar **endptr, compile_data *cd)
static int check_posix_name (const uschar *ptr, int len)
static void adjust_recurse (uschar *group, int adjust, bool utf8, compile_data *cd)
static bool compile_branch (int *optionsptr, int *brackets, uschar **codeptr, const uschar **ptrptr, const char **errorptr, int *firstbyteptr, int *reqbyteptr, branch_chain *bcptr, compile_data *cd)
static bool is_anchored (register const uschar *code, int *options, unsigned int bracket_map, unsigned int backref_map)
static bool is_startline (const uschar *code, unsigned int bracket_map, unsigned int backref_map)
static int find_firstassertedchar (const uschar *code, int *options, bool inassert)
pcrepcre_compile (const char *pattern, int options, const char **errorptr, int *erroroffset, const unsigned char *tables)
static bool match_ref (int offset, register const uschar *eptr, int length, match_data *md, unsigned long int ims)
static int match (REGISTER const uschar *eptr, REGISTER const uschar *ecode, int offset_top, match_data *md, unsigned long int ims, eptrblock *eptrb, int flags)
int pcre_exec (const pcre *external_re, const pcre_extra *extra_data, const char *subject, int length, int start_offset, int options, int *offsets, int offsetcount)

Variables

static unsigned char pcre_default_tables []
static const uschar OP_lengths [] = { OP_LENGTHS }
static const char rep_min [] = { 0, 0, 1, 1, 0, 0 }
static const char rep_max [] = { 0, 0, 0, 0, 1, 1 }
static const short int escapes []
static const char *const posix_names []
static const uschar posix_name_lengths []
static const int posix_class_maps []
static const unsigned char digitab []
int(*) pcre_callout (pcre_callout_block *) = NULL


Define Documentation

#define BACKCHAR ( eptr   ) 

Definition at line 1500 of file pcre.cpp.

#define BRASTACK_SIZE   200

Definition at line 1314 of file pcre.cpp.

Referenced by pcre_compile().

#define cbit_cntrl   288

Definition at line 585 of file pcre.cpp.

Referenced by pcre_maketables().

#define cbit_digit   64

Definition at line 578 of file pcre.cpp.

Referenced by compile_branch(), pcre_maketables(), and set_start_bits().

#define cbit_graph   192

Definition at line 582 of file pcre.cpp.

Referenced by pcre_maketables().

#define cbit_length   320

Definition at line 586 of file pcre.cpp.

Referenced by pcre_maketables().

#define cbit_lower   128

Definition at line 580 of file pcre.cpp.

Referenced by pcre_maketables().

#define cbit_print   224

Definition at line 583 of file pcre.cpp.

Referenced by pcre_maketables().

#define cbit_punct   256

Definition at line 584 of file pcre.cpp.

Referenced by pcre_maketables().

#define cbit_space   0

Definition at line 576 of file pcre.cpp.

Referenced by compile_branch(), pcre_maketables(), and set_start_bits().

#define cbit_upper   96

Definition at line 579 of file pcre.cpp.

Referenced by pcre_maketables().

#define cbit_word   160

Definition at line 581 of file pcre.cpp.

Referenced by compile_branch(), pcre_maketables(), and set_start_bits().

#define cbit_xdigit   32

Definition at line 577 of file pcre.cpp.

Referenced by pcre_maketables().

#define cbits_offset   512

Definition at line 593 of file pcre.cpp.

Referenced by pcre_compile(), and pcre_study().

#define CREF_RECURSE   0xffff

Definition at line 405 of file pcre.cpp.

Referenced by compile_branch(), and match().

#define ctype_digit   0x04

Definition at line 568 of file pcre.cpp.

Referenced by check_escape(), compile_branch(), is_counted_repeat(), match(), pcre_compile(), pcre_maketables(), and read_repeat_counts().

#define ctype_letter   0x02

Definition at line 567 of file pcre.cpp.

Referenced by check_posix_syntax(), pcre_maketables(), and set_bit().

#define ctype_meta   0x80

Definition at line 571 of file pcre.cpp.

Referenced by compile_branch(), pcre_compile(), and pcre_maketables().

#define ctype_space   0x01

Definition at line 566 of file pcre.cpp.

Referenced by compile_branch(), match(), pcre_compile(), and pcre_maketables().

#define ctype_word   0x10

Definition at line 570 of file pcre.cpp.

Referenced by match(), pcre_compile(), and pcre_maketables().

#define ctype_xdigit   0x08

Definition at line 569 of file pcre.cpp.

Referenced by check_escape(), and pcre_maketables().

#define ctypes_offset   (cbits_offset + cbit_length)

Definition at line 594 of file pcre.cpp.

Referenced by pcre_compile(), pcre_exec(), and pcre_study().

#define DPRINTF (  ) 

Definition at line 1306 of file pcre.cpp.

Referenced by compile_regex(), match(), pcre_compile(), and pcre_exec().

#define ERR1   "\\ at end of pattern"

Definition at line 412 of file pcre.cpp.

Referenced by check_escape().

#define ERR10   "operand of unlimited repeat could match the empty string"

Definition at line 421 of file pcre.cpp.

#define ERR11   "internal error: unexpected repeat"

Definition at line 422 of file pcre.cpp.

Referenced by compile_branch().

#define ERR12   "unrecognized character after (?"

Definition at line 423 of file pcre.cpp.

Referenced by pcre_compile().

#define ERR13   "POSIX named classes are supported only within a class"

Definition at line 424 of file pcre.cpp.

Referenced by compile_branch().

#define ERR14   "missing )"

Definition at line 425 of file pcre.cpp.

Referenced by compile_branch().

#define ERR15   "reference to non-existent subpattern"

Definition at line 426 of file pcre.cpp.

Referenced by compile_branch(), and pcre_compile().

#define ERR16   "erroffset passed as NULL"

Definition at line 427 of file pcre.cpp.

Referenced by pcre_compile().

#define ERR17   "unknown option bit(s) set"

Definition at line 428 of file pcre.cpp.

Referenced by pcre_compile().

#define ERR18   "missing ) after comment"

Definition at line 429 of file pcre.cpp.

Referenced by pcre_compile().

#define ERR19   "parentheses nested too deeply"

Definition at line 430 of file pcre.cpp.

Referenced by pcre_compile().

#define ERR2   "\\c at end of pattern"

Definition at line 413 of file pcre.cpp.

Referenced by check_escape().

#define ERR20   "regular expression too large"

Definition at line 431 of file pcre.cpp.

Referenced by pcre_compile().

#define ERR21   "failed to get memory"

Definition at line 432 of file pcre.cpp.

Referenced by pcre_compile().

#define ERR22   "unmatched parentheses"

Definition at line 433 of file pcre.cpp.

Referenced by pcre_compile().

#define ERR23   "internal error: code overflow"

Definition at line 434 of file pcre.cpp.

Referenced by pcre_compile().

#define ERR24   "unrecognized character after (?<"

Definition at line 435 of file pcre.cpp.

Referenced by pcre_compile().

#define ERR25   "lookbehind assertion is not fixed length"

Definition at line 436 of file pcre.cpp.

Referenced by compile_regex().

#define ERR26   "malformed number after (?("

Definition at line 437 of file pcre.cpp.

Referenced by pcre_compile().

#define ERR27   "conditional group contains more than two branches"

Definition at line 438 of file pcre.cpp.

Referenced by compile_branch().

#define ERR28   "assertion expected after (?("

Definition at line 439 of file pcre.cpp.

Referenced by pcre_compile().

#define ERR29   "(?R or (?digits must be followed by )"

Definition at line 440 of file pcre.cpp.

Referenced by pcre_compile().

#define ERR3   "unrecognized character follows \\"

Definition at line 414 of file pcre.cpp.

Referenced by check_escape().

#define ERR30   "unknown POSIX class name"

Definition at line 441 of file pcre.cpp.

Referenced by compile_branch().

#define ERR31   "POSIX collating elements are not supported"

Definition at line 442 of file pcre.cpp.

Referenced by compile_branch().

#define ERR32   "this version of PCRE is not compiled with PCRE_UTF8 support"

Definition at line 443 of file pcre.cpp.

Referenced by pcre_compile().

#define ERR33   "spare error"

Definition at line 444 of file pcre.cpp.

#define ERR34   "character value in \\x{...} sequence is too large"

Definition at line 445 of file pcre.cpp.

#define ERR35   "invalid condition (?(0)"

Definition at line 446 of file pcre.cpp.

Referenced by compile_branch().

#define ERR36   "\\C not allowed in lookbehind assertion"

Definition at line 447 of file pcre.cpp.

Referenced by compile_regex().

#define ERR37   "PCRE does not support \\L, \\l, \\N, \\P, \\p, \\U, \\u, or \\X"

Definition at line 448 of file pcre.cpp.

Referenced by check_escape().

#define ERR38   "number after (?C is > 255"

Definition at line 449 of file pcre.cpp.

Referenced by compile_branch().

#define ERR39   "closing ) for (?C expected"

Definition at line 450 of file pcre.cpp.

Referenced by pcre_compile().

#define ERR4   "numbers out of order in {} quantifier"

Definition at line 415 of file pcre.cpp.

Referenced by read_repeat_counts().

#define ERR40   "recursive call could loop indefinitely"

Definition at line 451 of file pcre.cpp.

Referenced by compile_branch().

#define ERR41   "unrecognized character after (?P"

Definition at line 452 of file pcre.cpp.

Referenced by pcre_compile().

#define ERR42   "syntax error after (?P"

Definition at line 453 of file pcre.cpp.

Referenced by pcre_compile().

#define ERR43   "two named groups have the same name"

Definition at line 454 of file pcre.cpp.

Referenced by compile_branch().

#define ERR44   "invalid UTF-8 string"

Definition at line 455 of file pcre.cpp.

#define ERR5   "number too big in {} quantifier"

Definition at line 416 of file pcre.cpp.

Referenced by read_repeat_counts().

#define ERR6   "missing terminating ] for character class"

Definition at line 417 of file pcre.cpp.

Referenced by pcre_compile().

#define ERR7   "invalid escape sequence in character class"

Definition at line 418 of file pcre.cpp.

Referenced by compile_branch().

#define ERR8   "range out of order in character class"

Definition at line 419 of file pcre.cpp.

Referenced by compile_branch().

#define ERR9   "nothing to repeat"

Definition at line 420 of file pcre.cpp.

Referenced by compile_branch().

#define ESC_e   27

Definition at line 160 of file pcre.cpp.

#define ESC_f   '\f'

Definition at line 164 of file pcre.cpp.

#define ESC_n   NEWLINE

Definition at line 168 of file pcre.cpp.

#define ESC_r   '\r'

Definition at line 172 of file pcre.cpp.

#define ESC_tee   '\t'

Definition at line 179 of file pcre.cpp.

#define EXTRACT_BASIC_MAX   150

Definition at line 401 of file pcre.cpp.

Referenced by compile_branch(), find_bracket(), is_anchored(), is_startline(), match(), and pcre_compile().

#define fc   c

Referenced by do_mail_stats(), and match().

#define fcc_offset   256

Definition at line 592 of file pcre.cpp.

Referenced by pcre_compile(), pcre_exec(), and pcre_study().

#define fi   i

Referenced by match().

#define GET ( a,
 )     (((a)[n] << 8) | (a)[(n)+1])

Definition at line 82 of file pcre.cpp.

Referenced by adjust_recurse(), compile_branch(), compile_regex(), could_be_empty_branch(), find_firstassertedchar(), find_fixedlength(), first_significant_code(), is_anchored(), is_startline(), match(), and set_start_bits().

#define GET2 ( a,
 )     (((a)[n] << 8) | (a)[(n)+1])

Definition at line 101 of file pcre.cpp.

Referenced by compile_branch(), could_be_empty_branch(), find_bracket(), find_fixedlength(), is_anchored(), is_startline(), and match().

#define GETCHAR ( c,
eptr   )     c = *eptr;

Definition at line 1496 of file pcre.cpp.

#define GETCHARINC ( c,
eptr   )     c = *eptr++;

Definition at line 1497 of file pcre.cpp.

#define GETCHARINCTEST ( c,
eptr   )     c = *eptr++;

Definition at line 1498 of file pcre.cpp.

Referenced by match().

#define GETCHARLEN ( c,
eptr,
len   )     c = *eptr;

Definition at line 1499 of file pcre.cpp.

#define lcc_offset   0

Definition at line 591 of file pcre.cpp.

Referenced by pcre_compile(), pcre_exec(), and pcre_study().

#define LINK_SIZE   2

Definition at line 57 of file pcre.cpp.

Referenced by adjust_recurse(), compile_branch(), compile_regex(), could_be_empty_branch(), find_bracket(), find_firstassertedchar(), find_fixedlength(), is_anchored(), is_startline(), match(), pcre_compile(), and set_start_bits().

#define MAGIC_NUMBER   0x50435245UL

Definition at line 140 of file pcre.cpp.

Referenced by pcre_compile(), pcre_exec(), and pcre_study().

#define match_condassert   0x01

Definition at line 1468 of file pcre.cpp.

Referenced by match().

#define match_isgroup   0x02

Definition at line 1469 of file pcre.cpp.

Referenced by match(), and pcre_exec().

#define MATCH_LIMIT   100000

Definition at line 58 of file pcre.cpp.

Referenced by pcre_exec().

#define MATCH_MATCH   1

Definition at line 1474 of file pcre.cpp.

Referenced by match(), and pcre_exec().

#define MATCH_NOMATCH   0

Definition at line 1475 of file pcre.cpp.

Referenced by match(), and pcre_exec().

#define MAX_PATTERN_SIZE   (1 << 16)

Definition at line 85 of file pcre.cpp.

Referenced by pcre_compile().

#define MAXLIT   250

Definition at line 1329 of file pcre.cpp.

Referenced by compile_branch(), and pcre_compile().

#define NEWLINE   '\n'

Definition at line 59 of file pcre.cpp.

Referenced by compile_branch(), match(), pcre_compile(), and pcre_exec().

#define OP_LENGTHS

Definition at line 353 of file pcre.cpp.

#define OP_NAME_LIST

Value:

"End", "\\A", "\\G", "\\B", "\\b", "\\D", "\\d",                \
  "\\S", "\\s", "\\W", "\\w", "Any", "Anybyte", "\\Z", "\\z",     \
  "Opt", "^", "$", "chars", "not",                                \
  "*", "*?", "+", "+?", "?", "??", "{", "{", "{",                 \
  "*", "*?", "+", "+?", "?", "??", "{", "{", "{",                 \
  "*", "*?", "+", "+?", "?", "??", "{", "{", "{",                 \
  "*", "*?", "+", "+?", "?", "??", "{", "{",                      \
  "class", "nclass", "xclass", "Ref", "Recurse", "Callout",       \
  "Alt", "Ket", "KetRmax", "KetRmin", "Assert", "Assert not",     \
  "AssertB", "AssertB not", "Reverse", "Once", "Cond", "Cond ref",\
  "Brazero", "Braminzero", "Branumber", "Bra"

Definition at line 330 of file pcre.cpp.

#define PCRE_FIRSTSET   0x40000000

Definition at line 116 of file pcre.cpp.

Referenced by pcre_compile(), pcre_exec(), and pcre_study().

#define PCRE_ICHANGED   0x08000000

Definition at line 119 of file pcre.cpp.

Referenced by pcre_compile().

#define PCRE_IMS   (PCRE_CASELESS|PCRE_MULTILINE|PCRE_DOTALL)

Definition at line 108 of file pcre.cpp.

Referenced by compile_branch(), compile_regex(), match(), and pcre_compile().

#define PCRE_REQCHSET   0x20000000

Definition at line 117 of file pcre.cpp.

Referenced by pcre_compile(), and pcre_exec().

#define PCRE_STARTLINE   0x10000000

Definition at line 118 of file pcre.cpp.

Referenced by pcre_compile(), pcre_exec(), and pcre_study().

#define PCRE_STUDY_MAPPED   0x01

Definition at line 123 of file pcre.cpp.

Referenced by pcre_exec(), and pcre_study().

#define PUBLIC_EXEC_OPTIONS   (PCRE_ANCHORED|PCRE_NOTBOL|PCRE_NOTEOL|PCRE_NOTEMPTY|PCRE_NO_UTF8_CHECK)

Definition at line 133 of file pcre.cpp.

Referenced by pcre_exec().

#define PUBLIC_OPTIONS

Value:

Definition at line 128 of file pcre.cpp.

Referenced by pcre_compile().

#define PUBLIC_STUDY_OPTIONS   0

Definition at line 136 of file pcre.cpp.

Referenced by pcre_study().

#define PUT ( a,
n,
 ) 

Value:

(a[n] = (d) >> 8), \
  (a[(n)+1] = (d) & 255)

Definition at line 78 of file pcre.cpp.

Referenced by adjust_recurse(), compile_branch(), and compile_regex().

#define PUT2 ( a,
n,
 ) 

Value:

a[n] = (d) >> 8; \
  a[(n)+1] = (d) & 255

Definition at line 97 of file pcre.cpp.

Referenced by compile_branch().

#define PUT2INC ( a,
n,
 )     PUT2(a,n,d), a += 2

Definition at line 104 of file pcre.cpp.

Referenced by compile_branch().

#define PUTINC ( a,
n,
 )     PUT(a,n,d), a += LINK_SIZE

Definition at line 90 of file pcre.cpp.

Referenced by compile_branch(), and compile_regex().

#define REC_STACK_SAVE_MAX   30

Definition at line 1321 of file pcre.cpp.

Referenced by match().

#define REGISTER   register

Definition at line 5371 of file pcre.cpp.

#define REQ_BYTE_MAX   1000

Definition at line 1335 of file pcre.cpp.

Referenced by pcre_exec().

#define REQ_CASELESS   0x0100

Definition at line 150 of file pcre.cpp.

Referenced by compile_branch(), find_firstassertedchar(), pcre_compile(), and pcre_exec().

#define REQ_NONE   (-1)

Definition at line 145 of file pcre.cpp.

Referenced by compile_branch(), and compile_regex().

#define REQ_UNSET   (-2)

Definition at line 144 of file pcre.cpp.

Referenced by compile_branch(), and compile_regex().

#define REQ_VARY   0x0200

Definition at line 151 of file pcre.cpp.

Referenced by compile_branch(), compile_regex(), and pcre_compile().

#define RMATCH ( rx,
ra,
rb,
rc,
rd,
re,
rf,
rg   )     rx = match(ra,rb,rc,rd,re,rf,rg)

Definition at line 5372 of file pcre.cpp.

Referenced by match().

#define RRETURN ( ra   )     return ra

Definition at line 5373 of file pcre.cpp.

Referenced by match().

#define tables_length   (ctypes_offset + 256)

Definition at line 595 of file pcre.cpp.

Referenced by pcre_maketables().

#define XCL_END   0

Definition at line 202 of file pcre.cpp.

#define XCL_MAP   0x02

Definition at line 200 of file pcre.cpp.

#define XCL_NOT   0x01

Definition at line 199 of file pcre.cpp.

#define XCL_RANGE   2

Definition at line 204 of file pcre.cpp.

#define XCL_SINGLE   1

Definition at line 203 of file pcre.cpp.


Typedef Documentation

typedef unsigned char uschar

Definition at line 464 of file pcre.cpp.


Enumeration Type Documentation

anonymous enum

Enumerator:
ESC_A 
ESC_G 
ESC_B 
ESC_b 
ESC_D 
ESC_d 
ESC_S 
ESC_s 
ESC_W 
ESC_w 
ESC_dum1 
ESC_C 
ESC_Z 
ESC_z 
ESC_E 
ESC_Q 
ESC_REF 

Definition at line 193 of file pcre.cpp.

anonymous enum

Enumerator:
OP_END 
OP_SOD 
OP_SOM 
OP_NOT_WORD_BOUNDARY 
OP_WORD_BOUNDARY 
OP_NOT_DIGIT 
OP_DIGIT 
OP_NOT_WHITESPACE 
OP_WHITESPACE 
OP_NOT_WORDCHAR 
OP_WORDCHAR 
OP_ANY 
OP_ANYBYTE 
OP_EODN 
OP_EOD 
OP_OPT 
OP_CIRC 
OP_DOLL 
OP_CHARS 
OP_NOT 
OP_STAR 
OP_MINSTAR 
OP_PLUS 
OP_MINPLUS 
OP_QUERY 
OP_MINQUERY 
OP_UPTO 
OP_MINUPTO 
OP_EXACT 
OP_NOTSTAR 
OP_NOTMINSTAR 
OP_NOTPLUS 
OP_NOTMINPLUS 
OP_NOTQUERY 
OP_NOTMINQUERY 
OP_NOTUPTO 
OP_NOTMINUPTO 
OP_NOTEXACT 
OP_TYPESTAR 
OP_TYPEMINSTAR 
OP_TYPEPLUS 
OP_TYPEMINPLUS 
OP_TYPEQUERY 
OP_TYPEMINQUERY 
OP_TYPEUPTO 
OP_TYPEMINUPTO 
OP_TYPEEXACT 
OP_CRSTAR 
OP_CRMINSTAR 
OP_CRPLUS 
OP_CRMINPLUS 
OP_CRQUERY 
OP_CRMINQUERY 
OP_CRRANGE 
OP_CRMINRANGE 
OP_CLASS 
OP_NCLASS 
OP_XCLASS 
OP_REF 
OP_RECURSE 
OP_CALLOUT 
OP_ALT 
OP_KET 
OP_KETRMAX 
OP_KETRMIN 
OP_ASSERT 
OP_ASSERT_NOT 
OP_ASSERTBACK 
OP_ASSERTBACK_NOT 
OP_REVERSE 
OP_ONCE 
OP_COND 
OP_CREF 
OP_BRAZERO 
OP_BRAMINZERO 
OP_BRANUMBER 
OP_BRA 

Definition at line 213 of file pcre.cpp.

00213      {
00214   OP_END,            /* 0 End of pattern */
00215 
00216   /* Values corresponding to backslashed metacharacters */
00217 
00218   OP_SOD,            /* 1 Start of data: \A */
00219   OP_SOM,            /* 2 Start of match (subject + offset): \G */
00220   OP_NOT_WORD_BOUNDARY,  /*  3 \B */
00221   OP_WORD_BOUNDARY,      /*  4 \b */
00222   OP_NOT_DIGIT,          /*  5 \D */
00223   OP_DIGIT,              /*  6 \d */
00224   OP_NOT_WHITESPACE,     /*  7 \S */
00225   OP_WHITESPACE,         /*  8 \s */
00226   OP_NOT_WORDCHAR,       /*  9 \W */
00227   OP_WORDCHAR,           /* 10 \w */
00228   OP_ANY,            /* 11 Match any character */
00229   OP_ANYBYTE,        /* 12 Match any byte (\C); different to OP_ANY for UTF-8 */
00230   OP_EODN,           /* 13 End of data or \n at end of data: \Z. */
00231   OP_EOD,            /* 14 End of data: \z */
00232 
00233   OP_OPT,            /* 15 Set runtime options */
00234   OP_CIRC,           /* 16 Start of line - varies with multiline switch */
00235   OP_DOLL,           /* 17 End of line - varies with multiline switch */
00236   OP_CHARS,          /* 18 Match string of characters */
00237   OP_NOT,            /* 19 Match anything but the following char */
00238 
00239   OP_STAR,           /* 20 The maximizing and minimizing versions of */
00240   OP_MINSTAR,        /* 21 all these opcodes must come in pairs, with */
00241   OP_PLUS,           /* 22 the minimizing one second. */
00242   OP_MINPLUS,        /* 23 This first set applies to single characters */
00243   OP_QUERY,          /* 24 */
00244   OP_MINQUERY,       /* 25 */
00245   OP_UPTO,           /* 26 From 0 to n matches */
00246   OP_MINUPTO,        /* 27 */
00247   OP_EXACT,          /* 28 Exactly n matches */
00248 
00249   OP_NOTSTAR,        /* 29 The maximizing and minimizing versions of */
00250   OP_NOTMINSTAR,     /* 30 all these opcodes must come in pairs, with */
00251   OP_NOTPLUS,        /* 31 the minimizing one second. */
00252   OP_NOTMINPLUS,     /* 32 This set applies to "not" single characters */
00253   OP_NOTQUERY,       /* 33 */
00254   OP_NOTMINQUERY,    /* 34 */
00255   OP_NOTUPTO,        /* 35 From 0 to n matches */
00256   OP_NOTMINUPTO,     /* 36 */
00257   OP_NOTEXACT,       /* 37 Exactly n matches */
00258 
00259   OP_TYPESTAR,       /* 38 The maximizing and minimizing versions of */
00260   OP_TYPEMINSTAR,    /* 39 all these opcodes must come in pairs, with */
00261   OP_TYPEPLUS,       /* 40 the minimizing one second. These codes must */
00262   OP_TYPEMINPLUS,    /* 41 be in exactly the same order as those above. */
00263   OP_TYPEQUERY,      /* 42 This set applies to character types such as \d */
00264   OP_TYPEMINQUERY,   /* 43 */
00265   OP_TYPEUPTO,       /* 44 From 0 to n matches */
00266   OP_TYPEMINUPTO,    /* 45 */
00267   OP_TYPEEXACT,      /* 46 Exactly n matches */
00268 
00269   OP_CRSTAR,         /* 47 The maximizing and minimizing versions of */
00270   OP_CRMINSTAR,      /* 48 all these opcodes must come in pairs, with */
00271   OP_CRPLUS,         /* 49 the minimizing one second. These codes must */
00272   OP_CRMINPLUS,      /* 50 be in exactly the same order as those above. */
00273   OP_CRQUERY,        /* 51 These are for character classes and back refs */
00274   OP_CRMINQUERY,     /* 52 */
00275   OP_CRRANGE,        /* 53 These are different to the three seta above. */
00276   OP_CRMINRANGE,     /* 54 */
00277 
00278   OP_CLASS,          /* 55 Match a character class, chars < 256 only */
00279   OP_NCLASS,         /* 56 Same, but the bitmap was created from a negative
00280                            class - the difference is relevant only when a UTF-8
00281                            character > 255 is encountered. */
00282 
00283   OP_XCLASS,         /* 57 Extended class for handling UTF-8 chars within the
00284                            class. This does both positive and negative. */
00285 
00286   OP_REF,            /* 58 Match a back reference */
00287   OP_RECURSE,        /* 59 Match a numbered subpattern (possibly recursive) */
00288   OP_CALLOUT,        /* 60 Call out to external function if provided */
00289 
00290   OP_ALT,            /* 61 Start of alternation */
00291   OP_KET,            /* 62 End of group that doesn't have an unbounded repeat */
00292   OP_KETRMAX,        /* 63 These two must remain together and in this */
00293   OP_KETRMIN,        /* 64 order. They are for groups the repeat for ever. */
00294 
00295   /* The assertions must come before ONCE and COND */
00296 
00297   OP_ASSERT,         /* 65 Positive lookahead */
00298   OP_ASSERT_NOT,     /* 66 Negative lookahead */
00299   OP_ASSERTBACK,     /* 67 Positive lookbehind */
00300   OP_ASSERTBACK_NOT, /* 68 Negative lookbehind */
00301   OP_REVERSE,        /* 69 Move pointer back - used in lookbehind assertions */
00302 
00303   /* ONCE and COND must come after the assertions, with ONCE first, as there's
00304   a test for >= ONCE for a subpattern that isn't an assertion. */
00305 
00306   OP_ONCE,           /* 70 Once matched, don't back up into the subpattern */
00307   OP_COND,           /* 71 Conditional group */
00308   OP_CREF,           /* 72 Used to hold an extraction string number (cond ref) */
00309 
00310   OP_BRAZERO,        /* 73 These two must remain together and in this */
00311   OP_BRAMINZERO,     /* 74 order. */
00312 
00313   OP_BRANUMBER,      /* 75 Used for extracting brackets whose number is greater
00314                            than can fit into an opcode. */
00315 
00316   OP_BRA             /* 76 This and greater values are used for brackets that
00317                            extract substrings up to a basic limit. After that,
00318                            use is made of OP_BRANUMBER. */
00319 };


Function Documentation

static void adjust_recurse ( uschar group,
int  adjust,
bool  utf8,
compile_data cd 
) [static]

Definition at line 2299 of file pcre.cpp.

References find_recurse(), GET, LINK_SIZE, PUT, and compile_data::start_code.

Referenced by compile_branch().

02299 {
02300 uschar *ptr = group;
02301 while ((ptr = (uschar *)find_recurse(ptr, utf8)) != NULL)
02302   {
02303   int offset = GET(ptr, 1);
02304   if (cd->start_code + offset >= group) PUT(ptr, 1, offset + adjust);
02305   ptr += 1 + LINK_SIZE;
02306   }
02307 }
02308 

static int check_escape ( const uschar **  ptrptr,
const char **  errorptr,
int  bracount,
int  options,
bool  isclass 
) [static]

Definition at line 1525 of file pcre.cpp.

References ctype_digit, ctype_xdigit, digitab, ERR1, ERR2, ERR3, ERR37, ESC_REF, escapes, and PCRE_EXTRA.

Referenced by compile_branch(), and pcre_compile().

01526 {
01527 const uschar *ptr = *ptrptr;
01528 int c, i;
01529 
01530 /* If backslash is at the end of the pattern, it's an error. */
01531 
01532 c = *(++ptr);
01533 if (c == 0) *errorptr = ERR1;
01534 
01535 /* Non-alphamerics are literals. For digits or letters, do an initial lookup in
01536 a table. A non-zero result is something that can be returned immediately.
01537 Otherwise further processing may be required. */
01538 
01539 else if (c < '0' || c > 'z') {}                           /* Not alphameric */
01540 else if ((i = escapes[c - '0']) != 0) c = i;
01541 
01542 /* Escapes that need further processing, or are illegal. */
01543 
01544 else
01545   {
01546   const uschar *oldptr;
01547   switch (c)
01548     {
01549     /* A number of Perl escapes are not handled by PCRE. We give an explicit
01550     error. */
01551 
01552     case 'l':
01553     case 'L':
01554     case 'N':
01555     case 'p':
01556     case 'P':
01557     case 'u':
01558     case 'U':
01559     case 'X':
01560     *errorptr = ERR37;
01561     break;
01562 
01563     /* The handling of escape sequences consisting of a string of digits
01564     starting with one that is not zero is not straightforward. By experiment,
01565     the way Perl works seems to be as follows:
01566 
01567     Outside a character class, the digits are read as a decimal number. If the
01568     number is less than 10, or if there are that many previous extracting
01569     left brackets, then it is a back reference. Otherwise, up to three octal
01570     digits are read to form an escaped byte. Thus \123 is likely to be octal
01571     123 (cf \0123, which is octal 012 followed by the literal 3). If the octal
01572     value is greater than 377, the least significant 8 bits are taken. Inside a
01573     character class, \ followed by a digit is always an octal number. */
01574 
01575     case '1': case '2': case '3': case '4': case '5':
01576     case '6': case '7': case '8': case '9':
01577 
01578     if (!isclass)
01579       {
01580       oldptr = ptr;
01581       c -= '0';
01582       while ((digitab[ptr[1]] & ctype_digit) != 0)
01583         c = c * 10 + *(++ptr) - '0';
01584       if (c < 10 || c <= bracount)
01585         {
01586         c = -(ESC_REF + c);
01587         break;
01588         }
01589       ptr = oldptr;      /* Put the pointer back and fall through */
01590       }
01591 
01592     /* Handle an octal number following \. If the first digit is 8 or 9, Perl
01593     generates a binary zero byte and treats the digit as a following literal.
01594     Thus we have to pull back the pointer by one. */
01595 
01596     if ((c = *ptr) >= '8')
01597       {
01598       ptr--;
01599       c = 0;
01600       break;
01601       }
01602 
01603     /* \0 always starts an octal number, but we may drop through to here with a
01604     larger first octal digit. */
01605 
01606     case '0':
01607     c -= '0';
01608     while(i++ < 2 && ptr[1] >= '0' && ptr[1] <= '7')
01609         c = c * 8 + *(++ptr) - '0';
01610     c &= 255;     /* Take least significant 8 bits */
01611     break;
01612 
01613     /* \x is complicated when UTF-8 is enabled. \x{ddd} is a character number
01614     which can be greater than 0xff, but only if the ddd are hex digits. */
01615 
01616     case 'x':
01617 
01618     /* Read just a single hex char */
01619 
01620     c = 0;
01621     while (i++ < 2 && (digitab[ptr[1]] & ctype_xdigit) != 0)
01622       {
01623       int cc;                               /* Some compilers don't like ++ */
01624       cc = *(++ptr);                        /* in initializers */
01625       if (cc >= 'a') cc -= 32;              /* Convert to upper case */
01626       c = c * 16 + cc - ((cc < 'A')? '0' : ('A' - 10));
01627       }
01628     break;
01629 
01630     /* Other special escapes not starting with a digit are straightforward */
01631 
01632     case 'c':
01633     c = *(++ptr);
01634     if (c == 0)
01635       {
01636       *errorptr = ERR2;
01637       return 0;
01638       }
01639 
01640     /* A letter is upper-cased; then the 0x40 bit is flipped. This coding
01641     is ASCII-specific, but then the whole concept of \cx is ASCII-specific. */
01642 
01643     if (c >= 'a' && c <= 'z') c -= 32;
01644     c ^= 0x40;
01645     break;
01646 
01647     /* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any
01648     other alphameric following \ is an error if PCRE_EXTRA was set; otherwise,
01649     for Perl compatibility, it is a literal. This code looks a bit odd, but
01650     there used to be some cases other than the default, and there may be again
01651     in future, so I haven't "optimized" it. */
01652 
01653     default:
01654     if ((options & PCRE_EXTRA) != 0)
01655       {
01656       *errorptr = ERR3;
01657       }
01658     break;
01659     }
01660   }
01661 
01662 *ptrptr = ptr;
01663 return c;
01664 }
01665 

static int check_posix_name ( const uschar ptr,
int  len 
) [static]

Definition at line 2262 of file pcre.cpp.

References posix_name_lengths, and posix_names.

Referenced by compile_branch().

02262 {
02263 register int yield = 0;
02264 while (posix_name_lengths[yield] != 0)
02265   {
02266   if (len == posix_name_lengths[yield] &&
02267     strncmp((const char *)ptr, posix_names[yield], len) == 0) return yield;
02268   yield++;
02269   }
02270 return -1;
02271 }
02272 

static bool check_posix_syntax ( const uschar ptr,
const uschar **  endptr,
compile_data cd 
) [static]

Definition at line 2230 of file pcre.cpp.

References ctype_letter, and compile_data::ctypes.

Referenced by compile_branch(), and pcre_compile().

02230 {
02231 int terminator;          /* Don't combine these lines; the Solaris cc */
02232 terminator = *(++ptr);   /* compiler warns about "non-constant" initializer. */
02233 if (*(++ptr) == '^') ptr++;
02234 while ((cd->ctypes[*ptr] & ctype_letter) != 0) ptr++;
02235 if (*ptr == terminator && ptr[1] == ']')
02236   {
02237   *endptr = ptr;
02238   return true;
02239   }
02240 return false;
02241 }
02242 

static bool compile_branch ( int *  optionsptr,
int *  brackets,
uschar **  codeptr,
const uschar **  ptrptr,
const char **  errorptr,
int *  firstbyteptr,
int *  reqbyteptr,
branch_chain bcptr,
compile_data cd 
) [static]

Definition at line 2336 of file pcre.cpp.

References adjust_recurse(), compile_data::backref_map, cbit_digit, cbit_space, cbit_word, compile_data::cbits, check_escape(), check_posix_name(), check_posix_syntax(), compile_regex(), could_be_empty(), CREF_RECURSE, ctype_digit, ctype_meta, ctype_space, compile_data::ctypes, digitab, ERR11, ERR13, ERR14, ERR15, ERR27, ERR30, ERR31, ERR35, ERR38, ERR40, ERR43, ERR7, ERR8, ERR9, ESC_b, ESC_D, ESC_d, ESC_Q, ESC_REF, ESC_S, ESC_s, ESC_W, ESC_w, ESC_Z, EXTRACT_BASIC_MAX, compile_data::fcc, find_bracket(), GET, GET2, is_counted_repeat(), LINK_SIZE, MAXLIT, compile_data::name_entry_size, compile_data::name_table, compile_data::names_found, NEWLINE, OP_ANY, OP_ASSERT, OP_ASSERT_NOT, OP_ASSERTBACK, OP_ASSERTBACK_NOT, OP_BRA, OP_BRANUMBER, OP_BRAZERO, OP_CALLOUT, OP_CHARS, OP_CIRC, OP_CLASS, OP_COND, OP_CREF, OP_CRPLUS, OP_CRQUERY, OP_CRRANGE, OP_CRSTAR, OP_DOLL, OP_END, OP_EODN, OP_EXACT, OP_KET, OP_KETRMAX, OP_NCLASS, OP_NOT, OP_NOTSTAR, OP_ONCE, OP_OPT, OP_PLUS, OP_QUERY, OP_RECURSE, OP_REF, OP_STAR, OP_TYPESTAR, OP_UPTO, PCRE_CASELESS, PCRE_DOTALL, PCRE_EXTENDED, PCRE_EXTRA, PCRE_IMS, PCRE_MULTILINE, PCRE_NO_AUTO_CAPTURE, PCRE_UNGREEDY, posix_class_maps, PUT, PUT2, PUT2INC, PUTINC, read_repeat_counts(), REQ_CASELESS, REQ_NONE, REQ_UNSET, REQ_VARY, compile_data::req_varyopt, compile_data::start_code, and compile_data::top_backref.

Referenced by compile_regex().

02338 {
02339 int repeat_type, op_type;
02340 int repeat_min = 0, repeat_max = 0;      /* To please picky compilers */
02341 int bravalue = 0;
02342 int length;
02343 int greedy_default, greedy_non_default;
02344 int firstbyte, reqbyte;
02345 int zeroreqbyte, zerofirstbyte;
02346 int req_caseopt, reqvary, tempreqvary;
02347 int condcount = 0;
02348 int options = *optionsptr;
02349 register int c;
02350 register uschar *code = *codeptr;
02351 uschar *tempcode;
02352 bool inescq = false;
02353 bool groupsetfirstbyte = false;
02354 const uschar *ptr = *ptrptr;
02355 const uschar *tempptr;
02356 uschar *previous = NULL;
02357 uschar classa[32];
02358 
02359 bool utf8 = false;
02360 
02361 /* Set up the default and non-default settings for greediness */
02362 
02363 greedy_default = ((options & PCRE_UNGREEDY) != 0);
02364 greedy_non_default = greedy_default ^ 1;
02365 
02366 /* Initialize no first char, no required char. REQ_UNSET means "no char
02367 matching encountered yet". It gets changed to REQ_NONE if we hit something that
02368 matches a non-fixed char first char; reqbyte just remains unset if we never
02369 find one.
02370 
02371 When we hit a repeat whose minimum is zero, we may have to adjust these values
02372 to take the zero repeat into account. This is implemented by setting them to
02373 zerofirstbyte and zeroreqbyte when such a repeat is encountered. The individual
02374 item types that can be repeated set these backoff variables appropriately. */
02375 
02376 firstbyte = reqbyte = zerofirstbyte = zeroreqbyte = REQ_UNSET;
02377 
02378 /* The variable req_caseopt contains either the REQ_CASELESS value or zero,
02379 according to the current setting of the caseless flag. REQ_CASELESS is a bit
02380 value > 255. It is added into the firstbyte or reqbyte variables to record the
02381 case status of the value. */
02382 
02383 req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
02384 
02385 /* Switch on next character until the end of the branch */
02386 
02387 for (;; ptr++)
02388   {
02389   bool negate_class;
02390   bool possessive_quantifier;
02391   int class_charcount;
02392   int class_lastchar;
02393   int newoptions;
02394   int recno;
02395   int skipbytes;
02396   int subreqbyte;
02397   int subfirstbyte;
02398 
02399   c = *ptr;
02400   if (inescq && c != 0) goto NORMAL_CHAR;
02401 
02402   if ((options & PCRE_EXTENDED) != 0)
02403     {
02404     if ((cd->ctypes[c] & ctype_space) != 0) continue;
02405     if (c == '#')
02406       {
02407       /* The space before the ; is to avoid a warning on a silly compiler
02408       on the Macintosh. */
02409       while ((c = *(++ptr)) != 0 && c != NEWLINE) ;
02410       if (c != 0) continue;   /* Else fall through to handle end of string */
02411       }
02412     }
02413 
02414   switch(c)
02415     {
02416     /* The branch terminates at end of string, |, or ). */
02417 
02418     case 0:
02419     case '|':
02420     case ')':
02421     *firstbyteptr = firstbyte;
02422     *reqbyteptr = reqbyte;
02423     *codeptr = code;
02424     *ptrptr = ptr;
02425     return true;
02426 
02427     /* Handle single-character metacharacters. In multiline mode, ^ disables
02428     the setting of any following char as a first character. */
02429 
02430     case '^':
02431     if ((options & PCRE_MULTILINE) != 0)
02432       {
02433       if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
02434       }
02435     previous = NULL;
02436     *code++ = OP_CIRC;
02437     break;
02438 
02439     case '$':
02440     previous = NULL;
02441     *code++ = OP_DOLL;
02442     break;
02443 
02444     /* There can never be a first char if '.' is first, whatever happens about
02445     repeats. The value of reqbyte doesn't change either. */
02446 
02447     case '.':
02448     if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
02449     zerofirstbyte = firstbyte;
02450     zeroreqbyte = reqbyte;
02451     previous = code;
02452     *code++ = OP_ANY;
02453     break;
02454 
02455     /* Character classes. If the included characters are all < 255 in value, we
02456     build a 32-byte bitmap of the permitted characters, except in the special
02457     case where there is only one such character. For negated classes, we build
02458     the map as usual, then invert it at the end. However, we use a different
02459     opcode so that data characters > 255 can be handled correctly.
02460 
02461     If the class contains characters outside the 0-255 range, a different
02462     opcode is compiled. It may optionally have a bit map for characters < 256,
02463     but those above are are explicitly listed afterwards. A flag byte tells
02464     whether the bitmap is present, and whether this is a negated class or not.
02465     */
02466 
02467     case '[':
02468     previous = code;
02469 
02470     /* PCRE supports POSIX class stuff inside a class. Perl gives an error if
02471     they are encountered at the top level, so we'll do that too. */
02472 
02473     if ((ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&
02474         check_posix_syntax(ptr, &tempptr, cd))
02475       {
02476       *errorptr = (ptr[1] == ':')? ERR13 : ERR31;
02477       goto FAILED;
02478       }
02479 
02480     /* If the first character is '^', set the negation flag and skip it. */
02481 
02482     if ((c = *(++ptr)) == '^')
02483       {
02484       negate_class = true;
02485       c = *(++ptr);
02486       }
02487     else
02488       {
02489       negate_class = false;
02490       }
02491 
02492     /* Keep a count of chars with values < 256 so that we can optimize the case
02493     of just a single character (as long as it's < 256). For higher valued UTF-8
02494     characters, we don't yet do any optimization. */
02495 
02496     class_charcount = 0;
02497     class_lastchar = -1;
02498 
02499 
02500     /* Initialize the 32-char bit map to all zeros. We have to build the
02501     map in a temporary bit of store, in case the class contains only 1
02502     character (< 256), because in that case the compiled code doesn't use the
02503     bit map. */
02504 
02505     memset(classa, 0, 32 * sizeof(uschar));
02506 
02507     /* Process characters until ] is reached. By writing this as a "do" it
02508     means that an initial ] is taken as a data character. The first pass
02509     through the regex checked the overall syntax, so we don't need to be very
02510     strict here. At the start of the loop, c contains the first byte of the
02511     character. */
02512 
02513     do
02514       {
02515 
02516       /* Inside \Q...\E everything is literal except \E */
02517 
02518       if (inescq)
02519         {
02520         if (c == '\\' && ptr[1] == 'E')
02521           {
02522           inescq = false;
02523           ptr++;
02524           continue;
02525           }
02526         else goto LONE_SINGLE_CHARACTER;
02527         }
02528 
02529       /* Handle POSIX class names. Perl allows a negation extension of the
02530       form [:^name:]. A square bracket that doesn't match the syntax is
02531       treated as a literal. We also recognize the POSIX constructions
02532       [.ch.] and [=ch=] ("collating elements") and fault them, as Perl
02533       5.6 and 5.8 do. */
02534 
02535       if (c == '[' &&
02536           (ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&
02537           check_posix_syntax(ptr, &tempptr, cd))
02538         {
02539         bool local_negate = false;
02540         int posix_class, i;
02541         register const uschar *cbits = cd->cbits;
02542 
02543         if (ptr[1] != ':')
02544           {
02545           *errorptr = ERR31;
02546           goto FAILED;
02547           }
02548 
02549         ptr += 2;
02550         if (*ptr == '^')
02551           {
02552           local_negate = true;
02553           ptr++;
02554           }
02555 
02556         posix_class = check_posix_name(ptr, tempptr - ptr);
02557         if (posix_class < 0)
02558           {
02559           *errorptr = ERR30;
02560           goto FAILED;
02561           }
02562 
02563         /* If matching is caseless, upper and lower are converted to
02564         alpha. This relies on the fact that the class table starts with
02565         alpha, lower, upper as the first 3 entries. */
02566 
02567         if ((options & PCRE_CASELESS) != 0 && posix_class <= 2)
02568           posix_class = 0;
02569 
02570         /* Or into the map we are building up to 3 of the static class
02571         tables, or their negations. The [:blank:] class sets up the same
02572         chars as the [:space:] class (all white space). We remove the vertical
02573         white space chars afterwards. */
02574 
02575         posix_class *= 3;
02576         for (i = 0; i < 3; i++)
02577           {
02578           bool blankclass = strncmp((char *)ptr, "blank", 5) == 0;
02579           int taboffset = posix_class_maps[posix_class + i];
02580           if (taboffset < 0) break;
02581           if (local_negate)
02582             {
02583             for (c = 0; c < 32; c++) classa[c] |= ~cbits[c+taboffset];
02584             if (blankclass) classa[1] |= 0x3c;
02585             }
02586           else
02587             {
02588             for (c = 0; c < 32; c++) classa[c] |= cbits[c+taboffset];
02589             if (blankclass) classa[1] &= ~0x3c;
02590             }
02591           }
02592 
02593         ptr = tempptr + 1;
02594         class_charcount = 10;  /* Set > 1; assumes more than 1 per class */
02595         continue;    /* End of POSIX syntax handling */
02596         }
02597 
02598       /* Backslash may introduce a single character, or it may introduce one
02599       of the specials, which just set a flag. Escaped items are checked for
02600       validity in the pre-compiling pass. The sequence \b is a special case.
02601       Inside a class (and only there) it is treated as backspace. Elsewhere
02602       it marks a word boundary. Other escapes have preset maps ready to
02603       or into the one we are building. We assume they have more than one
02604       character in them, so set class_charcount bigger than one. */
02605 
02606       if (c == '\\')
02607         {
02608         c = check_escape(&ptr, errorptr, *brackets, options, true);
02609         if (-c == ESC_b) c = '\b';  /* \b is backslash in a class */
02610 
02611         if (-c == ESC_Q)            /* Handle start of quoted string */
02612           {
02613           if (ptr[1] == '\\' && ptr[2] == 'E')
02614             {
02615             ptr += 2; /* avoid empty string */
02616             }
02617           else inescq = true;
02618           continue;
02619           }
02620 
02621         else if (c < 0)
02622           {
02623           register const uschar *cbits = cd->cbits;
02624           class_charcount = 10;     /* Greater than 1 is what matters */
02625           switch (-c)
02626             {
02627             case ESC_d:
02628             for (c = 0; c < 32; c++) classa[c] |= cbits[c+cbit_digit];
02629             continue;
02630 
02631             case ESC_D:
02632             for (c = 0; c < 32; c++) classa[c] |= ~cbits[c+cbit_digit];
02633             continue;
02634 
02635             case ESC_w:
02636             for (c = 0; c < 32; c++) classa[c] |= cbits[c+cbit_word];
02637             continue;
02638 
02639             case ESC_W:
02640             for (c = 0; c < 32; c++) classa[c] |= ~cbits[c+cbit_word];
02641             continue;
02642 
02643             case ESC_s:
02644             for (c = 0; c < 32; c++) classa[c] |= cbits[c+cbit_space];
02645             classa[1] &= ~0x08;   /* Perl 5.004 onwards omits VT from \s */
02646             continue;
02647 
02648             case ESC_S:
02649             for (c = 0; c < 32; c++) classa[c] |= ~cbits[c+cbit_space];
02650             classa[1] |= 0x08;    /* Perl 5.004 onwards omits VT from \s */
02651             continue;
02652 
02653             /* Unrecognized escapes are faulted if PCRE is running in its
02654             strict mode. By default, for compatibility with Perl, they are
02655             treated as literals. */
02656 
02657             default:
02658             if ((options & PCRE_EXTRA) != 0)
02659               {
02660               *errorptr = ERR7;
02661               goto FAILED;
02662               }
02663             c = *ptr;    /* The final character */
02664             }
02665           }
02666 
02667         /* Fall through if we have a single character (c >= 0). This may be
02668         > 256 in UTF-8 mode. */
02669 
02670         }   /* End of backslash handling */
02671 
02672       /* A single character may be followed by '-' to form a range. However,
02673       Perl does not permit ']' to be the end of the range. A '-' character
02674       here is treated as a literal. */
02675 
02676       if (ptr[1] == '-' && ptr[2] != ']')
02677         {
02678         int d;
02679         ptr += 2;
02680 
02681         d = *ptr;
02682 
02683         /* The second part of a range can be a single-character escape, but
02684         not any of the other escapes. Perl 5.6 treats a hyphen as a literal
02685         in such circumstances. */
02686 
02687         if (d == '\\')
02688           {
02689           const uschar *oldptr = ptr;
02690           d = check_escape(&ptr, errorptr, *brackets, options, true);
02691 
02692           /* \b is backslash; any other special means the '-' was literal */
02693 
02694           if (d < 0)
02695             {
02696             if (d == -ESC_b) d = '\b'; else
02697               {
02698               ptr = oldptr - 2;
02699               goto LONE_SINGLE_CHARACTER;  /* A few lines below */
02700               }
02701             }
02702           }
02703 
02704         /* Check that the two values are in the correct order */
02705 
02706         if (d < c)
02707           {
02708           *errorptr = ERR8;
02709           goto FAILED;
02710           }
02711 
02712         /* If d is greater than 255, we can't just use the bit map, so set up
02713         for the UTF-8 supporting class type. If we are not caseless, we can
02714         just set up a single range. If we are caseless, the characters < 256
02715         are handled with a bitmap, in order to get the case-insensitive
02716         handling. */
02717 
02718         /* We use the bit map if the range is entirely < 255, or if part of it
02719         is < 255 and matching is caseless. */
02720 
02721         for (; c <= d; c++)
02722           {
02723           classa[c/8] |= (1 << (c&7));
02724           if ((options & PCRE_CASELESS) != 0)
02725             {
02726             int uc = cd->fcc[c];           /* flip case */
02727             classa[uc/8] |= (1 << (uc&7));
02728             }
02729           class_charcount++;                /* in case a one-char range */
02730           class_lastchar = c;
02731           }
02732 
02733         continue;   /* Go get the next char in the class */
02734         }
02735 
02736       /* Handle a lone single character - we can get here for a normal
02737       non-escape char, or after \ that introduces a single character. */
02738 
02739       LONE_SINGLE_CHARACTER:
02740 
02741       /* Handle a single-byte character */
02742         {
02743         classa[c/8] |= (1 << (c&7));
02744         if ((options & PCRE_CASELESS) != 0)
02745           {
02746           c = cd->fcc[c];   /* flip case */
02747           classa[c/8] |= (1 << (c&7));
02748           }
02749         class_charcount++;
02750         class_lastchar = c;
02751         }
02752       }
02753 
02754     /* Loop until ']' reached; the check for end of string happens inside the
02755     loop. This "while" is the end of the "do" above. */
02756 
02757     while ((c = *(++ptr)) != ']' || inescq);
02758 
02759     /* If class_charcount is 1, we saw precisely one character with a value <
02760     256. In UTF-8 mode, we can optimize if there were no characters >= 256 and
02761     the one character is < 128. In non-UTF-8 mode we can always optimize.
02762 
02763     The optimization throws away the bit map. We turn the item into a
02764     1-character OP_CHARS if it's positive, or OP_NOT if it's negative. Note
02765     that OP_NOT does not support multibyte characters. In the positive case, it
02766     can cause firstbyte to be set. Otherwise, there can be no first char if
02767     this item is first, whatever repeat count may follow. In the case of
02768     reqbyte, save the previous value for reinstating. */
02769 
02770     if (class_charcount == 1)
02771       {
02772       zeroreqbyte = reqbyte;
02773       if (negate_class)
02774         {
02775         if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
02776         zerofirstbyte = firstbyte;
02777         *code++ = OP_NOT;
02778         }
02779       else
02780         {
02781         if (firstbyte == REQ_UNSET)
02782           {
02783           zerofirstbyte = REQ_NONE;
02784           firstbyte = class_lastchar | req_caseopt;
02785           }
02786         else
02787           {
02788           zerofirstbyte = firstbyte;
02789           reqbyte = class_lastchar | req_caseopt | cd->req_varyopt;
02790           }
02791         *code++ = OP_CHARS;
02792         *code++ = 1;
02793         }
02794       *code++ = class_lastchar;
02795       break;  /* End of class handling */
02796       }       /* End of 1-byte optimization */
02797 
02798     /* Otherwise, if this is the first thing in the branch, there can be no
02799     first char setting, whatever the repeat count. Any reqbyte setting must
02800     remain unchanged after any kind of repeat. */
02801 
02802     if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
02803     zerofirstbyte = firstbyte;
02804     zeroreqbyte = reqbyte;
02805 
02806     /* If there are characters with values > 255, we have to compile an
02807     extended class, with its own opcode. If there are no characters < 256,
02808     we can omit the bitmap. */
02809 
02810 
02811     /* If there are no characters > 255, negate the 32-byte map if necessary,
02812     and copy it into the code vector. If this is the first thing in the branch,
02813     there can be no first char setting, whatever the repeat count. Any reqbyte
02814     setting must remain unchanged after any kind of repeat. */
02815 
02816     if (negate_class)
02817       {
02818       *code++ = OP_NCLASS;
02819       for (c = 0; c < 32; c++) code[c] = ~classa[c];
02820       }
02821     else
02822       {
02823       *code++ = OP_CLASS;
02824       memcpy(code, classa, 32);
02825       }
02826     code += 32;
02827     break;
02828 
02829     /* Various kinds of repeat */
02830 
02831     case '{':
02832     if (!is_counted_repeat(ptr+1)) goto NORMAL_CHAR;
02833     ptr = read_repeat_counts(ptr+1, &repeat_min, &repeat_max, errorptr);
02834     if (*errorptr != NULL) goto FAILED;
02835     goto REPEAT;
02836 
02837     case '*':
02838     repeat_min = 0;
02839     repeat_max = -1;
02840     goto REPEAT;
02841 
02842     case '+':
02843     repeat_min = 1;
02844     repeat_max = -1;
02845     goto REPEAT;
02846 
02847     case '?':
02848     repeat_min = 0;
02849     repeat_max = 1;
02850 
02851     REPEAT:
02852     if (previous == NULL)
02853       {
02854       *errorptr = ERR9;
02855       goto FAILED;
02856       }
02857 
02858     if (repeat_min == 0)
02859       {
02860       firstbyte = zerofirstbyte;    /* Adjust for zero repeat */
02861       reqbyte = zeroreqbyte;        /* Ditto */
02862       }
02863 
02864     /* Remember whether this is a variable length repeat */
02865 
02866     reqvary = (repeat_min == repeat_max)? 0 : REQ_VARY;
02867 
02868     op_type = 0;                    /* Default single-char op codes */
02869     possessive_quantifier = false;  /* Default not possessive quantifier */
02870 
02871     /* Save start of previous item, in case we have to move it up to make space
02872     for an inserted OP_ONCE for the additional '+' extension. */
02873 
02874     tempcode = previous;
02875 
02876     /* If the next character is '+', we have a possessive quantifier. This
02877     implies greediness, whatever the setting of the PCRE_UNGREEDY option.
02878     If the next character is '?' this is a minimizing repeat, by default,
02879     but if PCRE_UNGREEDY is set, it works the other way round. We change the
02880     repeat type to the non-default. */
02881 
02882     if (ptr[1] == '+')
02883       {
02884       repeat_type = 0;                  /* Force greedy */
02885       possessive_quantifier = true;
02886       ptr++;
02887       }
02888     else if (ptr[1] == '?')
02889       {
02890       repeat_type = greedy_non_default;
02891       ptr++;
02892       }
02893     else repeat_type = greedy_default;
02894 
02895     /* If previous was a recursion, we need to wrap it inside brackets so that
02896     it can be replicated if necessary. */
02897 
02898     if (*previous == OP_RECURSE)
02899       {
02900       memmove(previous + 1 + LINK_SIZE, previous, 1 + LINK_SIZE);
02901       code += 1 + LINK_SIZE;
02902       *previous = OP_BRA;
02903       PUT(previous, 1, code - previous);
02904       *code = OP_KET;
02905       PUT(code, 1, code - previous);
02906       code += 1 + LINK_SIZE;
02907       }
02908 
02909     /* If previous was a string of characters, chop off the last one and use it
02910     as the subject of the repeat. If there was only one character, we can
02911     abolish the previous item altogether. If a one-char item has a minumum of
02912     more than one, ensure that it is set in reqbyte - it might not be if a
02913     sequence such as x{3} is the first thing in a branch because the x will
02914     have gone into firstbyte instead.  */
02915 
02916     if (*previous == OP_CHARS)
02917       {
02918       /* Deal with UTF-8 characters that take up more than one byte. It's
02919       easier to write this out separately than try to macrify it. Use c to
02920       hold the length of the character in bytes, plus 0x80 to flag that it's a
02921       length rather than a small character. */
02922 
02923 
02924       /* Handle the case of a single byte - either with no UTF8 support, or
02925       with UTF-8 disabled, or for a UTF-8 character < 128. */
02926 
02927         {
02928         c = *(--code);
02929         if (code == previous + 2)   /* There was only one character */
02930           {
02931           code = previous;              /* Abolish the previous item */
02932           if (repeat_min > 1) reqbyte = c | req_caseopt | cd->req_varyopt;
02933           }
02934         else
02935           {
02936           previous[1]--;             /* adjust length */
02937           tempcode = code;           /* Adjust position to be moved for '+' */
02938           }
02939         }
02940 
02941       goto OUTPUT_SINGLE_REPEAT;   /* Code shared with single character types */
02942       }
02943 
02944     /* If previous was a single negated character ([^a] or similar), we use
02945     one of the special opcodes, replacing it. The code is shared with single-
02946     character repeats by setting opt_type to add a suitable offset into
02947     repeat_type. OP_NOT is currently used only for single-byte chars. */
02948 
02949     else if (*previous == OP_NOT)
02950       {
02951       op_type = OP_NOTSTAR - OP_STAR;  /* Use "not" opcodes */
02952       c = previous[1];
02953       code = previous;
02954       goto OUTPUT_SINGLE_REPEAT;
02955       }
02956 
02957     /* If previous was a character type match (\d or similar), abolish it and
02958     create a suitable repeat item. The code is shared with single-character
02959     repeats by setting op_type to add a suitable offset into repeat_type. */
02960 
02961     else if (*previous < OP_EODN)
02962       {
02963       op_type = OP_TYPESTAR - OP_STAR;  /* Use type opcodes */
02964       c = *previous;
02965       code = previous;
02966 
02967       OUTPUT_SINGLE_REPEAT:
02968 
02969       /* If the maximum is zero then the minimum must also be zero; Perl allows
02970       this case, so we do too - by simply omitting the item altogether. */
02971 
02972       if (repeat_max == 0) goto END_REPEAT;
02973 
02974       /* Combine the op_type with the repeat_type */
02975 
02976       repeat_type += op_type;
02977 
02978       /* A minimum of zero is handled either as the special case * or ?, or as
02979       an UPTO, with the maximum given. */
02980 
02981       if (repeat_min == 0)
02982         {
02983         if (repeat_max == -1) *code++ = OP_STAR + repeat_type;
02984           else if (repeat_max == 1) *code++ = OP_QUERY + repeat_type;
02985         else
02986           {
02987           *code++ = OP_UPTO + repeat_type;
02988           PUT2INC(code, 0, repeat_max);
02989           }
02990         }
02991 
02992       /* The case {1,} is handled as the special case + */
02993 
02994       else if (repeat_min == 1 && repeat_max == -1)
02995         *code++ = OP_PLUS + repeat_type;
02996 
02997       /* The case {n,n} is just an EXACT, while the general case {n,m} is
02998       handled as an EXACT followed by an UPTO. An EXACT of 1 is optimized. */
02999 
03000       else
03001         {
03002         if (repeat_min != 1)
03003           {
03004           *code++ = OP_EXACT + op_type;  /* NB EXACT doesn't have repeat_type */
03005           PUT2INC(code, 0, repeat_min);
03006           }
03007 
03008         /* If the mininum is 1 and the previous item was a character string,
03009         we either have to put back the item that got cancelled if the string
03010         length was 1, or add the character back onto the end of a longer
03011         string. For a character type nothing need be done; it will just get
03012         put back naturally. Note that the final character is always going to
03013         get added below, so we leave code ready for its insertion. */
03014 
03015         else if (*previous == OP_CHARS)
03016           {
03017           if (code == previous) code += 2; else
03018 
03019           /* In UTF-8 mode, a multibyte char has its length in c, with the 0x80
03020           bit set as a flag. The length will always be between 2 and 6. */
03021 
03022           previous[1]++;
03023           }
03024 
03025         /*  For a single negated character we also have to put back the
03026         item that got cancelled. At present this applies only to single byte
03027         characters in any mode. */
03028 
03029         else if (*previous == OP_NOT) code++;
03030 
03031         /* If the maximum is unlimited, insert an OP_STAR. Before doing so,
03032         we have to insert the character for the previous code. In UTF-8 mode,
03033         long characters have their length in c, with the 0x80 bit as a flag. */
03034 
03035         if (repeat_max < 0)
03036           {
03037           *code++ = c;
03038           *code++ = OP_STAR + repeat_type;
03039           }
03040 
03041         /* Else insert an UPTO if the max is greater than the min, again
03042         preceded by the character, for the previously inserted code. */
03043 
03044         else if (repeat_max != repeat_min)
03045           {
03046           *code++ = c;
03047           repeat_max -= repeat_min;
03048           *code++ = OP_UPTO + repeat_type;
03049           PUT2INC(code, 0, repeat_max);
03050           }
03051         }
03052 
03053       /* The character or character type itself comes last in all cases. */
03054 
03055 
03056       *code++ = c;
03057       }
03058 
03059     /* If previous was a character class or a back reference, we put the repeat
03060     stuff after it, but just skip the item if the repeat was {0,0}. */
03061 
03062     else if (*previous == OP_CLASS ||
03063              *previous == OP_NCLASS ||
03064              *previous == OP_REF)
03065       {
03066       if (repeat_max == 0)
03067         {
03068         code = previous;
03069         goto END_REPEAT;
03070         }
03071       if (repeat_min == 0 && repeat_max == -1)
03072         *code++ = OP_CRSTAR + repeat_type;
03073       else if (repeat_min == 1 && repeat_max == -1)
03074         *code++ = OP_CRPLUS + repeat_type;
03075       else if (repeat_min == 0 && repeat_max == 1)
03076         *code++ = OP_CRQUERY + repeat_type;
03077       else
03078         {
03079         *code++ = OP_CRRANGE + repeat_type;
03080         PUT2INC(code, 0, repeat_min);
03081         if (repeat_max == -1) repeat_max = 0;  /* 2-byte encoding for max */
03082         PUT2INC(code, 0, repeat_max);
03083         }
03084       }
03085 
03086     /* If previous was a bracket group, we may have to replicate it in certain
03087     cases. */
03088 
03089     else if (*previous >= OP_BRA || *previous == OP_ONCE ||
03090              *previous == OP_COND)
03091       {
03092       register int i;
03093       int ketoffset = 0;
03094       int len = code - previous;
03095       uschar *bralink = NULL;
03096 
03097       /* If the maximum repeat count is unlimited, find the end of the bracket
03098       by scanning through from the start, and compute the offset back to it
03099       from the current code pointer. There may be an OP_OPT setting following
03100       the final KET, so we can't find the end just by going back from the code
03101       pointer. */
03102 
03103       if (repeat_max == -1)
03104         {
03105         register uschar *ket = previous;
03106         do ket += GET(ket, 1); while (*ket != OP_KET);
03107         ketoffset = code - ket;
03108         }
03109 
03110       /* The case of a zero minimum is special because of the need to stick
03111       OP_BRAZERO in front of it, and because the group appears once in the
03112       data, whereas in other cases it appears the minimum number of times. For
03113       this reason, it is simplest to treat this case separately, as otherwise
03114       the code gets far too messy. There are several special subcases when the
03115       minimum is zero. */
03116 
03117       if (repeat_min == 0)
03118         {
03119         /* If the maximum is also zero, we just omit the group from the output
03120         altogether. */
03121 
03122         if (repeat_max == 0)
03123           {
03124           code = previous;
03125           goto END_REPEAT;
03126           }
03127 
03128         /* If the maximum is 1 or unlimited, we just have to stick in the
03129         BRAZERO and do no more at this point. However, we do need to adjust
03130         any OP_RECURSE calls inside the group that refer to the group itself or
03131         any internal group, because the offset is from the start of the whole
03132         regex. Temporarily terminate the pattern while doing this. */
03133 
03134         if (repeat_max <= 1)
03135           {
03136           *code = OP_END;
03137           adjust_recurse(previous, 1, utf8, cd);
03138           memmove(previous+1, previous, len);
03139           code++;
03140           *previous++ = OP_BRAZERO + repeat_type;
03141           }
03142 
03143         /* If the maximum is greater than 1 and limited, we have to replicate
03144         in a nested fashion, sticking OP_BRAZERO before each set of brackets.
03145         The first one has to be handled carefully because it's the original
03146         copy, which has to be moved up. The remainder can be handled by code
03147         that is common with the non-zero minimum case below. We have to
03148         adjust the value or repeat_max, since one less copy is required. Once
03149         again, we may have to adjust any OP_RECURSE calls inside the group. */
03150 
03151         else
03152           {
03153           int offset;
03154           *code = OP_END;
03155           adjust_recurse(previous, 2 + LINK_SIZE, utf8, cd);
03156           memmove(previous + 2 + LINK_SIZE, previous, len);
03157           code += 2 + LINK_SIZE;
03158           *previous++ = OP_BRAZERO + repeat_type;
03159           *previous++ = OP_BRA;
03160 
03161           /* We chain together the bracket offset fields that have to be
03162           filled in later when the ends of the brackets are reached. */
03163 
03164           offset = (bralink == NULL)? 0 : previous - bralink;
03165           bralink = previous;
03166           PUTINC(previous, 0, offset);
03167           }
03168 
03169         repeat_max--;
03170         }
03171 
03172       /* If the minimum is greater than zero, replicate the group as many
03173       times as necessary, and adjust the maximum to the number of subsequent
03174       copies that we need. If we set a first char from the group, and didn't
03175       set a required char, copy the latter from the former. */
03176 
03177       else
03178         {
03179         if (repeat_min > 1)
03180           {
03181           if (groupsetfirstbyte && reqbyte < 0) reqbyte = firstbyte;
03182           for (i = 1; i < repeat_min; i++)
03183             {
03184             memcpy(code, previous, len);
03185             code += len;
03186             }
03187           }
03188         if (repeat_max > 0) repeat_max -= repeat_min;
03189         }
03190 
03191       /* This code is common to both the zero and non-zero minimum cases. If
03192       the maximum is limited, it replicates the group in a nested fashion,
03193       remembering the bracket starts on a stack. In the case of a zero minimum,
03194       the first one was set up above. In all cases the repeat_max now specifies
03195       the number of additional copies needed. */
03196 
03197       if (repeat_max >= 0)
03198         {
03199         for (i = repeat_max - 1; i >= 0; i--)
03200           {
03201           *code++ = OP_BRAZERO + repeat_type;
03202 
03203           /* All but the final copy start a new nesting, maintaining the
03204           chain of brackets outstanding. */
03205 
03206           if (i != 0)
03207             {
03208             int offset;
03209             *code++ = OP_BRA;
03210             offset = (bralink == NULL)? 0 : code - bralink;
03211             bralink = code;
03212             PUTINC(code, 0, offset);
03213             }
03214 
03215           memcpy(code, previous, len);
03216           code += len;
03217           }
03218 
03219         /* Now chain through the pending brackets, and fill in their length
03220         fields (which are holding the chain links pro tem). */
03221 
03222         while (bralink != NULL)
03223           {
03224           int oldlinkoffset;
03225           int offset = code - bralink + 1;
03226           uschar *bra = code - offset;
03227           oldlinkoffset = GET(bra, 1);
03228           bralink = (oldlinkoffset == 0)? NULL : bralink - oldlinkoffset;
03229           *code++ = OP_KET;
03230           PUTINC(code, 0, offset);
03231           PUT(bra, 1, offset);
03232           }
03233         }
03234 
03235       /* If the maximum is unlimited, set a repeater in the final copy. We
03236       can't just offset backwards from the current code point, because we
03237       don't know if there's been an options resetting after the ket. The
03238       correct offset was computed above. */
03239 
03240       else code[-ketoffset] = OP_KETRMAX + repeat_type;
03241       }
03242 
03243     /* Else there's some kind of shambles */
03244 
03245     else
03246       {
03247       *errorptr = ERR11;
03248       goto FAILED;
03249       }
03250 
03251     /* If the character following a repeat is '+', we wrap the entire repeated
03252     item inside OP_ONCE brackets. This is just syntactic sugar, taken from
03253     Sun's Java package. The repeated item starts at tempcode, not at previous,
03254     which might be the first part of a string whose (former) last char we
03255     repeated. However, we don't support '+' after a greediness '?'. */
03256 
03257     if (possessive_quantifier)
03258       {
03259       int len = code - tempcode;
03260       memmove(tempcode + 1+LINK_SIZE, tempcode, len);
03261       code += 1 + LINK_SIZE;
03262       len += 1 + LINK_SIZE;
03263       tempcode[0] = OP_ONCE;
03264       *code++ = OP_KET;
03265       PUTINC(code, 0, len);
03266       PUT(tempcode, 1, len);
03267       }
03268 
03269     /* In all case we no longer have a previous item. We also set the
03270     "follows varying string" flag for subsequently encountered reqbytes if
03271     it isn't already set and we have just passed a varying length item. */
03272 
03273     END_REPEAT:
03274     previous = NULL;
03275     cd->req_varyopt |= reqvary;
03276     break;
03277 
03278 
03279     /* Start of nested bracket sub-expression, or comment or lookahead or
03280     lookbehind or option setting or condition. First deal with special things
03281     that can come after a bracket; all are introduced by ?, and the appearance
03282     of any of them means that this is not a referencing group. They were
03283     checked for validity in the first pass over the string, so we don't have to
03284     check for syntax errors here.  */
03285 
03286     case '(':
03287     newoptions = options;
03288     skipbytes = 0;
03289 
03290     if (*(++ptr) == '?')
03291       {
03292       int set, unset;
03293       int *optset;
03294 
03295       switch (*(++ptr))
03296         {
03297         case '#':                 /* Comment; skip to ket */
03298         ptr++;
03299         while (*ptr != ')') ptr++;
03300         continue;
03301 
03302         case ':':                 /* Non-extracting bracket */
03303         bravalue = OP_BRA;
03304         ptr++;
03305         break;
03306 
03307         case '(':
03308         bravalue = OP_COND;       /* Conditional group */
03309 
03310         /* Condition to test for recursion */
03311 
03312         if (ptr[1] == 'R')
03313           {
03314           code[1+LINK_SIZE] = OP_CREF;
03315           PUT2(code, 2+LINK_SIZE, CREF_RECURSE);
03316           skipbytes = 3;
03317           ptr += 3;
03318           }
03319 
03320         /* Condition to test for a numbered subpattern match. We know that
03321         if a digit follows ( then there will just be digits until ) because
03322         the syntax was checked in the first pass. */
03323 
03324         else if ((digitab[ptr[1]] && ctype_digit) != 0)
03325           {
03326           int condref;                 /* Don't amalgamate; some compilers */
03327           condref = *(++ptr) - '0';    /* grumble at autoincrement in declaration */
03328           while (*(++ptr) != ')') condref = condref*10 + *ptr - '0';
03329           if (condref == 0)
03330             {
03331             *errorptr = ERR35;
03332             goto FAILED;
03333             }
03334           ptr++;
03335           code[1+LINK_SIZE] = OP_CREF;
03336           PUT2(code, 2+LINK_SIZE, condref);
03337           skipbytes = 3;
03338           }
03339         /* For conditions that are assertions, we just fall through, having
03340         set bravalue above. */
03341         break;
03342 
03343         case '=':                 /* Positive lookahead */
03344         bravalue = OP_ASSERT;
03345         ptr++;
03346         break;
03347 
03348         case '!':                 /* Negative lookahead */
03349         bravalue = OP_ASSERT_NOT;
03350         ptr++;
03351         break;
03352 
03353         case '<':                 /* Lookbehinds */
03354         switch (*(++ptr))
03355           {
03356           case '=':               /* Positive lookbehind */
03357           bravalue = OP_ASSERTBACK;
03358           ptr++;
03359           break;
03360 
03361           case '!':               /* Negative lookbehind */
03362           bravalue = OP_ASSERTBACK_NOT;
03363           ptr++;
03364           break;
03365           }
03366         break;
03367 
03368         case '>':                 /* One-time brackets */
03369         bravalue = OP_ONCE;
03370         ptr++;
03371         break;
03372 
03373         case 'C':                 /* Callout - may be followed by digits */
03374         *code++ = OP_CALLOUT;
03375           {
03376           int n = 0;
03377           while ((digitab[*(++ptr)] & ctype_digit) != 0)
03378             n = n * 10 + *ptr - '0';
03379           if (n > 255)
03380             {
03381             *errorptr = ERR38;
03382             goto FAILED;
03383             }
03384           *code++ = n;
03385           }
03386         previous = NULL;
03387         continue;
03388 
03389         case 'P':                 /* Named subpattern handling */
03390         if (*(++ptr) == '<')      /* Definition */
03391           {
03392           int i, namelen;
03393           uschar *slot = cd->name_table;
03394           const uschar *name;     /* Don't amalgamate; some compilers */
03395           name = ++ptr;           /* grumble at autoincrement in declaration */
03396 
03397           while (*ptr++ != '>');
03398           namelen = ptr - name - 1;
03399 
03400           for (i = 0; i < cd->names_found; i++)
03401             {
03402             int crc = memcmp(name, slot+2, namelen);
03403             if (crc == 0)
03404               {
03405               if (slot[2+namelen] == 0)
03406                 {
03407                 *errorptr = ERR43;
03408                 goto FAILED;
03409                 }
03410               crc = -1;             /* Current name is substring */
03411               }
03412             if (crc < 0)
03413               {
03414               memmove(slot + cd->name_entry_size, slot,
03415                 (cd->names_found - i) * cd->name_entry_size);
03416               break;
03417               }
03418             slot += cd->name_entry_size;
03419             }
03420 
03421           PUT2(slot, 0, *brackets + 1);
03422           memcpy(slot + 2, name, namelen);
03423           slot[2+namelen] = 0;
03424           cd->names_found++;
03425           goto NUMBERED_GROUP;
03426           }
03427 
03428         if (*ptr == '=' || *ptr == '>')  /* Reference or recursion */
03429           {
03430           int i, namelen;
03431           int type = *ptr++;
03432           const uschar *name = ptr;
03433           uschar *slot = cd->name_table;
03434 
03435           while (*ptr != ')') ptr++;
03436           namelen = ptr - name;
03437 
03438           for (i = 0; i < cd->names_found; i++)
03439             {
03440             if (strncmp((char *)name, (char *)slot+2, namelen) == 0) break;
03441             slot += cd->name_entry_size;
03442             }
03443           if (i >= cd->names_found)
03444             {
03445             *errorptr = ERR15;
03446             goto FAILED;
03447             }
03448 
03449           recno = GET2(slot, 0);
03450 
03451           if (type == '>') goto HANDLE_RECURSION;  /* A few lines below */
03452 
03453           /* Back reference */
03454 
03455           previous = code;
03456           *code++ = OP_REF;
03457           PUT2INC(code, 0, recno);
03458           cd->backref_map |= (recno < 32)? (1 << recno) : 1;
03459           if (recno > cd->top_backref) cd->top_backref = recno;
03460           continue;
03461           }
03462 
03463         /* Should never happen */
03464         break;
03465 
03466         case 'R':                 /* Pattern recursion */
03467         ptr++;                    /* Same as (?0)      */
03468         /* Fall through */
03469 
03470         /* Recursion or "subroutine" call */
03471 
03472         case '0': case '1': case '2': case '3': case '4':
03473         case '5': case '6': case '7': case '8': case '9':
03474           {
03475           const uschar *called;
03476           recno = 0;
03477           while((digitab[*ptr] & ctype_digit) != 0)
03478             recno = recno * 10 + *ptr++ - '0';
03479 
03480           /* Come here from code above that handles a named recursion */
03481 
03482           HANDLE_RECURSION:
03483 
03484           previous = code;
03485 
03486           /* Find the bracket that is being referenced. Temporarily end the
03487           regex in case it doesn't exist. */
03488 
03489           *code = OP_END;
03490           called = (recno == 0)?
03491             cd->start_code : find_bracket(cd->start_code, recno);
03492 
03493           if (called == NULL)
03494             {
03495             *errorptr = ERR15;
03496             goto FAILED;
03497             }
03498 
03499           /* If the subpattern is still open, this is a recursive call. We
03500           check to see if this is a left recursion that could loop for ever,
03501           and diagnose that case. */
03502 
03503           if (GET(called, 1) == 0 && could_be_empty(called, code, bcptr, utf8))
03504             {
03505             *errorptr = ERR40;
03506             goto FAILED;
03507             }
03508 
03509           /* Insert the recursion/subroutine item */
03510 
03511           *code = OP_RECURSE;
03512           PUT(code, 1, called - cd->start_code);
03513           code += 1 + LINK_SIZE;
03514           }
03515         continue;
03516 
03517         /* Character after (? not specially recognized */
03518 
03519         default:                  /* Option setting */
03520         set = unset = 0;
03521         optset = &set;
03522 
03523         while (*ptr != ')' && *ptr != ':')
03524           {
03525           switch (*ptr++)
03526             {
03527             case '-': optset = &unset; break;
03528 
03529             case 'i': *optset |= PCRE_CASELESS; break;
03530             case 'm': *optset |= PCRE_MULTILINE; break;
03531             case 's': *optset |= PCRE_DOTALL; break;
03532             case 'x': *optset |= PCRE_EXTENDED; break;
03533             case 'U': *optset |= PCRE_UNGREEDY; break;
03534             case 'X': *optset |= PCRE_EXTRA; break;
03535             }
03536           }
03537 
03538         /* Set up the changed option bits, but don't change anything yet. */
03539 
03540         newoptions = (options | set) & (~unset);
03541 
03542         /* If the options ended with ')' this is not the start of a nested
03543         group with option changes, so the options change at this level. Compile
03544         code to change the ims options if this setting actually changes any of
03545         them. We also pass the new setting back so that it can be put at the
03546         start of any following branches, and when this group ends (if we are in
03547         a group), a resetting item can be compiled.
03548 
03549         Note that if this item is right at the start of the pattern, the
03550         options will have been abstracted and made global, so there will be no
03551         change to compile. */
03552 
03553         if (*ptr == ')')
03554           {
03555           if ((options & PCRE_IMS) != (newoptions & PCRE_IMS))
03556             {
03557             *code++ = OP_OPT;
03558             *code++ = newoptions & PCRE_IMS;
03559             }
03560 
03561           /* Change options at this level, and pass them back for use
03562           in subsequent branches. Reset the greedy defaults and the case
03563           value for firstbyte and reqbyte. */
03564 
03565           *optionsptr = options = newoptions;
03566           greedy_default = ((newoptions & PCRE_UNGREEDY) != 0);
03567           greedy_non_default = greedy_default ^ 1;
03568           req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
03569 
03570           previous = NULL;       /* This item can't be repeated */
03571           continue;              /* It is complete */
03572           }
03573 
03574         /* If the options ended with ':' we are heading into a nested group
03575         with possible change of options. Such groups are non-capturing and are
03576         not assertions of any kind. All we need to do is skip over the ':';
03577         the newoptions value is handled below. */
03578 
03579         bravalue = OP_BRA;
03580         ptr++;
03581         }
03582       }
03583 
03584     /* If PCRE_NO_AUTO_CAPTURE is set, all unadorned brackets become
03585     non-capturing and behave like (?:...) brackets */
03586 
03587     else if ((options & PCRE_NO_AUTO_CAPTURE) != 0)
03588       {
03589       bravalue = OP_BRA;
03590       }
03591 
03592     /* Else we have a referencing group; adjust the opcode. If the bracket
03593     number is greater than EXTRACT_BASIC_MAX, we set the opcode one higher, and
03594     arrange for the true number to follow later, in an OP_BRANUMBER item. */
03595 
03596     else
03597       {
03598       NUMBERED_GROUP:
03599       if (++(*brackets) > EXTRACT_BASIC_MAX)
03600         {
03601         bravalue = OP_BRA + EXTRACT_BASIC_MAX + 1;
03602         code[1+LINK_SIZE] = OP_BRANUMBER;
03603         PUT2(code, 2+LINK_SIZE, *brackets);
03604         skipbytes = 3;
03605         }
03606       else bravalue = OP_BRA + *brackets;
03607       }
03608 
03609     /* Process nested bracketed re. Assertions may not be repeated, but other
03610     kinds can be. We copy code into a non-register variable in order to be able
03611     to pass its address because some compilers complain otherwise. Pass in a
03612     new setting for the ims options if they have changed. */
03613 
03614     previous = (bravalue >= OP_ONCE)? code : NULL;
03615     *code = bravalue;
03616     tempcode = code;
03617     tempreqvary = cd->req_varyopt;     /* Save value before bracket */
03618 
03619     if (!compile_regex(
03620          newoptions,                   /* The complete new option state */
03621          options & PCRE_IMS,           /* The previous ims option state */
03622          brackets,                     /* Extracting bracket count */
03623          &tempcode,                    /* Where to put code (updated) */
03624          &ptr,                         /* Input pointer (updated) */
03625          errorptr,                     /* Where to put an error message */
03626          (bravalue == OP_ASSERTBACK ||
03627           bravalue == OP_ASSERTBACK_NOT), /* true if back assert */
03628          skipbytes,                    /* Skip over OP_COND/OP_BRANUMBER */
03629          &subfirstbyte,                /* For possible first char */
03630          &subreqbyte,                  /* For possible last char */
03631          bcptr,                        /* Current branch chain */
03632          cd))                          /* Tables block */
03633       goto FAILED;
03634 
03635     /* At the end of compiling, code is still pointing to the start of the
03636     group, while tempcode has been updated to point past the end of the group
03637     and any option resetting that may follow it. The pattern pointer (ptr)
03638     is on the bracket. */
03639 
03640     /* If this is a conditional bracket, check that there are no more than
03641     two branches in the group. */
03642 
03643     else if (bravalue == OP_COND)
03644       {
03645       uschar *tc = code;
03646       condcount = 0;
03647 
03648       do {
03649          condcount++;
03650          tc += GET(tc,1);
03651          }
03652       while (*tc != OP_KET);
03653 
03654       if (condcount > 2)
03655         {
03656         *errorptr = ERR27;
03657         goto FAILED;
03658         }
03659 
03660       /* If there is just one branch, we must not make use of its firstbyte or
03661       reqbyte, because this is equivalent to an empty second branch. */
03662 
03663       if (condcount == 1) subfirstbyte = subreqbyte = REQ_NONE;
03664       }
03665 
03666     /* Handle updating of the required and first characters. Update for normal
03667     brackets of all kinds, and conditions with two branches (see code above).
03668     If the bracket is followed by a quantifier with zero repeat, we have to
03669     back off. Hence the definition of zeroreqbyte and zerofirstbyte outside the
03670     main loop so that they can be accessed for the back off. */
03671 
03672     zeroreqbyte = reqbyte;
03673     zerofirstbyte = firstbyte;
03674     groupsetfirstbyte = false;
03675 
03676     if (bravalue >= OP_BRA || bravalue == OP_ONCE || bravalue == OP_COND)
03677       {
03678       /* If we have not yet set a firstbyte in this branch, take it from the
03679       subpattern, remembering that it was set here so that a repeat of more
03680       than one can replicate it as reqbyte if necessary. If the subpattern has
03681       no firstbyte, set "none" for the whole branch. In both cases, a zero
03682       repeat forces firstbyte to "none". */
03683 
03684       if (firstbyte == REQ_UNSET)
03685         {
03686         if (subfirstbyte >= 0)
03687           {
03688           firstbyte = subfirstbyte;
03689           groupsetfirstbyte = true;
03690           }
03691         else firstbyte = REQ_NONE;
03692         zerofirstbyte = REQ_NONE;
03693         }
03694 
03695       /* If firstbyte was previously set, convert the subpattern's firstbyte
03696       into reqbyte if there wasn't one, using the vary flag that was in
03697       existence beforehand. */
03698 
03699       else if (subfirstbyte >= 0 && subreqbyte < 0)
03700         subreqbyte = subfirstbyte | tempreqvary;
03701 
03702       /* If the subpattern set a required byte (or set a first byte that isn't
03703       really the first byte - see above), set it. */
03704 
03705       if (subreqbyte >= 0) reqbyte = subreqbyte;
03706       }
03707 
03708     /* For a forward assertion, we take the reqbyte, if set. This can be
03709     helpful if the pattern that follows the assertion doesn't set a different
03710     char. For example, it's useful for /(?=abcde).+/. We can't set firstbyte
03711     for an assertion, however because it leads to incorrect effect for patterns
03712     such as /(?=a)a.+/ when the "real" "a" would then become a reqbyte instead
03713     of a firstbyte. This is overcome by a scan at the end if there's no
03714     firstbyte, looking for an asserted first char. */
03715 
03716     else if (bravalue == OP_ASSERT && subreqbyte >= 0) reqbyte = subreqbyte;
03717 
03718     /* Now update the main code pointer to the end of the group. */
03719 
03720     code = tempcode;
03721 
03722     /* Error if hit end of pattern */
03723 
03724     if (*ptr != ')')
03725       {
03726       *errorptr = ERR14;
03727       goto FAILED;
03728       }
03729     break;
03730 
03731     /* Check \ for being a real metacharacter; if not, fall through and handle
03732     it as a data character at the start of a string. Escape items are checked
03733     for validity in the pre-compiling pass. */
03734 
03735     case '\\':
03736     tempptr = ptr;
03737     c = check_escape(&ptr, errorptr, *brackets, options, false);
03738 
03739     /* Handle metacharacters introduced by \. For ones like \d, the ESC_ values
03740     are arranged to be the negation of the corresponding OP_values. For the
03741     back references, the values are ESC_REF plus the reference number. Only
03742     back references and those types that consume a character may be repeated.
03743     We can test for values between ESC_b and ESC_Z for the latter; this may
03744     have to change if any new ones are ever created. */
03745 
03746     if (c < 0)
03747       {
03748       if (-c == ESC_Q)            /* Handle start of quoted string */
03749         {
03750         if (ptr[1] == '\\' && ptr[2] == 'E') ptr += 2; /* avoid empty string */
03751           else inescq = true;
03752         continue;
03753         }
03754 
03755       /* For metasequences that actually match a character, we disable the
03756       setting of a first character if it hasn't already been set. */
03757 
03758       if (firstbyte == REQ_UNSET && -c > ESC_b && -c < ESC_Z)
03759         firstbyte = REQ_NONE;
03760 
03761       /* Set values to reset to if this is followed by a zero repeat. */
03762 
03763       zerofirstbyte = firstbyte;
03764       zeroreqbyte = reqbyte;
03765 
03766       /* Back references are handled specially */
03767 
03768       if (-c >= ESC_REF)
03769         {
03770         int number = -c - ESC_REF;
03771         previous = code;
03772         *code++ = OP_REF;
03773         PUT2INC(code, 0, number);
03774         }
03775       else
03776         {
03777         previous = (-c > ESC_b && -c < ESC_Z)? code : NULL;
03778         *code++ = -c;
03779         }
03780       continue;
03781       }
03782 
03783     /* Data character: reset and fall through */
03784 
03785     ptr = tempptr;
03786     c = '\\';
03787 
03788     /* Handle a run of data characters until a metacharacter is encountered.
03789     The first character is guaranteed not to be whitespace or # when the
03790     extended flag is set. */
03791 
03792     NORMAL_CHAR:
03793     default:
03794     previous = code;
03795     *code = OP_CHARS;
03796     code += 2;
03797     length = 0;
03798 
03799     do
03800       {
03801       /* If in \Q...\E, check for the end; if not, we always have a literal */
03802 
03803       if (inescq)
03804         {
03805         if (c == '\\' && ptr[1] == 'E')
03806           {
03807           inescq = false;
03808           ptr++;
03809           }
03810         else
03811           {
03812           *code++ = c;
03813           length++;
03814           }
03815         continue;
03816         }
03817 
03818       /* Skip white space and comments for /x patterns */
03819 
03820       if ((options & PCRE_EXTENDED) != 0)
03821         {
03822         if ((cd->ctypes[c] & ctype_space) != 0) continue;
03823         if (c == '#')
03824           {
03825           /* The space before the ; is to avoid a warning on a silly compiler
03826           on the Macintosh. */
03827           while ((c = *(++ptr)) != 0 && c != NEWLINE) ;
03828           if (c == 0) break;
03829           continue;
03830           }
03831         }
03832 
03833       /* Backslash may introduce a data char or a metacharacter. Escaped items
03834       are checked for validity in the pre-compiling pass. Stop the string
03835       before a metaitem. */
03836 
03837       if (c == '\\')
03838         {
03839         tempptr = ptr;
03840         c = check_escape(&ptr, errorptr, *brackets, options, false);
03841         if (c < 0) { ptr = tempptr; break; }
03842 
03843         /* If a character is > 127 in UTF-8 mode, we have to turn it into
03844         two or more bytes in the UTF-8 encoding. */
03845 
03846         }
03847 
03848       /* Ordinary character or single-char escape */
03849 
03850       *code++ = c;
03851       length++;
03852       }
03853 
03854     /* This "while" is the end of the "do" above. */
03855 
03856     while (length < MAXLIT && (cd->ctypes[c = *(++ptr)] & ctype_meta) == 0);
03857 
03858     /* Update the first and last requirements. These are always bytes, even in
03859     UTF-8 mode. However, there is a special case to be considered when there
03860     are only one or two characters. Because this gets messy in UTF-8 mode, the
03861     code is kept separate. When we get here "length" contains the number of
03862     bytes. */
03863 
03864 
03865     /* This is the code for non-UTF-8 operation, either without UTF-8 support,
03866     or when UTF-8 is not enabled. */
03867 
03868       {
03869       /* firstbyte was not previously set; take it from this string */
03870 
03871       if (firstbyte == REQ_UNSET)
03872         {
03873         if (length == 1)
03874           {
03875           zerofirstbyte = REQ_NONE;
03876           firstbyte = previous[2] | req_caseopt;
03877           zeroreqbyte = reqbyte;
03878           }
03879         else
03880           {
03881           zerofirstbyte = firstbyte = previous[2] | req_caseopt;
03882           zeroreqbyte = (length > 2)?
03883             (code[-2] | req_caseopt | cd->req_varyopt) : reqbyte;
03884           reqbyte = code[-1] | req_caseopt | cd->req_varyopt;
03885           }
03886         }
03887 
03888       /* firstbyte was previously set */
03889 
03890       else
03891         {
03892         zerofirstbyte = firstbyte;
03893         zeroreqbyte = (length == 1)? reqbyte :
03894           code[-2] | req_caseopt | cd->req_varyopt;
03895         reqbyte = code[-1] | req_caseopt | cd->req_varyopt;
03896         }
03897       }
03898 
03899     /* Set the length in the data vector, and advance to the next state. */
03900 
03901     previous[1] = length;
03902     if (length < MAXLIT) ptr--;
03903     break;
03904     }
03905   }                   /* end of big loop */
03906 
03907 /* Control never reaches here by falling through, only by a goto for all the
03908 error states. Pass back the position in the pattern so that it can be displayed
03909 to the user for diagnosing the error. */
03910 
03911 FAILED:
03912 *ptrptr = ptr;
03913 return false;
03914 }
03915 

static bool compile_regex ( int  ,
int  ,
int *  ,
uschar **  ,
const uschar **  ,
const char **  ,
bool  ,
int  ,
int *  ,
int *  ,
branch_chain ,
compile_data  
) [static]

Definition at line 3950 of file pcre.cpp.

References CMuxAlarm::bAlarmed, compile_branch(), branch_chain::current, DPRINTF, ERR25, ERR36, find_fixedlength(), GET, LINK_SIZE, MuxAlarm, OP_ALT, OP_END, OP_KET, OP_OPT, OP_REVERSE, branch_chain::outer, PCRE_IMS, PUT, PUTINC, REQ_NONE, REQ_UNSET, and REQ_VARY.

Referenced by compile_branch(), and pcre_compile().

03952 {
03953 const uschar *ptr = *ptrptr;
03954 uschar *code = *codeptr;
03955 uschar *last_branch = code;
03956 uschar *start_bracket = code;
03957 uschar *reverse_count = NULL;
03958 int firstbyte, reqbyte;
03959 int branchfirstbyte, branchreqbyte;
03960 branch_chain bc;
03961 
03962 bc.outer = bcptr;
03963 bc.current = code;
03964 
03965 firstbyte = reqbyte = REQ_UNSET;
03966 
03967 /* Offset is set zero to mark that this bracket is still open */
03968 
03969 PUT(code, 1, 0);
03970 code += 1 + LINK_SIZE + skipbytes;
03971 
03972 /* Loop for each alternative branch */
03973 
03974 for (;!MuxAlarm.bAlarmed;)
03975   {
03976   /* Handle a change of ims options at the start of the branch */
03977 
03978   if ((options & PCRE_IMS) != oldims)
03979     {
03980     *code++ = OP_OPT;
03981     *code++ = options & PCRE_IMS;
03982     }
03983 
03984   /* Set up dummy OP_REVERSE if lookbehind assertion */
03985 
03986   if (lookbehind)
03987     {
03988     *code++ = OP_REVERSE;
03989     reverse_count = code;
03990     PUTINC(code, 0, 0);
03991     }
03992 
03993   /* Now compile the branch */
03994 
03995   if (!compile_branch(&options, brackets, &code, &ptr, errorptr,
03996         &branchfirstbyte, &branchreqbyte, &bc, cd))
03997     {
03998     *ptrptr = ptr;
03999     return false;
04000     }
04001 
04002   /* If this is the first branch, the firstbyte and reqbyte values for the
04003   branch become the values for the regex. */
04004 
04005   if (*last_branch != OP_ALT)
04006     {
04007     firstbyte = branchfirstbyte;
04008     reqbyte = branchreqbyte;
04009     }
04010 
04011   /* If this is not the first branch, the first char and reqbyte have to
04012   match the values from all the previous branches, except that if the previous
04013   value for reqbyte didn't have REQ_VARY set, it can still match, and we set
04014   REQ_VARY for the regex. */
04015 
04016   else
04017     {
04018     /* If we previously had a firstbyte, but it doesn't match the new branch,
04019     we have to abandon the firstbyte for the regex, but if there was previously
04020     no reqbyte, it takes on the value of the old firstbyte. */
04021 
04022     if (firstbyte >= 0 && firstbyte != branchfirstbyte)
04023       {
04024       if (reqbyte < 0) reqbyte = firstbyte;
04025       firstbyte = REQ_NONE;
04026       }
04027 
04028     /* If we (now or from before) have no firstbyte, a firstbyte from the
04029     branch becomes a reqbyte if there isn't a branch reqbyte. */
04030 
04031     if (firstbyte < 0 && branchfirstbyte >= 0 && branchreqbyte < 0)
04032         branchreqbyte = branchfirstbyte;
04033 
04034     /* Now ensure that the reqbytes match */
04035 
04036     if ((reqbyte & ~REQ_VARY) != (branchreqbyte & ~REQ_VARY))
04037       reqbyte = REQ_NONE;
04038     else reqbyte |= branchreqbyte;   /* To "or" REQ_VARY */
04039     }
04040 
04041   /* If lookbehind, check that this branch matches a fixed-length string,
04042   and put the length into the OP_REVERSE item. Temporarily mark the end of
04043   the branch with OP_END. */
04044 
04045   if (lookbehind)
04046     {
04047     int length;
04048     *code = OP_END;
04049     length = find_fixedlength(last_branch, options);
04050     DPRINTF(("fixed length = %d\n", length));
04051     if (length < 0)
04052       {
04053       *errorptr = (length == -2)? ERR36 : ERR25;
04054       *ptrptr = ptr;
04055       return false;
04056       }
04057     PUT(reverse_count, 0, length);
04058     }
04059 
04060   /* Reached end of expression, either ')' or end of pattern. Go back through
04061   the alternative branches and reverse the chain of offsets, with the field in
04062   the BRA item now becoming an offset to the first alternative. If there are
04063   no alternatives, it points to the end of the group. The length in the
04064   terminating ket is always the length of the whole bracketed item. If any of
04065   the ims options were changed inside the group, compile a resetting op-code
04066   following, except at the very end of the pattern. Return leaving the pointer
04067   at the terminating char. */
04068 
04069   if (*ptr != '|')
04070     {
04071     int length = code - last_branch;
04072     do
04073       {
04074       int prev_length = GET(last_branch, 1);
04075       PUT(last_branch, 1, length);
04076       length = prev_length;
04077       last_branch -= length;
04078       }
04079     while (length > 0);
04080 
04081     /* Fill in the ket */
04082 
04083     *code = OP_KET;
04084     PUT(code, 1, code - start_bracket);
04085     code += 1 + LINK_SIZE;
04086 
04087     /* Resetting option if needed */
04088 
04089     if ((options & PCRE_IMS) != oldims && *ptr == ')')
04090       {
04091       *code++ = OP_OPT;
04092       *code++ = oldims;
04093       }
04094 
04095     /* Set values to pass back */
04096 
04097     *codeptr = code;
04098     *ptrptr = ptr;
04099     *firstbyteptr = firstbyte;
04100     *reqbyteptr = reqbyte;
04101     return true;
04102     }
04103 
04104   /* Another branch follows; insert an "or" node. Its length field points back
04105   to the previous branch while the bracket remains open. At the end the chain
04106   is reversed. It's done like this so that the start of the bracket has a
04107   zero offset until it is closed, making it possible to detect recursion. */
04108 
04109   *code = OP_ALT;
04110   PUT(code, 1, code - last_branch);
04111   bc.current = last_branch = code;
04112   code += 1 + LINK_SIZE;
04113   ptr++;
04114   }
04115 return false;
04116 }
04117 

static bool could_be_empty ( const uschar code,
const uschar endcode,
branch_chain bcptr,
bool  utf8 
) [static]

Definition at line 2199 of file pcre.cpp.

References could_be_empty_branch(), branch_chain::current, and branch_chain::outer.

Referenced by compile_branch().

02200 {
02201 while (bcptr != NULL && bcptr->current >= code)
02202   {
02203   if (!could_be_empty_branch(bcptr->current, endcode, utf8)) return false;
02204   bcptr = bcptr->outer;
02205   }
02206 return true;
02207 }
02208 

static bool could_be_empty_branch ( const uschar code,
const uschar endcode,
bool  utf8 
) [static]

Definition at line 2080 of file pcre.cpp.

References first_significant_code(), GET, GET2, LINK_SIZE, OP_ALT, OP_ANY, OP_ANYBYTE, OP_BRA, OP_CHARS, OP_CLASS, OP_CRMINPLUS, OP_CRMINQUERY, OP_CRMINRANGE, OP_CRMINSTAR, OP_CRPLUS, OP_CRQUERY, OP_CRRANGE, OP_CRSTAR, OP_DIGIT, OP_EXACT, OP_KET, OP_KETRMAX, OP_KETRMIN, OP_lengths, OP_MINPLUS, OP_NCLASS, OP_NOT, OP_NOT_DIGIT, OP_NOT_WHITESPACE, OP_NOT_WORDCHAR, OP_NOTEXACT, OP_NOTMINPLUS, OP_NOTPLUS, OP_PLUS, OP_TYPEEXACT, OP_TYPEMINPLUS, OP_TYPEPLUS, OP_WHITESPACE, and OP_WORDCHAR.

Referenced by could_be_empty().

02080 {
02081 register int c;
02082 for (code = first_significant_code(code + 1 + LINK_SIZE, NULL, 0);
02083      code < endcode;
02084      code = first_significant_code(code + OP_lengths[c], NULL, 0))
02085   {
02086   const uschar *ccode;
02087 
02088   c = *code;
02089 
02090   if (c >= OP_BRA)
02091     {
02092     bool empty_branch;
02093     if (GET(code, 1) == 0) return true;    /* Hit unclosed bracket */
02094 
02095     /* Scan a closed bracket */
02096 
02097     empty_branch = false;
02098     do
02099       {
02100       if (!empty_branch && could_be_empty_branch(code, endcode, utf8))
02101         empty_branch = true;
02102       code += GET(code, 1);
02103       }
02104     while (*code == OP_ALT);
02105     if (!empty_branch) return false;   /* All branches are non-empty */
02106     code += 1 + LINK_SIZE;
02107     c = *code;
02108     }
02109 
02110   else switch (c)
02111     {
02112     /* Check for quantifiers after a class */
02113 
02114 
02115     case OP_CLASS:
02116     case OP_NCLASS:
02117     ccode = code + 33;
02118 
02119 
02120     switch (*ccode)
02121       {
02122       case OP_CRSTAR:            /* These could be empty; continue */
02123       case OP_CRMINSTAR:
02124       case OP_CRQUERY:
02125       case OP_CRMINQUERY:
02126       break;
02127 
02128       default:                   /* Non-repeat => class must match */
02129       case OP_CRPLUS:            /* These repeats aren't empty */
02130       case OP_CRMINPLUS:
02131       return false;
02132 
02133       case OP_CRRANGE:
02134       case OP_CRMINRANGE:
02135       if (GET2(ccode, 1) > 0) return false;  /* Minimum > 0 */
02136       break;
02137       }
02138     break;
02139 
02140     /* Opcodes that must match a character */
02141 
02142     case OP_NOT_DIGIT:
02143     case OP_DIGIT:
02144     case OP_NOT_WHITESPACE:
02145     case OP_WHITESPACE:
02146     case OP_NOT_WORDCHAR:
02147     case OP_WORDCHAR:
02148     case OP_ANY:
02149     case OP_ANYBYTE:
02150     case OP_CHARS:
02151     case OP_NOT:
02152     case OP_PLUS:
02153     case OP_MINPLUS:
02154     case OP_EXACT:
02155     case OP_NOTPLUS:
02156     case OP_NOTMINPLUS:
02157     case OP_NOTEXACT:
02158     case OP_TYPEPLUS:
02159     case OP_TYPEMINPLUS:
02160     case OP_TYPEEXACT:
02161     return false;
02162 
02163     /* End of branch */
02164 
02165     case OP_KET:
02166     case OP_KETRMAX:
02167     case OP_KETRMIN:
02168     case OP_ALT:
02169     return true;
02170 
02171     }
02172   }
02173 
02174 return true;
02175 }
02176 

static const uschar* find_bracket ( const uschar code,
int  number 
) [static]

Definition at line 1997 of file pcre.cpp.

References EXTRACT_BASIC_MAX, GET2, LINK_SIZE, OP_BRA, OP_CHARS, OP_END, and OP_lengths.

Referenced by compile_branch().

01997 {
01998 
01999 for (;;)
02000   {
02001   register int c = *code;
02002   if (c == OP_END) return NULL;
02003   else if (c == OP_CHARS) code += code[1] + OP_lengths[c];
02004   else if (c > OP_BRA)
02005     {
02006     int n = c - OP_BRA;
02007     if (n > EXTRACT_BASIC_MAX) n = GET2(code, 2+LINK_SIZE);
02008     if (n == number) return (uschar *)code;
02009     code += OP_lengths[OP_BRA];
02010     }
02011   else
02012     {
02013     code += OP_lengths[c];
02014 
02015     }
02016   }
02017 }
02018 

static int find_firstassertedchar ( const uschar code,
int *  options,
bool  inassert 
) [static]

Definition at line 4294 of file pcre.cpp.

References first_significant_code(), GET, LINK_SIZE, OP_ALT, OP_ASSERT, OP_BRA, OP_CHARS, OP_COND, OP_EXACT, OP_MINPLUS, OP_ONCE, OP_PLUS, PCRE_CASELESS, and REQ_CASELESS.

Referenced by pcre_compile().

04294 {
04295 register int c = -1;
04296 do {
04297    int d;
04298    const uschar *scode =
04299      first_significant_code(code + 1+LINK_SIZE, options, PCRE_CASELESS);
04300    register int op = *scode;
04301 
04302    if (op >= OP_BRA) op = OP_BRA;
04303 
04304    switch(op)
04305      {
04306      default:
04307      return -1;
04308 
04309      case OP_BRA:
04310      case OP_ASSERT:
04311      case OP_ONCE:
04312      case OP_COND:
04313      if ((d = find_firstassertedchar(scode, options, op == OP_ASSERT)) < 0)
04314        return -1;
04315      if (c < 0) c = d; else if (c != d) return -1;
04316      break;
04317 
04318      case OP_EXACT:       /* Fall through */
04319      scode++;
04320 
04321      case OP_CHARS:       /* Fall through */
04322      scode++;
04323 
04324      case OP_PLUS:
04325      case OP_MINPLUS:
04326      if (!inassert) return -1;
04327      if (c < 0)
04328        {
04329        c = scode[1];
04330        if ((*options & PCRE_CASELESS) != 0) c |= REQ_CASELESS;
04331        }
04332      else if (c != scode[1]) return -1;
04333      break;
04334      }
04335 
04336    code += GET(code, 1);
04337    }
04338 while (*code == OP_ALT);
04339 return c;
04340 }
04341 

static int find_fixedlength ( uschar code,
int  options 
) [static]

Definition at line 1830 of file pcre.cpp.

References GET, GET2, LINK_SIZE, OP_ALT, OP_ANY, OP_ANYBYTE, OP_ASSERT, OP_ASSERT_NOT, OP_ASSERTBACK, OP_ASSERTBACK_NOT, OP_BRA, OP_BRANUMBER, OP_CALLOUT, OP_CHARS, OP_CIRC, OP_CLASS, OP_COND, OP_CREF, OP_CRMINQUERY, OP_CRMINRANGE, OP_CRMINSTAR, OP_CRQUERY, OP_CRRANGE, OP_CRSTAR, OP_DIGIT, OP_DOLL, OP_END, OP_EOD, OP_EODN, OP_EXACT, OP_KET, OP_KETRMAX, OP_KETRMIN, OP_lengths, OP_NCLASS, OP_NOT_DIGIT, OP_NOT_WHITESPACE, OP_NOT_WORD_BOUNDARY, OP_NOT_WORDCHAR, OP_ONCE, OP_OPT, OP_REVERSE, OP_SOD, OP_SOM, OP_TYPEEXACT, OP_WHITESPACE, OP_WORD_BOUNDARY, and OP_WORDCHAR.

Referenced by compile_regex().

01830 {
01831 int length = -1;
01832 
01833 register int branchlength = 0;
01834 register uschar *cc = code + 1 + LINK_SIZE;
01835 
01836 /* Scan along the opcodes for this branch. If we get to the end of the
01837 branch, check the length against that of the other branches. */
01838 
01839 for (;;)
01840   {
01841   int d;
01842   register int op = *cc;
01843   if (op >= OP_BRA) op = OP_BRA;
01844 
01845   switch (op)
01846     {
01847     case OP_BRA:
01848     case OP_ONCE:
01849     case OP_COND:
01850     d = find_fixedlength(cc, options);
01851     if (d < 0) return d;
01852     branchlength += d;
01853     do cc += GET(cc, 1); while (*cc == OP_ALT);
01854     cc += 1 + LINK_SIZE;
01855     break;
01856 
01857     /* Reached end of a branch; if it's a ket it is the end of a nested
01858     call. If it's ALT it is an alternation in a nested call. If it is
01859     END it's the end of the outer call. All can be handled by the same code. */
01860 
01861     case OP_ALT:
01862     case OP_KET:
01863     case OP_KETRMAX:
01864     case OP_KETRMIN:
01865     case OP_END:
01866     if (length < 0) length = branchlength;
01867       else if (length != branchlength) return -1;
01868     if (*cc != OP_ALT) return length;
01869     cc += 1 + LINK_SIZE;
01870     branchlength = 0;
01871     break;
01872 
01873     /* Skip over assertive subpatterns */
01874 
01875     case OP_ASSERT:
01876     case OP_ASSERT_NOT:
01877     case OP_ASSERTBACK:
01878     case OP_ASSERTBACK_NOT:
01879     do cc += GET(cc, 1); while (*cc == OP_ALT);
01880     /* Fall through */
01881 
01882     /* Skip over things that don't match chars */
01883 
01884     case OP_REVERSE:
01885     case OP_BRANUMBER:
01886     case OP_CREF:
01887     case OP_OPT:
01888     case OP_CALLOUT:
01889     case OP_SOD:
01890     case OP_SOM:
01891     case OP_EOD:
01892     case OP_EODN:
01893     case OP_CIRC:
01894     case OP_DOLL:
01895     case OP_NOT_WORD_BOUNDARY:
01896     case OP_WORD_BOUNDARY:
01897     cc += OP_lengths[*cc];
01898     break;
01899 
01900     /* Handle char strings. In UTF-8 mode we must count characters, not bytes.
01901     This requires a scan of the string, unfortunately. We assume valid UTF-8
01902     strings, so all we do is reduce the length by one for every byte whose bits
01903     are 10xxxxxx. */
01904 
01905     case OP_CHARS:
01906     branchlength += *(++cc);
01907     cc += *cc + 1;
01908     break;
01909 
01910     /* Handle exact repetitions. The count is already in characters, but we
01911     need to skip over a multibyte character in UTF8 mode.  */
01912 
01913     case OP_EXACT:
01914     branchlength += GET2(cc,1);
01915     cc += 4;
01916     break;
01917 
01918     case OP_TYPEEXACT:
01919     branchlength += GET2(cc,1);
01920     cc += 4;
01921     break;
01922 
01923     /* Handle single-char matchers */
01924 
01925     case OP_NOT_DIGIT:
01926     case OP_DIGIT:
01927     case OP_NOT_WHITESPACE:
01928     case OP_WHITESPACE:
01929     case OP_NOT_WORDCHAR:
01930     case OP_WORDCHAR:
01931     case OP_ANY:
01932     branchlength++;
01933     cc++;
01934     break;
01935 
01936     /* The single-byte matcher isn't allowed */
01937 
01938     case OP_ANYBYTE:
01939     return -2;
01940 
01941     /* Check a class for variable quantification */
01942 
01943 
01944     case OP_CLASS:
01945     case OP_NCLASS:
01946     cc += 33;
01947 
01948     switch (*cc)
01949       {
01950       case OP_CRSTAR:
01951       case OP_CRMINSTAR:
01952       case OP_CRQUERY:
01953       case OP_CRMINQUERY:
01954       return -1;
01955 
01956       case OP_CRRANGE:
01957       case OP_CRMINRANGE:
01958       if (GET2(cc,1) != GET2(cc,3)) return -1;
01959       branchlength += GET2(cc,1);
01960       cc += 5;
01961       break;
01962 
01963       default:
01964       branchlength++;
01965       }
01966     break;
01967 
01968     /* Anything else is variable length */
01969 
01970     default:
01971     return -1;
01972     }
01973   }
01974 /* Control never gets here */
01975 }
01976 

static const uschar* find_recurse ( const uschar code,
bool  utf8 
) [static]

Definition at line 2037 of file pcre.cpp.

References OP_BRA, OP_CHARS, OP_END, OP_lengths, and OP_RECURSE.

Referenced by adjust_recurse().

02037 {
02038 utf8 = utf8;               /* Stop pedantic compilers complaining */
02039 
02040 for (;;)
02041   {
02042   register int c = *code;
02043   if (c == OP_END) return NULL;
02044   else if (c == OP_RECURSE) return code;
02045   else if (c == OP_CHARS) code += code[1] + OP_lengths[c];
02046   else if (c > OP_BRA)
02047     {
02048     code += OP_lengths[OP_BRA];
02049     }
02050   else
02051     {
02052     code += OP_lengths[c];
02053 
02054     }
02055   }
02056 }
02057 

static const uschar* first_significant_code ( const uschar code,
int *  options,
int  optbit 
) [static]

Definition at line 1777 of file pcre.cpp.

References GET, OP_ALT, OP_ASSERT_NOT, OP_ASSERTBACK, OP_ASSERTBACK_NOT, OP_BRANUMBER, OP_CALLOUT, OP_CREF, OP_lengths, OP_NOT_WORD_BOUNDARY, OP_OPT, and OP_WORD_BOUNDARY.

Referenced by could_be_empty_branch(), find_firstassertedchar(), is_anchored(), and is_startline().

01777 {
01778 for (;;)
01779   {
01780   switch ((int)*code)
01781     {
01782     case OP_OPT:
01783     if (optbit > 0 && ((int)code[1] & optbit) != (*options & optbit))
01784       *options = (int)code[1];
01785     code += 2;
01786     break;
01787 
01788     case OP_ASSERT_NOT:
01789     case OP_ASSERTBACK:
01790     case OP_ASSERTBACK_NOT:
01791     do code += GET(code, 1); while (*code == OP_ALT);
01792     /* Fall through */
01793 
01794     case OP_CALLOUT:
01795     case OP_CREF:
01796     case OP_BRANUMBER:
01797     case OP_WORD_BOUNDARY:
01798     case OP_NOT_WORD_BOUNDARY:
01799     code += OP_lengths[*code];
01800     break;
01801 
01802     default:
01803     return code;
01804     }
01805   }
01806 /* Control never reaches here */
01807 }
01808 

static bool is_anchored ( register const uschar code,
int *  options,
unsigned int  bracket_map,
unsigned int  backref_map 
) [static]

Definition at line 4161 of file pcre.cpp.

References EXTRACT_BASIC_MAX, first_significant_code(), GET, GET2, LINK_SIZE, OP_ALT, OP_ANY, OP_ASSERT, OP_BRA, OP_CIRC, OP_COND, OP_ONCE, OP_SOD, OP_SOM, OP_TYPEMINSTAR, OP_TYPESTAR, PCRE_DOTALL, and PCRE_MULTILINE.

Referenced by pcre_compile().

04162 {
04163 do {
04164    const uschar *scode =
04165      first_significant_code(code + 1+LINK_SIZE, options, PCRE_MULTILINE);
04166    register int op = *scode;
04167 
04168    /* Capturing brackets */
04169 
04170    if (op > OP_BRA)
04171      {
04172      int new_map;
04173      op -= OP_BRA;
04174      if (op > EXTRACT_BASIC_MAX) op = GET2(scode, 2+LINK_SIZE);
04175      new_map = bracket_map | ((op < 32)? (1 << op) : 1);
04176      if (!is_anchored(scode, options, new_map, backref_map)) return false;
04177      }
04178 
04179    /* Other brackets */
04180 
04181    else if (op == OP_BRA || op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
04182      {
04183      if (!is_anchored(scode, options, bracket_map, backref_map)) return false;
04184      }
04185 
04186    /* .* is not anchored unless DOTALL is set and it isn't in brackets that
04187    are or may be referenced. */
04188 
04189    else if ((op == OP_TYPESTAR || op == OP_TYPEMINSTAR) &&
04190             (*options & PCRE_DOTALL) != 0)
04191      {
04192      if (scode[1] != OP_ANY || (bracket_map & backref_map) != 0) return false;
04193      }
04194 
04195    /* Check for explicit anchoring */
04196 
04197    else if (op != OP_SOD && op != OP_SOM &&
04198            ((*options & PCRE_MULTILINE) != 0 || op != OP_CIRC))
04199      return false;
04200    code += GET(code, 1);
04201    }
04202 while (*code == OP_ALT);   /* Loop for each alternative */
04203 return true;
04204 }
04205 

static bool is_counted_repeat ( const uschar p  )  [static]

Definition at line 1685 of file pcre.cpp.

References ctype_digit, and digitab.

Referenced by compile_branch(), and pcre_compile().

01685 {
01686 if ((digitab[*p++] & ctype_digit) == 0) return false;
01687 while ((digitab[*p] & ctype_digit) != 0) p++;
01688 if (*p == '}') return true;
01689 
01690 if (*p++ != ',') return false;
01691 if (*p == '}') return true;
01692 
01693 if ((digitab[*p++] & ctype_digit) == 0) return false;
01694 while ((digitab[*p] & ctype_digit) != 0) p++;
01695 
01696 return (*p == '}');
01697 }
01698 

static bool is_startline ( const uschar code,
unsigned int  bracket_map,
unsigned int  backref_map 
) [static]

Definition at line 4231 of file pcre.cpp.

References EXTRACT_BASIC_MAX, first_significant_code(), GET, GET2, LINK_SIZE, OP_ALT, OP_ANY, OP_ASSERT, OP_BRA, OP_CIRC, OP_COND, OP_ONCE, OP_TYPEMINSTAR, and OP_TYPESTAR.

Referenced by pcre_compile().

04232 {
04233 do {
04234    const uschar *scode = first_significant_code(code + 1+LINK_SIZE, NULL, 0);
04235    register int op = *scode;
04236 
04237    /* Capturing brackets */
04238 
04239    if (op > OP_BRA)
04240      {
04241      int new_map;
04242      op -= OP_BRA;
04243      if (op > EXTRACT_BASIC_MAX) op = GET2(scode, 2+LINK_SIZE);
04244      new_map = bracket_map | ((op < 32)? (1 << op) : 1);
04245      if (!is_startline(scode, new_map, backref_map)) return false;
04246      }
04247 
04248    /* Other brackets */
04249 
04250    else if (op == OP_BRA || op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
04251      { if (!is_startline(scode, bracket_map, backref_map)) return false; }
04252 
04253    /* .* is not anchored unless DOTALL is set and it isn't in brackets that
04254    may be referenced. */
04255 
04256    else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR)
04257      {
04258      if (scode[1] != OP_ANY || (bracket_map & backref_map) != 0) return false;
04259      }
04260 
04261    /* Check for explicit circumflex */
04262 
04263    else if (op != OP_CIRC) return false;
04264    code += GET(code, 1);
04265    }
04266 while (*code == OP_ALT);  /* Loop for each alternative */
04267 return true;
04268 }
04269 

static int match ( REGISTER const uschar eptr,
REGISTER const uschar ecode,
int  offset_top,
match_data md,
unsigned long int  ims,
eptrblock eptrb,
int  flags 
) [static]

Definition at line 5414 of file pcre.cpp.

References recursion_info::after_call, CMuxAlarm::bAlarmed, pcre_callout_block::callout_data, pcre_callout_block::callout_number, pcre_callout_block::capture_last, pcre_callout_block::capture_top, CREF_RECURSE, ctype_digit, ctype_space, ctype_word, pcre_callout_block::current_position, DPRINTF, eptrblock::epb_prev, eptrblock::epb_saved_eptr, EXTRACT_BASIC_MAX, fc, fi, GET, GET2, GETCHARINCTEST, recursion_info::group_num, LINK_SIZE, match_condassert, match_isgroup, MATCH_MATCH, MATCH_NOMATCH, match_ref(), md, MuxAlarm, NEWLINE, next, recursion_info::offset_save, pcre_callout_block::offset_vector, OP_ALT, OP_ANY, OP_ANYBYTE, OP_ASSERT, OP_ASSERT_NOT, OP_ASSERTBACK, OP_ASSERTBACK_NOT, OP_BRA, OP_BRAMINZERO, OP_BRANUMBER, OP_BRAZERO, OP_CALLOUT, OP_CHARS, OP_CIRC, OP_CLASS, OP_COND, OP_CREF, OP_CRMINPLUS, OP_CRMINQUERY, OP_CRMINRANGE, OP_CRMINSTAR, OP_CRPLUS, OP_CRQUERY, OP_CRRANGE, OP_CRSTAR, OP_DIGIT, OP_DOLL, OP_END, OP_EOD, OP_EODN, OP_EXACT, OP_KET, OP_KETRMAX, OP_KETRMIN, OP_MINPLUS, OP_MINQUERY, OP_MINSTAR, OP_MINUPTO, OP_NCLASS, OP_NOT, OP_NOT_DIGIT, OP_NOT_WHITESPACE, OP_NOT_WORD_BOUNDARY, OP_NOT_WORDCHAR, OP_NOTEXACT, OP_NOTMINPLUS, OP_NOTMINQUERY, OP_NOTMINSTAR, OP_NOTMINUPTO, OP_NOTPLUS, OP_NOTQUERY, OP_NOTSTAR, OP_NOTUPTO, OP_ONCE, OP_OPT, OP_PLUS, OP_QUERY, OP_RECURSE, OP_REF, OP_REVERSE, OP_SOD, OP_SOM, OP_STAR, OP_TYPEEXACT, OP_TYPEMINPLUS, OP_TYPEMINQUERY, OP_TYPEMINSTAR, OP_TYPEMINUPTO, OP_TYPEPLUS, OP_TYPEQUERY, OP_TYPESTAR, OP_TYPEUPTO, OP_UPTO, OP_WHITESPACE, OP_WORD_BOUNDARY, OP_WORDCHAR, pcre_callout, PCRE_CASELESS, PCRE_DOTALL, PCRE_ERROR_MATCHLIMIT, PCRE_ERROR_NOMEMORY, PCRE_ERROR_UNKNOWN_NODE, PCRE_IMS, PCRE_MULTILINE, recursion_info::prevrec, REC_STACK_SAVE_MAX, rep_max, rep_min, RMATCH, RRETURN, recursion_info::save_start, recursion_info::saved_max, pcre_callout_block::start_match, pcre_callout_block::subject, pcre_callout_block::subject_length, and pcre_callout_block::version.

Referenced by absolute_name(), atr_match(), atr_match1(), match_numeric(), match_player(), and pcre_exec().

05416 {
05417 /* These variables do not need to be preserved over recursion in this function,
05418 so they can be ordinary variables in all cases. Mark them with "register"
05419 because they are used a lot in loops. */
05420 
05421 register int rrc;    /* Returns from recursive calls */
05422 register int i;      /* Used for loops not involving calls to RMATCH() */
05423 register int c;      /* Character values not kept over RMATCH() calls */
05424 
05425 /* When recursion is not being used, all "local" variables that have to be
05426 preserved over calls to RMATCH() are part of a "frame" which is obtained from
05427 heap storage. Set up the top-level frame here; others are obtained from the
05428 heap whenever RMATCH() does a "recursion". See the macro definitions above. */
05429 
05430 #define fi i
05431 #define fc c
05432 
05433 const uschar *callpat;             /* Many of these variables are used ony */
05434                                    /* small blocks of the code. My normal  */
05435 const uschar *data;                /* style of coding would have declared  */
05436                                    /* them within each of those blocks.    */
05437 const uschar *next;                /* However, in order to accommodate the */
05438 const uschar *pp;                  /* version of this code that uses an    */
05439 const uschar *prev;                /* external "stack" implemented on the  */
05440 const uschar *saved_eptr;          /* heap, it is easier to declare them   */
05441                                    /* all here, so the declarations can    */
05442 recursion_info new_recursive;      /* be cut out in a block. The only      */
05443                                    /* declarations within blocks below are */
05444 bool cur_is_word;                  /* for variables that do not have to    */
05445 bool condition;                    /* be preserved over a recursive call   */
05446 bool minimize;                     /* to RMATCH().                         */
05447 bool prev_is_word;
05448 
05449 unsigned long int original_ims;
05450 
05451 int ctype;
05452 int length;
05453 int max;
05454 int min;
05455 int number;
05456 int offset;
05457 int op;
05458 int save_capture_last;
05459 int save_offset1, save_offset2, save_offset3;
05460 int stacksave[REC_STACK_SAVE_MAX];
05461 
05462 eptrblock newptrb;
05463 
05464 /* OK, now we can get on with the real code of the function. Recursion is
05465 specified by the macros RMATCH and RRETURN. When NO_RECURSE is *not* defined,
05466 these just turn into a recursive call to match() and a "return", respectively.
05467 However, RMATCH isn't like a function call because it's quite a complicated
05468 macro. It has to be used in one particular way. This shouldn't, however, impact
05469 performance when true recursion is being used. */
05470 
05471 if (md->match_call_count++ >= md->match_limit) RRETURN(PCRE_ERROR_MATCHLIMIT);
05472 
05473 original_ims = ims;    /* Save for resetting on ')' */
05474 
05475 /* At the start of a bracketed group, add the current subject pointer to the
05476 stack of such pointers, to be re-instated at the end of the group when we hit
05477 the closing ket. When match() is called in other circumstances, we don't add to
05478 this stack. */
05479 
05480 if ((flags & match_isgroup) != 0)
05481   {
05482   newptrb.epb_prev = eptrb;
05483   newptrb.epb_saved_eptr = eptr;
05484   eptrb = &newptrb;
05485   }
05486 
05487 /* Now start processing the operations. */
05488 
05489 for (;!MuxAlarm.bAlarmed;)
05490   {
05491   op = *ecode;
05492   minimize = false;
05493 
05494   /* Opening capturing bracket. If there is space in the offset vector, save
05495   the current subject position in the working slot at the top of the vector. We
05496   mustn't change the current values of the data slot, because they may be set
05497   from a previous iteration of this group, and be referred to by a reference
05498   inside the group.
05499 
05500   If the bracket fails to match, we need to restore this value and also the
05501   values of the final offsets, in case they were set by a previous iteration of
05502   the same bracket.
05503 
05504   If there isn't enough space in the offset vector, treat this as if it were a
05505   non-capturing bracket. Don't worry about setting the flag for the error case
05506   here; that is handled in the code for KET. */
05507 
05508   if (op > OP_BRA)
05509     {
05510     number = op - OP_BRA;
05511 
05512     /* For extended extraction brackets (large number), we have to fish out the
05513     number from a dummy opcode at the start. */
05514 
05515     if (number > EXTRACT_BASIC_MAX)
05516       number = GET2(ecode, 2+LINK_SIZE);
05517     offset = number << 1;
05518 
05519     if (offset < md->offset_max)
05520       {
05521       save_offset1 = md->offset_vector[offset];
05522       save_offset2 = md->offset_vector[offset+1];
05523       save_offset3 = md->offset_vector[md->offset_end - number];
05524       save_capture_last = md->capture_last;
05525 
05526       DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
05527       md->offset_vector[md->offset_end - number] = eptr - md->start_subject;
05528 
05529       do
05530         {
05531         RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb,
05532           match_isgroup);
05533         if (rrc != MATCH_NOMATCH) RRETURN(rrc);
05534         md->capture_last = save_capture_last;
05535         ecode += GET(ecode, 1);
05536         }
05537       while (*ecode == OP_ALT);
05538 
05539       DPRINTF(("bracket %d failed\n", number));
05540 
05541       md->offset_vector[offset] = save_offset1;
05542       md->offset_vector[offset+1] = save_offset2;
05543       md->offset_vector[md->offset_end - number] = save_offset3;
05544 
05545       RRETURN(MATCH_NOMATCH);
05546       }
05547 
05548     /* Insufficient room for saving captured contents */
05549 
05550     else op = OP_BRA;
05551     }
05552 
05553   /* Other types of node can be handled by a switch */
05554 
05555   switch(op)
05556     {
05557     case OP_BRA:     /* Non-capturing bracket: optimized */
05558     DPRINTF(("start bracket 0\n"));
05559     do
05560       {
05561       RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb,
05562         match_isgroup);
05563       if (rrc != MATCH_NOMATCH) RRETURN(rrc);
05564       ecode += GET(ecode, 1);
05565       }
05566     while (*ecode == OP_ALT);
05567     DPRINTF(("bracket 0 failed\n"));
05568     RRETURN(MATCH_NOMATCH);
05569 
05570     /* Conditional group: compilation checked that there are no more than
05571     two branches. If the condition is false, skipping the first branch takes us
05572     past the end if there is only one branch, but that's OK because that is
05573     exactly what going to the ket would do. */
05574 
05575     case OP_COND:
05576     if (ecode[LINK_SIZE+1] == OP_CREF) /* Condition extract or recurse test */
05577       {
05578       offset = GET2(ecode, LINK_SIZE+2) << 1;  /* Doubled ref number */
05579       condition = (offset == CREF_RECURSE * 2)?
05580         (md->recursive != NULL) :
05581         (offset < offset_top && md->offset_vector[offset] >= 0);
05582       RMATCH(rrc, eptr, ecode + (condition?
05583         (LINK_SIZE + 4) : (LINK_SIZE + 1 + GET(ecode, 1))),
05584         offset_top, md, ims, eptrb, match_isgroup);
05585       RRETURN(rrc);
05586       }
05587 
05588     /* The condition is an assertion. Call match() to evaluate it - setting
05589     the final argument true causes it to stop at the end of an assertion. */
05590 
05591     else
05592       {
05593       RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL,
05594           match_condassert | match_isgroup);
05595       if (rrc == MATCH_MATCH)
05596         {
05597         ecode += 1 + LINK_SIZE + GET(ecode, LINK_SIZE+2);
05598         while (*ecode == OP_ALT) ecode += GET(ecode, 1);
05599         }
05600       else if (rrc != MATCH_NOMATCH)
05601         {
05602         RRETURN(rrc);         /* Need braces because of following else */
05603         }
05604       else ecode += GET(ecode, 1);
05605       RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb,
05606         match_isgroup);
05607       RRETURN(rrc);
05608       }
05609     /* Control never reaches here */
05610 
05611     /* Skip over conditional reference or large extraction number data if
05612     encountered. */
05613 
05614     case OP_CREF:
05615     case OP_BRANUMBER:
05616     ecode += 3;
05617     break;
05618 
05619     /* End of the pattern. If we are in a recursion, we should restore the
05620     offsets appropriately and continue from after the call. */
05621 
05622     case OP_END:
05623     if (md->recursive != NULL && md->recursive->group_num == 0)
05624       {
05625       recursion_info *rec = md->recursive;
05626       DPRINTF(("Hit the end in a (?0) recursion\n"));
05627       md->recursive = rec->prevrec;
05628       memmove(md->offset_vector, rec->offset_save,
05629         rec->saved_max * sizeof(int));
05630       md->start_match = rec->save_start;
05631       ims = original_ims;
05632       ecode = rec->after_call;
05633       break;
05634       }
05635 
05636     /* Otherwise, if PCRE_NOTEMPTY is set, fail if we have matched an empty
05637     string - backtracking will then try other alternatives, if any. */
05638 
05639     if (md->notempty && eptr == md->start_match) RRETURN(MATCH_NOMATCH);
05640     md->end_match_ptr = eptr;          /* Record where we ended */
05641     md->end_offset_top = offset_top;   /* and how many extracts were taken */
05642     RRETURN(MATCH_MATCH);
05643 
05644     /* Change option settings */
05645 
05646     case OP_OPT:
05647     ims = ecode[1];
05648     ecode += 2;
05649     DPRINTF(("ims set to %02lx\n", ims));
05650     break;
05651 
05652     /* Assertion brackets. Check the alternative branches in turn - the
05653     matching won't pass the KET for an assertion. If any one branch matches,
05654     the assertion is true. Lookbehind assertions have an OP_REVERSE item at the
05655     start of each branch to move the current point backwards, so the code at
05656     this level is identical to the lookahead case. */
05657 
05658     case OP_ASSERT:
05659     case OP_ASSERTBACK:
05660     do
05661       {
05662       RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL,
05663         match_isgroup);
05664       if (rrc == MATCH_MATCH) break;
05665       if (rrc != MATCH_NOMATCH) RRETURN(rrc);
05666       ecode += GET(ecode, 1);
05667       }
05668     while (*ecode == OP_ALT);
05669     if (*ecode == OP_KET) RRETURN(MATCH_NOMATCH);
05670 
05671     /* If checking an assertion for a condition, return MATCH_MATCH. */
05672 
05673     if ((flags & match_condassert) != 0) RRETURN(MATCH_MATCH);
05674 
05675     /* Continue from after the assertion, updating the offsets high water
05676     mark, since extracts may have been taken during the assertion. */
05677 
05678     do ecode += GET(ecode,1); while (*ecode == OP_ALT);
05679     ecode += 1 + LINK_SIZE;
05680     offset_top = md->end_offset_top;
05681     continue;
05682 
05683     /* Negative assertion: all branches must fail to match */
05684 
05685     case OP_ASSERT_NOT:
05686     case OP_ASSERTBACK_NOT:
05687     do
05688       {
05689       RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL,
05690         match_isgroup);
05691       if (rrc == MATCH_MATCH) RRETURN(MATCH_NOMATCH);
05692       if (rrc != MATCH_NOMATCH) RRETURN(rrc);
05693       ecode += GET(ecode,1);
05694       }
05695     while (*ecode == OP_ALT);
05696 
05697     if ((flags & match_condassert) != 0) RRETURN(MATCH_MATCH);
05698 
05699     ecode += 1 + LINK_SIZE;
05700     continue;
05701 
05702     /* Move the subject pointer back. This occurs only at the start of
05703     each branch of a lookbehind assertion. If we are too close to the start to
05704     move back, this match function fails. When working with UTF-8 we move
05705     back a number of characters, not bytes. */
05706 
05707     case OP_REVERSE:
05708 
05709     /* No UTF-8 support, or not in UTF-8 mode: count is byte count */
05710 
05711       {
05712       eptr -= GET(ecode,1);
05713       if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH);
05714       }
05715 
05716     /* Skip to next op code */
05717 
05718     ecode += 1 + LINK_SIZE;
05719     break;
05720 
05721     /* The callout item calls an external function, if one is provided, passing
05722     details of the match so far. This is mainly for debugging, though the
05723     function is able to force a failure. */
05724 
05725     case OP_CALLOUT:
05726     if (pcre_callout != NULL)
05727       {
05728       pcre_callout_block cb;
05729       cb.version          = 0;   /* Version 0 of the callout block */
05730       cb.callout_number   = ecode[1];
05731       cb.offset_vector    = md->offset_vector;
05732       cb.subject          = (const char *)md->start_subject;
05733       cb.subject_length   = md->end_subject - md->start_subject;
05734       cb.start_match      = md->start_match - md->start_subject;
05735       cb.current_position = eptr - md->start_subject;
05736       cb.capture_top      = offset_top/2;
05737       cb.capture_last     = md->capture_last;
05738       cb.callout_data     = md->callout_data;
05739       if ((rrc = (*pcre_callout)(&cb)) > 0) RRETURN(MATCH_NOMATCH);
05740       if (rrc < 0) RRETURN(rrc);
05741       }
05742     ecode += 2;
05743     break;
05744 
05745     /* Recursion either matches the current regex, or some subexpression. The
05746     offset data is the offset to the starting bracket from the start of the
05747     whole pattern. (This is so that it works from duplicated subpatterns.)
05748 
05749     If there are any capturing brackets started but not finished, we have to
05750     save their starting points and reinstate them after the recursion. However,
05751     we don't know how many such there are (offset_top records the completed
05752     total) so we just have to save all the potential data. There may be up to
05753     65535 such values, which is too large to put on the stack, but using malloc
05754     for small numbers seems expensive. As a compromise, the stack is used when
05755     there are no more than REC_STACK_SAVE_MAX values to store; otherwise malloc
05756     is used. A problem is what to do if the malloc fails ... there is no way of
05757     returning to the top level with an error. Save the top REC_STACK_SAVE_MAX
05758     values on the stack, and accept that the rest may be wrong.
05759 
05760     There are also other values that have to be saved. We use a chained
05761     sequence of blocks that actually live on the stack. Thanks to Robin Houston
05762     for the original version of this logic. */
05763 
05764     case OP_RECURSE:
05765       {
05766       callpat = md->start_code + GET(ecode, 1);
05767       new_recursive.group_num = *callpat - OP_BRA;
05768 
05769       /* For extended extraction brackets (large number), we have to fish out
05770       the number from a dummy opcode at the start. */
05771 
05772       if (new_recursive.group_num > EXTRACT_BASIC_MAX)
05773         new_recursive.group_num = GET2(callpat, 2+LINK_SIZE);
05774 
05775       /* Add to "recursing stack" */
05776 
05777       new_recursive.prevrec = md->recursive;
05778       md->recursive = &new_recursive;
05779 
05780       /* Find where to continue from afterwards */
05781 
05782       ecode += 1 + LINK_SIZE;
05783       new_recursive.after_call = ecode;
05784 
05785       /* Now save the offset data. */
05786 
05787       new_recursive.saved_max = md->offset_end;
05788       if (new_recursive.saved_max <= REC_STACK_SAVE_MAX)
05789         new_recursive.offset_save = stacksave;
05790       else
05791         {
05792         new_recursive.offset_save =
05793           static_cast<int *>(malloc(new_recursive.saved_max * sizeof(int)));
05794         if (new_recursive.offset_save == NULL) RRETURN(PCRE_ERROR_NOMEMORY);
05795         }
05796 
05797       memcpy(new_recursive.offset_save, md->offset_vector,
05798             new_recursive.saved_max * sizeof(int));
05799       new_recursive.save_start = md->start_match;
05800       md->start_match = eptr;
05801 
05802       /* OK, now we can do the recursion. For each top-level alternative we
05803       restore the offset and recursion data. */
05804 
05805       DPRINTF(("Recursing into group %d\n", new_recursive.group_num));
05806       do
05807         {
05808         RMATCH(rrc, eptr, callpat + 1 + LINK_SIZE, offset_top, md, ims,
05809             eptrb, match_isgroup);
05810         if (rrc == MATCH_MATCH)
05811           {
05812           md->recursive = new_recursive.prevrec;
05813           if (new_recursive.offset_save != stacksave)
05814             free(new_recursive.offset_save);
05815           RRETURN(MATCH_MATCH);
05816           }
05817         else if (rrc != MATCH_NOMATCH) RRETURN(rrc);
05818 
05819         md->recursive = &new_recursive;
05820         memcpy(md->offset_vector, new_recursive.offset_save,
05821             new_recursive.saved_max * sizeof(int));
05822         callpat += GET(callpat, 1);
05823         }
05824       while (*callpat == OP_ALT);
05825 
05826       DPRINTF(("Recursion didn't match\n"));
05827       md->recursive = new_recursive.prevrec;
05828       if (new_recursive.offset_save != stacksave)
05829         free(new_recursive.offset_save);
05830       RRETURN(MATCH_NOMATCH);
05831       }
05832     /* Control never reaches here */
05833 
05834     /* "Once" brackets are like assertion brackets except that after a match,
05835     the point in the subject string is not moved back. Thus there can never be
05836     a move back into the brackets. Friedl calls these "atomic" subpatterns.
05837     Check the alternative branches in turn - the matching won't pass the KET
05838     for this kind of subpattern. If any one branch matches, we carry on as at
05839     the end of a normal bracket, leaving the subject pointer. */
05840 
05841     case OP_ONCE:
05842       {
05843       prev = ecode;
05844       saved_eptr = eptr;
05845 
05846       do
05847         {
05848         RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims,
05849           eptrb, match_isgroup);
05850         if (rrc == MATCH_MATCH) break;
05851         if (rrc != MATCH_NOMATCH) RRETURN(rrc);
05852         ecode += GET(ecode,1);
05853         }
05854       while (*ecode == OP_ALT);
05855 
05856       /* If hit the end of the group (which could be repeated), fail */
05857 
05858       if (*ecode != OP_ONCE && *ecode != OP_ALT) RRETURN(MATCH_NOMATCH);
05859 
05860       /* Continue as from after the assertion, updating the offsets high water
05861       mark, since extracts may have been taken. */
05862 
05863       do ecode += GET(ecode,1); while (*ecode == OP_ALT);
05864 
05865       offset_top = md->end_offset_top;
05866       eptr = md->end_match_ptr;
05867 
05868       /* For a non-repeating ket, just continue at this level. This also
05869       happens for a repeating ket if no characters were matched in the group.
05870       This is the forcible breaking of infinite loops as implemented in Perl
05871       5.005. If there is an options reset, it will get obeyed in the normal
05872       course of events. */
05873 
05874       if (*ecode == OP_KET || eptr == saved_eptr)
05875         {
05876         ecode += 1+LINK_SIZE;
05877         break;
05878         }
05879 
05880       /* The repeating kets try the rest of the pattern or restart from the
05881       preceding bracket, in the appropriate order. We need to reset any options
05882       that changed within the bracket before re-running it, so check the next
05883       opcode. */
05884 
05885       if (ecode[1+LINK_SIZE] == OP_OPT)
05886         {
05887         ims = (ims & ~PCRE_IMS) | ecode[4];
05888         DPRINTF(("ims set to %02lx at group repeat\n", ims));
05889         }
05890 
05891       if (*ecode == OP_KETRMIN)
05892         {
05893         RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb, 0);
05894         if (rrc != MATCH_NOMATCH) RRETURN(rrc);
05895         RMATCH(rrc, eptr, prev, offset_top, md, ims, eptrb, match_isgroup);
05896         if (rrc != MATCH_NOMATCH) RRETURN(rrc);
05897         }
05898       else  /* OP_KETRMAX */
05899         {
05900         RMATCH(rrc, eptr, prev, offset_top, md, ims, eptrb, match_isgroup);
05901         if (rrc != MATCH_NOMATCH) RRETURN(rrc);
05902         RMATCH(rrc, eptr, ecode + 1+LINK_SIZE, offset_top, md, ims, eptrb, 0);
05903         if (rrc != MATCH_NOMATCH) RRETURN(rrc);
05904         }
05905       }
05906     RRETURN(MATCH_NOMATCH);
05907 
05908     /* An alternation is the end of a branch; scan along to find the end of the
05909     bracketed group and go to there. */
05910 
05911     case OP_ALT:
05912     do ecode += GET(ecode,1); while (*ecode == OP_ALT);
05913     break;
05914 
05915     /* BRAZERO and BRAMINZERO occur just before a bracket group, indicating
05916     that it may occur zero times. It may repeat infinitely, or not at all -
05917     i.e. it could be ()* or ()? in the pattern. Brackets with fixed upper
05918     repeat limits are compiled as a number of copies, with the optional ones
05919     preceded by BRAZERO or BRAMINZERO. */
05920 
05921     case OP_BRAZERO:
05922       {
05923       next = ecode+1;
05924       RMATCH(rrc, eptr, next, offset_top, md, ims, eptrb, match_isgroup);
05925       if (rrc != MATCH_NOMATCH) RRETURN(rrc);
05926       do next += GET(next,1); while (*next == OP_ALT);
05927       ecode = next + 1+LINK_SIZE;
05928       }
05929     break;
05930 
05931     case OP_BRAMINZERO:
05932       {
05933       next = ecode+1;
05934       do next += GET(next,1); while (*next == OP_ALT);
05935       RMATCH(rrc, eptr, next + 1+LINK_SIZE, offset_top, md, ims, eptrb,
05936         match_isgroup);
05937       if (rrc != MATCH_NOMATCH) RRETURN(rrc);
05938       ecode++;
05939       }
05940     break;
05941 
05942     /* End of a group, repeated or non-repeating. If we are at the end of
05943     an assertion "group", stop matching and return MATCH_MATCH, but record the
05944     current high water mark for use by positive assertions. Do this also
05945     for the "once" (not-backup up) groups. */
05946 
05947     case OP_KET:
05948     case OP_KETRMIN:
05949     case OP_KETRMAX:
05950       {
05951       prev = ecode - GET(ecode, 1);
05952       saved_eptr = eptrb->epb_saved_eptr;
05953 
05954       /* Back up the stack of bracket start pointers. */
05955 
05956       eptrb = eptrb->epb_prev;
05957 
05958       if (*prev == OP_ASSERT || *prev == OP_ASSERT_NOT ||
05959           *prev == OP_ASSERTBACK || *prev == OP_ASSERTBACK_NOT ||
05960           *prev == OP_ONCE)
05961         {
05962         md->end_match_ptr = eptr;      /* For ONCE */
05963         md->end_offset_top = offset_top;
05964         RRETURN(MATCH_MATCH);
05965         }
05966 
05967       /* In all other cases except a conditional group we have to check the
05968       group number back at the start and if necessary complete handling an
05969       extraction by setting the offsets and bumping the high water mark. */
05970 
05971       if (*prev != OP_COND)
05972         {
05973         number = *prev - OP_BRA;
05974 
05975         /* For extended extraction brackets (large number), we have to fish out
05976         the number from a dummy opcode at the start. */
05977 
05978         if (number > EXTRACT_BASIC_MAX) number = GET2(prev, 2+LINK_SIZE);
05979         offset = number << 1;
05980 
05981         /* Test for a numbered group. This includes groups called as a result
05982         of recursion. Note that whole-pattern recursion is coded as a recurse
05983         into group 0, so it won't be picked up here. Instead, we catch it when
05984         the OP_END is reached. */
05985 
05986         if (number > 0)
05987           {
05988           md->capture_last = number;
05989           if (offset >= md->offset_max) md->offset_overflow = true; else
05990             {
05991             md->offset_vector[offset] =
05992               md->offset_vector[md->offset_end - number];
05993             md->offset_vector[offset+1] = eptr - md->start_subject;
05994             if (offset_top <= offset) offset_top = offset + 2;
05995             }
05996 
05997           /* Handle a recursively called group. Restore the offsets
05998           appropriately and continue from after the call. */
05999 
06000           if (md->recursive != NULL && md->recursive->group_num == number)
06001             {
06002             recursion_info *rec = md->recursive;
06003             DPRINTF(("Recursion (%d) succeeded - continuing\n", number));
06004             md->recursive = rec->prevrec;
06005             md->start_match = rec->save_start;
06006             memcpy(md->offset_vector, rec->offset_save,
06007               rec->saved_max * sizeof(int));
06008             ecode = rec->after_call;
06009             ims = original_ims;
06010             break;
06011             }
06012           }
06013         }
06014 
06015       /* Reset the value of the ims flags, in case they got changed during
06016       the group. */
06017 
06018       ims = original_ims;
06019       DPRINTF(("ims reset to %02lx\n", ims));
06020 
06021       /* For a non-repeating ket, just continue at this level. This also
06022       happens for a repeating ket if no characters were matched in the group.
06023       This is the forcible breaking of infinite loops as implemented in Perl
06024       5.005. If there is an options reset, it will get obeyed in the normal
06025       course of events. */
06026 
06027       if (*ecode == OP_KET || eptr == saved_eptr)
06028         {
06029         ecode += 1 + LINK_SIZE;
06030         break;
06031         }
06032 
06033       /* The repeating kets try the rest of the pattern or restart from the
06034       preceding bracket, in the appropriate order. */
06035 
06036       if (*ecode == OP_KETRMIN)
06037         {
06038         RMATCH(rrc, eptr, ecode + 1+LINK_SIZE, offset_top, md, ims, eptrb, 0);
06039         if (rrc != MATCH_NOMATCH) RRETURN(rrc);
06040         RMATCH(rrc, eptr, prev, offset_top, md, ims, eptrb, match_isgroup);
06041         if (rrc != MATCH_NOMATCH) RRETURN(rrc);
06042         }
06043       else  /* OP_KETRMAX */
06044         {
06045         RMATCH(rrc, eptr, prev, offset_top, md, ims, eptrb, match_isgroup);
06046         if (rrc != MATCH_NOMATCH) RRETURN(rrc);
06047         RMATCH(rrc, eptr, ecode + 1+LINK_SIZE, offset_top, md, ims, eptrb, 0);
06048         if (rrc != MATCH_NOMATCH) RRETURN(rrc);
06049         }
06050       }
06051 
06052     RRETURN(MATCH_NOMATCH);
06053 
06054     /* Start of subject unless notbol, or after internal newline if multiline */
06055 
06056     case OP_CIRC:
06057     if (md->notbol && eptr == md->start_subject) RRETURN(MATCH_NOMATCH);
06058     if ((ims & PCRE_MULTILINE) != 0)
06059       {
06060       if (eptr != md->start_subject && eptr[-1] != NEWLINE)
06061         RRETURN(MATCH_NOMATCH);
06062       ecode++;
06063       break;
06064       }
06065     /* ... else fall through */
06066 
06067     /* Start of subject assertion */
06068 
06069     case OP_SOD:
06070     if (eptr != md->start_subject) RRETURN(MATCH_NOMATCH);
06071     ecode++;
06072     break;
06073 
06074     /* Start of match assertion */
06075 
06076     case OP_SOM:
06077     if (eptr != md->start_subject + md->start_offset) RRETURN(MATCH_NOMATCH);
06078     ecode++;
06079     break;
06080 
06081     /* Assert before internal newline if multiline, or before a terminating
06082     newline unless endonly is set, else end of subject unless noteol is set. */
06083 
06084     case OP_DOLL:
06085     if ((ims & PCRE_MULTILINE) != 0)
06086       {
06087       if (eptr < md->end_subject)
06088         { if (*eptr != NEWLINE) RRETURN(MATCH_NOMATCH); }
06089       else
06090         { if (md->noteol) RRETURN(MATCH_NOMATCH); }
06091       ecode++;
06092       break;
06093       }
06094     else
06095       {
06096       if (md->noteol) RRETURN(MATCH_NOMATCH);
06097       if (!md->endonly)
06098         {
06099         if (eptr < md->end_subject - 1 ||
06100            (eptr == md->end_subject - 1 && *eptr != NEWLINE))
06101           RRETURN(MATCH_NOMATCH);
06102         ecode++;
06103         break;
06104         }
06105       }
06106     /* ... else fall through */
06107 
06108     /* End of subject assertion (\z) */
06109 
06110     case OP_EOD:
06111     if (eptr < md->end_subject) RRETURN(MATCH_NOMATCH);
06112     ecode++;
06113     break;
06114 
06115     /* End of subject or ending \n assertion (\Z) */
06116 
06117     case OP_EODN:
06118     if (eptr < md->end_subject - 1 ||
06119        (eptr == md->end_subject - 1 && *eptr != NEWLINE)) RRETURN(MATCH_NOMATCH);
06120     ecode++;
06121     break;
06122 
06123     /* Word boundary assertions */
06124 
06125     case OP_NOT_WORD_BOUNDARY:
06126     case OP_WORD_BOUNDARY:
06127       {
06128 
06129       /* Find out if the previous and current characters are "word" characters.
06130       It takes a bit more work in UTF-8 mode. Characters > 255 are assumed to
06131       be "non-word" characters. */
06132 
06133 
06134       /* More streamlined when not in UTF-8 mode */
06135 
06136         {
06137         prev_is_word = (eptr != md->start_subject) &&
06138           ((md->ctypes[eptr[-1]] & ctype_word) != 0);
06139         cur_is_word = (eptr < md->end_subject) &&
06140           ((md->ctypes[*eptr] & ctype_word) != 0);
06141         }
06142 
06143       /* Now see if the situation is what we want */
06144 
06145       if ((*ecode++ == OP_WORD_BOUNDARY)?
06146            cur_is_word == prev_is_word : cur_is_word != prev_is_word)
06147         RRETURN(MATCH_NOMATCH);
06148       }
06149     break;
06150 
06151     /* Match a single character type; inline for speed */
06152 
06153     case OP_ANY:
06154     if ((ims & PCRE_DOTALL) == 0 && eptr < md->end_subject && *eptr == NEWLINE)
06155       RRETURN(MATCH_NOMATCH);
06156     if (eptr++ >= md->end_subject) RRETURN(MATCH_NOMATCH);
06157     ecode++;
06158     break;
06159 
06160     /* Match a single byte, even in UTF-8 mode. This opcode really does match
06161     any byte, even newline, independent of the setting of PCRE_DOTALL. */
06162 
06163     case OP_ANYBYTE:
06164     if (eptr++ >= md->end_subject) RRETURN(MATCH_NOMATCH);
06165     ecode++;
06166     break;
06167 
06168     case OP_NOT_DIGIT:
06169     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
06170     GETCHARINCTEST(c, eptr);
06171     if (
06172        (md->ctypes[c] & ctype_digit) != 0
06173        )
06174       RRETURN(MATCH_NOMATCH);
06175     ecode++;
06176     break;
06177 
06178     case OP_DIGIT:
06179     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
06180     GETCHARINCTEST(c, eptr);
06181     if (
06182        (md->ctypes[c] & ctype_digit) == 0
06183        )
06184       RRETURN(MATCH_NOMATCH);
06185     ecode++;
06186     break;
06187 
06188     case OP_NOT_WHITESPACE:
06189     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
06190     GETCHARINCTEST(c, eptr);
06191     if (
06192        (md->ctypes[c] & ctype_space) != 0
06193        )
06194       RRETURN(MATCH_NOMATCH);
06195     ecode++;
06196     break;
06197 
06198     case OP_WHITESPACE:
06199     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
06200     GETCHARINCTEST(c, eptr);
06201     if (
06202        (md->ctypes[c] & ctype_space) == 0
06203        )
06204       RRETURN(MATCH_NOMATCH);
06205     ecode++;
06206     break;
06207 
06208     case OP_NOT_WORDCHAR:
06209     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
06210     GETCHARINCTEST(c, eptr);
06211     if (
06212        (md->ctypes[c] & ctype_word) != 0
06213        )
06214       RRETURN(MATCH_NOMATCH);
06215     ecode++;
06216     break;
06217 
06218     case OP_WORDCHAR:
06219     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
06220     GETCHARINCTEST(c, eptr);
06221     if (
06222        (md->ctypes[c] & ctype_word) == 0
06223        )
06224       RRETURN(MATCH_NOMATCH);
06225     ecode++;
06226     break;
06227 
06228     /* Match a back reference, possibly repeatedly. Look past the end of the
06229     item to see if there is repeat information following. The code is similar
06230     to that for character classes, but repeated for efficiency. Then obey
06231     similar code to character type repeats - written out again for speed.
06232     However, if the referenced string is the empty string, always treat
06233     it as matched, any number of times (otherwise there could be infinite
06234     loops). */
06235 
06236     case OP_REF:
06237       {
06238       offset = GET2(ecode, 1) << 1;               /* Doubled ref number */
06239       ecode += 3;                                     /* Advance past item */
06240 
06241       /* If the reference is unset, set the length to be longer than the amount
06242       of subject left; this ensures that every attempt at a match fails. We
06243       can't just fail here, because of the possibility of quantifiers with zero
06244       minima. */
06245 
06246       length = (offset >= offset_top || md->offset_vector[offset] < 0)?
06247         md->end_subject - eptr + 1 :
06248         md->offset_vector[offset+1] - md->offset_vector[offset];
06249 
06250       /* Set up for repetition, or handle the non-repeated case */
06251 
06252       switch (*ecode)
06253         {
06254         case OP_CRSTAR:
06255         case OP_CRMINSTAR:
06256         case OP_CRPLUS:
06257         case OP_CRMINPLUS:
06258         case OP_CRQUERY:
06259         case OP_CRMINQUERY:
06260         c = *ecode++ - OP_CRSTAR;
06261         minimize = (c & 1) != 0;
06262         min = rep_min[c];                 /* Pick up values from tables; */
06263         max = rep_max[c];                 /* zero for max => infinity */
06264         if (max == 0) max = INT_MAX;
06265         break;
06266 
06267         case OP_CRRANGE:
06268         case OP_CRMINRANGE:
06269         minimize = (*ecode == OP_CRMINRANGE);
06270         min = GET2(ecode, 1);
06271         max = GET2(ecode, 3);
06272         if (max == 0) max = INT_MAX;
06273         ecode += 5;
06274         break;
06275 
06276         default:               /* No repeat follows */
06277         if (!match_ref(offset, eptr, length, md, ims)) RRETURN(MATCH_NOMATCH);
06278         eptr += length;
06279         continue;              /* With the main loop */
06280         }
06281 
06282       /* If the length of the reference is zero, just continue with the
06283       main loop. */
06284 
06285       if (length == 0) continue;
06286 
06287       /* First, ensure the minimum number of matches are present. We get back
06288       the length of the reference string explicitly rather than passing the
06289       address of eptr, so that eptr can be a register variable. */
06290 
06291       for (i = 1; i <= min; i++)
06292         {
06293         if (!match_ref(offset, eptr, length, md, ims)) RRETURN(MATCH_NOMATCH);
06294         eptr += length;
06295         }
06296 
06297       /* If min = max, continue at the same level without recursion.
06298       They are not both allowed to be zero. */
06299 
06300       if (min == max) continue;
06301 
06302       /* If minimizing, keep trying and advancing the pointer */
06303 
06304       if (minimize)
06305         {
06306         for (fi = min;; fi++)
06307           {
06308           RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
06309           if (rrc != MATCH_NOMATCH) RRETURN(rrc);
06310           if (fi >= max || !match_ref(offset, eptr, length, md, ims))
06311             RRETURN(MATCH_NOMATCH);
06312           eptr += length;
06313           }
06314         /* Control never gets here */
06315         }
06316 
06317       /* If maximizing, find the longest string and work backwards */
06318 
06319       else
06320         {
06321         pp = eptr;
06322         for (i = min; i < max; i++)
06323           {
06324           if (!match_ref(offset, eptr, length, md, ims)) break;
06325           eptr += length;
06326           }
06327         while (eptr >= pp)
06328           {
06329           RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
06330           if (rrc != MATCH_NOMATCH) RRETURN(rrc);
06331           eptr -= length;
06332           }
06333         RRETURN(MATCH_NOMATCH);
06334         }
06335       }
06336     /* Control never gets here */
06337 
06338 
06339 
06340     /* Match a bit-mapped character class, possibly repeatedly. This op code is
06341     used when all the characters in the class have values in the range 0-255.
06342     The only difference between OP_CLASS and OP_NCLASS occurs when a data
06343     character outside the range is encountered.
06344 
06345     First, look past the end of the item to see if there is repeat information
06346     following. Then obey similar code to character type repeats - written out
06347     again for speed. */
06348 
06349     case OP_NCLASS:
06350     case OP_CLASS:
06351       {
06352       data = ecode + 1;                /* Save for matching */
06353       ecode += 33;                     /* Advance past the item */
06354 
06355       switch (*ecode)
06356         {
06357         case OP_CRSTAR:
06358         case OP_CRMINSTAR:
06359         case OP_CRPLUS:
06360         case OP_CRMINPLUS:
06361         case OP_CRQUERY:
06362         case OP_CRMINQUERY:
06363         c = *ecode++ - OP_CRSTAR;
06364         minimize = (c & 1) != 0;
06365         min = rep_min[c];                 /* Pick up values from tables; */
06366         max = rep_max[c];                 /* zero for max => infinity */
06367         if (max == 0) max = INT_MAX;
06368         break;
06369 
06370         case OP_CRRANGE:
06371         case OP_CRMINRANGE:
06372         minimize = (*ecode == OP_CRMINRANGE);
06373         min = GET2(ecode, 1);
06374         max = GET2(ecode, 3);
06375         if (max == 0) max = INT_MAX;
06376         ecode += 5;
06377         break;
06378 
06379         default:               /* No repeat follows */
06380         min = max = 1;
06381         break;
06382         }
06383 
06384       /* First, ensure the minimum number of matches are present. */
06385 
06386       /* Not UTF-8 mode */
06387         {
06388         for (i = 1; i <= min; i++)
06389           {
06390           if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
06391           c = *eptr++;
06392           if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
06393           }
06394         }
06395 
06396       /* If max == min we can continue with the main loop without the
06397       need to recurse. */
06398 
06399       if (min == max) continue;
06400 
06401       /* If minimizing, keep testing the rest of the expression and advancing
06402       the pointer while it matches the class. */
06403 
06404       if (minimize)
06405         {
06406         /* Not UTF-8 mode */
06407           {
06408           for (fi = min;; fi++)
06409             {
06410             RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
06411             if (rrc != MATCH_NOMATCH) RRETURN(rrc);
06412             if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
06413             c = *eptr++;
06414             if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
06415             }
06416           }
06417         /* Control never gets here */
06418         }
06419 
06420       /* If maximizing, find the longest possible run, then work backwards. */
06421 
06422       else
06423         {
06424         pp = eptr;
06425 
06426           /* Not UTF-8 mode */
06427           {
06428           for (i = min; i < max; i++)
06429             {
06430             if (eptr >= md->end_subject) break;
06431             c = *eptr;
06432             if ((data[c/8] & (1 << (c&7))) == 0) break;
06433             eptr++;
06434             }
06435           while (eptr >= pp)
06436             {
06437             RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
06438             eptr--;
06439             if (rrc != MATCH_NOMATCH) RRETURN(rrc);
06440             }
06441           }
06442 
06443         RRETURN(MATCH_NOMATCH);
06444         }
06445       }
06446     /* Control never gets here */
06447 
06448 
06449     /* Match an extended character class. This opcode is encountered only
06450     in UTF-8 mode, because that's the only time it is compiled. */
06451 
06452 
06453     /* Match a run of characters */
06454 
06455     case OP_CHARS:
06456       {
06457       register int slen = ecode[1];
06458       ecode += 2;
06459 
06460       if (slen > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
06461       if ((ims & PCRE_CASELESS) != 0)
06462         {
06463         while (slen-- > 0)
06464           if (md->lcc[*ecode++] != md->lcc[*eptr++])
06465             RRETURN(MATCH_NOMATCH);
06466         }
06467       else
06468         {
06469         while (slen-- > 0) if (*ecode++ != *eptr++) RRETURN(MATCH_NOMATCH);
06470         }
06471       }
06472     break;
06473 
06474     /* Match a single character repeatedly; different opcodes share code. */
06475 
06476     case OP_EXACT:
06477     min = max = GET2(ecode, 1);
06478     ecode += 3;
06479     goto REPEATCHAR;
06480 
06481     case OP_UPTO:
06482     case OP_MINUPTO:
06483     min = 0;
06484     max = GET2(ecode, 1);
06485     minimize = *ecode == OP_MINUPTO;
06486     ecode += 3;
06487     goto REPEATCHAR;
06488 
06489     case OP_STAR:
06490     case OP_MINSTAR:
06491     case OP_PLUS:
06492     case OP_MINPLUS:
06493     case OP_QUERY:
06494     case OP_MINQUERY:
06495     c = *ecode++ - OP_STAR;
06496     minimize = (c & 1) != 0;
06497     min = rep_min[c];                 /* Pick up values from tables; */
06498     max = rep_max[c];                 /* zero for max => infinity */
06499     if (max == 0) max = INT_MAX;
06500 
06501     /* Common code for all repeated single-character matches. We can give
06502     up quickly if there are fewer than the minimum number of characters left in
06503     the subject. */
06504 
06505     REPEATCHAR:
06506 
06507     /* When not in UTF-8 mode, load a single-byte character. */
06508       {
06509       if (min > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
06510       fc = *ecode++;
06511       }
06512 
06513     /* The value of fc at this point is always less than 256, though we may or
06514     may not be in UTF-8 mode. The code is duplicated for the caseless and
06515     caseful cases, for speed, since matching characters is likely to be quite
06516     common. First, ensure the minimum number of matches are present. If min =
06517     max, continue at the same level without recursing. Otherwise, if
06518     minimizing, keep trying the rest of the expression and advancing one
06519     matching character if failing, up to the maximum. Alternatively, if
06520     maximizing, find the maximum number of characters and work backwards. */
06521 
06522     DPRINTF(("matching %c{%d,%d} against subject %.*s\n", fc, min, max,
06523       max, eptr));
06524 
06525     if ((ims & PCRE_CASELESS) != 0)
06526       {
06527       fc = md->lcc[fc];
06528       for (i = 1; i <= min; i++)
06529         if (fc != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
06530       if (min == max) continue;
06531       if (minimize)
06532         {
06533         for (fi = min;; fi++)
06534           {
06535           RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
06536           if (rrc != MATCH_NOMATCH) RRETURN(rrc);
06537           if (fi >= max || eptr >= md->end_subject ||
06538               fc != md->lcc[*eptr++])
06539             RRETURN(MATCH_NOMATCH);
06540           }
06541         /* Control never gets here */
06542         }
06543       else
06544         {
06545         pp = eptr;
06546         for (i = min; i < max; i++)
06547           {
06548           if (eptr >= md->end_subject || fc != md->lcc[*eptr]) break;
06549           eptr++;
06550           }
06551         while (eptr >= pp)
06552           {
06553           RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
06554           eptr--;
06555           if (rrc != MATCH_NOMATCH) RRETURN(rrc);
06556           }
06557         RRETURN(MATCH_NOMATCH);
06558         }
06559       /* Control never gets here */
06560       }
06561 
06562     /* Caseful comparisons (includes all multi-byte characters) */
06563 
06564     else
06565       {
06566       for (i = 1; i <= min; i++) if (fc != *eptr++) RRETURN(MATCH_NOMATCH);
06567       if (min == max) continue;
06568       if (minimize)
06569         {
06570         for (fi = min;; fi++)
06571           {
06572           RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
06573           if (rrc != MATCH_NOMATCH) RRETURN(rrc);
06574           if (fi >= max || eptr >= md->end_subject || fc != *eptr++)
06575             RRETURN(MATCH_NOMATCH);
06576           }
06577         /* Control never gets here */
06578         }
06579       else
06580         {
06581         pp = eptr;
06582         for (i = min; i < max; i++)
06583           {
06584           if (eptr >= md->end_subject || fc != *eptr) break;
06585           eptr++;
06586           }
06587         while (eptr >= pp)
06588           {
06589           RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
06590           eptr--;
06591           if (rrc != MATCH_NOMATCH) RRETURN(rrc);
06592           }
06593         RRETURN(MATCH_NOMATCH);
06594         }
06595       }
06596     /* Control never gets here */
06597 
06598     /* Match a negated single one-byte character. The character we are
06599     checking can be multibyte. */
06600 
06601     case OP_NOT:
06602     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
06603     ecode++;
06604     GETCHARINCTEST(c, eptr);
06605     if ((ims & PCRE_CASELESS) != 0)
06606       {
06607       c = md->lcc[c];
06608       if (md->lcc[*ecode++] == c) RRETURN(MATCH_NOMATCH);
06609       }
06610     else
06611       {
06612       if (*ecode++ == c) RRETURN(MATCH_NOMATCH);
06613       }
06614     break;
06615 
06616     /* Match a negated single one-byte character repeatedly. This is almost a
06617     repeat of the code for a repeated single character, but I haven't found a
06618     nice way of commoning these up that doesn't require a test of the
06619     positive/negative option for each character match. Maybe that wouldn't add
06620     very much to the time taken, but character matching *is* what this is all
06621     about... */
06622 
06623     case OP_NOTEXACT:
06624     min = max = GET2(ecode, 1);
06625     ecode += 3;
06626     goto REPEATNOTCHAR;
06627 
06628     case OP_NOTUPTO:
06629     case OP_NOTMINUPTO:
06630     min = 0;
06631     max = GET2(ecode, 1);
06632     minimize = *ecode == OP_NOTMINUPTO;
06633     ecode += 3;
06634     goto REPEATNOTCHAR;
06635 
06636     case OP_NOTSTAR:
06637     case OP_NOTMINSTAR:
06638     case OP_NOTPLUS:
06639     case OP_NOTMINPLUS:
06640     case OP_NOTQUERY:
06641     case OP_NOTMINQUERY:
06642     c = *ecode++ - OP_NOTSTAR;
06643     minimize = (c & 1) != 0;
06644     min = rep_min[c];                 /* Pick up values from tables; */
06645     max = rep_max[c];                 /* zero for max => infinity */
06646     if (max == 0) max = INT_MAX;
06647 
06648     /* Common code for all repeated single-character (less than 255) matches.
06649     We can give up quickly if there are fewer than the minimum number of
06650     characters left in the subject. */
06651 
06652     REPEATNOTCHAR:
06653     if (min > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
06654     fc = *ecode++;
06655 
06656     /* The code is duplicated for the caseless and caseful cases, for speed,
06657     since matching characters is likely to be quite common. First, ensure the
06658     minimum number of matches are present. If min = max, continue at the same
06659     level without recursing. Otherwise, if minimizing, keep trying the rest of
06660     the expression and advancing one matching character if failing, up to the
06661     maximum. Alternatively, if maximizing, find the maximum number of
06662     characters and work backwards. */
06663 
06664     DPRINTF(("negative matching %c{%d,%d} against subject %.*s\n", fc, min, max,
06665       max, eptr));
06666 
06667     if ((ims & PCRE_CASELESS) != 0)
06668       {
06669       fc = md->lcc[fc];
06670 
06671 
06672       /* Not UTF-8 mode */
06673         {
06674         for (i = 1; i <= min; i++)
06675           if (fc == md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
06676         }
06677 
06678       if (min == max) continue;
06679 
06680       if (minimize)
06681         {
06682         /* Not UTF-8 mode */
06683           {
06684           for (fi = min;; fi++)
06685             {
06686             RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
06687             if (rrc != MATCH_NOMATCH) RRETURN(rrc);
06688             if (fi >= max || eptr >= md->end_subject || fc == md->lcc[*eptr++])
06689               RRETURN(MATCH_NOMATCH);
06690             }
06691           }
06692         /* Control never gets here */
06693         }
06694 
06695       /* Maximize case */
06696 
06697       else
06698         {
06699         pp = eptr;
06700 
06701         /* Not UTF-8 mode */
06702           {
06703           for (i = min; i < max; i++)
06704             {
06705             if (eptr >= md->end_subject || fc == md->lcc[*eptr]) break;
06706             eptr++;
06707             }
06708           while (eptr >= pp)
06709             {
06710             RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
06711             if (rrc != MATCH_NOMATCH) RRETURN(rrc);
06712             eptr--;
06713             }
06714           }
06715 
06716         RRETURN(MATCH_NOMATCH);
06717         }
06718       /* Control never gets here */
06719       }
06720 
06721     /* Caseful comparisons */
06722 
06723     else
06724       {
06725       /* Not UTF-8 mode */
06726         {
06727         for (i = 1; i <= min; i++)
06728           if (fc == *eptr++) RRETURN(MATCH_NOMATCH);
06729         }
06730 
06731       if (min == max) continue;
06732 
06733       if (minimize)
06734         {
06735         /* Not UTF-8 mode */
06736           {
06737           for (fi = min;; fi++)
06738             {
06739             RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
06740             if (rrc != MATCH_NOMATCH) RRETURN(rrc);
06741             if (fi >= max || eptr >= md->end_subject || fc == *eptr++)
06742               RRETURN(MATCH_NOMATCH);
06743             }
06744           }
06745         /* Control never gets here */
06746         }
06747 
06748       /* Maximize case */
06749 
06750       else
06751         {
06752         pp = eptr;
06753 
06754         /* Not UTF-8 mode */
06755           {
06756           for (i = min; i < max; i++)
06757             {
06758             if (eptr >= md->end_subject || fc == *eptr) break;
06759             eptr++;
06760             }
06761           while (eptr >= pp)
06762             {
06763             RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
06764             if (rrc != MATCH_NOMATCH) RRETURN(rrc);
06765             eptr--;
06766             }
06767           }
06768 
06769         RRETURN(MATCH_NOMATCH);
06770         }
06771       }
06772     /* Control never gets here */
06773 
06774     /* Match a single character type repeatedly; several different opcodes
06775     share code. This is very similar to the code for single characters, but we
06776     repeat it in the interests of efficiency. */
06777 
06778     case OP_TYPEEXACT:
06779     min = max = GET2(ecode, 1);
06780     minimize = true;
06781     ecode += 3;
06782     goto REPEATTYPE;
06783 
06784     case OP_TYPEUPTO:
06785     case OP_TYPEMINUPTO:
06786     min = 0;
06787     max = GET2(ecode, 1);
06788     minimize = *ecode == OP_TYPEMINUPTO;
06789     ecode += 3;
06790     goto REPEATTYPE;
06791 
06792     case OP_TYPESTAR:
06793     case OP_TYPEMINSTAR:
06794     case OP_TYPEPLUS:
06795     case OP_TYPEMINPLUS:
06796     case OP_TYPEQUERY:
06797     case OP_TYPEMINQUERY:
06798     c = *ecode++ - OP_TYPESTAR;
06799     minimize = (c & 1) != 0;
06800     min = rep_min[c];                 /* Pick up values from tables; */
06801     max = rep_max[c];                 /* zero for max => infinity */
06802     if (max == 0) max = INT_MAX;
06803 
06804     /* Common code for all repeated single character type matches. Note that
06805     in UTF-8 mode, '.' matches a character of any length, but for the other
06806     character types, the valid characters are all one-byte long. */
06807 
06808     REPEATTYPE:
06809     ctype = *ecode++;      /* Code for the character type */
06810 
06811     /* First, ensure the minimum number of matches are present. Use inline
06812     code for maximizing the speed, and do the type test once at the start
06813     (i.e. keep it out of the loop). Also we can test that there are at least
06814     the minimum number of bytes before we start. This isn't as effective in
06815     UTF-8 mode, but it does no harm. Separate the UTF-8 code completely as that
06816     is tidier. */
06817 
06818     if (min > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
06819     if (min > 0)
06820       {
06821 
06822       /* Code for the non-UTF-8 case for minimum matching */
06823 
06824       switch(ctype)
06825         {
06826         case OP_ANY:
06827         if ((ims & PCRE_DOTALL) == 0)
06828           {
06829           for (i = 1; i <= min; i++)
06830             if (*eptr++ == NEWLINE) RRETURN(MATCH_NOMATCH);
06831           }
06832         else eptr += min;
06833         break;
06834 
06835         case OP_ANYBYTE:
06836         eptr += min;
06837         break;
06838 
06839         case OP_NOT_DIGIT:
06840         for (i = 1; i <= min; i++)
06841           if ((md->ctypes[*eptr++] & ctype_digit) != 0) RRETURN(MATCH_NOMATCH);
06842         break;
06843 
06844         case OP_DIGIT:
06845         for (i = 1; i <= min; i++)
06846           if ((md->ctypes[*eptr++] & ctype_digit) == 0) RRETURN(MATCH_NOMATCH);
06847         break;
06848 
06849         case OP_NOT_WHITESPACE:
06850         for (i = 1; i <= min; i++)
06851           if ((md->ctypes[*eptr++] & ctype_space) != 0) RRETURN(MATCH_NOMATCH);
06852         break;
06853 
06854         case OP_WHITESPACE:
06855         for (i = 1; i <= min; i++)
06856           if ((md->ctypes[*eptr++] & ctype_space) == 0) RRETURN(MATCH_NOMATCH);
06857         break;
06858 
06859         case OP_NOT_WORDCHAR:
06860         for (i = 1; i <= min; i++)
06861           if ((md->ctypes[*eptr++] & ctype_word) != 0)
06862             RRETURN(MATCH_NOMATCH);
06863         break;
06864 
06865         case OP_WORDCHAR:
06866         for (i = 1; i <= min; i++)
06867           if ((md->ctypes[*eptr++] & ctype_word) == 0)
06868             RRETURN(MATCH_NOMATCH);
06869         break;
06870         }
06871       }
06872 
06873     /* If min = max, continue at the same level without recursing */
06874 
06875     if (min == max) continue;
06876 
06877     /* If minimizing, we have to test the rest of the pattern before each
06878     subsequent match. Again, separate the UTF-8 case for speed. */
06879 
06880     if (minimize)
06881       {
06882       /* Not UTF-8 mode */
06883         {
06884         for (fi = min;; fi++)
06885           {
06886           RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
06887           if (rrc != MATCH_NOMATCH) RRETURN(rrc);
06888           if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
06889           c = *eptr++;
06890           switch(ctype)
06891             {
06892             case OP_ANY:
06893             if ((ims & PCRE_DOTALL) == 0 && c == NEWLINE) RRETURN(MATCH_NOMATCH);
06894             break;
06895 
06896             case OP_ANYBYTE:
06897             break;
06898 
06899             case OP_NOT_DIGIT:
06900             if ((md->ctypes[c] & ctype_digit) != 0) RRETURN(MATCH_NOMATCH);
06901             break;
06902 
06903             case OP_DIGIT:
06904             if ((md->ctypes[c] & ctype_digit) == 0) RRETURN(MATCH_NOMATCH);
06905             break;
06906 
06907             case OP_NOT_WHITESPACE:
06908             if ((md->ctypes[c] & ctype_space) != 0) RRETURN(MATCH_NOMATCH);
06909             break;
06910 
06911             case OP_WHITESPACE:
06912             if  ((md->ctypes[c] & ctype_space) == 0) RRETURN(MATCH_NOMATCH);
06913             break;
06914 
06915             case OP_NOT_WORDCHAR:
06916             if ((md->ctypes[c] & ctype_word) != 0) RRETURN(MATCH_NOMATCH);
06917             break;
06918 
06919             case OP_WORDCHAR:
06920             if ((md->ctypes[c] & ctype_word) == 0) RRETURN(MATCH_NOMATCH);
06921             break;
06922             }
06923           }
06924         }
06925       /* Control never gets here */
06926       }
06927 
06928     /* If maximizing it is worth using inline code for speed, doing the type
06929     test once at the start (i.e. keep it out of the loop). Again, keep the
06930     UTF-8 stuff separate. */
06931 
06932     else
06933       {
06934       pp = eptr;
06935 
06936       /* Not UTF-8 mode */
06937         {
06938         switch(ctype)
06939           {
06940           case OP_ANY:
06941           if ((ims & PCRE_DOTALL) == 0)
06942             {
06943             for (i = min; i < max; i++)
06944               {
06945               if (eptr >= md->end_subject || *eptr == NEWLINE) break;
06946               eptr++;
06947               }
06948             break;
06949             }
06950           /* For DOTALL case, fall through and treat as \C */
06951 
06952           case OP_ANYBYTE:
06953           c = max - min;
06954           if (c > md->end_subject - eptr) c = md->end_subject - eptr;
06955           eptr += c;
06956           break;
06957 
06958           case OP_NOT_DIGIT:
06959           for (i = min; i < max; i++)
06960             {
06961             if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_digit) != 0)
06962               break;
06963             eptr++;
06964             }
06965           break;
06966 
06967           case OP_DIGIT:
06968           for (i = min; i < max; i++)
06969             {
06970             if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_digit) == 0)
06971               break;
06972             eptr++;
06973             }
06974           break;
06975 
06976           case OP_NOT_WHITESPACE:
06977           for (i = min; i < max; i++)
06978             {
06979             if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_space) != 0)
06980               break;
06981             eptr++;
06982             }
06983           break;
06984 
06985           case OP_WHITESPACE:
06986           for (i = min; i < max; i++)
06987             {
06988             if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_space) == 0)
06989               break;
06990             eptr++;
06991             }
06992           break;
06993 
06994           case OP_NOT_WORDCHAR:
06995           for (i = min; i < max; i++)
06996             {
06997             if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_word) != 0)
06998               break;
06999             eptr++;
07000             }
07001           break;
07002 
07003           case OP_WORDCHAR:
07004           for (i = min; i < max; i++)
07005             {
07006             if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_word) == 0)
07007               break;
07008             eptr++;
07009             }
07010           break;
07011           }
07012 
07013         /* eptr is now past the end of the maximum run */
07014 
07015         while (eptr >= pp)
07016           {
07017           RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
07018           eptr--;
07019           if (rrc != MATCH_NOMATCH) RRETURN(rrc);
07020           }
07021         }
07022 
07023       /* Get here if we can't make it match with any permitted repetitions */
07024 
07025       RRETURN(MATCH_NOMATCH);
07026       }
07027     /* Control never gets here */
07028 
07029     /* There's been some horrible disaster. Since all codes > OP_BRA are
07030     for capturing brackets, and there shouldn't be any gaps between 0 and
07031     OP_BRA, arrival here can only mean there is something seriously wrong
07032     in the code above or the OP_xxx definitions. */
07033 
07034     default:
07035     DPRINTF(("Unknown opcode %d\n", *ecode));
07036     RRETURN(PCRE_ERROR_UNKNOWN_NODE);
07037     }
07038 
07039   /* Do not stick any code in here without much thought; it is assumed
07040   that "continue" in the code above comes out to here to repeat the main
07041   loop. */
07042 
07043   }             /* End of main loop */
07044 RRETURN(MATCH_NOMATCH);
07045 }
07046 

static bool match_ref ( int  offset,
register const uschar eptr,
int  length,
match_data md,
unsigned long int  ims 
) [static]

Definition at line 5323 of file pcre.cpp.

References md, and PCRE_CASELESS.

Referenced by match().

05324 {
05325 const uschar *p = md->start_subject + md->offset_vector[offset];
05326 
05327 /* Always fail if not enough characters left */
05328 
05329 if (length > md->end_subject - eptr) return false;
05330 
05331 /* Separate the caselesss case for speed */
05332 
05333 if ((ims & PCRE_CASELESS) != 0)
05334   {
05335   while (length-- > 0)
05336     if (md->lcc[*p++] != md->lcc[*eptr++]) return false;
05337   }
05338 else
05339   { while (length-- > 0) if (*p++ != *eptr++) return false; }
05340 
05341 return true;
05342 }
05343 

pcre* pcre_compile ( const char *  pattern,
int  options,
const char **  errorptr,
int *  erroroffset,
const unsigned char *  tables 
)

Definition at line 4365 of file pcre.cpp.

References compile_data::backref_map, BRASTACK_SIZE, compile_data::cbits, cbits_offset, check_escape(), check_posix_syntax(), compile_regex(), ctype_digit, ctype_meta, ctype_space, ctype_word, compile_data::ctypes, ctypes_offset, digitab, DPRINTF, ERR12, ERR15, ERR16, ERR17, ERR18, ERR19, ERR20, ERR21, ERR22, ERR23, ERR24, ERR26, ERR28, ERR29, ERR32, ERR39, ERR41, ERR42, ERR6, ESC_b, ESC_Q, ESC_REF, EXTRACT_BASIC_MAX, compile_data::fcc, fcc_offset, find_firstassertedchar(), real_pcre::first_byte, is_anchored(), is_counted_repeat(), is_startline(), compile_data::lcc, lcc_offset, LINK_SIZE, real_pcre::magic_number, MAGIC_NUMBER, MAX_PATTERN_SIZE, MAXLIT, real_pcre::name_count, real_pcre::name_entry_size, compile_data::name_entry_size, compile_data::name_table, compile_data::names_found, NEWLINE, OP_BRA, OP_END, real_pcre::options, PCRE_ANCHORED, PCRE_CASELESS, pcre_default_tables, PCRE_DOTALL, PCRE_EXTENDED, PCRE_EXTRA, PCRE_FIRSTSET, PCRE_ICHANGED, PCRE_IMS, PCRE_MULTILINE, PCRE_NO_AUTO_CAPTURE, PCRE_REQCHSET, PCRE_STARTLINE, PCRE_UNGREEDY, PCRE_UTF8, PUBLIC_OPTIONS, read_repeat_counts(), real_pcre::req_byte, REQ_CASELESS, REQ_VARY, compile_data::req_varyopt, real_pcre::size, compile_data::start_code, real_pcre::tables, compile_data::top_backref, real_pcre::top_backref, and real_pcre::top_bracket.

Referenced by CF_HAND(), check_filter(), real_regmatch(), real_regrab(), and regexp_match().

04366 {
04367 real_pcre *re;
04368 int length = 1 + LINK_SIZE;      /* For initial BRA plus length */
04369 int runlength;
04370 int c, firstbyte, reqbyte;
04371 int bracount = 0;
04372 int branch_extra = 0;
04373 int branch_newextra;
04374 int item_count = -1;
04375 int name_count = 0;
04376 int max_name_size = 0;
04377 bool inescq = false;
04378 unsigned int brastackptr = 0;
04379 size_t size;
04380 uschar *code;
04381 const uschar *codestart;
04382 const uschar *ptr;
04383 compile_data compile_block;
04384 int brastack[BRASTACK_SIZE];
04385 uschar bralenstack[BRASTACK_SIZE];
04386 
04387 /* We can't pass back an error message if errorptr is NULL; I guess the best we
04388 can do is just return NULL. */
04389 
04390 if (errorptr == NULL) return NULL;
04391 *errorptr = NULL;
04392 
04393 /* However, we can give a message for this error */
04394 
04395 if (erroroffset == NULL)
04396   {
04397   *errorptr = ERR16;
04398   return NULL;
04399   }
04400 *erroroffset = 0;
04401 
04402 /* Can't support UTF8 unless PCRE has been compiled to include the code. */
04403 
04404 if ((options & PCRE_UTF8) != 0)
04405   {
04406   *errorptr = ERR32;
04407   return NULL;
04408   }
04409 
04410 if ((options & ~PUBLIC_OPTIONS) != 0)
04411   {
04412   *errorptr = ERR17;
04413   return NULL;
04414   }
04415 
04416 /* Set up pointers to the individual character tables */
04417 
04418 if (tables == NULL) tables = pcre_default_tables;
04419 compile_block.lcc = tables + lcc_offset;
04420 compile_block.fcc = tables + fcc_offset;
04421 compile_block.cbits = tables + cbits_offset;
04422 compile_block.ctypes = tables + ctypes_offset;
04423 
04424 /* Maximum back reference and backref bitmap. This is updated for numeric
04425 references during the first pass, but for named references during the actual
04426 compile pass. The bitmap records up to 31 back references to help in deciding
04427 whether (.*) can be treated as anchored or not. */
04428 
04429 compile_block.top_backref = 0;
04430 compile_block.backref_map = 0;
04431 
04432 /* Reflect pattern for debugging output */
04433 
04434 DPRINTF(("------------------------------------------------------------------\n"));
04435 DPRINTF(("%s\n", pattern));
04436 
04437 /* The first thing to do is to make a pass over the pattern to compute the
04438 amount of store required to hold the compiled code. This does not have to be
04439 perfect as long as errors are overestimates. At the same time we can detect any
04440 flag settings right at the start, and extract them. Make an attempt to correct
04441 for any counted white space if an "extended" flag setting appears late in the
04442 pattern. We can't be so clever for #-comments. */
04443 
04444 ptr = (const uschar *)(pattern - 1);
04445 while ((c = *(++ptr)) != 0)
04446   {
04447   int min, max;
04448 #if defined(WIN32) && (_MSC_VER == 1200) && defined(_M_IX86) && !defined(__INTEL_COMPILER)
04449   // The addition of 'volatile' works around a bug in Version 12.0 of
04450   // Microsoft's Visual C/C++ compiler (part of Visual Studio 6.0). Without
04451   // volatile, class_optcount is calculated properly, but the compiler
04452   // clobbers the EAX register before tests it as class_optcount.
04453   //
04454   // This is not a problem with the Intel Compiler.
04455   //
04456   volatile int class_optcount;
04457 #else
04458   int class_optcount;
04459 #endif
04460   int bracket_length;
04461   int duplength;
04462 
04463   /* If we are inside a \Q...\E sequence, all chars are literal */
04464 
04465   if (inescq) goto NORMAL_CHAR;
04466 
04467   /* Otherwise, first check for ignored whitespace and comments */
04468 
04469   if ((options & PCRE_EXTENDED) != 0)
04470     {
04471     if ((compile_block.ctypes[c] & ctype_space) != 0) continue;
04472     if (c == '#')
04473       {
04474       /* The space before the ; is to avoid a warning on a silly compiler
04475       on the Macintosh. */
04476       while ((c = *(++ptr)) != 0 && c != NEWLINE) ;
04477       if (c == 0) break;
04478       continue;
04479       }
04480     }
04481 
04482   item_count++;    /* Is zero for the first non-comment item */
04483 
04484   switch(c)
04485     {
04486     /* A backslashed item may be an escaped "normal" character or a
04487     character type. For a "normal" character, put the pointers and
04488     character back so that tests for whitespace etc. in the input
04489     are done correctly. */
04490 
04491     case '\\':
04492       {
04493       const uschar *save_ptr = ptr;
04494       c = check_escape(&ptr, errorptr, bracount, options, false);
04495       if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
04496       if (c >= 0)
04497         {
04498         ptr = save_ptr;
04499         c = '\\';
04500         goto NORMAL_CHAR;
04501         }
04502       }
04503 
04504     /* If \Q, enter "literal" mode */
04505 
04506     if (-c == ESC_Q)
04507       {
04508       inescq = true;
04509       continue;
04510       }
04511 
04512     /* Other escapes need one byte, and are of length one for repeats */
04513 
04514     length++;
04515 
04516     /* A back reference needs an additional 2 bytes, plus either one or 5
04517     bytes for a repeat. We also need to keep the value of the highest
04518     back reference. */
04519 
04520     if (c <= -ESC_REF)
04521       {
04522       int refnum = -c - ESC_REF;
04523       compile_block.backref_map |= (refnum < 32)? (1 << refnum) : 1;
04524       if (refnum > compile_block.top_backref)
04525         compile_block.top_backref = refnum;
04526       length += 2;   /* For single back reference */
04527       if (ptr[1] == '{' && is_counted_repeat(ptr+2))
04528         {
04529         ptr = read_repeat_counts(ptr+2, &min, &max, errorptr);
04530         if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
04531         if ((min == 0 && (max == 1 || max == -1)) ||
04532           (min == 1 && max == -1))
04533             length++;
04534         else length += 5;
04535         if (ptr[1] == '?') ptr++;
04536         }
04537       }
04538     continue;
04539 
04540     case '^':     /* Single-byte metacharacters */
04541     case '.':
04542     case '$':
04543     length++;
04544     continue;
04545 
04546     case '*':            /* These repeats won't be after brackets; */
04547     case '+':            /* those are handled separately */
04548     case '?':
04549     length++;
04550     goto POSESSIVE;      /* A few lines below */
04551 
04552     /* This covers the cases of braced repeats after a single char, metachar,
04553     class, or back reference. */
04554 
04555     case '{':
04556     if (!is_counted_repeat(ptr+1)) goto NORMAL_CHAR;
04557     ptr = read_repeat_counts(ptr+1, &min, &max, errorptr);
04558     if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
04559 
04560     /* These special cases just insert one extra opcode */
04561 
04562     if ((min == 0 && (max == 1 || max == -1)) ||
04563       (min == 1 && max == -1))
04564         length++;
04565 
04566     /* These cases might insert additional copies of a preceding character. */
04567 
04568     else
04569       {
04570 
04571       /* Not UTF-8 mode: all characters are one byte */
04572         {
04573         if (min != 1)
04574           {
04575           length--;   /* Uncount the original char or metachar */
04576           if (min > 0) length += 4;
04577           }
04578 
04579         length += (max > 0)? 4 : 2;
04580         }
04581       }
04582 
04583     if (ptr[1] == '?') ptr++;      /* Needs no extra length */
04584 
04585     POSESSIVE:                     /* Test for possessive quantifier */
04586     if (ptr[1] == '+')
04587       {
04588       ptr++;
04589       length += 2 + 2*LINK_SIZE;   /* Allow for atomic brackets */
04590       }
04591     continue;
04592 
04593     /* An alternation contains an offset to the next branch or ket. If any ims
04594     options changed in the previous branch(es), and/or if we are in a
04595     lookbehind assertion, extra space will be needed at the start of the
04596     branch. This is handled by branch_extra. */
04597 
04598     case '|':
04599     length += 1 + LINK_SIZE + branch_extra;
04600     continue;
04601 
04602     /* A character class uses 33 characters provided that all the character
04603     values are less than 256. Otherwise, it uses a bit map for low valued
04604     characters, and individual items for others. Don't worry about character
04605     types that aren't allowed in classes - they'll get picked up during the
04606     compile. A character class that contains only one single-byte character
04607     uses 2 or 3 bytes, depending on whether it is negated or not. Notice this
04608     where we can. (In UTF-8 mode we can do this only for chars < 128.) */
04609 
04610     case '[':
04611     class_optcount = 0;
04612 
04613     if (*(++ptr) == '^') ptr++;
04614 
04615     /* Written as a "do" so that an initial ']' is taken as data */
04616 
04617     if (*ptr != 0) do
04618       {
04619       /* Inside \Q...\E everything is literal except \E */
04620 
04621       if (inescq)
04622         {
04623         if (*ptr != '\\' || ptr[1] != 'E') goto NON_SPECIAL_CHARACTER;
04624         inescq = false;
04625         ptr += 1;
04626         continue;
04627         }
04628 
04629       /* Outside \Q...\E, check for escapes */
04630 
04631       if (*ptr == '\\')
04632         {
04633         int ch = check_escape(&ptr, errorptr, bracount, options, true);
04634         if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
04635 
04636         /* \b is backspace inside a class */
04637 
04638         if (-ch == ESC_b) ch = '\b';
04639 
04640         /* \Q enters quoting mode */
04641 
04642         if (-ch == ESC_Q)
04643           {
04644           inescq = true;
04645           continue;
04646           }
04647 
04648         /* Handle escapes that turn into characters */
04649 
04650         if (ch >= 0)
04651           {
04652           class_optcount++;            /* for possible optimization */
04653           }
04654         else class_optcount = 10;      /* \d, \s etc; make sure > 1 */
04655         }
04656 
04657       /* Check the syntax for POSIX stuff. The bits we actually handle are
04658       checked during the real compile phase. */
04659 
04660       else if (*ptr == '[' && check_posix_syntax(ptr, &ptr, &compile_block))
04661         {
04662         ptr++;
04663         class_optcount = 10;    /* Make sure > 1 */
04664         }
04665 
04666       /* Anything else just increments the possible optimization count. If
04667       there are wide characters, we are going to have to use an XCLASS. */
04668 
04669       else
04670         {
04671         NON_SPECIAL_CHARACTER:
04672         class_optcount++;
04673 
04674         }
04675       }
04676     while (*(++ptr) != 0 && (inescq || *ptr != ']')); /* Concludes "do" above */
04677 
04678     if (*ptr == 0)                          /* Missing terminating ']' */
04679       {
04680       *errorptr = ERR6;
04681       goto PCRE_ERROR_RETURN;
04682       }
04683 
04684     /* We can optimize when there was only one optimizable character. Repeats
04685     for positive and negated single one-byte chars are handled by the general
04686     code. Here, we handle repeats for the class opcodes. */
04687 
04688     if (class_optcount == 1) length += 3; else
04689       {
04690       length += 33;
04691 
04692       /* A repeat needs either 1 or 5 bytes. If it is a possessive quantifier,
04693       we also need extra for wrapping the whole thing in a sub-pattern. */
04694 
04695       if (*ptr != 0 && ptr[1] == '{' && is_counted_repeat(ptr+2))
04696         {
04697         ptr = read_repeat_counts(ptr+2, &min, &max, errorptr);
04698         if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
04699         if ((min == 0 && (max == 1 || max == -1)) ||
04700           (min == 1 && max == -1))
04701             length++;
04702         else length += 5;
04703         if (ptr[1] == '+')
04704           {
04705           ptr++;
04706           length += 2 + 2*LINK_SIZE;
04707           }
04708         else if (ptr[1] == '?') ptr++;
04709         }
04710       }
04711     continue;
04712 
04713     /* Brackets may be genuine groups or special things */
04714 
04715     case '(':
04716     branch_newextra = 0;
04717     bracket_length = 1 + LINK_SIZE;
04718 
04719     /* Handle special forms of bracket, which all start (? */
04720 
04721     if (ptr[1] == '?')
04722       {
04723       int set, unset;
04724       int *optset;
04725 
04726       switch (c = ptr[2])
04727         {
04728         /* Skip over comments entirely */
04729         case '#':
04730         ptr += 3;
04731         while (*ptr != 0 && *ptr != ')') ptr++;
04732         if (*ptr == 0)
04733           {
04734           *errorptr = ERR18;
04735           goto PCRE_ERROR_RETURN;
04736           }
04737         continue;
04738 
04739         /* Non-referencing groups and lookaheads just move the pointer on, and
04740         then behave like a non-special bracket, except that they don't increment
04741         the count of extracting brackets. Ditto for the "once only" bracket,
04742         which is in Perl from version 5.005. */
04743 
04744         case ':':
04745         case '=':
04746         case '!':
04747         case '>':
04748         ptr += 2;
04749         break;
04750 
04751         /* (?R) specifies a recursive call to the regex, which is an extension
04752         to provide the facility which can be obtained by (?p{perl-code}) in
04753         Perl 5.6. In Perl 5.8 this has become (??{perl-code}).
04754 
04755         From PCRE 4.00, items such as (?3) specify subroutine-like "calls" to
04756         the appropriate numbered brackets. This includes both recursive and
04757         non-recursive calls. (?R) is now synonymous with (?0). */
04758 
04759         case 'R':
04760         ptr++;
04761 
04762         case '0': case '1': case '2': case '3': case '4':
04763         case '5': case '6': case '7': case '8': case '9':
04764         ptr += 2;
04765         if (c != 'R')
04766           while ((digitab[*(++ptr)] & ctype_digit) != 0);
04767         if (*ptr != ')')
04768           {
04769           *errorptr = ERR29;
04770           goto PCRE_ERROR_RETURN;
04771           }
04772         length += 1 + LINK_SIZE;
04773 
04774         /* If this item is quantified, it will get wrapped inside brackets so
04775         as to use the code for quantified brackets. We jump down and use the
04776         code that handles this for real brackets. */
04777 
04778         if (ptr[1] == '+' || ptr[1] == '*' || ptr[1] == '?' || ptr[1] == '{')
04779           {
04780           length += 2 + 2 * LINK_SIZE;       /* to make bracketed */
04781           duplength = 5 + 3 * LINK_SIZE;
04782           goto HANDLE_QUANTIFIED_BRACKETS;
04783           }
04784         continue;
04785 
04786         /* (?C) is an extension which provides "callout" - to provide a bit of
04787         the functionality of the Perl (?{...}) feature. An optional number may
04788         follow (default is zero). */
04789 
04790         case 'C':
04791         ptr += 2;
04792         while ((digitab[*(++ptr)] & ctype_digit) != 0);
04793         if (*ptr != ')')
04794           {
04795           *errorptr = ERR39;
04796           goto PCRE_ERROR_RETURN;
04797           }
04798         length += 2;
04799         continue;
04800 
04801         /* Named subpatterns are an extension copied from Python */
04802 
04803         case 'P':
04804         ptr += 3;
04805         if (*ptr == '<')
04806           {
04807           const uschar *p;    /* Don't amalgamate; some compilers */
04808           p = ++ptr;          /* grumble at autoincrement in declaration */
04809           while ((compile_block.ctypes[*ptr] & ctype_word) != 0) ptr++;
04810           if (*ptr != '>')
04811             {
04812             *errorptr = ERR42;
04813             goto PCRE_ERROR_RETURN;
04814             }
04815           name_count++;
04816           if (ptr - p > max_name_size) max_name_size = (ptr - p);
04817           break;
04818           }
04819 
04820         if (*ptr == '=' || *ptr == '>')
04821           {
04822           while ((compile_block.ctypes[*(++ptr)] & ctype_word) != 0);
04823           if (*ptr != ')')
04824             {
04825             *errorptr = ERR42;
04826             goto PCRE_ERROR_RETURN;
04827             }
04828           break;
04829           }
04830 
04831         /* Unknown character after (?P */
04832 
04833         *errorptr = ERR41;
04834         goto PCRE_ERROR_RETURN;
04835 
04836         /* Lookbehinds are in Perl from version 5.005 */
04837 
04838         case '<':
04839         ptr += 3;
04840         if (*ptr == '=' || *ptr == '!')
04841           {
04842           branch_newextra = 1 + LINK_SIZE;
04843           length += 1 + LINK_SIZE;         /* For the first branch */
04844           break;
04845           }
04846         *errorptr = ERR24;
04847         goto PCRE_ERROR_RETURN;
04848 
04849         /* Conditionals are in Perl from version 5.005. The bracket must either
04850         be followed by a number (for bracket reference) or by an assertion
04851         group, or (a PCRE extension) by 'R' for a recursion test. */
04852 
04853         case '(':
04854         if (ptr[3] == 'R' && ptr[4] == ')')
04855           {
04856           ptr += 4;
04857           length += 3;
04858           }
04859         else if ((digitab[ptr[3]] & ctype_digit) != 0)
04860           {
04861           ptr += 4;
04862           length += 3;
04863           while ((digitab[*ptr] & ctype_digit) != 0) ptr++;
04864           if (*ptr != ')')
04865             {
04866             *errorptr = ERR26;
04867             goto PCRE_ERROR_RETURN;
04868             }
04869           }
04870         else   /* An assertion must follow */
04871           {
04872           ptr++;   /* Can treat like ':' as far as spacing is concerned */
04873           if (ptr[2] != '?' ||
04874              (ptr[3] != '=' && ptr[3] != '!' && ptr[3] != '<') )
04875             {
04876             ptr += 2;    /* To get right offset in message */
04877             *errorptr = ERR28;
04878             goto PCRE_ERROR_RETURN;
04879             }
04880           }
04881         break;
04882 
04883         /* Else loop checking valid options until ) is met. Anything else is an
04884         error. If we are without any brackets, i.e. at top level, the settings
04885         act as if specified in the options, so massage the options immediately.
04886         This is for backward compatibility with Perl 5.004. */
04887 
04888         default:
04889         set = unset = 0;
04890         optset = &set;
04891         ptr += 2;
04892 
04893         for (;; ptr++)
04894           {
04895           c = *ptr;
04896           switch (c)
04897             {
04898             case 'i':
04899             *optset |= PCRE_CASELESS;
04900             continue;
04901 
04902             case 'm':
04903             *optset |= PCRE_MULTILINE;
04904             continue;
04905 
04906             case 's':
04907             *optset |= PCRE_DOTALL;
04908             continue;
04909 
04910             case 'x':
04911             *optset |= PCRE_EXTENDED;
04912             continue;
04913 
04914             case 'X':
04915             *optset |= PCRE_EXTRA;
04916             continue;
04917 
04918             case 'U':
04919             *optset |= PCRE_UNGREEDY;
04920             continue;
04921 
04922             case '-':
04923             optset = &unset;
04924             continue;
04925 
04926             /* A termination by ')' indicates an options-setting-only item; if
04927             this is at the very start of the pattern (indicated by item_count
04928             being zero), we use it to set the global options. This is helpful
04929             when analyzing the pattern for first characters, etc. Otherwise
04930             nothing is done here and it is handled during the compiling
04931             process.
04932 
04933             [Historical note: Up to Perl 5.8, options settings at top level
04934             were always global settings, wherever they appeared in the pattern.
04935             That is, they were equivalent to an external setting. From 5.8
04936             onwards, they apply only to what follows (which is what you might
04937             expect).] */
04938 
04939             case ')':
04940             if (item_count == 0)
04941               {
04942               options = (options | set) & (~unset);
04943               set = unset = 0;     /* To save length */
04944               item_count--;        /* To allow for several */
04945               }
04946 
04947             /* Fall through */
04948 
04949             /* A termination by ':' indicates the start of a nested group with
04950             the given options set. This is again handled at compile time, but
04951             we must allow for compiled space if any of the ims options are
04952             set. We also have to allow for resetting space at the end of
04953             the group, which is why 4 is added to the length and not just 2.
04954             If there are several changes of options within the same group, this
04955             will lead to an over-estimate on the length, but this shouldn't
04956             matter very much. We also have to allow for resetting options at
04957             the start of any alternations, which we do by setting
04958             branch_newextra to 2. Finally, we record whether the case-dependent
04959             flag ever changes within the regex. This is used by the "required
04960             character" code. */
04961 
04962             case ':':
04963             if (((set|unset) & PCRE_IMS) != 0)
04964               {
04965               length += 4;
04966               branch_newextra = 2;
04967               if (((set|unset) & PCRE_CASELESS) != 0) options |= PCRE_ICHANGED;
04968               }
04969             goto END_OPTIONS;
04970 
04971             /* Unrecognized option character */
04972 
04973             default:
04974             *errorptr = ERR12;
04975             goto PCRE_ERROR_RETURN;
04976             }
04977           }
04978 
04979         /* If we hit a closing bracket, that's it - this is a freestanding
04980         option-setting. We need to ensure that branch_extra is updated if
04981         necessary. The only values branch_newextra can have here are 0 or 2.
04982         If the value is 2, then branch_extra must either be 2 or 5, depending
04983         on whether this is a lookbehind group or not. */
04984 
04985         END_OPTIONS:
04986         if (c == ')')
04987           {
04988           if (branch_newextra == 2 &&
04989               (branch_extra == 0 || branch_extra == 1+LINK_SIZE))
04990             branch_extra += branch_newextra;
04991           continue;
04992           }
04993 
04994         /* If options were terminated by ':' control comes here. Fall through
04995         to handle the group below. */
04996         }
04997       }
04998 
04999     /* Extracting brackets must be counted so we can process escapes in a
05000     Perlish way. If the number exceeds EXTRACT_BASIC_MAX we are going to
05001     need an additional 3 bytes of store per extracting bracket. However, if
05002     PCRE_NO_AUTO)CAPTURE is set, unadorned brackets become non-capturing, so we
05003     must leave the count alone (it will aways be zero). */
05004 
05005     else if ((options & PCRE_NO_AUTO_CAPTURE) == 0)
05006       {
05007       bracount++;
05008       if (bracount > EXTRACT_BASIC_MAX) bracket_length += 3;
05009       }
05010 
05011     /* Save length for computing whole length at end if there's a repeat that
05012     requires duplication of the group. Also save the current value of
05013     branch_extra, and start the new group with the new value. If non-zero, this
05014     will either be 2 for a (?imsx: group, or 3 for a lookbehind assertion. */
05015 
05016     if (brastackptr >= sizeof(brastack)/sizeof(int))
05017       {
05018       *errorptr = ERR19;
05019       goto PCRE_ERROR_RETURN;
05020       }
05021 
05022     bralenstack[brastackptr] = branch_extra;
05023     branch_extra = branch_newextra;
05024 
05025     brastack[brastackptr++] = length;
05026     length += bracket_length;
05027     continue;
05028 
05029     /* Handle ket. Look for subsequent max/min; for certain sets of values we
05030     have to replicate this bracket up to that many times. If brastackptr is
05031     0 this is an unmatched bracket which will generate an error, but take care
05032     not to try to access brastack[-1] when computing the length and restoring
05033     the branch_extra value. */
05034 
05035     case ')':
05036     length += 1 + LINK_SIZE;
05037     if (brastackptr > 0)
05038       {
05039       duplength = length - brastack[--brastackptr];
05040       branch_extra = bralenstack[brastackptr];
05041       }
05042     else duplength = 0;
05043 
05044     /* The following code is also used when a recursion such as (?3) is
05045     followed by a quantifier, because in that case, it has to be wrapped inside
05046     brackets so that the quantifier works. The value of duplength must be
05047     set before arrival. */
05048 
05049     HANDLE_QUANTIFIED_BRACKETS:
05050 
05051     /* Leave ptr at the final char; for read_repeat_counts this happens
05052     automatically; for the others we need an increment. */
05053 
05054     if ((c = ptr[1]) == '{' && is_counted_repeat(ptr+2))
05055       {
05056       ptr = read_repeat_counts(ptr+2, &min, &max, errorptr);
05057       if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
05058       }
05059     else if (c == '*') { min = 0; max = -1; ptr++; }
05060     else if (c == '+') { min = 1; max = -1; ptr++; }
05061     else if (c == '?') { min = 0; max = 1;  ptr++; }
05062     else { min = 1; max = 1; }
05063 
05064     /* If the minimum is zero, we have to allow for an OP_BRAZERO before the
05065     group, and if the maximum is greater than zero, we have to replicate
05066     maxval-1 times; each replication acquires an OP_BRAZERO plus a nesting
05067     bracket set. */
05068 
05069     if (min == 0)
05070       {
05071       length++;
05072       if (max > 0) length += (max - 1) * (duplength + 3 + 2*LINK_SIZE);
05073       }
05074 
05075     /* When the minimum is greater than zero, we have to replicate up to
05076     minval-1 times, with no additions required in the copies. Then, if there
05077     is a limited maximum we have to replicate up to maxval-1 times allowing
05078     for a BRAZERO item before each optional copy and nesting brackets for all
05079     but one of the optional copies. */
05080 
05081     else
05082       {
05083       length += (min - 1) * duplength;
05084       if (max > min)   /* Need this test as max=-1 means no limit */
05085         length += (max - min) * (duplength + 3 + 2*LINK_SIZE)
05086           - (2 + 2*LINK_SIZE);
05087       }
05088 
05089     /* Allow space for once brackets for "possessive quantifier" */
05090 
05091     if (ptr[1] == '+')
05092       {
05093       ptr++;
05094       length += 2 + 2*LINK_SIZE;
05095       }
05096     continue;
05097 
05098     /* Non-special character. For a run of such characters the length required
05099     is the number of characters + 2, except that the maximum run length is
05100     MAXLIT. We won't get a skipped space or a non-data escape or the start of a
05101     # comment as the first character, so the length can't be zero. */
05102 
05103     NORMAL_CHAR:
05104     default:
05105     length += 2;
05106     runlength = 0;
05107     do
05108       {
05109 
05110       /* If in a \Q...\E sequence, check for end; otherwise it's a literal */
05111       if (inescq)
05112         {
05113         if (c == '\\' && ptr[1] == 'E')
05114           {
05115           inescq = false;
05116           ptr++;
05117           }
05118         else runlength++;
05119         continue;
05120         }
05121 
05122       /* Skip whitespace and comments for /x */
05123 
05124       if ((options & PCRE_EXTENDED) != 0)
05125         {
05126         if ((compile_block.ctypes[c] & ctype_space) != 0) continue;
05127         if (c == '#')
05128           {
05129           /* The space before the ; is to avoid a warning on a silly compiler
05130           on the Macintosh. */
05131           while ((c = *(++ptr)) != 0 && c != NEWLINE) ;
05132           continue;
05133           }
05134         }
05135 
05136       /* Backslash may introduce a data char or a metacharacter; stop the
05137       string before the latter. */
05138 
05139       if (c == '\\')
05140         {
05141         const uschar *saveptr = ptr;
05142         c = check_escape(&ptr, errorptr, bracount, options, false);
05143         if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
05144         if (c < 0) { ptr = saveptr; break; }
05145 
05146         /* In UTF-8 mode, add on the number of additional bytes needed to
05147         encode this character, and save the total length in case this is a
05148         final char that is repeated. */
05149 
05150         }
05151 
05152       /* Ordinary character or single-char escape */
05153 
05154       runlength++;
05155       }
05156 
05157     /* This "while" is the end of the "do" above. */
05158 
05159     while (runlength < MAXLIT &&
05160       (compile_block.ctypes[c = *(++ptr)] & ctype_meta) == 0);
05161 
05162     /* If we hit a meta-character, back off to point to it */
05163 
05164     if (runlength < MAXLIT) ptr--;
05165 
05166     /* If the last char in the string is a UTF-8 multibyte character, we must
05167     set lastcharlength correctly. If it was specified as an escape, this will
05168     already have been done above. However, we also have to support in-line
05169     UTF-8 characters, so check backwards from where we are. */
05170 
05171 
05172     length += runlength;
05173     continue;
05174     }
05175   }
05176 
05177 length += 2 + LINK_SIZE;    /* For final KET and END */
05178 
05179 if (length > MAX_PATTERN_SIZE)
05180   {
05181   *errorptr = ERR20;
05182   return NULL;
05183   }
05184 
05185 /* Compute the size of data block needed and get it, either from malloc or
05186 externally provided function. */
05187 
05188 size = length + sizeof(real_pcre) + name_count * (max_name_size + 3);
05189 re = static_cast<real_pcre *>(malloc(size));
05190 
05191 if (re == NULL)
05192   {
05193   *errorptr = ERR21;
05194   return NULL;
05195   }
05196 
05197 /* Put in the magic number, and save the size, options, and table pointer */
05198 
05199 re->magic_number = MAGIC_NUMBER;
05200 re->size = size;
05201 re->options = options;
05202 re->tables = tables;
05203 re->name_entry_size = max_name_size + 3;
05204 re->name_count = name_count;
05205 
05206 /* The starting points of the name/number translation table and of the code are
05207 passed around in the compile data block. */
05208 
05209 compile_block.names_found = 0;
05210 compile_block.name_entry_size = max_name_size + 3;
05211 compile_block.name_table = (uschar *)re + sizeof(real_pcre);
05212 codestart = compile_block.name_table + re->name_entry_size * re->name_count;
05213 compile_block.start_code = codestart;
05214 compile_block.req_varyopt = 0;
05215 
05216 /* Set up a starting, non-extracting bracket, then compile the expression. On
05217 error, *errorptr will be set non-NULL, so we don't need to look at the result
05218 of the function here. */
05219 
05220 ptr = (const uschar *)pattern;
05221 code = (uschar *)codestart;
05222 *code = OP_BRA;
05223 bracount = 0;
05224 (void)compile_regex(options, options & PCRE_IMS, &bracount, &code, &ptr,
05225   errorptr, false, 0, &firstbyte, &reqbyte, NULL, &compile_block);
05226 re->top_bracket = bracount;
05227 re->top_backref = compile_block.top_backref;
05228 
05229 /* If not reached end of pattern on success, there's an excess bracket. */
05230 
05231 if (*errorptr == NULL && *ptr != 0) *errorptr = ERR22;
05232 
05233 /* Fill in the terminating state and check for disastrous overflow, but
05234 if debugging, leave the test till after things are printed out. */
05235 
05236 *code++ = OP_END;
05237 
05238 if (code - codestart > length) *errorptr = ERR23;
05239 
05240 /* Give an error if there's back reference to a non-existent capturing
05241 subpattern. */
05242 
05243 if (re->top_backref > re->top_bracket) *errorptr = ERR15;
05244 
05245 /* Failed to compile, or error while post-processing */
05246 
05247 if (*errorptr != NULL)
05248   {
05249   free(re);
05250   PCRE_ERROR_RETURN:
05251   *erroroffset = ptr - (const uschar *)pattern;
05252   return NULL;
05253   }
05254 
05255 /* If the anchored option was not passed, set the flag if we can determine that
05256 the pattern is anchored by virtue of ^ characters or \A or anything else (such
05257 as starting with .* when DOTALL is set).
05258 
05259 Otherwise, if we know what the first character has to be, save it, because that
05260 speeds up unanchored matches no end. If not, see if we can set the
05261 PCRE_STARTLINE flag. This is helpful for multiline matches when all branches
05262 start with ^. and also when all branches start with .* for non-DOTALL matches.
05263 */
05264 
05265 if ((options & PCRE_ANCHORED) == 0)
05266   {
05267   int temp_options = options;
05268   if (is_anchored(codestart, &temp_options, 0, compile_block.backref_map))
05269     re->options |= PCRE_ANCHORED;
05270   else
05271     {
05272     if (firstbyte < 0)
05273       firstbyte = find_firstassertedchar(codestart, &temp_options, false);
05274     if (firstbyte >= 0)   /* Remove caseless flag for non-caseable chars */
05275       {
05276       int ch = firstbyte & 255;
05277       re->first_byte = ((firstbyte & REQ_CASELESS) != 0 &&
05278          compile_block.fcc[ch] == ch)? ch : firstbyte;
05279       re->options |= PCRE_FIRSTSET;
05280       }
05281     else if (is_startline(codestart, 0, compile_block.backref_map))
05282       re->options |= PCRE_STARTLINE;
05283     }
05284   }
05285 
05286 /* For an anchored pattern, we use the "required byte" only if it follows a
05287 variable length item in the regex. Remove the caseless flag for non-caseable
05288 chars. */
05289 
05290 if (reqbyte >= 0 &&
05291      ((re->options & PCRE_ANCHORED) == 0 || (reqbyte & REQ_VARY) != 0))
05292   {
05293   int ch = reqbyte & 255;
05294   re->req_byte = ((reqbyte & REQ_CASELESS) != 0 &&
05295     compile_block.fcc[ch] == ch)? (reqbyte & ~REQ_CASELESS) : reqbyte;
05296   re->options |= PCRE_REQCHSET;
05297   }
05298 
05299 return (pcre *)re;
05300 }
05301 

int pcre_copy_substring ( const char *  subject,
int *  ovector,
int  stringcount,
int  stringnumber,
char *  buffer,
int  size 
)

Definition at line 811 of file pcre.cpp.

References PCRE_ERROR_NOMEMORY, and PCRE_ERROR_NOSUBSTRING.

Referenced by real_regmatch(), and regexp_match().

00812 {
00813 int yield;
00814 if (stringnumber < 0 || stringnumber >= stringcount)
00815   return PCRE_ERROR_NOSUBSTRING;
00816 stringnumber *= 2;
00817 yield = ovector[stringnumber+1] - ovector[stringnumber];
00818 if (size < yield + 1) return PCRE_ERROR_NOMEMORY;
00819 memcpy(buffer, subject + ovector[stringnumber], yield);
00820 buffer[yield] = 0;
00821 return yield;
00822 }
00823 

int pcre_exec ( const pcre external_re,
const pcre_extra extra_data,
const char *  subject,
int  length,
int  start_offset,
int  options,
int *  offsets,
int  offsetcount 
)

Definition at line 7090 of file pcre.cpp.

References match_data::callout_data, pcre_extra::callout_data, match_data::capture_last, match_data::ctypes, ctypes_offset, DPRINTF, match_data::end_match_ptr, match_data::end_offset_top, match_data::end_subject, match_data::endonly, fcc_offset, real_pcre::first_byte, pcre_extra::flags, match_data::lcc, lcc_offset, real_pcre::magic_number, MAGIC_NUMBER, match(), match_data::match_call_count, match_isgroup, match_data::match_limit, MATCH_LIMIT, pcre_extra::match_limit, MATCH_MATCH, MATCH_NOMATCH, real_pcre::name_count, real_pcre::name_entry_size, NEWLINE, match_data::notbol, match_data::notempty, match_data::noteol, match_data::offset_end, match_data::offset_max, match_data::offset_overflow, match_data::offset_vector, real_pcre::options, pcre_study_data::options, PCRE_ANCHORED, PCRE_CASELESS, PCRE_DOLLAR_ENDONLY, PCRE_DOTALL, PCRE_ERROR_BADMAGIC, PCRE_ERROR_BADOPTION, PCRE_ERROR_NOMATCH, PCRE_ERROR_NOMEMORY, PCRE_ERROR_NULL, PCRE_EXTRA_CALLOUT_DATA, PCRE_EXTRA_MATCH_LIMIT, PCRE_EXTRA_STUDY_DATA, PCRE_FIRSTSET, PCRE_MULTILINE, PCRE_NOTBOL, PCRE_NOTEMPTY, PCRE_NOTEOL, PCRE_REQCHSET, PCRE_STARTLINE, PCRE_STUDY_MAPPED, PCRE_UTF8, PUBLIC_EXEC_OPTIONS, match_data::recursive, real_pcre::req_byte, REQ_BYTE_MAX, REQ_CASELESS, pcre_study_data::start_bits, match_data::start_code, match_data::start_match, match_data::start_offset, match_data::start_subject, pcre_extra::study_data, real_pcre::tables, real_pcre::top_backref, real_pcre::top_bracket, and match_data::utf8.

Referenced by check_filter(), FUNCTION(), real_regmatch(), real_regrab(), and regexp_match().

07092 {
07093 int rc, resetcount, ocount;
07094 int first_byte = -1;
07095 int req_byte = -1;
07096 int req_byte2 = -1;
07097 unsigned long int ims = 0;
07098 bool using_temporary_offsets = false;
07099 bool anchored;
07100 bool startline;
07101 bool first_byte_caseless = false;
07102 bool req_byte_caseless = false;
07103 match_data match_block;
07104 const uschar *start_bits = NULL;
07105 const uschar *start_match = (const uschar *)subject + start_offset;
07106 const uschar *end_subject;
07107 const uschar *req_byte_ptr = start_match - 1;
07108 const pcre_study_data *study;
07109 const real_pcre *re = (const real_pcre *)external_re;
07110 
07111 /* Plausibility checks */
07112 
07113 if ((options & ~PUBLIC_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
07114 if (re == NULL || subject == NULL ||
07115    (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;
07116 
07117 /* Fish out the optional data from the extra_data structure, first setting
07118 the default values. */
07119 
07120 study = NULL;
07121 match_block.match_limit = MATCH_LIMIT;
07122 match_block.callout_data = NULL;
07123 
07124 if (extra_data != NULL)
07125   {
07126   register unsigned int flags = extra_data->flags;
07127   if ((flags & PCRE_EXTRA_STUDY_DATA) != 0)
07128     study = (const pcre_study_data *)extra_data->study_data;
07129   if ((flags & PCRE_EXTRA_MATCH_LIMIT) != 0)
07130     match_block.match_limit = extra_data->match_limit;
07131   if ((flags & PCRE_EXTRA_CALLOUT_DATA) != 0)
07132     match_block.callout_data = extra_data->callout_data;
07133   }
07134 
07135 /* Now we have re supposedly pointing to the regex */
07136 
07137 if (re->magic_number != MAGIC_NUMBER) return PCRE_ERROR_BADMAGIC;
07138 
07139 anchored = ((re->options | options) & PCRE_ANCHORED) != 0;
07140 startline = (re->options & PCRE_STARTLINE) != 0;
07141 
07142 match_block.start_code =
07143   (const uschar *)re + sizeof(real_pcre) + re->name_count * re->name_entry_size;
07144 match_block.start_subject = (const uschar *)subject;
07145 match_block.start_offset = start_offset;
07146 match_block.end_subject = match_block.start_subject + length;
07147 end_subject = match_block.end_subject;
07148 
07149 match_block.endonly = (re->options & PCRE_DOLLAR_ENDONLY) != 0;
07150 match_block.utf8 = (re->options & PCRE_UTF8) != 0;
07151 
07152 match_block.notbol = (options & PCRE_NOTBOL) != 0;
07153 match_block.noteol = (options & PCRE_NOTEOL) != 0;
07154 match_block.notempty = (options & PCRE_NOTEMPTY) != 0;
07155 
07156 match_block.recursive = NULL;                   /* No recursion at top level */
07157 
07158 match_block.lcc = re->tables + lcc_offset;
07159 match_block.ctypes = re->tables + ctypes_offset;
07160 
07161 /* Check a UTF-8 string if required. Unfortunately there's no way of passing
07162 back the character offset. */
07163 
07164 /* The ims options can vary during the matching as a result of the presence
07165 of (?ims) items in the pattern. They are kept in a local variable so that
07166 restoring at the exit of a group is easy. */
07167 
07168 ims = re->options & (PCRE_CASELESS|PCRE_MULTILINE|PCRE_DOTALL);
07169 
07170 /* If the expression has got more back references than the offsets supplied can
07171 hold, we get a temporary bit of working store to use during the matching.
07172 Otherwise, we can use the vector supplied, rounding down its size to a multiple
07173 of 3. */
07174 
07175 ocount = offsetcount - (offsetcount % 3);
07176 
07177 if (re->top_backref > 0 && re->top_backref >= ocount/3)
07178   {
07179   ocount = re->top_backref * 3 + 3;
07180   match_block.offset_vector = static_cast<int *>(malloc(ocount * sizeof(int)));
07181   if (match_block.offset_vector == NULL) return PCRE_ERROR_NOMEMORY;
07182   using_temporary_offsets = true;
07183   DPRINTF(("Got memory to hold back references\n"));
07184   }
07185 else match_block.offset_vector = offsets;
07186 
07187 match_block.offset_end = ocount;
07188 match_block.offset_max = (2*ocount)/3;
07189 match_block.offset_overflow = false;
07190 match_block.capture_last = -1;
07191 
07192 /* Compute the minimum number of offsets that we need to reset each time. Doing
07193 this makes a huge difference to execution time when there aren't many brackets
07194 in the pattern. */
07195 
07196 resetcount = 2 + re->top_bracket * 2;
07197 if (resetcount > offsetcount) resetcount = ocount;
07198 
07199 /* Reset the working variable associated with each extraction. These should
07200 never be used unless previously set, but they get saved and restored, and so we
07201 initialize them to avoid reading uninitialized locations. */
07202 
07203 if (match_block.offset_vector != NULL)
07204   {
07205   register int *iptr = match_block.offset_vector + ocount;
07206   register int *iend = iptr - resetcount/2 + 1;
07207   while (--iptr >= iend) *iptr = -1;
07208   }
07209 
07210 /* Set up the first character to match, if available. The first_byte value is
07211 never set for an anchored regular expression, but the anchoring may be forced
07212 at run time, so we have to test for anchoring. The first char may be unset for
07213 an unanchored pattern, of course. If there's no first char and the pattern was
07214 studied, there may be a bitmap of possible first characters. */
07215 
07216 if (!anchored)
07217   {
07218   if ((re->options & PCRE_FIRSTSET) != 0)
07219     {
07220     first_byte = re->first_byte & 255;
07221     if ((first_byte_caseless = ((re->first_byte & REQ_CASELESS) != 0)) == true)
07222       first_byte = match_block.lcc[first_byte];
07223     }
07224   else
07225     if (!startline && study != NULL &&
07226       (study->options & PCRE_STUDY_MAPPED) != 0)
07227         start_bits = study->start_bits;
07228   }
07229 
07230 /* For anchored or unanchored matches, there may be a "last known required
07231 character" set. */
07232 
07233 if ((re->options & PCRE_REQCHSET) != 0)
07234   {
07235   req_byte = re->req_byte & 255;
07236   req_byte_caseless = (re->req_byte & REQ_CASELESS) != 0;
07237   req_byte2 = (re->tables + fcc_offset)[req_byte];  /* case flipped */
07238   }
07239 
07240 /* Loop for handling unanchored repeated matching attempts; for anchored regexs
07241 the loop runs just once. */
07242 
07243 do
07244   {
07245   register int *iptr = match_block.offset_vector;
07246   register int *iend = iptr + resetcount;
07247 
07248   /* Reset the maximum number of extractions we might see. */
07249 
07250   while (iptr < iend) *iptr++ = -1;
07251 
07252   /* Advance to a unique first char if possible */
07253 
07254   if (first_byte >= 0)
07255     {
07256     if (first_byte_caseless)
07257       while (start_match < end_subject &&
07258              match_block.lcc[*start_match] != first_byte)
07259         start_match++;
07260     else
07261       while (start_match < end_subject && *start_match != first_byte)
07262         start_match++;
07263     }
07264 
07265   /* Or to just after \n for a multiline match if possible */
07266 
07267   else if (startline)
07268     {
07269     if (start_match > match_block.start_subject + start_offset)
07270       {
07271       while (start_match < end_subject && start_match[-1] != NEWLINE)
07272         start_match++;
07273       }
07274     }
07275 
07276   /* Or to a non-unique first char after study */
07277 
07278   else if (start_bits != NULL)
07279     {
07280     while (start_match < end_subject)
07281       {
07282       register int c = *start_match;
07283       if ((start_bits[c/8] & (1 << (c&7))) == 0) start_match++; else break;
07284       }
07285     }
07286 
07287   /* If req_byte is set, we know that that character must appear in the subject
07288   for the match to succeed. If the first character is set, req_byte must be
07289   later in the subject; otherwise the test starts at the match point. This
07290   optimization can save a huge amount of backtracking in patterns with nested
07291   unlimited repeats that aren't going to match. Writing separate code for
07292   cased/caseless versions makes it go faster, as does using an autoincrement
07293   and backing off on a match.
07294 
07295   HOWEVER: when the subject string is very, very long, searching to its end can
07296   take a long time, and give bad performance on quite ordinary patterns. This
07297   showed up when somebody was matching /^C/ on a 32-megabyte string... so we
07298   don't do this when the string is sufficiently long. */
07299 
07300   if (req_byte >= 0 && end_subject - start_match < REQ_BYTE_MAX)
07301     {
07302     register const uschar *p = start_match + ((first_byte >= 0)? 1 : 0);
07303 
07304     /* We don't need to repeat the search if we haven't yet reached the
07305     place we found it at last time. */
07306 
07307     if (p > req_byte_ptr)
07308       {
07309       if (req_byte_caseless)
07310         {
07311         while (p < end_subject)
07312           {
07313           register int pp = *p++;
07314           if (pp == req_byte || pp == req_byte2) { p--; break; }
07315           }
07316         }
07317       else
07318         {
07319         while (p < end_subject)
07320           {
07321           if (*p++ == req_byte) { p--; break; }
07322           }
07323         }
07324 
07325       /* If we can't find the required character, break the matching loop */
07326 
07327       if (p >= end_subject) break;
07328 
07329       /* If we have found the required character, save the point where we
07330       found it, so that we don't search again next time round the loop if
07331       the start hasn't passed this character yet. */
07332 
07333       req_byte_ptr = p;
07334       }
07335     }
07336 
07337   /* When a match occurs, substrings will be set for all internal extractions;
07338   we just need to set up the whole thing as substring 0 before returning. If
07339   there were too many extractions, set the return code to zero. In the case
07340   where we had to get some local store to hold offsets for backreferences, copy
07341   those back references that we can. In this case there need not be overflow
07342   if certain parts of the pattern were not used. */
07343 
07344   match_block.start_match = start_match;
07345   match_block.match_call_count = 0;
07346 
07347   rc = match(start_match, match_block.start_code, 2, &match_block, ims, NULL,
07348     match_isgroup);
07349 
07350   if (rc == MATCH_NOMATCH)
07351     {
07352     start_match++;
07353     continue;
07354     }
07355 
07356   if (rc != MATCH_MATCH)
07357     {
07358     DPRINTF((">>>> error: returning %d\n", rc));
07359     return rc;
07360     }
07361 
07362   /* We have a match! Copy the offset information from temporary store if
07363   necessary */
07364 
07365   if (using_temporary_offsets)
07366     {
07367     if (offsetcount >= 4)
07368       {
07369       memcpy(offsets + 2, match_block.offset_vector + 2,
07370         (offsetcount - 2) * sizeof(int));
07371       DPRINTF(("Copied offsets from temporary memory\n"));
07372       }
07373     if (match_block.end_offset_top > offsetcount)
07374       match_block.offset_overflow = true;
07375 
07376     DPRINTF(("Freeing temporary memory\n"));
07377     free(match_block.offset_vector);
07378     }
07379 
07380   rc = match_block.offset_overflow? 0 : match_block.end_offset_top/2;
07381 
07382   if (offsetcount < 2) rc = 0; else
07383     {
07384     offsets[0] = start_match - match_block.start_subject;
07385     offsets[1] = match_block.end_match_ptr - match_block.start_subject;
07386     }
07387 
07388   DPRINTF((">>>> returning %d\n", rc));
07389   return rc;
07390   }
07391 
07392 /* This "while" is the end of the "do" above */
07393 
07394 while (!anchored && start_match <= end_subject);
07395 
07396 if (using_temporary_offsets)
07397   {
07398   DPRINTF(("Freeing temporary memory\n"));
07399   free(match_block.offset_vector);
07400   }
07401 
07402 DPRINTF((">>>> returning PCRE_ERROR_NOMATCH\n"));
07403 
07404 return PCRE_ERROR_NOMATCH;
07405 }
07406 

const unsigned char* pcre_maketables ( void   ) 

Definition at line 842 of file pcre.cpp.

References cbit_cntrl, cbit_digit, cbit_graph, cbit_length, cbit_lower, cbit_print, cbit_punct, cbit_space, cbit_upper, cbit_word, cbit_xdigit, ctype_digit, ctype_letter, ctype_meta, ctype_space, ctype_word, ctype_xdigit, and tables_length.

00842 {
00843 unsigned char *yield, *p;
00844 int i;
00845 
00846 yield = static_cast<unsigned char*>(malloc(tables_length));
00847 
00848 if (yield == NULL) return NULL;
00849 p = yield;
00850 
00851 /* First comes the lower casing table */
00852 
00853 for (i = 0; i < 256; i++) *p++ = tolower(i);
00854 
00855 /* Next the case-flipping table */
00856 
00857 for (i = 0; i < 256; i++) *p++ = islower(i)? toupper(i) : tolower(i);
00858 
00859 /* Then the character class tables. Don't try to be clever and save effort
00860 on exclusive ones - in some locales things may be different. Note that the
00861 table for "space" includes everything "isspace" gives, including VT in the
00862 default locale. This makes it work for the POSIX class [:space:]. */
00863 
00864 memset(p, 0, cbit_length);
00865 for (i = 0; i < 256; i++)
00866   {
00867   if (isdigit(i))
00868     {
00869     p[cbit_digit  + i/8] |= 1 << (i&7);
00870     p[cbit_word   + i/8] |= 1 << (i&7);
00871     }
00872   if (isupper(i))
00873     {
00874     p[cbit_upper  + i/8] |= 1 << (i&7);
00875     p[cbit_word   + i/8] |= 1 << (i&7);
00876     }
00877   if (islower(i))
00878     {
00879     p[cbit_lower  + i/8] |= 1 << (i&7);
00880     p[cbit_word   + i/8] |= 1 << (i&7);
00881     }
00882   if (i == '_')   p[cbit_word   + i/8] |= 1 << (i&7);
00883   if (isspace(i)) p[cbit_space  + i/8] |= 1 << (i&7);
00884   if (isxdigit(i))p[cbit_xdigit + i/8] |= 1 << (i&7);
00885   if (isgraph(i)) p[cbit_graph  + i/8] |= 1 << (i&7);
00886   if (isprint(i)) p[cbit_print  + i/8] |= 1 << (i&7);
00887   if (ispunct(i)) p[cbit_punct  + i/8] |= 1 << (i&7);
00888   if (iscntrl(i)) p[cbit_cntrl  + i/8] |= 1 << (i&7);
00889   }
00890 p += cbit_length;
00891 
00892 /* Finally, the character type table. In this, we exclude VT from the white
00893 space chars, because Perl doesn't recognize it as such for \s and for comments
00894 within regexes. */
00895 
00896 for (i = 0; i < 256; i++)
00897   {
00898   int x = 0;
00899   if (i != 0x0b && isspace(i)) x += ctype_space;
00900   if (isalpha(i)) x += ctype_letter;
00901   if (isdigit(i)) x += ctype_digit;
00902   if (isxdigit(i)) x += ctype_xdigit;
00903   if (isalnum(i) || i == '_') x += ctype_word;
00904   if (strchr("*+?{^.$|()[", i) != 0) x += ctype_meta;
00905   *p++ = x;
00906   }
00907 
00908 return yield;
00909 }
00910 

pcre_extra* pcre_study ( const pcre external_re,
int  options,
const char **  errorptr 
)

Definition at line 1233 of file pcre.cpp.

References compile_data::cbits, cbits_offset, compile_data::ctypes, ctypes_offset, compile_data::fcc, fcc_offset, pcre_extra::flags, compile_data::lcc, lcc_offset, real_pcre::magic_number, MAGIC_NUMBER, real_pcre::name_count, real_pcre::name_entry_size, real_pcre::options, pcre_study_data::options, PCRE_ANCHORED, PCRE_CASELESS, PCRE_EXTRA_STUDY_DATA, PCRE_FIRSTSET, PCRE_STARTLINE, PCRE_STUDY_MAPPED, PCRE_UTF8, PUBLIC_STUDY_OPTIONS, set_start_bits(), pcre_study_data::size, pcre_study_data::start_bits, pcre_extra::study_data, and real_pcre::tables.

Referenced by CF_HAND(), and real_regrab().

01233 {
01234 uschar start_bits[32];
01235 pcre_extra *extra;
01236 pcre_study_data *study;
01237 const real_pcre *re = (const real_pcre *)external_re;
01238 uschar *code = (uschar *)re + sizeof(real_pcre) +
01239   (re->name_count * re->name_entry_size);
01240 compile_data compile_block;
01241 
01242 *errorptr = NULL;
01243 
01244 if (re == NULL || re->magic_number != MAGIC_NUMBER)
01245   {
01246   *errorptr = "argument is not a compiled regular expression";
01247   return NULL;
01248   }
01249 
01250 if ((options & ~PUBLIC_STUDY_OPTIONS) != 0)
01251   {
01252   *errorptr = "unknown or incorrect option bit(s) set";
01253   return NULL;
01254   }
01255 
01256 /* For an anchored pattern, or an unanchored pattern that has a first char, or
01257 a multiline pattern that matches only at "line starts", no further processing
01258 at present. */
01259 
01260 if ((re->options & (PCRE_ANCHORED|PCRE_FIRSTSET|PCRE_STARTLINE)) != 0)
01261   return NULL;
01262 
01263 /* Set the character tables in the block which is passed around */
01264 
01265 compile_block.lcc = re->tables + lcc_offset;
01266 compile_block.fcc = re->tables + fcc_offset;
01267 compile_block.cbits = re->tables + cbits_offset;
01268 compile_block.ctypes = re->tables + ctypes_offset;
01269 
01270 /* See if we can find a fixed set of initial characters for the pattern. */
01271 
01272 memset(start_bits, 0, 32 * sizeof(uschar));
01273 if (!set_start_bits(code, start_bits, (re->options & PCRE_CASELESS) != 0,
01274   (re->options & PCRE_UTF8) != 0, &compile_block)) return NULL;
01275 
01276 /* Get a pcre_extra block and a pcre_study_data block. The study data is put in
01277 the latter, which is pointed to by the former, which may also get additional
01278 data set later by the calling program. At the moment, the size of
01279 pcre_study_data is fixed. We nevertheless save it in a field for returning via
01280 the pcre_fullinfo() function so that if it becomes variable in the future, we
01281 don't have to change that code. */
01282 
01283 extra = static_cast<pcre_extra *>(malloc(sizeof(pcre_extra) + sizeof(pcre_study_data)));
01284 
01285 if (extra == NULL)
01286   {
01287   *errorptr = "failed to get memory";
01288   return NULL;
01289   }
01290 
01291 // Hmm.
01292 study = reinterpret_cast<pcre_study_data *>(reinterpret_cast<char*>(extra) + sizeof(pcre_extra));
01293 extra->flags = PCRE_EXTRA_STUDY_DATA;
01294 extra->study_data = study;
01295 
01296 study->size = sizeof(pcre_study_data);
01297 study->options = PCRE_STUDY_MAPPED;
01298 memcpy(study->start_bits, start_bits, sizeof(start_bits));
01299 
01300 return extra;
01301 }
01302 

static const uschar* read_repeat_counts ( const uschar p,
int *  minp,
int *  maxp,
const char **  errorptr 
) [static]

Definition at line 1722 of file pcre.cpp.

References ctype_digit, digitab, ERR4, and ERR5.

Referenced by compile_branch(), and pcre_compile().

01722 {
01723 int min = 0;
01724 int max = -1;
01725 
01726 while ((digitab[*p] & ctype_digit) != 0) min = min * 10 + *p++ - '0';
01727 
01728 if (*p == '}') max = min; else
01729   {
01730   if (*(++p) != '}')
01731     {
01732     max = 0;
01733     while((digitab[*p] & ctype_digit) != 0) max = max * 10 + *p++ - '0';
01734     if (max < min)
01735       {
01736       *errorptr = ERR4;
01737       return p;
01738       }
01739     }
01740   }
01741 
01742 /* Do paranoid checks, then fill in the required variables, and pass back the
01743 pointer to the terminating '}'. */
01744 
01745 if (min < 0 || 65535 < min ||
01746     max < -1 || 65535 < max)
01747   *errorptr = ERR5;
01748 else
01749   {
01750   *minp = min;
01751   *maxp = max;
01752   }
01753 return p;
01754 }
01755 

static void set_bit ( uschar start_bits,
int  c,
bool  caseless,
compile_data cd 
) [static]

Definition at line 932 of file pcre.cpp.

References ctype_letter, compile_data::ctypes, and compile_data::fcc.

Referenced by set_start_bits().

00932 {
00933 start_bits[c/8] |= (1 << (c&7));
00934 if (caseless && (cd->ctypes[c] & ctype_letter) != 0)
00935   start_bits[cd->fcc[c]/8] |= (1 << (cd->fcc[c]&7));
00936 }
00937 

static bool set_start_bits ( const uschar code,
uschar start_bits,
bool  caseless,
bool  utf8,
compile_data cd 
) [static]

Definition at line 960 of file pcre.cpp.

References cbit_digit, cbit_space, cbit_word, compile_data::cbits, GET, LINK_SIZE, OP_ALT, OP_ASSERT, OP_ASSERT_NOT, OP_ASSERTBACK, OP_ASSERTBACK_NOT, OP_BRA, OP_BRAMINZERO, OP_BRANUMBER, OP_BRAZERO, OP_CALLOUT, OP_CHARS, OP_CLASS, OP_CRMINQUERY, OP_CRMINRANGE, OP_CRMINSTAR, OP_CRQUERY, OP_CRRANGE, OP_CRSTAR, OP_DIGIT, OP_EXACT, OP_MINPLUS, OP_MINQUERY, OP_MINSTAR, OP_MINUPTO, OP_NCLASS, OP_NOT_DIGIT, OP_NOT_WHITESPACE, OP_NOT_WORDCHAR, OP_OPT, OP_PLUS, OP_QUERY, OP_STAR, OP_TYPEEXACT, OP_TYPEMINPLUS, OP_TYPEMINQUERY, OP_TYPEMINSTAR, OP_TYPEMINUPTO, OP_TYPEPLUS, OP_TYPEQUERY, OP_TYPESTAR, OP_TYPEUPTO, OP_UPTO, OP_WHITESPACE, OP_WORDCHAR, PCRE_CASELESS, and set_bit().

Referenced by pcre_study().

00961 {
00962 register int c;
00963 
00964 /* This next statement and the later reference to dummy are here in order to
00965 trick the optimizer of the IBM C compiler for OS/2 into generating correct
00966 code. Apparently IBM isn't going to fix the problem, and we would rather not
00967 disable optimization (in this module it actually makes a big difference, and
00968 the pcre module can use all the optimization it can get). */
00969 
00970 volatile int dummy;
00971 
00972 do
00973   {
00974   const uschar *tcode = code + 1 + LINK_SIZE;
00975   bool try_next = true;
00976 
00977   while (try_next)
00978     {
00979     /* If a branch starts with a bracket or a positive lookahead assertion,
00980     recurse to set bits from within them. That's all for this branch. */
00981 
00982     if ((int)*tcode >= OP_BRA || *tcode == OP_ASSERT)
00983       {
00984       if (!set_start_bits(tcode, start_bits, caseless, utf8, cd))
00985         return false;
00986       try_next = false;
00987       }
00988 
00989     else switch(*tcode)
00990       {
00991       default:
00992       return false;
00993 
00994       /* Skip over callout */
00995 
00996       case OP_CALLOUT:
00997       tcode += 2;
00998       break;
00999 
01000       /* Skip over extended extraction bracket number */
01001 
01002       case OP_BRANUMBER:
01003       tcode += 3;
01004       break;
01005 
01006       /* Skip over lookbehind and negative lookahead assertions */
01007 
01008       case OP_ASSERT_NOT:
01009       case OP_ASSERTBACK:
01010       case OP_ASSERTBACK_NOT:
01011       do tcode += GET(tcode, 1); while (*tcode == OP_ALT);
01012       tcode += 1+LINK_SIZE;
01013       break;
01014 
01015       /* Skip over an option setting, changing the caseless flag */
01016 
01017       case OP_OPT:
01018       caseless = (tcode[1] & PCRE_CASELESS) != 0;
01019       tcode += 2;
01020       break;
01021 
01022       /* BRAZERO does the bracket, but carries on. */
01023 
01024       case OP_BRAZERO:
01025       case OP_BRAMINZERO:
01026       if (!set_start_bits(++tcode, start_bits, caseless, utf8, cd))
01027         return false;
01028       dummy = 1;
01029       do tcode += GET(tcode,1); while (*tcode == OP_ALT);
01030       tcode += 1+LINK_SIZE;
01031       break;
01032 
01033       /* Single-char * or ? sets the bit and tries the next item */
01034 
01035       case OP_STAR:
01036       case OP_MINSTAR:
01037       case OP_QUERY:
01038       case OP_MINQUERY:
01039       set_bit(start_bits, tcode[1], caseless, cd);
01040       tcode += 2;
01041       break;
01042 
01043       /* Single-char upto sets the bit and tries the next */
01044 
01045       case OP_UPTO:
01046       case OP_MINUPTO:
01047       set_bit(start_bits, tcode[3], caseless, cd);
01048       tcode += 4;
01049       break;
01050 
01051       /* At least one single char sets the bit and stops */
01052 
01053       case OP_EXACT:       /* Fall through */
01054       tcode++;
01055 
01056       case OP_CHARS:       /* Fall through */
01057       tcode++;
01058 
01059       case OP_PLUS:
01060       case OP_MINPLUS:
01061       set_bit(start_bits, tcode[1], caseless, cd);
01062       try_next = false;
01063       break;
01064 
01065       /* Single character type sets the bits and stops */
01066 
01067       case OP_NOT_DIGIT:
01068       for (c = 0; c < 32; c++)
01069         start_bits[c] |= ~cd->cbits[c+cbit_digit];
01070       try_next = false;
01071       break;
01072 
01073       case OP_DIGIT:
01074       for (c = 0; c < 32; c++)
01075         start_bits[c] |= cd->cbits[c+cbit_digit];
01076       try_next = false;
01077       break;
01078 
01079       case OP_NOT_WHITESPACE:
01080       for (c = 0; c < 32; c++)
01081         start_bits[c] |= ~cd->cbits[c+cbit_space];
01082       try_next = false;
01083       break;
01084 
01085       case OP_WHITESPACE:
01086       for (c = 0; c < 32; c++)
01087         start_bits[c] |= cd->cbits[c+cbit_space];
01088       try_next = false;
01089       break;
01090 
01091       case OP_NOT_WORDCHAR:
01092       for (c = 0; c < 32; c++)
01093         start_bits[c] |= ~cd->cbits[c+cbit_word];
01094       try_next = false;
01095       break;
01096 
01097       case OP_WORDCHAR:
01098       for (c = 0; c < 32; c++)
01099         start_bits[c] |= cd->cbits[c+cbit_word];
01100       try_next = false;
01101       break;
01102 
01103       /* One or more character type fudges the pointer and restarts, knowing
01104       it will hit a single character type and stop there. */
01105 
01106       case OP_TYPEPLUS:
01107       case OP_TYPEMINPLUS:
01108       tcode++;
01109       break;
01110 
01111       case OP_TYPEEXACT:
01112       tcode += 3;
01113       break;
01114 
01115       /* Zero or more repeats of character types set the bits and then
01116       try again. */
01117 
01118       case OP_TYPEUPTO:
01119       case OP_TYPEMINUPTO:
01120       tcode += 2;               /* Fall through */
01121 
01122       case OP_TYPESTAR:
01123       case OP_TYPEMINSTAR:
01124       case OP_TYPEQUERY:
01125       case OP_TYPEMINQUERY:
01126       switch(tcode[1])
01127         {
01128         case OP_NOT_DIGIT:
01129         for (c = 0; c < 32; c++)
01130           start_bits[c] |= ~cd->cbits[c+cbit_digit];
01131         break;
01132 
01133         case OP_DIGIT:
01134         for (c = 0; c < 32; c++)
01135           start_bits[c] |= cd->cbits[c+cbit_digit];
01136         break;
01137 
01138         case OP_NOT_WHITESPACE:
01139         for (c = 0; c < 32; c++)
01140           start_bits[c] |= ~cd->cbits[c+cbit_space];
01141         break;
01142 
01143         case OP_WHITESPACE:
01144         for (c = 0; c < 32; c++)
01145           start_bits[c] |= cd->cbits[c+cbit_space];
01146         break;
01147 
01148         case OP_NOT_WORDCHAR:
01149         for (c = 0; c < 32; c++)
01150           start_bits[c] |= ~cd->cbits[c+cbit_word];
01151         break;
01152 
01153         case OP_WORDCHAR:
01154         for (c = 0; c < 32; c++)
01155           start_bits[c] |= cd->cbits[c+cbit_word];
01156         break;
01157         }
01158 
01159       tcode += 2;
01160       break;
01161 
01162       /* Character class where all the information is in a bit map: set the
01163       bits and either carry on or not, according to the repeat count. If it was
01164       a negative class, and we are operating with UTF-8 characters, any byte
01165       with the top-bit set is a potentially valid starter because it may start
01166       a character with a value > 255. (This is sub-optimal in that the
01167       character may be in the range 128-255, and those characters might be
01168       unwanted, but that's as far as we go for the moment.) */
01169 
01170       case OP_NCLASS:
01171       if (utf8) memset(start_bits+16, 0xff, 16);
01172       /* Fall through */
01173 
01174       case OP_CLASS:
01175         {
01176         tcode++;
01177         for (c = 0; c < 32; c++) start_bits[c] |= tcode[c];
01178         tcode += 32;
01179         switch (*tcode)
01180           {
01181           case OP_CRSTAR:
01182           case OP_CRMINSTAR:
01183           case OP_CRQUERY:
01184           case OP_CRMINQUERY:
01185           tcode++;
01186           break;
01187 
01188           case OP_CRRANGE:
01189           case OP_CRMINRANGE:
01190           if (((tcode[1] << 8) + tcode[2]) == 0) tcode += 5;
01191             else try_next = false;
01192           break;
01193 
01194           default:
01195           try_next = false;
01196           break;
01197           }
01198         }
01199       break; /* End of bitmap class handling */
01200 
01201       }      /* End of switch */
01202     }        /* End of try_next loop */
01203 
01204   code += GET(code, 1);   /* Advance to next branch */
01205   }
01206 while (*code == OP_ALT);
01207 return true;
01208 }
01209 


Variable Documentation

const unsigned char digitab[] [static]

Definition at line 1415 of file pcre.cpp.

Referenced by check_escape(), compile_branch(), is_counted_repeat(), pcre_compile(), and read_repeat_counts().

const short int escapes[] [static]

Initial value:

 {
    0,      0,      0,      0,      0,      0,      0,      0,   
    0,      0,    ':',    ';',    '<',    '=',    '>',    '?',   
  '@', -ESC_A, -ESC_B, -ESC_C, -ESC_D, -ESC_E,      0, -ESC_G,   
    0,      0,      0,      0,      0,      0,      0,      0,   
    0, -ESC_Q,      0, -ESC_S,      0,      0,      0, -ESC_W,   
    0,      0, -ESC_Z,    '[',   '\\',    ']',    '^',    '_',   
  '`',      7, -ESC_b,      0, -ESC_d,  ESC_e,  ESC_f,      0,   
    0,      0,      0,      0,      0,      0,  ESC_n,      0,   
    0,      0,  ESC_r, -ESC_s,  ESC_tee,    0,      0, -ESC_w,   
    0,      0, -ESC_z                                            
}

Definition at line 1353 of file pcre.cpp.

Referenced by check_escape().

const uschar OP_lengths[] = { OP_LENGTHS } [static]

Definition at line 1341 of file pcre.cpp.

Referenced by could_be_empty_branch(), find_bracket(), find_fixedlength(), find_recurse(), and first_significant_code().

int(*) pcre_callout(pcre_callout_block *) = NULL

Definition at line 1489 of file pcre.cpp.

Referenced by match().

unsigned char pcre_default_tables[] [static]

Definition at line 611 of file pcre.cpp.

Referenced by pcre_compile().

const int posix_class_maps[] [static]

Initial value:

 {
  cbit_lower, cbit_upper, -1,             
  cbit_lower, -1,         -1,             
  cbit_upper, -1,         -1,             
  cbit_digit, cbit_lower, cbit_upper,     
  cbit_print, cbit_cntrl, -1,             
  cbit_space, -1,         -1,             
  cbit_cntrl, -1,         -1,             
  cbit_digit, -1,         -1,             
  cbit_graph, -1,         -1,             
  cbit_print, -1,         -1,             
  cbit_punct, -1,         -1,             
  cbit_space, -1,         -1,             
  cbit_word,  -1,         -1,             
  cbit_xdigit,-1,         -1              
}

Definition at line 1382 of file pcre.cpp.

Referenced by compile_branch().

const uschar posix_name_lengths[] [static]

Initial value:

 {
  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 }

Definition at line 1375 of file pcre.cpp.

Referenced by check_posix_name().

const char* const posix_names[] [static]

Initial value:

 {
  "alpha", "lower", "upper",
  "alnum", "ascii", "blank", "cntrl", "digit", "graph",
  "print", "punct", "space", "word",  "xdigit" }

Definition at line 1370 of file pcre.cpp.

Referenced by check_posix_name().

const char rep_max[] = { 0, 0, 0, 0, 1, 1 } [static]

Definition at line 1346 of file pcre.cpp.

Referenced by match().

const char rep_min[] = { 0, 0, 1, 1, 0, 0 } [static]

Definition at line 1345 of file pcre.cpp.

Referenced by match().


Generated on Mon May 28 04:40:21 2007 for MUX by  doxygen 1.4.7